java正则表达式解析html示例分享

时间:2023-06-13 15:53:42 

package work;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

public class chuanboyi {

 public static void main(String[] args){
  // TODO Auto-generated method stub
  StringBuffer html = new StringBuffer();
  HttpClient httpclient = new HttpClient();
  //创建GET方法实例
  GetMethod getMethod = new GetMethod("https://www.jb51.net");
  //使用系统提供的默认恢复策略
  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
  try{
   //执行GET方法
   int statusCode = httpclient.executeMethod(getMethod);
   if(statusCode != HttpStatus.SC_OK){
    System.out.println("Method is wrong " + getMethod.getStatusLine());
   }
   InputStream responseBody = getMethod.getResponseBodyAsStream();
   BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));
   String line = reader.readLine();
   while(line != null){
    html.append(line).append("\n");
    line = reader.readLine();
   }
   reader.close();
   //正则表达式
   String regex = "<form name=\"compareForm\"[\\s\\S]+>[\\s\\S]+</form>.*<script.*>";
   String regexa ="(?<=<li>)[\\s\\S]+?(?=</li>)";
   Pattern pattern = Pattern.compile(regex);
         Matcher m = pattern.matcher(html);
         StringBuffer str = new StringBuffer();
         int i = 0;
         while(m.find()){
          str.append(m.group());
         }
         pattern = Pattern.compile(regexa);
         m = pattern.matcher(str);
         while(m.find()){
          attrs(m.group());
          i++;
         }
         System.out.println("共有"+i+"条数据!");
  }catch (HttpException e) {
   // TODO: handle exception
   System.out.println("Please check your provided http address!");
   e.printStackTrace();
  }catch (IOException e) {
   // TODO: handle exception
   System.out.println("the line is wrong!");
   e.printStackTrace();
  }finally{
   getMethod.releaseConnection();//释放链接
  }
 }
 public static void attrs(String str){
  //获取url的正则表达式
  String regexURL = "[a-z]+-[0-9]+\\.html";
  //获取Name的正则表达式
  String regexName = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";
  //获取图片的正则表达式
  String regexPicture = "images.*\\.jpg";
  Pattern patternURL = Pattern.compile(regexURL);
  Pattern patternName = Pattern.compile(regexName);
  Pattern patternPicture = Pattern.compile(regexPicture);
  Matcher mURL = patternURL.matcher(str);
  Matcher mName = patternName.matcher(str);
  Matcher mPicture = patternPicture.matcher(str);
  if(mName.find()){
   System.out.println("名字:"+mName.group());
  }
  if(mURL.find()){
   System.out.println("链接:"+mURL.group());
  }
  if(mPicture.find()){
   System.out.println("图片:"+mPicture.group());
  }
 } 
}

标签:java,正则表达式
0
投稿

猜你喜欢

  • SQL Server查询条件IN中能否使用变量的示例详解

    2024-01-15 17:55:55
  • Python利用tkinter实现一个简易番茄钟的示例代码

    2021-03-02 17:20:59
  • 解决tensorflow/keras时出现数组维度不匹配问题

    2023-01-01 04:16:24
  • python 通过 pybind11 使用Eigen加速代码的步骤

    2021-03-07 06:09:30
  • python神经网络MobileNet模型的复现详解

    2022-09-24 19:54:14
  • 谈谈网页设计中的字体应用 (4) 实战应用篇·下

    2009-11-24 13:13:00
  • Linux上通过binlog文件恢复mysql数据库详细步骤

    2024-01-27 17:56:01
  • JS图片懒加载的优点及实现原理

    2024-04-18 09:45:34
  • 通过Python中的CGI接口讲解什么是WSGI

    2022-08-10 15:04:24
  • asp如何制作一个文本文件编辑器?

    2010-07-07 12:26:00
  • python 包实现JSON 轻量数据操作

    2022-11-13 05:40:32
  • 一文教你如何优雅处理Golang中的异常

    2024-02-13 21:32:33
  • tensorflow实现tensor中满足某一条件的数值取出组成新的tensor

    2021-02-18 02:09:22
  • python之从文件读取数据到list的实例讲解

    2021-11-11 08:04:26
  • Python中time与datetime模块使用方法详解

    2022-11-06 11:03:59
  • 网页图片按钮的生成与美化

    2008-12-12 13:03:00
  • python双向链表实例详解

    2023-10-28 08:25:09
  • Python 的可变和不可变对象详情

    2021-11-24 22:28:33
  • 全面解析Bootstrap表单使用方法(表单按钮)

    2024-05-10 14:08:49
  • 浅谈Python中的可变对象和不可变对象

    2021-07-17 16:58:49
  • asp之家 网络编程 m.aspxhome.com