基于C#实现网页爬虫

作者:秋荷雨翔 时间:2021-10-30 08:13:44 

本文实例为大家分享了基于C#实现网页爬虫的详细代码,供大家参考,具体内容如下

HTTP请求工具类:

功能:

1、获取网页html

2、下载网络图片


using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
 /// <summary>
 /// HTTP请求工具类
 /// </summary>
 public class HttpRequestUtil
 {
   /// <summary>
   /// 获取页面html
   /// </summary>
   public static string GetPageHtml(string url)
   {
     // 设置参数
     HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
     request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
     //发送请求并获取相应回应数据
     HttpWebResponse response = request.GetResponse() as HttpWebResponse;
     //直到request.GetResponse()程序才开始向目标网页发送Post请求
     Stream responseStream = response.GetResponseStream();
     StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
     //返回结果网页(html)代码
     string content = sr.ReadToEnd();
     return content;
   }

/// <summary>
   /// Http下载文件
   /// </summary>
   public static void HttpDownloadFile(string url)
   {
     int pos = url.LastIndexOf("/") + 1;
     string fileName = url.Substring(pos);
     string path = Application.StartupPath + "\\download";
     if (!Directory.Exists(path))
     {
       Directory.CreateDirectory(path);
     }
     string filePathName = path + "\\" + fileName;
     if (File.Exists(filePathName)) return;

// 设置参数
     HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
     request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
     request.Proxy = null;
     //发送请求并获取相应回应数据
     HttpWebResponse response = request.GetResponse() as HttpWebResponse;
     //直到request.GetResponse()程序才开始向目标网页发送Post请求
     Stream responseStream = response.GetResponseStream();

//创建本地文件写入流
     Stream stream = new FileStream(filePathName, FileMode.Create);

byte[] bArr = new byte[1024];
     int size = responseStream.Read(bArr, 0, (int)bArr.Length);
     while (size > 0)
     {
       stream.Write(bArr, 0, size);
       size = responseStream.Read(bArr, 0, (int)bArr.Length);
     }
     stream.Close();
     responseStream.Close();
   }
 }
}

多线程爬取网页代码:


using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;

namespace 爬虫
{
 public partial class Form1 : Form
 {
   List<Thread> threadList = new List<Thread>();
   Thread thread = null;

public Form1()
   {
     InitializeComponent();
   }

private void button1_Click(object sender, EventArgs e)
   {
     DateTime dtStart = DateTime.Now;
     button3.Enabled = true;
     button2.Enabled = true;
     button1.Enabled = false;
     int page = 0;
     int count = 0;
     int personCount = 0;
     lblPage.Text = "已完成页数:0";
     int index = 0;

for (int i = 1; i <= 10; i++)
     {
       thread = new Thread(new ParameterizedThreadStart(delegate(object obj)
       {
         for (int j = 1; j <= 10; j++)
         {
           try
           {
             index = (Convert.ToInt32(obj) - 1) * 10 + j;
             string pageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com/c44/0/1_" + index.ToString() + ".html");
             Regex regA = new Regex("<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\"");
             Regex regImg = new Regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\"");
             MatchCollection mc = regA.Matches(pageHtml);
             foreach (Match match in mc)
             {
               int start = match.ToString().IndexOf("href=\"");
               string url = match.ToString().Substring(start + 6);
               int end = url.IndexOf("\"");
               url = url.Substring(0, end);
               if (url.IndexOf("/") == 0)
               {
                 string imgPageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com" + url);
                 personCount++;
                 lblPerson.Invoke(new Action(delegate() { lblPerson.Text = "已完成条数:" + personCount.ToString(); }));
                 MatchCollection mcImgPage = regImg.Matches(imgPageHtml);
                 foreach (Match matchImgPage in mcImgPage)
                 {
                   start = matchImgPage.ToString().IndexOf("src=\"");
                   string imgUrl = matchImgPage.ToString().Substring(start + 5);
                   end = imgUrl.IndexOf("\"");
                   imgUrl = imgUrl.Substring(0, end);
                   if (imgUrl.IndexOf("http://i1") == 0)
                   {
                     try
                     {
                       HttpRequestUtil.HttpDownloadFile(imgUrl);
                       count++;
                       lblNum.Invoke(new Action(delegate()
                       {
                         lblNum.Text = "已下载图片数" + count.ToString();
                         DateTime dt = DateTime.Now;
                         double time = dt.Subtract(dtStart).TotalSeconds;
                         if (time > 0)
                         {
                           lblSpeed.Text = "速度:" + (count / time).ToString("0.0") + "张/秒";
                         }
                       }));
                     }
                     catch { }
                     Thread.Sleep(1);
                   }
                 }
               }
             }
           }
           catch { }
           page++;
           lblPage.Invoke(new Action(delegate() { lblPage.Text = "已完成页数:" + page.ToString(); }));

if (page == 100)
           {
             button1.Invoke(new Action(delegate() { button1.Enabled = true; }));
             MessageBox.Show("完成!");
           }
         }
       }));
       thread.Start(i);
       threadList.Add(thread);
     }
   }

private void button2_Click(object sender, EventArgs e)
   {
     button1.Invoke(new Action(delegate()
     {
       foreach (Thread thread in threadList)
       {
         if (thread.ThreadState == ThreadState.Suspended)
         {
           thread.Resume();
         }
         thread.Abort();
       }
       button1.Enabled = true;
       button2.Enabled = false;
       button3.Enabled = false;
       button4.Enabled = false;
     }));
   }

private void Form1_FormClosing(object sender, FormClosingEventArgs e)
   {
     foreach (Thread thread in threadList)
     {
       thread.Abort();
     }
   }

private void button3_Click(object sender, EventArgs e)
   {
     foreach (Thread thread in threadList)
     {
       if (thread.ThreadState == ThreadState.Running)
       {
         thread.Suspend();
       }
     }
     button3.Enabled = false;
     button4.Enabled = true;
   }

private void button4_Click(object sender, EventArgs e)
   {
     foreach (Thread thread in threadList)
     {
       if (thread.ThreadState == ThreadState.Suspended)
       {
         thread.Resume();
       }
     }
     button3.Enabled = true;
     button4.Enabled = false;
   }
 }
}

截图:

基于C#实现网页爬虫

标签:C#,爬虫
0
投稿

猜你喜欢

  • Java如何跳过https的ssl证书验证详解

    2023-08-24 11:34:56
  • java设计模式--原型模式详解

    2023-11-25 05:08:24
  • 如何利用Java输出链表中倒数第k个结点

    2022-11-29 10:13:28
  • JVM类加载,垃圾回收

    2022-08-16 02:44:36
  • jsp如何获取Session中的值

    2023-07-01 05:59:28
  • java~springboot~ibatis数组in查询的实现方法

    2023-03-31 21:31:13
  • 从 JVM 中深入探究 Synchronized作用及原理

    2023-07-28 17:22:33
  • SpringCloud消息总线Bus配置中心实现过程解析

    2023-02-14 10:05:18
  • 详解Java分布式事务的 6 种解决方案

    2022-06-19 13:33:49
  • 详解springboot和vue前后端分离开发跨域登陆问题

    2023-08-07 00:48:38
  • Spring中事务几个常见的问题解决

    2022-04-07 00:52:01
  • java的Arrays工具类实战

    2023-08-21 15:46:36
  • spring boot 集成 shiro 自定义密码验证 自定义freemarker标签根据权限渲染不同页面(推荐

    2023-07-28 17:39:16
  • SpringBoot整合rockerMQ消息队列详解

    2021-10-03 10:55:14
  • java Stream流常见操作方法(反射,类加载器,类加载,反射)

    2022-03-24 06:54:56
  • java解析json数组方式

    2023-08-10 17:22:14
  • Spring Cloud Ribbon配置详解

    2023-11-25 01:32:50
  • Spring Boot配置线程池拒绝策略的场景分析(妥善处理好溢出的任务)

    2022-08-05 07:12:24
  • Java Socket使用加密协议进行传输对象的方法

    2023-11-28 12:47:44
  • Android自定义Banner轮播效果

    2023-08-05 23:34:06
  • asp之家 软件编程 m.aspxhome.com