C#实现抓取和分析网页类实例

作者:陌香 时间:2023-09-21 04:27:32 

本文实例讲述了C#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:

这里介绍了抓取和分析网页的类。

其主要功能有:

1、提取网页的纯文本,去所有html标签和javascript代码
2、提取网页的链接,包括href和frame及iframe
3、提取网页的title等(其它的标签可依此类推,正则是一样的)
4、可以实现简单的表单提交及cookie保存


/*
* Author:Sunjoy at CCNU
* 如果您改进了这个类请发一份代码给我(ccnusjy 在gmail.com)
*/
using System;
using System.Data;
using System.Configuration;
using System.Net;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Threading;
using System.Web;
/// <summary>
/// 网页类
/// </summary>
public class WebPage
{
#region 私有成员
private Uri m_uri; //网址
private List<Link> m_links; //此网页上的链接
private string m_title;  //此网页的标题
private string m_html;   //此网页的HTML代码
private string m_outstr;  //此网页可输出的纯文本
private bool m_good;   //此网页是否可用
private int m_pagesize;  //此网页的大小
private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie
private string m_post; //此网页的登陆页需要的POST数据
private string m_loginurl; //此网页的登陆页
#endregion
#region 私有方法
/// <summary>
/// 这私有方法从网页的HTML代码中分析出链接信息
/// </summary>
/// <returns>List<Link></returns>
private List<Link> getLinks()
{
 if (m_links.Count == 0)
 {
  Regex[] regex = new Regex[2];
  regex[0] = new Regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\W)*?)</", RegexOptions.Multiline | RegexOptions.IgnoreCase);
  regex[1] = new Regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
  for (int i = 0; i < 2; i++)
  {
   Match match = regex[i].Match(m_html);
   while (match.Success)
   {
    try
    {
     string url = new Uri(m_uri, match.Groups["url"].Value).AbsoluteUri;
     string text = "";
     if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|(&nbsp;)|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");
     Link link = new Link(url, text);
     m_links.Add(link);
    }
    catch(Exception ex){Console.WriteLine(ex.Message); };
    match = match.NextMatch();
   }
  }
 }
 return m_links;
}
/// <summary>
/// 此私有方法从一段HTML文本中提取出一定字数的纯文本
/// </summary>
/// <param name="instr">HTML代码</param>
/// <param name="firstN">提取从头数多少个字</param>
/// <param name="withLink">是否要链接里面的字</param>
/// <returns>纯文本</returns>
private string getFirstNchar(string instr, int firstN, bool withLink)
{
 if (m_outstr == "")
 {
  m_outstr = instr.Clone() as string;
  m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");
  m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");
  m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");
  if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
  Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);
  m_outstr = objReg.Replace(m_outstr, "");
  Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
  m_outstr = objReg2.Replace(m_outstr, " ");
 }
 return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;
}
/// <summary>
/// 此私有方法返回一个IP地址对应的无符号整数
/// </summary>
/// <param name="x">IP地址</param>
/// <returns></returns>
private uint getuintFromIP(IPAddress x)
{
 Byte[] bt = x.GetAddressBytes();
 uint i = (uint)(bt[0] * 256 * 256 * 256);
 i += (uint)(bt[1] * 256 * 256);
 i += (uint)(bt[2] * 256);
 i += (uint)(bt[3]);
 return i;
}
#endregion
#region 公有文法
/// <summary>
/// 此公有方法提取网页中一定字数的纯文本,包括链接文字
/// </summary>
/// <param name="firstN">字数</param>
/// <returns></returns>
public string getContext(int firstN)
{
 return getFirstNchar(m_html, firstN, true);
}
/// <summary>
/// 此公有方法提取网页中一定字数的纯文本,不包括链接文字
/// </summary>
/// <param name="firstN"></param>
/// <returns></returns>
public string getContextWithOutLink(int firstN)
{
 return getFirstNchar(m_html, firstN, false);
}
/// <summary>
/// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式
/// </summary>
/// <param name="pattern">正则式</param>
/// <param name="count">返回的链接的个数</param>
/// <returns>List<Link></returns>
public List<Link> getSpecialLinksByUrl(string pattern,int count)
{
 if(m_links.Count==0)getLinks();
 List<Link> SpecialLinks = new List<Link>();
 List<Link>.Enumerator i;
 i = m_links.GetEnumerator();
 int cnt = 0;
 while (i.MoveNext() && cnt<count)
 {
  if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success)
  {
   SpecialLinks.Add(i.Current);
   cnt++;
  }
 }
 return SpecialLinks;
}
/// <summary>
/// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
/// </summary>
/// <param name="pattern">正则式</param>
/// <param name="count">返回的链接的个数</param>
/// <returns>List<Link></returns>
public List<Link> getSpecialLinksByText(string pattern,int count)
{
 if (m_links.Count == 0) getLinks();
 List<Link> SpecialLinks = new List<Link>();
 List<Link>.Enumerator i;
 i = m_links.GetEnumerator();
 int cnt = 0;
 while (i.MoveNext() && cnt < count)
 {
  if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success)
  {
   SpecialLinks.Add(i.Current);
   cnt++;
  }
 }
 return SpecialLinks;
}
/// <summary>
/// 此公有方法获得所有链接中在一定IP范围的链接
/// </summary>
/// <param name="_ip_start">起始IP</param>
/// <param name="_ip_end">终止IP</param>
/// <returns></returns>
public List<Link> getSpecialLinksByIP(string _ip_start, string _ip_end)
{
 IPAddress ip_start = IPAddress.Parse(_ip_start);
 IPAddress ip_end = IPAddress.Parse(_ip_end);
 if (m_links.Count == 0) getLinks();
 List<Link> SpecialLinks = new List<Link>();
 List<Link>.Enumerator i;
 i = m_links.GetEnumerator();
 while (i.MoveNext())
 {
  IPAddress ip;
  try
  {
   ip = Dns.GetHostEntry(new Uri(i.Current.url).Host).AddressList[0];
  }
  catch { continue; }
  if(getuintFromIP(ip)>=getuintFromIP(ip_start) && getuintFromIP(ip)<=getuintFromIP(ip_end))
  {
   SpecialLinks.Add(i.Current);
  }
 }
 return SpecialLinks;
}
/// <summary>
/// 这公有方法提取本网页的纯文本中满足某正则式的文字
/// </summary>
/// <param name="pattern">正则式</param>
/// <returns>返回文字</returns>
public string getSpecialWords(string pattern)
{
 if (m_outstr == "") getContext(Int16.MaxValue);
 Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase );
 Match mc=regex.Match(m_outstr);
 if (mc.Success)
  return mc.Groups[1].Value;
 return string.Empty;
}
#endregion
#region 构造函数
private void Init(string _url)
{
 try
 {
  m_uri = new Uri(_url);
  m_links = new List<Link>();
  m_html = "";
  m_outstr = "";
  m_title = "";
  m_good = true;
  if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))
  {
   m_good = false;
   return;
  }
  HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
  rqst.AllowAutoRedirect = true;
  rqst.MaximumAutomaticRedirections = 3;
  rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
  rqst.KeepAlive = true;
  rqst.Timeout = 30000;
  lock (WebPage.webcookies)
  {
   if (WebPage.webcookies.ContainsKey(m_uri.Host))
    rqst.CookieContainer = WebPage.webcookies[m_uri.Host];
   else
   {
    CookieContainer cc = new CookieContainer();
    WebPage.webcookies[m_uri.Host] = cc;
    rqst.CookieContainer = cc;
   }
  }
  HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
  Stream sm = rsps.GetResponseStream();
  if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)
  {
   rsps.Close();
   m_good = false;
   return;
  }
  Encoding cding = System.Text.Encoding.Default;
  string contenttype=rsps.ContentType.ToLower();
  int ix = contenttype.IndexOf("charset=");
  if (ix != -1)
  {
   try
   {
    cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
   }
   catch
   {
    cding = Encoding.Default;
   }
   m_html = new StreamReader(sm, cding).ReadToEnd();
  }
  else
  {
   m_html = new StreamReader(sm, cding).ReadToEnd();
   Regex regex = new Regex("charset=(?<cding>[^=]+)?\"",RegexOptions.IgnoreCase);
   string strcding = regex.Match(m_html).Groups["cding"].Value;
   try
   {
    cding = Encoding.GetEncoding(strcding);
   }
   catch{
    cding = Encoding.Default;
   }
   byte[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray());
   m_html = cding.GetString(bytes);
   if (m_html.Split('?').Length > 100)
   {
    m_html=Encoding.Default.GetString(bytes);
   }
  }

m_pagesize = m_html.Length;
  m_uri = rsps.ResponseUri;
  rsps.Close();
 }
 catch (Exception ex)
 {
  Console.WriteLine(ex.Message+m_uri.ToString());
  m_good = false;
 }
}
public WebPage(string _url)
{
 string uurl = "";
 try
 {
  uurl = Uri.UnescapeDataString(_url);
  _url = uurl;
 }
 catch { };
 Regex re = new Regex("(?<h>[^\x00-\xff]+)");
 Match mc = re.Match(_url);
 if (mc.Success)
 {
  string han = mc.Groups["h"].Value;
  _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));
 }
 Init(_url);
}
public WebPage(string _url, string _loginurl, string _post)
{
 string uurl = "";
 try
 {
  uurl = Uri.UnescapeDataString(_url);
  _url = uurl;
 }
 catch { };
 Regex re = new Regex("(?<h>[^\x00-\xff]+)");
 Match mc = re.Match(_url);
 if (mc.Success)
 {
  string han = mc.Groups["h"].Value;
  _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));
 }
 if (_loginurl.Trim() == "" || _post.Trim() == "" || WebPage.webcookies.ContainsKey(new Uri(_url).Host))
 {
  Init(_url);
 }
 else
 {
  #region 登陆
  string indata = _post;
  m_post = _post;
  m_loginurl = _loginurl;
  byte[] bytes = Encoding.Default.GetBytes(_post);
  CookieContainer myCookieContainer = new CookieContainer();
  try
  {
   //新建一个CookieContainer来存放Cookie集合
   HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl);
   //新建一个HttpWebRequest
   myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";
   myHttpWebRequest.AllowAutoRedirect = false;
   myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
   myHttpWebRequest.Timeout = 60000;
   myHttpWebRequest.KeepAlive = true;
   myHttpWebRequest.ContentLength = bytes.Length;
   myHttpWebRequest.Method = "POST";
   myHttpWebRequest.CookieContainer = myCookieContainer;
   //设置HttpWebRequest的CookieContainer为刚才建立的那个myCookieContainer
   Stream myRequestStream = myHttpWebRequest.GetRequestStream();
   myRequestStream.Write(bytes, 0, bytes.Length);
   myRequestStream.Close();
   HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
   foreach (Cookie ck in myHttpWebResponse.Cookies)
   {
    myCookieContainer.Add(ck);
   }
   myHttpWebResponse.Close();
  }
  catch
  {
   Init(_url);
   return;
  }
  #endregion
  #region 登陆后再访问页面
  try
  {
   m_uri = new Uri(_url);
   m_links = new List<Link>();
   m_html = "";
   m_outstr = "";
   m_title = "";
   m_good = true;
   if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))
   {
    m_good = false;
    return;
   }
   HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
   rqst.AllowAutoRedirect = true;
   rqst.MaximumAutomaticRedirections = 3;
   rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
   rqst.KeepAlive = true;
   rqst.Timeout = 30000;
   rqst.CookieContainer = myCookieContainer;
   lock (WebPage.webcookies)
   {
    WebPage.webcookies[m_uri.Host] = myCookieContainer;
   }
   HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
   Stream sm = rsps.GetResponseStream();
   if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)
   {
    rsps.Close();
    m_good = false;
    return;
   }
   Encoding cding = System.Text.Encoding.Default;
   int ix = rsps.ContentType.ToLower().IndexOf("charset=");
   if (ix != -1)
   {
    try
    {
     cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
    }
    catch
    {
     cding = Encoding.Default;
    }
   }
   m_html = new StreamReader(sm, cding).ReadToEnd();
   m_pagesize = m_html.Length;
   m_uri = rsps.ResponseUri;
   rsps.Close();
  }
  catch (Exception ex)
  {
   Console.WriteLine(ex.Message+m_uri.ToString());
   m_good = false;
  }
  #endregion
 }
}
#endregion
#region 属性
/// <summary>
/// 通过此属性可获得本网页的网址,只读
/// </summary>
public string URL
{
 get
 {
  return m_uri.AbsoluteUri;
 }
}
/// <summary>
/// 通过此属性可获得本网页的标题,只读
/// </summary>
public string Title
{
 get
 {
  if (m_title == "")
  {
   Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase );
   Match mc = reg.Match(m_html);
   if (mc.Success)
    m_title= mc.Groups["title"].Value.Trim();
  }
  return m_title;
 }
}
/// <summary>
/// 此属性获得本网页的所有链接信息,只读
/// </summary>
public List<Link> Links
{
 get
 {
  if (m_links.Count == 0) getLinks();
  return m_links;
 }
}
/// <summary>
/// 此属性返回本网页的全部纯文本信息,只读
/// </summary>
public string Context
{
 get
 {
  if (m_outstr == "") getContext(Int16.MaxValue);
  return m_outstr;
 }
}
/// <summary>
/// 此属性获得本网页的大小
/// </summary>
public int PageSize
{
 get
 {
  return m_pagesize;
 }
}
/// <summary>
/// 此属性获得本网页的所有站内链接
/// </summary>
public List<Link> InsiteLinks
{
 get
 {
  return getSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);
 }
}
/// <summary>
/// 此属性表示本网页是否可用
/// </summary>
public bool IsGood
{
 get
 {
  return m_good;
 }
}
/// <summary>
/// 此属性表示网页的所在的网站
/// </summary>
public string Host
{
 get
 {
  return m_uri.Host;
 }
}
/// <summary>
/// 此网页的登陆页所需的POST数据
/// </summary>
public string PostStr
{
 get
 {
  return m_post;
 }
}
/// <summary>
/// 此网页的登陆页
/// </summary>
public string LoginURL
{
 get
 {
  return m_loginurl;
 }
}
#endregion
}
/// <summary>
/// 链接类
/// </summary>
public class Link
{
public string url; //链接网址
public string text; //链接文字
public Link(string _url, string _text)
{
 url = _url;
 text = _text;
}
}

希望本文所述对大家的C#程序设计有所帮助。

标签:C#,抓取,分析,网页
0
投稿

猜你喜欢

  • Java类的定义以及执行顺序学习教程

    2023-01-14 10:44:29
  • maven手动上传jar包示例及图文步骤过程

    2023-11-13 22:10:57
  • Android Glide的简单使用

    2022-12-22 01:37:14
  • Android进阶事件分发机制解决事件冲突

    2021-08-22 06:12:34
  • Java如何解决发送Post请求报Stream closed问题

    2021-12-12 04:20:10
  • Android中的Notification机制深入理解

    2021-12-03 14:14:22
  • C#实现简单串口通信

    2023-05-14 08:15:27
  • SpringBoot 2.x整合Log4j2日志的详细步骤

    2022-08-05 09:50:19
  • Java开发岗位面试被问到反射怎么办

    2023-01-07 00:09:12
  • JAVA使用hutool工具实现查询树结构数据(省市区)

    2021-11-04 18:30:09
  • C#函数式编程中的部分应用详解

    2023-12-01 23:02:44
  • Java经典面试题最全汇总208道(四)

    2023-11-08 23:59:26
  • Java贪吃蛇游戏完善版

    2023-04-12 03:07:53
  • 5个主流的Java开源IDE工具详解

    2021-10-13 06:06:50
  • Android 异步获取网络图片并处理导致内存溢出问题解决方法

    2022-10-11 07:58:14
  • Android 显示GIF图片实例详解

    2023-08-06 09:11:52
  • Java中Future和FutureTask的示例详解及使用

    2023-01-29 11:48:42
  • 如何使用ByteArrayOutputStream下载文件

    2021-07-29 11:32:45
  • C#算法之散列表

    2022-07-30 19:05:52
  • 使用DateTime的ParseExact方法实现特殊日期时间的方法详解

    2021-10-06 01:48:54
  • asp之家 软件编程 m.aspxhome.com