基于python实现垂直爬虫系统的方法详解

作者:bwt_D 时间:2023-11-17 13:33:38 

html_downloader

from urllib import request
def download(url):
   if url is None:
       return
   response = request.urlopen(url)
   if response.getcode() != 200:
       return None
   return response.read()

html_outeputer

data_list = []
def collect_data(data):
   data_list.append(data)
def output_html():
   fout = open('output.html', 'w')
   fout.write('<html>')
   fout.write('<body>')
   fout.write('<table>')
   for dataitem in data_list:
       fout.write('<tr>')
       fout.write('<td>%s</td>' % dataitem['url'])
       fout.write('<td>%s</td>' % dataitem['title'])
       fout.write('<td>%s</td>' % dataitem['datetime'])
       fout.write('<td>%s</td>' % dataitem['visitcount'])
       fout.write('</tr>')
   fout.write('</table>')
   fout.write('</body>')
   fout.write('</html>')
   fout.close()

html_parser

import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_new_urls(page_url, soup):
   new_urls = set()
   links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))
   for link in links:
       new_url = link['href']
       new_full_url = urljoin(page_url, new_url)
       new_urls.add(new_full_url)
   return new_urls
def get_new_data(page_url, soup):
   res_data = {}
   title_node = soup.find('h1', class_='arti-title')
   if title_node is None:
       return res_data
   res_data['title'] = title_node.get_text()
   datetime_node = soup.find('span', class_='arti-update')
   res_data['datetime'] = datetime_node.get_text()
   visitcount_node = soup.find('span', class_='WP_VisitCount')
   res_data['visitcount'] = visitcount_node.get_text()
   res_data['url'] = page_url
   return res_data
def parse(page_url, html_cont):
   if page_url is None or html_cont is None:
       return
   soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
   new_urls = get_new_urls(page_url, soup)
   new_data = get_new_data(page_url, soup)
   return new_urls, new_data

spider_main

import urls_manager, html_downloader, \
   html_parser, html_outputer
def craw(root_url):
   count = 1
   urls_manager.add_new_url(root_url)
   #启动爬虫循环
   while urls_manager.has_new_url():
       new_url = urls_manager.get_new_url()
       print('craw %d : %s' % (count, new_url))
       html_cont = html_downloader.download(new_url)
       new_urls, new_data = html_parser.parse(new_url, html_cont)
       urls_manager.add_new_urls(new_urls)
       if new_data:
           html_outputer.collect_data(new_data)
       if count == 10:
           break
       count = count + 1
   html_outputer.output_html()

if __name__ == '__main__':
   root_url = 'http://news.zzuli.edu.cn/'
   craw(root_url)
import urls_manager, html_downloader, \
   html_parser, html_outputer
def craw(root_url):
   count = 1
   urls_manager.add_new_url(root_url)
   #启动爬虫循环
   while urls_manager.has_new_url():
       new_url = urls_manager.get_new_url()
       print('craw %d : %s' % (count, new_url))
       html_cont = html_downloader.download(new_url)
       new_urls, new_data = html_parser.parse(new_url, html_cont)
       urls_manager.add_new_urls(new_urls)
       if new_data:
           html_outputer.collect_data(new_data)
       if count == 10:
           break
       count = count + 1
   html_outputer.output_html()

if __name__ == '__main__':
   root_url = 'http://news.zzuli.edu.cn/'
   craw(root_url)

test_64

from bs4 import BeautifulSoup
import re
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
print('获取所有链接')
links = soup.find_all('a')
for link in links:
   print(link.name, link['href'], link.get_text())
print('获取lacie链接')
link_node = soup.find('a', href='http://example.com/lacie')
print(link_node.name, link_node['href'], link_node.get_text())
print('正则匹配')
link_node = soup.find('a', href=re.compile(r'ill'))
print(link_node.name, link_node['href'], link_node.get_text())
print('获取P段落文字')
p_node = soup.find('p', class_='title')
print(p_node.name, p_node.get_text())

urls_manager

new_urls = set()
old_urls = set()
def add_new_url(url):
   if url is None:
       return
   if url not in new_urls and url not in old_urls:
       new_urls.add(url)
def add_new_urls(urls):
   if urls is None or len(urls) == 0:
       return
   for url in urls:
       add_new_url(url)
def get_new_url():
   new_url = new_urls.pop()
   old_urls.add(new_url)
   return new_url
def has_new_url():
   return len(new_urls) != 0

来源:https://blog.csdn.net/bwt_D/article/details/123238555

标签:python,垂直,爬虫,系统
0
投稿

猜你喜欢

  • Python获取一个用户名的组ID过程解析

    2021-09-04 15:40:05
  • 使用 XML HTTP Request 对象[翻译]

    2007-11-07 21:11:00
  • VS2019连接mysql8.0数据库的教程图文详解

    2024-01-13 12:54:26
  • Python实现扫描局域网活动ip(扫描在线电脑)

    2022-10-02 02:38:52
  • 如何实现让每句话的头一个字母都大写?

    2010-05-24 18:26:00
  • 对pandas replace函数的使用方法小结

    2022-07-04 15:20:24
  • python使用xlrd模块读写Excel文件的方法

    2022-02-14 16:54:55
  • 再论Javascript下字符串连接的性能

    2010-06-26 13:13:00
  • Python脚本处理空格的方法

    2021-03-12 09:45:33
  • Python获取网页上图片下载地址的方法

    2021-01-22 13:15:26
  • 如何更改 pandas dataframe 中两列的位置

    2023-01-06 03:14:23
  • 编写安全的SQL Server扩展存储过程

    2008-11-25 11:16:00
  • python中 OpenCV和Pillow处理图像操作及时间对比

    2021-02-04 16:46:52
  • 利用xmlhttp和adodb.stream加缓存技术下载远程Web文

    2009-04-23 18:33:00
  • ASP Framework_1_简介

    2009-10-12 11:35:00
  • Tensorflow训练模型越来越慢的2种解决方案

    2021-06-04 20:55:53
  • Python MySQL 日期时间格式化作为参数的操作

    2024-01-12 23:52:13
  • asp 根据IP地址自动判断转向分站的代码

    2011-04-14 10:51:00
  • python tornado开启多进程的几种方法

    2021-09-18 22:28:17
  • 如何使用共享连接减少空闲的连接数?

    2010-05-16 15:15:00
  • asp之家 网络编程 m.aspxhome.com