Python如何使用队列方式实现多线程爬虫

作者：Norni 时间：2022-03-24 08:56:51　

说明：糗事百科段子的爬取，采用了队列和多线程的方式，其中关键点是Queue.task_done()、Queue.join()，保证了线程的有序进行。

代码如下

import requests
from lxml import etree
import json
from queue import Queue
import threading

class Qsbk(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
"Referer": "https://www.qiushibaike.com/"
}
# 实例化三个队列，用来存放内容
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()

def get_total_url(self):
"""
获取了所有的页面url，并且返回url_list
return:url_list
现在放入url_queue队列中保存
"""
url_temp = "https://www.qiushibaike.com/text/page/{}/"
url_list = list()
for i in range(1,13):
# url_list.append(url_temp.format(i))
# 将生成的url放入url_queue队列
self.url_queue.put(url_temp.format(i))

def parse_url(self):
"""
发送请求，获取响应，同时etree处理html
"""
while self.url_queue.not_empty:
# 判断非空，为空时结束循环

# 从队列中取出一个url
url = self.url_queue.get()
print("parsing url:",url)
# 发送请求
response = requests.get(url,headers=self.headers,timeout=10)
# 获取html字符串
html = response.content.decode()
# 获取element类型的html
html = etree.HTML(html)
# 将生成的element对象放入html_queue队列
self.html_queue.put(html)
# Queue.task_done() 在完成一项工作之后，Queue.task_done()函数向任务已经完成的队列发送一个信号
self.url_queue.task_done()

def get_content(self):
"""
解析网页内容，获取想要的信息
"""
while self.html_queue.not_empty:
items = list()
html = self.html_queue.get()
total_div = html.xpath("//div[@class='col1 old-style-col1']/div")
for i in total_div:

author_img = i.xpath(".//a[@rel='nofollow']/img/@src")
author_img = "https"+author_img[0] if len(author_img)>0 else None

author_name = i.xpath(".//a[@rel='nofollow']/img/@alt")
author_name = author_name[0] if len(author_name)>0 else None

author_href = i.xpath("./a/@href")
author_href = "https://www.qiushibaike.com/"+author_href[0] if len(author_href)>0 else None

author_gender = i.xpath("./div[1]/div/@class")
author_gender = author_gender[0].split(" ")[-1].replace("Icon","").strip() if len(author_gender)>0 else None

author_age = i.xpath("./div[1]/div/text()")
author_age = author_age[0] if len(author_age)>0 else None

content = i.xpath("./a/div/span/text()")
content = content[0].strip() if len(content)>0 else None

content_vote = i.xpath("./div[@class='stats']/span[@class='stats-vote']/i/text()")
content_vote = content_vote[0] if len(content_vote)>0 else None

content_comment_numbers = i.xpath("./div[@class='stats']/span[@class='stats-comments']/a/i/text()")
content_comment_numbers = content_comment_numbers[0] if len(content_comment_numbers)>0 else None

item = {
"author_name":author_name,
"author_age" :author_age,
"author_gender":author_gender,
"author_img":author_img,
"author_href":author_href,
"content":content,
"content_vote":content_vote,
"content_comment_numbers":content_comment_numbers,
}
items.append(item)
self.content_queue.put(items)
# task_done的时候，队列计数减一
self.html_queue.task_done()

def save_items(self):
"""
保存items
"""
while self.content_queue.not_empty:
items = self.content_queue.get()
with open("quishibaike.txt",'a',encoding='utf-8') as f:
for i in items:
json.dump(i,f,ensure_ascii=False,indent=2)
self.content_queue.task_done()

def run(self):
# 获取url list
thread_list = list()
thread_url = threading.Thread(target=self.get_total_url)
thread_list.append(thread_url)

# 发送网络请求
for i in range(10):
thread_parse = threading.Thread(target=self.parse_url)
thread_list.append(thread_parse)

# 提取数据
thread_get_content = threading.Thread(target=self.get_content)
thread_list.append(thread_get_content)

# 保存
thread_save = threading.Thread(target=self.save_items)
thread_list.append(thread_save)

for t in thread_list:
# 为每个进程设置为后台进程，效果是主进程退出子进程也会退出
t.setDaemon(True)
t.start()

# 让主线程等待，所有的队列为空的时候才能退出
self.url_queue.join()
self.html_queue.join()
self.content_queue.join()

if __name__=="__main__":
obj = Qsbk()
obj.run()

来源：https://www.cnblogs.com/nuochengze/p/12861358.html

标签：Python,队列,多线程,爬虫

投稿

Python如何使用队列方式实现多线程爬虫

猜你喜欢

python模块之re正则表达式详解

python实现壁纸批量下载代码实例

SQL Server复制功能巧妙选择避开Bug

如何在Windows中安装多个python解释器

Python实现PS滤镜中马赛克效果示例

利用rpm打包上线部署golang代码的方法教程

Mysql写入数据十几秒后被自动删除了如何解决

js链表操作(实例讲解)

golang gorm的预加载及软删硬删的数据操作示例

django用户登录和注销的实现方法

CSS兼容性(IE和Firefox)技巧大全

Python 列表去重去除空字符的例子

python有证书的加密解密实现方法

在asp.net中KindEditor编辑器的使用方法小结

Golang定时器Timer与Ticker的使用详解

uniapp项目打包为桌面应用的方法步骤

win2003 Server配置SQL Server 2005远程连接的方法

PHP常量及变量区别原理详解

python 操作 mongodb 数据库详情

解决pycharm同一目录下无法import其他文件

Python如何使用队列方式实现多线程爬虫

猜你喜欢

python模块之re正则表达式详解

python实现壁纸批量下载代码实例

SQL Server复制功能 巧妙选择避开Bug

如何在Windows中安装多个python解释器

Python实现PS滤镜中马赛克效果示例

利用rpm打包上线部署golang代码的方法教程

Mysql写入数据十几秒后被自动删除了如何解决

js链表操作(实例讲解)

golang gorm的预加载及软删硬删的数据操作示例

django用户登录和注销的实现方法

CSS兼容性(IE和Firefox)技巧大全

Python 列表去重去除空字符的例子

python有证书的加密解密实现方法

在asp.net中KindEditor编辑器的使用方法小结

Golang定时器Timer与Ticker的使用详解

uniapp项目打包为桌面应用的方法步骤

win2003 Server配置SQL Server 2005远程连接的方法

PHP常量及变量区别原理详解

python 操作 mongodb 数据库详情

解决pycharm同一目录下无法import其他文件

SQL Server复制功能巧妙选择避开Bug