python实现爬取图书封面

作者:萌妹子哦哦 时间:2023-06-08 16:40:44 

本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下

kongfuzi.py

利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制


import requests
import random
import time

class DownLoad():
 def __init__(self):
   self.ip_list = ['191.33.179.242:8080', '122.72.108.53:80', '93.190.142.214:80', '189.8.88.125:65301',
           '36.66.55.181:8080', '170.84.102.5:8080', '177.200.72.214:20183', '115.229.115.190:9000']

self.user_agent_list = [
     'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
     'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
     'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
   ]

def get(self, url, proxy=None, timeout=20, num=5):
   print("正在请求%s" % url)
   UA = random.choice(self.user_agent_list)
   headers = {'User-Agent': UA}

if proxy == None:
     try:
       return requests.get(url, headers=headers, timeout=timeout)
     except:
       if num > 0:
         time.sleep(10)
         return self.get(url, num=num - 1)
       else:
         time.sleep(10)
         IP = ''.join(random.choice(self.ip_list).strip())
         proxy = {'http': IP}
         return self.get(url, proxy=proxy, timeout=timeout)
   else:
     try:
       IP = ''.join(random.choice(self.ip_list).strip())
       proxy = {'http': IP}
       return requests.get(url, headers=headers, proxy=proxy, timeout=timeout)
     except:
       if num > 0:
         time.sleep(10)
         IP = ''.join(random.choice(self.ip_list).strip())
         proxy = {'http': IP}
         print("正在更换代理")
         print("当前代理%s" % proxy)
         return self.get(url, proxy=proxy, num=num - 1)

main.py

将爬取的图片保存到本地,然后展示到界面


import kongfuzi
import os
import requests
import bs4
from tkinter import *
from PIL import Image, ImageTk

# 下载图片,生成图片地址列表和图书信息列表
def download():
 baseUrl = "http://search.kongfz.com"
 keyword = e1.get()
 url = baseUrl + "/product_result/?select=0&key=" + keyword
 print("下载链接:" + url)
 show(url)

# bs4处理
def changesoup(html):
 htm = html.content
 html_doc = str(htm, 'utf-8')
 soup = bs4.BeautifulSoup(html_doc, "html.parser")
 return soup

# 图书信息集合
def bookinfo(soup):
 # 图书价格列表
 price = []
 soupprice = soup.select(".first-info .f_right .bold")
 for i in soupprice:
   price.append(i.string)

# 书店名列表
 storename = []
 soupstorename = soup.select(".text a span")
 for each in soupstorename:
   if each.string == None:
     soupstorename.remove(each)
 for i in soupstorename:
   storename.append(i.string)

# 商家地区列表
 place = []
 soupplace = soup.select(".user-place")
 for i in soupplace:
   place.append(i.string)

# 书名列表
 bookname = []
 bookname1 = soup.select(
   ".search-wrap .search-main .search-main-result .result-content .result-list .item .item-info .title .link")
 # print(len(bookname1))
 # print(bookname1)
 for each in bookname1:
   print(each)
   # a = bs4.BeautifulSoup(each, "html.parser")
   a = each.get_text()
   print(a)
   # type(a)
   # a = bs4.BeautifulSoup(a, "html.parser")
   # b = a.get_text()
   bookname.append(a)
 # print(bookname)
 # print(len(bookname))

return bookname, price, place, storename

# 保存图片
def imgsave(soup):
 dirName = "image"
 os.makedirs(dirName, exist_ok=True)
 filePathList = []
 imgUrl = soup.select(".search-main-result .result-content .result-list .item .item-img .img-box img")

# print(imgUrl)
 if not imgUrl:
   print("没有找到当前节点下图片")
 else:
   i = 0
   for imageUrls in imgUrl:
     # 找到图片地址 获取它
     downloadUrl = imageUrls.get('src')
     # if downloadUrl == "/searchfront/img/error.jpg":
     #   downloadUrl = "http://book.kongfz.com/img/pc/error.jpg"
     print("打印要下载的图片地址:", downloadUrl)
     #   http://book.kongfz.com/img/pc/error.jpg
     # 分割字符
     split = downloadUrl.split("/")
     # 只保留最后一个元素
     fileName = str(i) + "-" + os.path.basename(split[len(split) - 1])
     print("文件名:" + fileName)
     # 建立一个新路径
     filePath = os.path.join(dirName, fileName)
     filePathList.append(filePath)
     if not os.path.exists(filePath):
       imageUrlPath = requests.get(downloadUrl)
       # 检查当前网络是否请求成功
       imageUrlPath.raise_for_status()
       # 'wb'二进制模式打开img适用
       imageFile = open(filePath, 'wb')
       for image in imageUrlPath.iter_content(10000):
         # 把每次遍历的文件图像都存储进文件夹中
         imageFile.write(image)
       # 关闭文件
       imageFile.close()
     i = i + 1
 return filePathList

# 图片展示
def show(url):
 xz = kongfuzi.DownLoad()
 html = xz.get(url)

# 添加代理ip到ip_list
 add_ip = e2.get()
 xz.ip_list.append(add_ip)

soup = changesoup(html)
 bookname, price, place, storename = bookinfo(soup)
 # print(bookname)
 # print(price)
 # print(place)
 # print(storename)
 filePathList = imgsave(soup)
 root1 = Toplevel()
 root1.geometry("1720x800")
 root1.title("孔网图片爬取")

# 处理图片,转换成可以显示
 photo = []
 temp = []
 for each in filePathList:
   temp = Image.open(each)
   photo.append(ImageTk.PhotoImage(temp))

canvas = Canvas(root1, width=1700, height=800, scrollregion=(0, 0, 0, 4000)) # 创建canvas
 canvas.place(x=10, y=10) # 放置canvas的位置

frame = Frame(canvas) # 把frame放在canvas里
 frame.place(width=1680, height=800)

for i in range(50):
   # 图片行列
   rownum = int(i / 5)
   columnnum = i % 5

# photo = ImageTk.PhotoImage(Image.open(filePathList[i]))
   imgLabel1 = Label(frame, image=photo[i], width=280, height=280)
   imgLabel1.grid(row=rownum * 5, column=columnnum, padx=10, pady=5)

infoLabel1 = Label(frame, text="书名:" + bookname[i], bg="#FFF8DC", justify=LEFT)
   infoLabel1.grid(row=rownum * 5 + 1, column=columnnum, padx=45, pady=2, sticky=W)
   infoLabel2 = Label(frame, text="价格:" + price[i] + "元", bg="#FFF8DC", justify=LEFT)
   infoLabel2.grid(row=rownum * 5 + 2, column=columnnum, padx=45, pady=2, sticky=W)
   infoLabel3 = Label(frame, text="发货地区:" + place[i], bg="#FFF8DC", justify=LEFT)
   infoLabel3.grid(row=rownum * 5 + 3, column=columnnum, padx=45, pady=2, sticky=W)
   infoLabel4 = Label(frame, text="书店:" + storename[i], bg="#FFF8DC", justify=LEFT)
   infoLabel4.grid(row=rownum * 5 + 4, column=columnnum, padx=45, pady=2, sticky=W)

vbar = Scrollbar(canvas, orient=VERTICAL) # 竖直滚动条
 vbar.place(x=1680, width=20, height=800)
 vbar.configure(command=canvas.yview)
 canvas.config(yscrollcommand=vbar.set) # 设置
 canvas.create_window((800, 2000), window=frame)

mainloop()

if __name__ == '__main__':
 # 界面
 root = Tk()
 root.title("孔网图片爬取")
 e1 = Entry(root)
 e2 = Entry(root)
 e1.grid(row=0, column=0, padx=20, pady=20)
 e2.grid(row=0, column=2, padx=20, pady=20)
 label1 = Label(root, text="关键字", width=10).grid(row=0, column=1, padx=10, pady=5)
 label2 = Label(root, text="添加代理ip", width=10).grid(row=0, column=3, padx=10, pady=5)
 btn1 = Button(root, text="搜索", width=10, command=download).grid(row=1, column=1, padx=10, pady=5)
 # print(e1.get())
 mainloop()

来源:https://blog.csdn.net/Bancroft_boy/article/details/80904322

标签:python,爬取,图书
0
投稿

猜你喜欢

  • python爬取免费代理并验证代理是否可用

    2021-12-24 20:02:48
  • Python fire模块(最简化命令行生成工具)的使用教程详解

    2022-06-10 15:25:00
  • XML+ JS创建树形菜单

    2013-08-22 08:30:17
  • GOOGLE LOGO 设计演化过程

    2008-02-13 19:41:00
  • 详解JavaScript作用域 闭包

    2024-04-19 10:07:20
  • Pytorch实现List Tensor转Tensor,reshape拼接等操作

    2021-06-06 19:58:51
  • python内存动态分配过程详解

    2023-10-02 14:58:46
  • python用字节处理文件实例讲解

    2023-07-18 12:44:50
  • navicat无法远程连接mysql的解决方法

    2024-01-21 13:13:51
  • 微信小程序分包操作实战指南

    2024-04-16 08:47:57
  • 优雅地使用loading(推荐)

    2024-04-30 08:42:01
  • Can''t connect to MySQL server on localhost (10061)解决方法

    2024-01-22 00:25:02
  • Python将一个Excel拆分为多个Excel

    2021-02-04 06:00:53
  • 利用Python写个简易版星空大战游戏

    2023-08-26 14:07:42
  • 总结网络IO模型与select模型的Python实例讲解

    2021-10-16 22:09:41
  • js实现ajax分页完整实例

    2024-05-21 10:12:49
  • Python接口自动化 之用例读取方法总结

    2023-12-14 06:47:22
  • 浅谈vue中使用编辑器vue-quill-editor踩过的坑

    2024-04-10 13:46:00
  • mysql 设置自动创建时间及修改时间的方法示例

    2024-01-24 08:12:55
  • python使用fork实现守护进程的方法

    2021-08-27 00:37:50
  • asp之家 网络编程 m.aspxhome.com