Scrapy项目实战之爬取某社区用户详情

作者:hankleo 时间:2022-04-16 11:36:31 

本文介绍了Scrapy项目实战之爬取某社区用户详情,分享给大家,具有如下:

get_cookies.py


from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings

class GetCookies(object):
def __init__(self):
 # 初始化组件
 # 设定webdriver选项
 self.opt = webdriver.ChromeOptions()
 # self.opt.add_argument("--headless")
 # 初始化用户列表
 self.user_list = settings.USER_LIST
 # 初始化MongoDB参数
 self.client = MongoClient(settings.MONGO_URI)
 self.db = self.client[settings.MONGO_DB]
 self.collection = self.db["cookies"]

def get_cookies(self,username,password):
 """

:param username:
 :param password:
 :return: cookies
 """
 # 使用webdriver选项创建driver
 driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
 driver.get("https://segmentfault.com/user/login")
 driver.find_element_by_name("username").send_keys(username)
 driver.find_element_by_name("password").send_keys(password)
 driver.find_element_by_xpath("//button[@type='submit']").click()
 time.sleep(2)
 driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
 # 登陆之后获取页面cookies
 cookies = driver.get_cookies()
 driver.quit()

return cookies

def format_cookies(self,cookies):
 """

:param cookies:
 从driver.get_cookies的形式为:
 [{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
 'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
 {'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
 'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
 'value': '1550066940'},
 {'domain': '.segmentfault.com', 'httpOnly': False,
 'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
 'path': '/', 'secure': False, 'value': '1550066940'},
 {'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
 {'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
 {'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
 'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
 只需提取每一项的name与value即可

:return:
 """
 c = dict()
 for item in cookies:
  c[item['name']] = item['value']

return c

def save(self):
 print("开始获取Cookies....")
 # 从用户列表中获取用户名与密码,分别登陆获取cookies
 for username,password in self.user_list:
  cookies = self.get_cookies(username,password)
  f_cookies = self.format_cookies(cookies)
  print("insert cookie:{}".format(f_cookies))
  # 将格式整理后的cookies插入MongoDB数据库
  self.collection.insert_one(f_cookies)

# s = db[self.collection].find()
 # for i in s:
 #  print(i)

if __name__ == '__main__':

cookies = GetCookies()
for i in range(20):
 cookies.save()

item.py


# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class SegmentfaultItem(scrapy.Item):
# define the fields for your item here like:
# 个人属性
# 姓名
name = scrapy.Field()
# 声望
rank = scrapy.Field()
# 学校
school = scrapy.Field()
# 专业
majors = scrapy.Field()
# 公司
company = scrapy.Field()
# 工作
job = scrapy.Field()
# blog
blog = scrapy.Field()
# 社交活动数据
# 关注人数
following = scrapy.Field()
# 粉丝数
fans = scrapy.Field()
# 回答数
answers = scrapy.Field()
# 提问数
questions = scrapy.Field()
# 文章数
articles = scrapy.Field()
# 讲座数
lives = scrapy.Field()
# 徽章数
badges = scrapy.Field()
# 技能属性
# 点赞数
like = scrapy.Field()
# 技能
skills = scrapy.Field()
# 注册日期
register_date = scrapy.Field()
# 问答统计
# 回答最高得票数
answers_top_score = scrapy.Field()
# 得票数最高的回答对应的问题的标题
answers_top_title = scrapy.Field()
# 得票数最高的回答对应的问题的标签
answers_top_tags = scrapy.Field()
# 得票数最高的回答对应的问题的内容
answers_top_question = scrapy.Field()
# 得票数最高的回答对应的问题的内容
answers_top_content = scrapy.Field()

pipeline.py


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class SegmentfaultPipeline(object):
# 设定MongoDB集合名称
collection_name = 'userinfo'

def __init__(self,mongo_uri,mongo_db):
 self.mongo_uri = mongo_uri
 self.mongo_db = mongo_db

# 通过crawler获取settings.py中设定的MongoDB连接信息
@classmethod
def from_crawler(cls,crawler):
 return cls(
  mongo_uri = crawler.settings.get('MONGO_URI'),
  mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
 )

# 当爬虫启动时连接MongoDB
def open_spider(self,spider):
 self.client = pymongo.MongoClient(self.mongo_uri)
 self.db = self.client[self.mongo_db]

# 当爬虫关闭时断开MongoDB连接
def close_spider(self,spider):
 self.client.close()

# 将Item插入数据库保存
def process_item(self, item, spider):
 self.db[self.collection_name].insert_one(dict(item))
 return item

settings.py


# -*- coding: utf-8 -*-

# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#  https://doc.scrapy.org/en/latest/topics/settings.html
#  https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#  https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'segmentfault'

SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

RETRY_ENABLED = False

REDIRECT_ENABLED = False

DOWNLOAD_TIMEOUT = 5

# HTTPALLOW

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,

}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'segmentfault.pipelines.SegmentfaultPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'

# 用户列表
USER_LIST = [
("798549150@qq.com","guoqing1010"),
("learnscrapy@163.com","guoqing1010"),
]

# 配置代理列表
PROXY_LIST = [
'http://115.182.212.169:8080',
'http://121.61.25.149:9999',
'http://180.118.247.189:9000',
'http://115.151.3.12:9999',
'http://183.154.213.160:9000',
'http://113.128.9.106:9999',
'http://124.42.68.152:90',
'http://49.70.48.50:9999',
'http://113.128.11.172:9999',
'http://111.177.177.40:9999',
'http://59.62.83.253:9999',
'http://39.107.84.185:8123',
'http://124.94.195.107:9999',
'http://111.177.160.132:9999',
'http://120.25.203.182:7777'
]

USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]

userinfo.py


# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem

class UserinfoSpider(CrawlSpider):
name = 'userinfo'
allowed_domains = ['segmentfault.com']
start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']

rules = (
 # 用户主页地址,跟进并进行解析
 Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
 # 用户关注列表,跟进列表页面,抓取用户主页地址进行后续操作
 # Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
 # 用户粉丝列表,跟进列表页面,抓取用户主页地址进行后续操作
 Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
 # 跟进其他页面地址
 # Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
)

def start_requests(self):
 # 从MongoDB中获取一条cookie,添加到开始方法
 client = MongoClient(self.crawler.settings['MONGO_URI'])
 db = client[self.crawler.settings['MONGO_DB']]
 cookies_collection = db.cookies
 # 获取一条cookie
 cookies = cookies_collection.find_one()
 # cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法,因此重新填充
 cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))

return [Request("https://segmentfault.com",
     cookies=cookies,
     meta={'cookiejar':1},
     callback=self.after_login)]

# 登录之后从start_url中开始抓取数据
def after_login(self,response):
 for url in self.start_urls:
  return self.make_requests_from_url(url)
# def after_login(self,response):
#  yield Request(self.start_urls[0],
#     meta={'cookiejar':response.meta['cookiejar']},
#     callback=self.parse_item)

def parse_item(self, response):
 """
 :param response:
 :return:
 """
 item = SegmentfaultItem()
 # 个人属性模块
 profile_head = response.css('.profile__heading')
 # 姓名
 item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')
 # 声望
 item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
 # 学校专业信息
 school_info = profile_head.css('.profile__school::text').extract()
 if school_info:
  # 学校
  item['school'] = school_info[0]
  # 专业
  item['majors'] = school_info[1].strip()
 else:
  item['school'] = ''
  item['majors'] = ''
 # 公司职位信息
 company_info = profile_head.css('.profile__company::text').extract()
 if company_info:
  # 公司
  item['company'] = company_info[0]
  # 职位
  item['job'] = company_info[1].strip()
 else:
  item['company'] = ''
  item['job'] = ''
 # 个人博客
 item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()

# 统计面板模块
 profile_active = response.xpath("//div[@class='col-md-2']")
 # 关注人数
 item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]
 # 粉丝人数
 item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]
 # 回答问题数
 item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
 # 提问数
 item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
 # 文章数
 item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
 # 讲座数
 item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
 # 徽章数
 item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
 # 徽章详细页面地址
 badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()

# 技能面板模块
 profile_skill = response.xpath("//div[@class='col-md-3']")
 # 技能标签列表
 item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
 # 获得的点赞数
 item['like'] = profile_skill.css('.authlist').re_first(r'获得 (\d+) 次点赞')
 # 注册日期
 item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
 # if register_time:
 #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))
 # else:
 #  item['register_date'] = ''

# 产出数据模块
 profile_work = response.xpath("//div[@class='col-md-7']")
 # 回答获得的最高分
 item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
 # 最高分回答对应的问题的标题
 item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
 # 最高分回答对应的问题的url
 answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()

# 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据
 request = scrapy.Request(
  # 问题详细页url
  url=response.urljoin(answer_url),
  meta={
  # item需要传递
  'item':item,
  # 徽章的url
  'badge_url':response.urljoin(badge_url)},
  # 调用parse_ansser继续处理
  callback=self.parse_answer)
 yield request

def parse_answer(self,response):
 # 取出传递的item
 item = response.meta['item']
 # 取出传递的徽章详细页url
 badge_url = response.meta['badge_url']
 # 问题标签列表
 item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
 # 先获取组成问题内容的字符串列表
 question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
 # 拼接后传入item
 item['answers_top_question'] = ''.join(question_content)
 # 先获取组成答案的字符串列表
 answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
 # 拼接后传入item
 item['answers_top_content'] = ''.join(answer_content)

# 问题页面内容抓取后继续抓取徽章页内容,并将更新后的item继续传递
 request = scrapy.Request(url=badge_url,
        meta={'item':item},
        callback=self.parse_badge)
 yield request

def parse_badge(self,response):
 item = response.meta['item']
 badge_name = response.css('span.badge span::text').extract()
 badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
 name_count = {}
 for i in range(len(badge_count)):
  name_count[badge_name[i]] = badge_count[i]
 item['badges'] = name_count
 yield item

middlewars.py


# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)

class SegmentfaultSpiderMiddleware(object):
"""
处理Item中保存的三种类型注册日期数据:
1. 注册于 2015年12月12日
2. 注册于 3 天前
3. 注册于 5 小时前
"""

def process_spider_output(self,response,result,spider):

"""
 输出response时调用此方法处理item中register_date
 :param response:
 :param result: 包含item
 :param spider:
 :return:处理过注册日期的item
 """
 for item in result:
  # 判断获取的数据是否是scrapy.item类型
  if isinstance(item,scrapy.Item):
   # 获取当前时间
   now = datetime.datetime.now()
   register_date = item['register_date']
   logger.info("获取注册日志格式为{}".format(register_date))
   # 提取注册日期字符串,如'注册于2015年12月12日' => '20151212'
   day = ''.join(re.findall(r'\d+',register_date))
   # 如果提取数字字符串长度大于4位,则为'注册于2015年12月12日'形式
   if len(day) > 4:
    date = day
   # 如果‘时'在提取的字符串中,则为'注册于8小时前'形式
   elif '时' in register_date:
    d = now - datetime.timedelta(hours=int(day))
    date = d.strftime("%Y%m%d")
   # 最后一种情况就是'注册于3天前'形式
   else:
    d = now - datetime.timedelta(days=int(day))
    date = d.strftime("%Y%m%d")

# 更新register_date值
   item['register_date'] = date
  yield item

class SegmentfaultHttpProxyMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
 self.proxy_list = settings['PROXY_LIST']

def process_request(self, request, spider):
 proxy = random.choice(self.proxy_list)
 logger.info('使用代理:{}'.format(proxy))
 request.meta['proxy'] = proxy

class SegmentfaultUserAgentMiddleware(object):
def __init__(self):
 self.useragent_list = settings['USER_AGENT_LIST']

def process_request(self,request,spider):
 user_agent = random.choice(self.useragent_list)

# logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
 request.headers['User-Agent'] = user_agent

class SegmentfaultCookiesMiddleware(object):
client = MongoClient(settings['MONGO_URI'])
db = client[settings['MONGO_DB']]
collection = db['cookies']

def get_cookies(self):
 """
 随机获取cookies
 :return:
 """
 cookies = random.choice([cookie for cookie in self.collection.find()])
 # 将不需要的"_id"与"_gat"参数删除
 cookies.pop('_id')
 cookies.pop('_gat')
 # 将"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间
 cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
 return cookies

def remove_cookies(self,cookies):
 """
 删除已失效的cookies
 :param cookies:
 :return:
 """
 # 随机获取cookies中的一对键值,返回结果是一个元祖
 i = cookies.popitem()
 # 删除cookies
 try:
  logger.info("删除cookies{}".format(cookies))
  self.collection.remove({i[0]:i[1]})
 except Exception as e:
  logger.info("No this cookies:{}".format(cookies))

def process_request(self,request,spider):
 """
 为每一个request添加一个cookie
 :param request:
 :param spider:
 :return:
 """
 cookies = self.get_cookies()
 request.cookies = cookies

def process_response(self,request,response,spider):
 """
 对于登录失效的情况,可能会重定向到登录页面,这时添加新的cookies继续,将请求放回调度器
 :param request:
 :param response:
 :param spider:
 :return:
 """
 if response.status in [301,302]:
  logger.info("Redirect response:{}".format(response))
  redirect_url = response.headers['location']
  if b'/user/login' in redirect_url:
   logger.info("Cookies失效")

# 请求失败,重新获取一个cookie,添加到request,并停止后续中间件处理此request,将此request放入调度器
   new_cookie = self.get_cookies()
   logger.info("获取新cookie:{}".format(new_cookie))
   # 删除旧cookies
   self.remove_cookies(request.cookies)
   request.cookies = new_cookie
  return request
 #
 return response

run.py


from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies

if __name__ == '__main__':
cookies = GetCookies()
cookies.save()
name = 'userinfo'
""
cmd = 'scrapy crawl {}'.format(name)
cmdline.execute(cmd.split())

来源:https://www.cnblogs.com/hankleo/p/12994207.html

标签:Scrapy,爬取
0
投稿

猜你喜欢

  • DreamWeaver经典技巧四则

    2007-12-03 11:34:00
  • 解读Scrapy回调函数callback传递参数的方式

    2022-12-02 04:12:53
  • pytorch VGG11识别cifar10数据集(训练+预测单张输入图片操作)

    2021-10-22 21:55:52
  • Python 使用 prettytable 库打印表格美化输出功能

    2021-10-25 08:42:54
  • Dreamweaver MX 2004表格设计

    2008-02-03 11:36:00
  • Python实现判断一个字符串是否包含子串的方法总结

    2023-04-17 06:27:58
  • pandas 使用apply同时处理两列数据的方法

    2021-09-27 07:35:30
  • pycharm 创建py文件总是为txt格式的问题及解决

    2022-01-13 16:03:27
  • Python命令行参数解析工具 docopt 安装和应用过程详解

    2022-01-15 05:31:05
  • python 将列表中的字符串连接成一个长路径的方法

    2023-04-18 14:06:25
  • 使用cmd命令行窗口操作SqlServer的方法

    2012-07-21 14:24:06
  • pandas求两个表格不相交的集合方法

    2022-03-08 01:15:41
  • js实现的捐赠管理完整实例

    2023-08-22 05:25:14
  • Python turtle绘画象棋棋盘

    2022-05-06 22:48:55
  • ASP压缩ACCESS数据库实例

    2009-01-19 11:47:00
  • javascript 45种缓动效果(一)

    2009-09-19 18:30:00
  • Python GUI库PyQt5图形和特效样式QSS介绍

    2022-05-03 09:25:34
  • python出现RuntimeError错误问题及解决

    2022-01-01 00:58:08
  • python列表操作之extend和append的区别实例分析

    2023-08-02 15:14:30
  • 重构中的模块化设计:样式的作用域

    2010-04-23 14:42:00
  • asp之家 网络编程 m.aspxhome.com