python实现博客文章爬虫示例

时间:2022-06-30 08:20:40 


#!/usr/bin/python
#-*-coding:utf-8-*-
# JCrawler
# Author: Jam <810441377@qq.com>

import time
import urllib2
from bs4 import BeautifulSoup

# 目标站点
TargetHost = "http://adirectory.blog.com"
# User Agent
UserAgent  = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36'
# 链接采集规则
# 目录链接采集规则
CategoryFind    = [{'findMode':'find','findTag':'div','rule':{'id':'cat-nav'}},
                   {'findMode':'findAll','findTag':'a','rule':{}}]
# 文章链接采集规则
ArticleListFind = [{'findMode':'find','findTag':'div','rule':{'id':'content'}},
                   {'findMode':'findAll','findTag':'h2','rule':{'class':'title'}},
                   {'findMode':'findAll','findTag':'a','rule':{}}]
# 分页URL规则
PageUrl  = 'page/#page/'
PageStart = 1
PageStep  = 1
PageStopHtml = '404: Page Not Found'

def GetHtmlText(url):
    request  = urllib2.Request(url)
    request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp")
    request.add_header('Accept-Encoding', "*")
    request.add_header('User-Agent', UserAgent)
    return urllib2.urlopen(request).read()

def ArrToStr(varArr):
    returnStr = ""
    for s in varArr:
        returnStr += str(s)
    return returnStr


def GetHtmlFind(htmltext, findRule):
    findReturn = BeautifulSoup(htmltext)
    returnText = ""
    for f in findRule:
        if returnText != "":
            findReturn = BeautifulSoup(returnText)
        if f['findMode'] == 'find':
            findReturn = findReturn.find(f['findTag'], f['rule'])
        if f['findMode'] == 'findAll':
            findReturn = findReturn.findAll(f['findTag'], f['rule'])
        returnText = ArrToStr(findReturn)
    return findReturn

def GetCategory():
    categorys = [];
    htmltext = GetHtmlText(TargetHost)
    findReturn = GetHtmlFind(htmltext, CategoryFind)

    for tag in findReturn:
        print "[G]->Category:" + tag.string + "|Url:" + tag['href']
        categorys.append({'name': tag.string, 'url': tag['href']})
    return categorys;

def GetArticleList(categoryUrl):
    articles = []
    page = PageStart
    #pageUrl = PageUrl
    while True:
        htmltext = ""
        pageUrl  = PageUrl.replace("#page", str(page))
        print "[G]->PageUrl:" + categoryUrl + pageUrl
        while True:
            try:
                htmltext = GetHtmlText(categoryUrl + pageUrl)
                break
            except urllib2.HTTPError,e:
                print "[E]->HTTP Error:" + str(e.code)
                if e.code == 404:
                    htmltext = PageStopHtml
                    break
                if e.code == 504:
                    print "[E]->HTTP Error 504: Gateway Time-out, Wait"
                    time.sleep(5)
                else:
                    break

        if htmltext.find(PageStopHtml) >= 0:
            print "End Page."
            break
        else:

            findReturn = GetHtmlFind(htmltext, ArticleListFind)

            for tag in findReturn:
                if tag.string != None and tag['href'].find(TargetHost) >= 0:
                    print "[G]->Article:" + tag.string + "|Url:" + tag['href']
                    articles.append({'name': tag.string, 'url': tag['href']})

            page += 1

    return articles;

print "[G]->GetCategory"
Mycategorys = GetCategory();
print "[G]->GetCategory->Success."
time.sleep(3)
for category in Mycategorys:
   print "[G]->GetArticleList:" + category['name']
   GetArticleList(category['url'])

标签:python,爬虫,爬虫
0
投稿

猜你喜欢

  • 解决Tensorflow使用pip安装后没有model目录的问题

    2023-08-09 22:58:05
  • Python实现1-9数组形成的结果为100的所有运算式的示例

    2023-04-09 10:52:37
  • TinkerPop框架查询Gremlin图实现过程详解

    2024-01-29 11:26:45
  • Python实现炸金花游戏的示例代码

    2022-01-15 05:24:17
  • 实现文字放大效果Javascript源码

    2010-03-17 20:46:00
  • golang中的时间格式化

    2024-04-25 15:29:58
  • Mac上安装Mysql的详细步骤及配置

    2024-01-29 12:20:46
  • Python如何获取多线程返回结果

    2024-01-01 23:34:28
  • Python实现利用最大公约数求三个正整数的最小公倍数示例

    2022-12-30 09:04:16
  • Python向日志输出中添加上下文信息

    2021-01-09 15:04:42
  • Pytorch随机数生成常用的4种方法汇总

    2022-02-07 09:25:34
  • python查找指定文件夹下所有文件并按修改时间倒序排列的方法

    2023-08-08 01:11:57
  • python中的闭包用法实例详解

    2022-01-06 00:47:07
  • Python基于有道实现英汉字典功能

    2021-05-23 19:35:57
  • Ajax改造:使用Ajax和jQuery改进现有站点

    2010-04-02 12:50:00
  • vue项目打包之后接口出现错误的问题及解决

    2024-05-09 15:11:25
  • asp 类型转换函数大全第1/2页

    2011-04-07 11:06:00
  • python切片操作方法的实例总结

    2021-02-28 12:51:01
  • 为SQL Server数据库传数组参数的变通办法

    2009-10-23 09:26:00
  • Python 打印不带括号的元组的实现

    2022-08-12 15:47:45
  • asp之家 网络编程 m.aspxhome.com