python定向爬虫校园论坛帖子信息

作者：lannooooooooooo 时间：2022-02-18 19:46:29　

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息，主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url（包含page页面参数）来获得page页面从当前页面数到pageNum的url列表

import re

def getURLs(url, attr, pageNum=1):
all_links = []
try:
now_page_number = int(re.search(attr+'=(\d+)', url, re.S).group(1))
for i in range(now_page_number, pageNum + 1):
new_url = re.sub(attr+'=\d+', attr+'=％s' ％ i, url, re.S)
all_links.append(new_url)
return all_links
except TypeError:
print "arguments TypeError:attr should be string."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式，文本格式都为 &#XXXX;的形式，所以在爬得网站内容后还需要对其进行转换

import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')

def get_native(raw):
tostring = raw
while True:
obj = re.search('&#(.*?);', tostring, flags=re.S)
if obj is None:
break
else:
raw, code = obj.group(0), obj.group(1)
tostring = re.sub(raw, unichr(int(code)), tostring)
return tostring

存入SQLite数据库：saveInfo.py

# -*- coding: utf-8 -*-

import MySQLdb

class saveSqlite():
def __init__(self):
self.infoList = []

def saveSingle(self, author=None, title=None, date=None, url=None,reply=0, view=0):
if author is None or title is None or date is None or url is None:
print "No info saved!"
else:
singleDict = {}
singleDict['author'] = author
singleDict['title'] = title
singleDict['date'] = date
singleDict['url'] = url
singleDict['reply'] = reply
singleDict['view'] = view
self.infoList.append(singleDict)

def toMySQL(self):
conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, db='db_name', charset='utf8')
cursor = conn.cursor()
# sql = "select * from info"
# n = cursor.execute(sql)
# for row in cursor.fetchall():
# for r in row:
# print r
# print '\n'
sql = "delete from info"
cursor.execute(sql)
conn.commit()

sql = "insert into info(title,author,url,date,reply,view) values (％s,％s,％s,％s,％s,％s)"
params = []
for each in self.infoList:
params.append((each['title'], each['author'], each['url'], each['date'], each['reply'], each['view']))
cursor.executemany(sql, params)

conn.commit()
cursor.close()
conn.close()

def show(self):
for each in self.infoList:
print "author: "+each['author']
print "title: "+each['title']
print "date: "+each['date']
print "url: "+each['url']
print "reply: "+str(each['reply'])
print "view: "+str(each['view'])
print '\n'

if __name__ == '__main__':
save = saveSqlite()
save.saveSingle('网','aaa','2008-10-10 10:10:10','www.baidu.com',1,1)
# save.show()
save.toMySQL()

主要爬虫代码

import requests
from lxml import etree
from cc98 import uni_2_native, URLs, saveInfo

# 根据自己所需要爬的网站，伪造一个header
headers ={
'Accept': '',
'Accept-Encoding': '',
'Accept-Language': '',
'Connection': '',
'Cookie': '',
'Host': '',
'Referer': '',
'Upgrade-Insecure-Requests': '',
'User-Agent': ''
}
url = 'http://www.cc98.org/list.asp?boardid=459&page=1&action='
cc98 = 'http://www.cc98.org/'

print "get infomation from cc98..."

urls = URLs.getURLs(url, "page", 50)
savetools = saveInfo.saveSqlite()

for url in urls:
r = requests.get(url, headers=headers)
html = uni_2_native.get_native(r.text)

selector = etree.HTML(html)
content_tr_list = selector.xpath('//form/table[@class="tableborder1 list-topic-table"]/tbody/tr')

for each in content_tr_list:
href = each.xpath('./td[2]/a/@href')
if len(href) == 0:
continue
else:
# print len(href)
# not very well using for, though just one element in list
# but I don't know why I cannot get the data by index
for each_href in href:
link = cc98 + each_href
title_author_time = each.xpath('./td[2]/a/@title')

# print len(title_author_time)
for info in title_author_time:
info_split = info.split('\n')
title = info_split[0][1:len(info_split[0])-1]
author = info_split[1][3:]
date = info_split[2][3:]

hot = each.xpath('./td[4]/text()')
# print len(hot)
for hot_num in hot:
reply_view = hot_num.strip().split('/')
reply, view = reply_view[0], reply_view[1]
savetools.saveSingle(author=author, title=title, date=date, url=link, reply=reply, view=view)

print "All got! Now saving to Database..."
# savetools.show()
savetools.toMySQL()
print "ALL CLEAR! Have Fun!"

来源：https://blog.csdn.net/qq_22187919/article/details/60466283

标签：python,爬虫,论坛

投稿

python定向爬虫校园论坛帖子信息

猜你喜欢

pytorch 输出中间层特征的实例

win7 x64系统中安装Scrapy的方法

Windows中安装使用Virtualenv来创建独立Python环境

HTML标签tbody的用法

设计手机端应用时的一些建议

J2EE基础应用：J2EE中SQL语句自动构造方法

html风格tooltip效果的实现

利用Python自动化操作AutoCAD的实现

python轻松查到删除自己的微信好友

Go语言中的Array、Slice、Map和Set使用详解

python math模块的基本使用教程

Python实现判断一个字符串是否包含子串的方法总结

ASP脚本变量、函数、过程和条件语句

Python3.5以上版本lxml导入etree报错的解决方案

FrontPage服务器扩展

CSS Hack经验总结

python数据结构leetcode338比特位计数算法

python利用wx实现界面按钮和按钮监听和字体改变的方法

使用PHP Socket 编程模拟Http post和get请求

Bootstrap select多选下拉框实现代码