python定向爬虫校园论坛帖子信息

作者:lannooooooooooo 时间:2022-02-18 19:46:29 

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息,主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url(包含page页面参数)来获得page页面从当前页面数到pageNum的url列表


import re

def getURLs(url, attr, pageNum=1):
 all_links = []
 try:
   now_page_number = int(re.search(attr+'=(\d+)', url, re.S).group(1))
   for i in range(now_page_number, pageNum + 1):
     new_url = re.sub(attr+'=\d+', attr+'=%s' % i, url, re.S)
     all_links.append(new_url)
   return all_links
 except TypeError:
   print "arguments TypeError:attr should be string."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式,文本格式都为 &#XXXX;的形式,所以在爬得网站内容后还需要对其进行转换


import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')

def get_native(raw):
 tostring = raw
 while True:
   obj = re.search('&#(.*?);', tostring, flags=re.S)
   if obj is None:
     break
   else:
     raw, code = obj.group(0), obj.group(1)
     tostring = re.sub(raw, unichr(int(code)), tostring)
 return tostring

存入SQLite数据库:saveInfo.py


# -*- coding: utf-8 -*-

import MySQLdb

class saveSqlite():
 def __init__(self):
   self.infoList = []

def saveSingle(self, author=None, title=None, date=None, url=None,reply=0, view=0):
   if author is None or title is None or date is None or url is None:
     print "No info saved!"
   else:
     singleDict = {}
     singleDict['author'] = author
     singleDict['title'] = title
     singleDict['date'] = date
     singleDict['url'] = url
     singleDict['reply'] = reply
     singleDict['view'] = view
     self.infoList.append(singleDict)

def toMySQL(self):
   conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, db='db_name', charset='utf8')
   cursor = conn.cursor()
   # sql = "select * from info"
   # n = cursor.execute(sql)
   # for row in cursor.fetchall():
   #   for r in row:
   #     print r
   #   print '\n'
   sql = "delete from info"
   cursor.execute(sql)
   conn.commit()

sql = "insert into info(title,author,url,date,reply,view) values (%s,%s,%s,%s,%s,%s)"
   params = []
   for each in self.infoList:
     params.append((each['title'], each['author'], each['url'], each['date'], each['reply'], each['view']))
   cursor.executemany(sql, params)

conn.commit()
   cursor.close()
   conn.close()

def show(self):
   for each in self.infoList:
     print "author: "+each['author']
     print "title: "+each['title']
     print "date: "+each['date']
     print "url: "+each['url']
     print "reply: "+str(each['reply'])
     print "view: "+str(each['view'])
     print '\n'

if __name__ == '__main__':
 save = saveSqlite()
 save.saveSingle('网','aaa','2008-10-10 10:10:10','www.baidu.com',1,1)
 # save.show()
 save.toMySQL()

主要爬虫代码


import requests
from lxml import etree
from cc98 import uni_2_native, URLs, saveInfo

# 根据自己所需要爬的网站,伪造一个header
headers ={
 'Accept': '',
 'Accept-Encoding': '',
 'Accept-Language': '',
 'Connection': '',
 'Cookie': '',
 'Host': '',
 'Referer': '',
 'Upgrade-Insecure-Requests': '',
 'User-Agent': ''
}
url = 'http://www.cc98.org/list.asp?boardid=459&page=1&action='
cc98 = 'http://www.cc98.org/'

print "get infomation from cc98..."

urls = URLs.getURLs(url, "page", 50)
savetools = saveInfo.saveSqlite()

for url in urls:
 r = requests.get(url, headers=headers)
 html = uni_2_native.get_native(r.text)

selector = etree.HTML(html)
 content_tr_list = selector.xpath('//form/table[@class="tableborder1 list-topic-table"]/tbody/tr')

for each in content_tr_list:
   href = each.xpath('./td[2]/a/@href')
   if len(href) == 0:
     continue
   else:
     # print len(href)
     # not very well using for, though just one element in list
     # but I don't know why I cannot get the data by index
     for each_href in href:
       link = cc98 + each_href
     title_author_time = each.xpath('./td[2]/a/@title')

# print len(title_author_time)
     for info in title_author_time:
       info_split = info.split('\n')
       title = info_split[0][1:len(info_split[0])-1]
       author = info_split[1][3:]
       date = info_split[2][3:]

hot = each.xpath('./td[4]/text()')
     # print len(hot)
     for hot_num in hot:
       reply_view = hot_num.strip().split('/')
       reply, view = reply_view[0], reply_view[1]
     savetools.saveSingle(author=author, title=title, date=date, url=link, reply=reply, view=view)

print "All got! Now saving to Database..."
# savetools.show()
savetools.toMySQL()
print "ALL CLEAR! Have Fun!"

来源:https://blog.csdn.net/qq_22187919/article/details/60466283

标签:python,爬虫,论坛
0
投稿

猜你喜欢

  • pytorch 输出中间层特征的实例

    2022-06-08 07:36:39
  • win7 x64系统中安装Scrapy的方法

    2023-10-19 04:04:36
  • Windows中安装使用Virtualenv来创建独立Python环境

    2023-10-21 19:44:52
  • HTML标签tbody的用法

    2009-11-02 10:11:00
  • 设计手机端应用时的一些建议

    2011-05-14 16:45:00
  • J2EE基础应用:J2EE中SQL语句自动构造方法

    2009-09-18 09:06:00
  • html风格tooltip效果的实现

    2010-04-08 13:00:00
  • 利用Python自动化操作AutoCAD的实现

    2022-02-14 21:32:28
  • python轻松查到删除自己的微信好友

    2021-06-06 12:31:44
  • Go语言中的Array、Slice、Map和Set使用详解

    2023-06-24 07:29:03
  • python math模块的基本使用教程

    2022-01-30 23:07:53
  • Python实现判断一个字符串是否包含子串的方法总结

    2023-04-17 06:27:58
  • ASP脚本变量、函数、过程和条件语句

    2008-10-14 14:43:00
  • Python3.5以上版本lxml导入etree报错的解决方案

    2021-06-25 12:53:16
  • FrontPage服务器扩展

    2008-03-05 13:05:00
  • CSS Hack经验总结

    2008-05-01 13:13:00
  • python数据结构leetcode338比特位计数算法

    2023-05-06 21:24:33
  • python利用wx实现界面按钮和按钮监听和字体改变的方法

    2023-10-26 17:34:24
  • 使用PHP Socket 编程模拟Http post和get请求

    2023-11-15 10:58:52
  • Bootstrap select多选下拉框实现代码

    2023-09-15 12:44:55
  • asp之家 网络编程 m.aspxhome.com