对Python3 解析html的几种操作方式小结

作者:极客点儿 时间:2021-04-07 20:13:12 

解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。

先介绍基础的辅助函数,主要用于获取html并输入解析后的结束


#把传递解析函数,便于下面的修改
def get_html(url, paraser=bs4_paraser):
headers = {
 'Accept': '*/*',
 'Accept-Encoding': 'gzip, deflate, sdch',
 'Accept-Language': 'zh-CN,zh;q=0.8',
 'Host': 'www.360kan.com',
 'Proxy-Connection': 'keep-alive',
 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
response.encoding = 'utf-8'
if response.code == 200:
 data = StringIO.StringIO(response.read())
 gzipper = gzip.GzipFile(fileobj=data)
 data = gzipper.read()
 value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()
 return value
else:
 pass

value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
print row

1,lxml.html的方式进行解析,

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官网](http://lxml.de/)


def lxml_parser(page):
data = []
doc = etree.HTML(page)
all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
for row in all_div:
 # 获取每一个影评,即影评的item
 all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
 for r in all_div_item:
  value = {}
  # 获取影评的标题部分
  title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
  value['title'] = title[0].xpath('./a/text()')[0]
  value['title_href'] = title[0].xpath('./a/@href')[0]
  score_text = title[0].xpath('./div/span/span/@style')[0]
  score_text = re.search(r'\d+', score_text).group()
  value['score'] = int(score_text) / 20
  # 时间
  value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
  # 多少人喜欢
  value['people'] = int(
    re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
  data.append(value)
return data

2,使用BeautifulSoup,不多说了,大家网上找资料看看


def bs4_paraser(html):
all_value = []
value = {}
soup = BeautifulSoup(html, 'html.parser')
# 获取影评的部分
all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
for row in all_div:
 # 获取每一个影评,即影评的item
 all_div_item = row.find_all('div', attrs={'class': 'item'})
 for r in all_div_item:
  # 获取影评的标题部分
  title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
  if title is not None and len(title) > 0:
   value['title'] = title[0].a.string
   value['title_href'] = title[0].a['href']
   score_text = title[0].div.span.span['style']
   score_text = re.search(r'\d+', score_text).group()
   value['score'] = int(score_text) / 20
   # 时间
   value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
   # 多少人喜欢
   value['people'] = int(
     re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
  # print r
  all_value.append(value)
  value = {}
return all_value

3,使用SGMLParser,主要是通过start、end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)


class CommentParaser(SGMLParser):
def __init__(self):
 SGMLParser.__init__(self)
 self.__start_div_yingping = False
 self.__start_div_item = False
 self.__start_div_gclear = False
 self.__start_div_ratingwrap = False
 self.__start_div_num = False
 # a
 self.__start_a = False
 # span 3中状态
 self.__span_state = 0
 # 数据
 self.__value = {}
 self.data = []

def start_div(self, attrs):
 for k, v in attrs:
  if k == 'class' and v == 'yingping-list-wrap':
   self.__start_div_yingping = True
  elif k == 'class' and v == 'item':
   self.__start_div_item = True
  elif k == 'class' and v == 'g-clear title-wrap':
   self.__start_div_gclear = True
  elif k == 'class' and v == 'rating-wrap g-clear':
   self.__start_div_ratingwrap = True
  elif k == 'class' and v == 'num':
   self.__start_div_num = True

def end_div(self):
 if self.__start_div_yingping:
  if self.__start_div_item:
   if self.__start_div_gclear:
    if self.__start_div_num or self.__start_div_ratingwrap:
     if self.__start_div_num:
      self.__start_div_num = False
     if self.__start_div_ratingwrap:
      self.__start_div_ratingwrap = False
    else:
     self.__start_div_gclear = False
   else:
    self.data.append(self.__value)
    self.__value = {}
    self.__start_div_item = False
  else:
   self.__start_div_yingping = False

def start_a(self, attrs):
 if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
  self.__start_a = True
  for k, v in attrs:
   if k == 'href':
    self.__value['href'] = v

def end_a(self):
 if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
  self.__start_a = False

def start_span(self, attrs):
 if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
  if self.__start_div_ratingwrap:
   if self.__span_state != 1:
    for k, v in attrs:
     if k == 'class' and v == 'rating':
      self.__span_state = 1
     elif k == 'class' and v == 'time':
      self.__span_state = 2
   else:
    for k, v in attrs:
     if k == 'style':
      score_text = re.search(r'\d+', v).group()
    self.__value['score'] = int(score_text) / 20
    self.__span_state = 3
  elif self.__start_div_num:
   self.__span_state = 4

def end_span(self):
 self.__span_state = 0

def handle_data(self, data):
 if self.__start_a:
  self.__value['title'] = data
 elif self.__span_state == 2:
  self.__value['time'] = data
 elif self.__span_state == 4:
  score_text = re.search(r'\d+', data).group()
  self.__value['people'] = int(score_text)
 pass
def sgl_parser(html):
parser = CommentParaser()
parser.feed(html)
return parser.data

4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,


class CommentHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
 HTMLParser.HTMLParser.__init__(self)
 self.__start_div_yingping = False
 self.__start_div_item = False
 self.__start_div_gclear = False
 self.__start_div_ratingwrap = False
 self.__start_div_num = False
 # a
 self.__start_a = False
 # span 3中状态
 self.__span_state = 0
 # 数据
 self.__value = {}
 self.data = []

def handle_starttag(self, tag, attrs):
 if tag == 'div':
  for k, v in attrs:
   if k == 'class' and v == 'yingping-list-wrap':
    self.__start_div_yingping = True
   elif k == 'class' and v == 'item':
    self.__start_div_item = True
   elif k == 'class' and v == 'g-clear title-wrap':
    self.__start_div_gclear = True
   elif k == 'class' and v == 'rating-wrap g-clear':
    self.__start_div_ratingwrap = True
   elif k == 'class' and v == 'num':
    self.__start_div_num = True
 elif tag == 'a':
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   self.__start_a = True
   for k, v in attrs:
    if k == 'href':
     self.__value['href'] = v
 elif tag == 'span':
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   if self.__start_div_ratingwrap:
    if self.__span_state != 1:
     for k, v in attrs:
      if k == 'class' and v == 'rating':
       self.__span_state = 1
      elif k == 'class' and v == 'time':
       self.__span_state = 2
    else:
     for k, v in attrs:
      if k == 'style':
       score_text = re.search(r'\d+', v).group()
     self.__value['score'] = int(score_text) / 20
     self.__span_state = 3
   elif self.__start_div_num:
    self.__span_state = 4

def handle_endtag(self, tag):
 if tag == 'div':
  if self.__start_div_yingping:
   if self.__start_div_item:
    if self.__start_div_gclear:
     if self.__start_div_num or self.__start_div_ratingwrap:
      if self.__start_div_num:
       self.__start_div_num = False
      if self.__start_div_ratingwrap:
       self.__start_div_ratingwrap = False
     else:
      self.__start_div_gclear = False
    else:
     self.data.append(self.__value)
     self.__value = {}
     self.__start_div_item = False
   else:
    self.__start_div_yingping = False
 elif tag == 'a':
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
   self.__start_a = False
 elif tag == 'span':
  self.__span_state = 0

def handle_data(self, data):
 if self.__start_a:
  self.__value['title'] = data
 elif self.__span_state == 2:
  self.__value['time'] = data
 elif self.__span_state == 4:
  score_text = re.search(r'\d+', data).group()
  self.__value['people'] = int(score_text)
 pass
def html_parser(html):
parser = CommentHTMLParser()
parser.feed(html)
return parser.data

3,4对于该案例来说确实是不太适合,趁现在有空记录下来,功学习使用!

来源:https://blog.csdn.net/yilovexing/article/details/79675672

标签:Python3,解析,html
0
投稿

猜你喜欢

  • 利用JavaScript正则表达式模拟Google Talk的文本处理

    2007-12-04 18:43:00
  • python装饰器代码深入讲解

    2023-08-17 20:59:19
  • 删除数据库中重复数据的两个方法

    2008-01-01 19:16:00
  • 将图片读入到Dom中,并将其存为xml文件

    2008-09-04 11:24:00
  • php返回相对时间(如:20分钟前,3天前)的方法

    2023-10-26 11:20:38
  • Python pandas如何向excel添加数据

    2021-07-14 17:37:41
  • 仿迅雷焦点广告效果(JQuery版)

    2009-08-03 14:18:00
  • SQL Server 查询分析器快捷键集合

    2007-08-17 09:42:00
  • javascript 获取中文字符串长度

    2009-10-18 12:06:00
  • 如何用METADATA替换ADOVBS.INC?

    2010-06-12 12:54:00
  • Python使用pymongo库操作MongoDB数据库的方法实例

    2023-06-04 06:20:22
  • ubutu 16.04环境下,PHP与mysql数据库,网页登录验证实例讲解

    2023-11-22 08:18:27
  • 15个滑动门效果CSS导航菜单实例教程

    2010-02-20 13:02:00
  • 剖析SQL Server 事务日志的收缩和截断

    2009-01-15 13:04:00
  • Thinking XML: 创建 XML 的好建议

    2008-05-29 11:25:00
  • ASP充分利用Err.Description

    2009-06-24 11:12:00
  • Python实现读取文件最后n行的方法

    2023-08-02 10:33:32
  • JS实现动画中的布局转换

    2023-10-14 15:58:04
  • Javascript"篱式"条件判断(翻译)

    2008-08-01 12:21:00
  • python代码 FTP备份交换机配置脚本实例解析

    2023-05-08 22:47:25
  • asp之家 网络编程 m.aspxhome.com