python实现的一个火车票转让信息采集器
作者:junjie 时间:2023-09-05 11:42:53
好吧,我承认我是对晚上看到一张合适的票转让但打过电话去说已经被搞走了这件事情感到蛋疼。直接上文件吧。
#coding: utf-8
'''
春运查询火车票转让信息
Author: piglei2007@gmail.com
Date: 2011.01.25
'''
import re
import os
import time
import urlparse
import datetime
import traceback
import urllib2
import socket
socket.setdefaulttimeout(20)
BLANK_RE = re.compile(r"\s+")
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
opener.addheaders = [
("User-agent", "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.1) Gecko/20090704 Firefox/3.5"),
("Accept", "*/*"),
]
urllib2.install_opener(opener)
from BeautifulSoup import BeautifulSoup
SOURCE = {
"58": "http://bj.58.com/huochepiao/?Num=%(train)s&StartTime=%(date)s00",
"ganji": "http://bj.ganji.com/piao/cc_%(train)s/%(date)s/",
}
RECORD_FILE = "/tmp/ticket_records.txt"
def parse_record():
try:
return set([x.strip() for x in open(RECORD_FILE, "r").readlines()])
except IOError:
open(RECORD_FILE, "w")
return set()
def flush_record(records):
open(RECORD_FILE, "w").write("\n".join(records))
def main(config):
"""
开始抓取
"""
existed = parse_record()
to_email = []
for train in config["trains"]:
for date in config["dates"]:
for type, _url in SOURCE.items():
url = _url % dict(train=train, date=date)
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
result = parse_content(type, soup, train)
for url, text in result:
url = urlparse.urljoin(_url, url)
# 只要卧铺!
if url not in existed and u"卧" in text:
to_email.append([text, url])
existed.add(url)
if to_email:
content = "".join(
[x for x in [" | ".join(y) for y in to_email]]
).encode("utf-8")
simple_mail(config["people"], content)
flush_record(existed)
def parse_content(type, soup, train):
"""
获得车次信息
"""
result = []
if type == "58":
info_table = soup.find("table", id="infolist")
if info_table:
for x in info_table.findAll("tr", text=re.compile(ur"%s(?!时刻表)" % train, re.I)):
a = x.parent
_text = BLANK_RE.sub("", a.text)
result.append([a["href"], _text])
if type == "ganji":
for x in soup.findAll("dl", {"class": "list_piao"}):
a = x.dt.a
result.append([a["href"], a.text])
return result
EMAIL_HOST = 'smtp.sohu.com'
EMAIL_HOST_USER = 'yourname@sohu.com'
EMAIL_HOST_PASSWORD = 'yourpassword'
EMAIL_PORT = 25
def simple_mail(to, content):
"""
发送邮件
"""
import smtplib
from email.mime.text import MIMEText
msgRoot = MIMEText(content, 'html', 'UTF-8')
msgRoot['Subject'] = "[%s]有票来啦!!!!" % datetime.datetime.today().isoformat(" ")
msgRoot['From'] = EMAIL_HOST_USER
msgRoot['To'] = ", ".join(to)
s = smtplib.SMTP(EMAIL_HOST, EMAIL_PORT)
s.login(EMAIL_HOST_USER, EMAIL_HOST_PASSWORD)
s.sendmail(EMAIL_HOST_USER, to, msgRoot.as_string())
s.close()
def switch_time_zone():
"""
切换时区
"""
os.environ["TZ"] = "Asia/Shanghai"
time.tzset()
switch_time_zone()
if __name__ == '__main__':
config = {
"trains": ("k471",),
"dates": ("20110129",),
"people": (
"youremail@sohu.com",
)
}
try:
main(config)
print "%s: ok" % datetime.datetime.today()
except Exception, e:
print traceback.format_exc()
然后放入cron,你懂的。
标签:python,火车票,采集器


猜你喜欢
Python使用pandas导入csv文件内容的示例代码
2022-07-25 15:34:16

python编程进阶之异常处理用法实例分析
2023-01-27 16:39:24
adox 的vbs类,提取表名,列名等
2008-07-02 12:37:00
如何使用微信公众平台开发模式实现多客服
2023-11-14 17:48:37
Python借助with语句实现代码段只执行有限次
2022-08-07 15:52:29
python 请求服务器的实现代码(http请求和https请求)
2023-07-10 08:23:58
分享8点超级有用的Python编程建议(推荐)
2022-03-31 08:05:57

WPF自定义搜索框代码分享
2023-07-18 23:31:04

asp如何自动反馈电子邮件?
2002-01-01 06:54:00
python3字符串输出常见面试题总结
2021-01-13 08:39:23
Python 类,property属性(简化属性的操作),@property,property()用法示例
2022-01-04 19:21:53
关于Kotlin中SAM转换的那些事
2022-02-09 15:14:38
Dephi逆向工具Dede导出函数名MAP导入到IDA中的实现方法
2023-04-09 06:31:40

详解如何使用beego orm在postgres中存储图片
2024-04-25 15:14:46
python 使用 requests 模块发送http请求 的方法
2021-06-02 17:29:19
asp文章上一篇,下一篇实现代码
2008-03-24 20:15:00
表格梳理解析python内置时间模块看完就懂
2023-10-21 08:10:27
使用递归删除树形结构的所有子节点(java和mysql实现)
2024-01-12 23:22:16
oracle 的表空间实例详解
2023-06-25 11:39:37
Python实现光速定位并提取两个文件的不同之处
2023-11-01 10:42:25
