python手机号前7位归属地爬虫代码实例

作者：wanli001 时间：2021-01-23 05:20:34　

需求分析

项目上需要用到手机号前7位，判断号码是否合法，还有归属地查询。旧的数据是几年前了太久了，打算用python爬虫重新爬一份

单线程版本

# coding:utf-8
import requests
from datetime import datetime

class PhoneInfoSpider:
def __init__(self, phoneSections):
self.phoneSections = phoneSections

def phoneInfoHandler(self, textData):
text = textData.splitlines(True)
# print("text length:" + str(len(text)))

if len(text) >= 9:
number = text[1].split('\'')[1]
province = text[2].split('\'')[1]
mobile_area = text[3].split('\'')[1]
postcode = text[5].split('\'')[1]
line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
line_text = number + "," + province + "," + mobile_area + "," + postcode
print(line_text)
# print("province:" + province)

try:
f = open('./result.txt', 'a')
f.write(str(line_text) + '\n')
except Exception as e:
print(Exception, ":", e)

def requestPhoneInfo(self, phoneNum):
try:
url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
response = requests.get(url)
self.phoneInfoHandler(response.text)
except Exception as e:
print(Exception, ":", e)

def requestAllSections(self):
# last用于接上次异常退出前的号码
last = 0
# last = 4
# 自动生成手机号码，后四位补0
for head in self.phoneSections:
head_begin = datetime.now()
print(head + " begin time:" + str(head_begin))

# for i in range(last, 10000):
for i in range(last, 10):
middle = str(i).zfill(4)
phoneNum = head + middle + "0000"
self.requestPhoneInfo(phoneNum)
last = 0

head_end = datetime.now()
print(head + " end time:" + str(head_end))

if __name__ == '__main__':
task_begin = datetime.now()
print("phone check begin time:" + str(task_begin))

# 电信，联通，移动，虚拟运营商
dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']
yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172',
'178', '182', '183', '184', '187', '188', '198']
add = ['170']
all_num = dx + lt + yd + add

# print(all_num)
print(len(all_num))

# 要爬的号码段
spider = PhoneInfoSpider(all_num)
spider.requestAllSections()

task_end = datetime.now()
print("phone check end time:" + str(task_end))

发现爬取一个号段，共10000次查询，单线程版大概要多1个半小时，太慢了。

多线程版本

# coding:utf-8
import requests
from datetime import datetime
import queue
import threading

threadNum = 32

class MyThread(threading.Thread):
def __init__(self, func):
threading.Thread.__init__(self)
self.func = func

def run(self):
self.func()

def requestPhoneInfo():
global lock
while True:
lock.acquire()
if q.qsize() != 0:
print("queue size:" + str(q.qsize()))
p = q.get() # 获得任务
lock.release()

middle = str(9999 - q.qsize()).zfill(4)
phoneNum = phone_head + middle + "0000"
print("phoneNum:" + phoneNum)

try:
url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
# print(url)
response = requests.get(url)
# print(response.text)
phoneInfoHandler(response.text)
except Exception as e:
print(Exception, ":", e)
else:
lock.release()
break

def phoneInfoHandler(textData):
text = textData.splitlines(True)

if len(text) >= 9:
number = text[1].split('\'')[1]
province = text[2].split('\'')[1]
mobile_area = text[3].split('\'')[1]
postcode = text[5].split('\'')[1]
line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
line_text = number + "," + province + "," + mobile_area + "," + postcode
print(line_text)
# print("province:" + province)

try:
f = open('./result.txt', 'a')
f.write(str(line_text) + '\n')
except Exception as e:
print(Exception, ":", e)

if __name__ == '__main__':
task_begin = datetime.now()
print("phone check begin time:" + str(task_begin))

dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']
yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',
'182', '183', '184', '187', '188', '198']
all_num = dx + lt + yd
print(len(all_num))

for head in all_num:
head_begin = datetime.now()
print(head + " begin time:" + str(head_begin))

q = queue.Queue()
threads = []
lock = threading.Lock()

for p in range(10000):
q.put(p + 1)

print(q.qsize())

for i in range(threadNum):
middle = str(i).zfill(4)
global phone_head
phone_head = head

thread = MyThread(requestPhoneInfo)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()

head_end = datetime.now()
print(head + " end time:" + str(head_end))

task_end = datetime.now()
print("phone check end time:" + str(task_end))

多线程版的1个号码段1000条数据，大概2，3min就好，cpu使用飙升，大概维持在70％左右。

总共40多个号段，爬完大概1，2个小时，总数据41w左右

来源：https://www.cnblogs.com/wanli002/p/11413281.html

标签：python,手机,归属地,爬虫

投稿

python手机号前7位归属地爬虫代码实例

猜你喜欢

5款Python程序员高频使用开发工具推荐

MySQL中ROUND函数进行四舍五入操作陷阱分析

oracle下巧用bulk collect实现cursor批量fetch的sql语句

matplotlib 输出保存指定尺寸的图片方法

Python在字典中查找元素的3种方式

Flask框架使用DBUtils模块连接数据库操作示例

Python 绘制酷炫的三维图步骤详解

Python虚拟环境venv用法详解

Opencv中的cv2.calcHist()函数的作用及返回值说明

Z-Blog实现摘要图文混排效果的方法

简单理解Python中基于生成器的状态机

Python Flask 实现 HTML 文件压缩案例代码(9 级压缩)

Golang使用Consul详解

Vue前端表格导出Excel文件的图文教程

Golang json 库中的RawMessage功能原理

php析构函数的具体用法小结

Python 从subprocess运行的子进程中实时获取输出的例子

SQLServer2005重建索引前后对比分析

oracle删除表字段和oracle表增加字段

浅谈Pytorch中的torch.gather函数的含义