对python读写文件去重、RE、set的使用详解
作者:IBoyMan 时间:2022-09-25 04:33:48
如下所示:
# -*- coding:utf-8 -*-
from datetime import datetime
import re
def Main():
sourcr_dir = '/data/u_lx_data/fudan/muying/muying_11yue_all.txt'
target_dir = '/data/u_lx_data/fudan/muying/python/uid_regular_get.txt'
uset = set() #去重
print("开始。。。。。")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
with open(target_dir, 'w+') as f_write:
with open(sourcr_dir, 'r') as f_scorce:
for line in f_scorce:
line = line.strip().split("\t")
# 宝宝树
if line[2] == 'babytree.com':
uidList = re.findall(r'.*NL=u%02(u\d+)', line[3], re.I)
if uidList:
# 去重代码
if uidList[0] not in uset:
f_write.write(uidList[0] + "\n")
uset.add(uidList[0])
print("宝宝树已完成")
# 柚宝宝
elif line[2] == 'youzibuy.com':
if line[4].find("yunqi.youzibuy.com/tae_top_notify") != -1:
uidList = re.findall(r'.*myuid=(\d+)', line[4], re.I)
if uidList:
if uidList[0] not in uset:
f_write.write(uidList[0] + "\n")
uset.add(uidList[0])
print("柚宝宝已完成")
# 妈妈帮
elif line[2] == 'mmbang.com':
uidList = re.findall(r'.*uid=(\d+)', line[3], re.I)
if uidList:
if uidList[0] not in uset:
f_write.write(uidList[0] + "\n")
uset.add(uidList[0])
print("妈妈帮已完成")
# 妈妈网
elif line[2] == 'mama.cn':
if line[4].find("mapi.mama.cn/feed/users/show") != -1:
uidList = re.findall(r'.*friend_uid=(\d+)', line[4], re.I)
if uidList:
if uidList[0] not in uset:
f_write.write(uidList[0] + "\n")
uset.add(uidList[0])
if line[4].find("mamaquan/mmq_thread") != -1:
uidList = re.findall(r'.*uid=(\d+)', line[4], re.I)
if uidList:
if uidList[0] not in uset:
f_write.write(uidList[0] + "\n")
uset.add(uidList[0])
print("妈妈网已完成")
# 育儿网
elif line[2] == 'ci123.com':
uidList = re.findall(r'.*ci123js=([a-zA-Z]+\d+)', line[3], re.I)
if uidList:
if uidList[0] not in uset:
f_write.write(uidList[0] + "\n")
uset.add(uidList[0])
print("育儿网已完成")
print("完成。。。。。")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
if __name__ == "__main__":
Main()
来源:https://blog.csdn.net/IBoyMan/article/details/79401596
标签:python,RE,set,去重
0
投稿
猜你喜欢
对python添加模块路径的三种方法总结
2023-11-26 04:01:35
用Python手把手教你实现2048小游戏
2023-02-22 23:27:57
php使用curl抓取qq空间的访客信息示例
2023-10-30 05:50:32
Python使用re模块验证危险字符
2023-10-29 13:46:55
scrapy-redis源码分析之发送POST请求详解
2021-05-19 05:24:03
Django DRF认证组件流程实现原理详解
2021-01-03 08:48:49
pytorch + visdom 处理简单分类问题的示例
2022-08-17 12:56:58
聚焦 DreamWeaver MX 2004
2010-03-25 12:22:00
如何使用python读取Excel指定范围并转为数组
2023-06-13 15:21:10
Python通过TensorFLow进行线性模型训练原理与实现方法详解
2022-11-10 16:17:27
.Net中控件的命名规则
2024-06-05 09:25:15
详解Python牛顿插值法
2023-03-05 05:58:27
mysql自动增量备份的实例方法(本地备份与远程备份)
2024-01-16 12:28:32
从源码解析Python的Flask框架中request对象的用法
2021-02-20 02:15:57
python导入csv文件出现SyntaxError问题分析
2023-12-12 04:29:57
使用rpclib进行Python网络编程时的注释问题
2022-12-26 23:44:47
Express无法通过req.body获取请求传递的数据解决方法
2024-06-05 09:52:06
python线程、进程和协程详解
2023-03-02 14:00:39
Mysql中复制详细解析
2024-01-13 20:46:25
mysql unique key在查询中的使用与相关问题
2024-01-18 20:00:54