Python实现快速保存微信公众号文章中的图片

作者：用余生去守护时间：2021-02-18 23:03:25　

一、实现效果(以槿泉壁纸为例)

二、实现过程

1.新建一个link文本，将需要下载的文章链接依次保存；

2.新建一个.py文件，将下面的源码复制进去；

3.新建一个pic文件夹，用来保存图片；

4.运行即可；

三、源码

sound code

代码如下（示例）：

import requests
from re import findall
from bs4 import BeautifulSoup
import time
import os
import sys

weixin_title=""
weixin_time=""

#获取微信公众号内容,保存标题和时间
def get_weixin_html(url):
global weixin_time,weixin_title
res=requests.get(url)
soup=BeautifulSoup(res.text,"html.parser")

#获取标题
temp=soup.find('h1')
weixin_title=temp.string.strip()

#使用正则表达式获取时间
# result=findall(r'[0-9]{4}-[0-9]{2}-[0-9]{2}.+:[0-9]{2}',res.text)
result=findall(r"(\d{4}-\d{1,2}-\d{1,2})",res.text)
weixin_time=result[0]

#获取正文html并修改
content=soup.find(id='js_content')
soup2=BeautifulSoup((str(content)),"html.parser")
soup2.div['style']='visibility: visible;'
html=str(soup2)
pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
result = findall(pattern, html)

#将data-src修改为src
for url in result:
html=html.replace('data-src="'+url+'"','src="'+url+'"')

return html

#上传图片至服务器
def download_pic(content):

pic_path= 'pic/' + str(path)+ '/'
if not os.path.exists(pic_path):
os.makedirs(pic_path)

#使用正则表达式查找所有需要下载的图片链接
pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
pic_list = findall(pattern, content)

for index, item in enumerate(pic_list,1):
count=1
flag=True
pic_url=str(item)

while flag and count<=10:
try:
data=requests.get(pic_url);

if pic_url.find('png')>0:
file_name = str(index)+'.png'

elif pic_url.find('gif')>0:
file_name=str(index)+'.gif'

else:
file_name=str(index)+'.jpg'

with open( pic_path + file_name,"wb") as f:
f.write(data.content)

#将图片链接替换为本地链接
content = content.replace(pic_url, pic_path + file_name)

flag = False
print('已下载第' + str(index) +'张图片.')
count += 1
time.sleep(1)

except:
count+=1
time.sleep(1)

if count>10:
print("下载出错：",pic_url)
return content

def get_link(dir):
link = []
with open(dir,'r') as file_to_read:
while True:
line = file_to_read.readline()
if not line:
break
line = line.strip('\n')
link.append(line)
return link

path = 'link.txt'
linklist = get_link(path)
print(linklist)
s = len(linklist)

if __name__ == "__main__":

#获取html
input_flag=True
while input_flag:
# for j in range(0,s):
# pic = str(j)
j = 1
for i in linklist:
weixin_url = i
path = j
j += 1
#weixin_url=input()
re=findall(r'http[s]?:\/\/mp.weixin.qq.com\/s\/[0-9a-zA-Z_]+',weixin_url)
if len(re)<=0:
print("链接有误，请重新输入!")
else:
input_flag=False

content=get_weixin_html(weixin_url)
content=download_pic(content)
#保存至本地
with open(weixin_title+'.txt','w+',encoding="utf-8") as f:
f.write(content)
with open(weixin_title+'.html','w+',encoding="utf-8") as f:
f.write(content)

print()
print("标题：《"+weixin_title+"》")
print("发布时间："+weixin_time)

四、Python正则表达式匹配日期与时间

import re
from datetime import datetime

test_date = '小明的生日是2016-12-12 14:34,小张的生日是2016-12-21 11:34 .'
test_datetime = '小明的生日是2016-12-12 14:34,.小晴的生日是2016-12-21 11:34,好可爱的.'

# date
mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_date)
print mat.groups()
# ('2016-12-12',)
print mat.group(0)
# 2016-12-12

date_all = re.findall(r"(\d{4}-\d{1,2}-\d{1,2})",test_date)
for item in date_all:
print item
# 2016-12-12
# 2016-12-21

# datetime
mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",test_datetime)
print mat.groups()
# ('2016-12-12 14:34',)
print mat.group(0)
# 2016-12-12 14:34

date_all = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",test_datetime)
for item in date_all:
print item
# 2016-12-12 14:34
# 2016-12-21 11:34
## 有效时间

# 如这样的日期2016-12-35也可以匹配到.测试如下.
test_err_date = '如这样的日期2016-12-35也可以匹配到.测试如下.'
print re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_err_date).group(0)
# 2016-12-35

# 可以加个判断
def validate(date_text):
try:
if date_text != datetime.strptime(date_text, "％Y-％m-％d").strftime('％Y-％m-％d'):
raise ValueError
return True
except ValueError:
# raise ValueError("错误是日期格式或日期,格式是年-月-日")
return False

print validate(re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_err_date).group(0))
# false

# 其他格式匹配. 如2016-12-24与2016/12/24的日期格式.
date_reg_exp = re.compile('\d{4}[-/]\d{2}[-/]\d{2}')

test_str= """
平安夜圣诞节2016-12-24的日子与去年2015/12/24的是有不同哦.
"""
# 根据正则查找所有日期并返回
matches_list=date_reg_exp.findall(test_str)

# 列出并打印匹配的日期
for match in matches_list:
print match

# 2016-12-24
# 2015/12/24

来源：https://blog.csdn.net/qq_45365214/article/details/125405386

标签：Python,保存,公众号,文章,图片

投稿

Python实现快速保存微信公众号文章中的图片

一、实现效果(以槿泉壁纸为例)

二、实现过程

三、源码

四、Python正则表达式匹配日期与时间

猜你喜欢

python的open函数使用案例代码

不要放弃使用CSS中的新技术

MYSQL5 下的兼容说明(my.ini my.conf)

使用keras实现非线性回归(两种加激活函数的方式)

Golang使用lua脚本实现redis原子操作

asp如何用FSO对象显示一个文本文件？

oracle 日期函数集合(集中版本)第1/2页

解决asp中ADODB.Stream 0x800A0C93 错误

escape,encodeURI,encodeURIComponent函数比较

Python分析彩票记录并预测中奖号码过程详解

怎样修改 MySQL数据库中的密码

Python使用numpy产生正态分布随机数的向量或矩阵操作示例

利用XML实现通用WEB报表打印实际使用中的例子

MySQL6.0新增特性

优化次数过多的循环

交互设计规范原则

我喜欢你抖音表白程序python版

python字符串拼接.join()和拆分.split()详解

ASP分页技术详解

PHP判断密码强度的方法详解

Python实现快速保存微信公众号文章中的图片

一、实现效果(以槿泉壁纸为例)

二、实现过程

三、源码

四、Python正则表达式匹配日期与时间

猜你喜欢

python的open函数使用案例代码

不要放弃使用CSS中的新技术

MYSQL5 下的兼容说明(my.ini my.conf)

使用keras实现非线性回归(两种加激活函数的方式)

Golang使用lua脚本实现redis原子操作

asp如何用FSO对象显示一个文本文件？

oracle 日期函数集合(集中版本)第1/2页

解决asp中ADODB.Stream 0x800A0C93 错误

escape,encodeURI,encodeURIComponent函数比较

Python分析彩票记录并预测中奖号码过程详解

怎样修改 MySQL数据库中的密码

Python使用numpy产生正态分布随机数的向量或矩阵操作示例

利用XML实现通用WEB报表打印实际使用中的例子

MySQL6.0新增特性

优化次数过多的循环

交互设计规范原则

我喜欢你 抖音表白程序python版

python字符串拼接.join()和拆分.split()详解

ASP分页技术详解

PHP判断密码强度的方法详解

我喜欢你抖音表白程序python版