为了把之前写的博客整理到doc文档中,写了个脚本,把标题和对应链接一起爬出来了,这里做个存档。
[!note]
仅适用于typecho博客引擎
import requests
import re
write_content = ''
regex = r"<a \n href=\"https://linjoey\.cn/index\.php/archives/(\d{1,3})/\">[\s]*(.*)[\s]*</a>"
index = 1
while(index < 27):
url=f'https://linjoey.cn/index.php/page/{index}'
html=requests.get(url)
if html.status_code == 200:
html_bytes = html.content
html_str = html_bytes.decode()
index += 1
if html.status_code != 200:
print(html)
break
all_items = re.findall(regex,html_str)
for item in all_items:
write_content=f'{write_content}\n{item[1]}\nhttps://linjoey.cn/index.php/archives/{item[0]}/\n'
with open('mytitle_out.txt','w',encoding='utf-8') as f:
f.write(write_content)
input("Press Any Key")