Python爬取typecho博客的标题链接

为了把之前写的博客整理到doc文档中,写了个脚本,把标题和对应链接一起爬出来了,这里做个存档。

[!note]
仅适用于typecho博客引擎
import requests
import re

write_content = ''
regex = r"<a \n    href=\"https://linjoey\.cn/index\.php/archives/(\d{1,3})/\">[\s]*(.*)[\s]*</a>"


index = 1
while(index < 27):
  url=f'https://linjoey.cn/index.php/page/{index}'
  html=requests.get(url)
  if html.status_code == 200:
    html_bytes = html.content
    html_str = html_bytes.decode()
    index += 1
  if html.status_code != 200:
    print(html)
    break
  all_items = re.findall(regex,html_str)
  for item in all_items:
    write_content=f'{write_content}\n{item[1]}\nhttps://linjoey.cn/index.php/archives/{item[0]}/\n'
  with open('mytitle_out.txt','w',encoding='utf-8') as f:
    f.write(write_content)


input("Press Any Key")
无标签