python实现下载韩寒博客中的所有文章，在本地存储

# -*- coding:utf-8 -*- ＃ 
import urllib,requests
import bs4,os,re


urlList = []
urlListZuiHou = []
urlPurpose = 'http://blog.sina.com.cn/twocold'


#下载韩寒博客网页
res = requests.get(urlPurpose)

#检查是否下载成功
try:
    res.raise_for_status()
except Exception as exc:
    print('There was a problem:%s'%(exc))

#通过正则表达式在下载网页中匹配博文地址存放在urlList列表中    
patternUrl = r'http://blog.sina\.com\.cn/s/blog.+\.html'
regex = re.compile(patternUrl)
urlList = re.findall(regex,res.text)


#去掉重复的URl路径项，存放在列表urlListZuiHou中 
for i in range(0,len(urlList),4):
    urlListZuiHou.append(urlList[i])




#循环遍历所有博客地址，并下载保存到本地文件
for i in range(len(urlListZuiHou)):
    resBlog = urllib.request.urlopen(urlListZuiHou[i])
    #转化下载文件的编码格式utf-8
    html = str(resBlog.read(),'utf-8')

#创建BeautifulSoup对象，并查找匹配博客标题    
    bs4BOne = bs4.BeautifulSoup(html)
    bs4BlogBiaoTi = bs4BOne.select('title')

#以博客标题为文件名称，将每一篇博客保存为html文件
    openBlogFileHTML = open('C:\\Users\\Nick\\Desktop\\python\\drawing\\2\\quiz\\'+str(bs4BlogBiaoTi[0].getText())+'.html','at')
    openBlogFileHTML.write(html)
    openBlogFileHTML.close()

Biegral Blog

python实现下载韩寒博客中的所有文章，在本地存储

阅读排行

分类

归档