# -*- coding:utf-8 -*- #
import urllib,requests
import bs4,os,re
urlList = []
urlListZuiHou = []
urlPurpose = 'http://blog.sina.com.cn/twocold'
#下载韩寒博客网页
res = requests.get(urlPurpose)
#检查是否下载成功
try:
res.raise_for_status()
except Exception as exc:
print('There was a problem:%s'%(exc))
#通过正则表达式在下载网页中匹配博文地址存放在urlList列表中
patternUrl = r'http://blog.sina\.com\.cn/s/blog.+\.html'
regex = re.compile(patternUrl)
urlList = re.findall(regex,res.text)
#去掉重复的URl路径项,存放在列表urlListZuiHou中
for i in range(0,len(urlList),4):
urlListZuiHou.append(urlList[i])
#循环遍历所有博客地址,并下载保存到本地文件
for i in range(len(urlListZuiHou)):
resBlog = urllib.request.urlopen(urlListZuiHou[i])
#转化下载文件的编码格式utf-8
html = str(resBlog.read(),'utf-8')
#创建BeautifulSoup对象,并查找匹配博客标题
bs4BOne = bs4.BeautifulSoup(html)
bs4BlogBiaoTi = bs4BOne.select('title')
#以博客标题为文件名称,将每一篇博客保存为html文件
openBlogFileHTML = open('C:\\Users\\Nick\\Desktop\\python\\drawing\\2\\quiz\\'+str(bs4BlogBiaoTi[0].getText())+'.html','at')
openBlogFileHTML.write(html)
openBlogFileHTML.close()
python实现下载韩寒博客中的所有文章,在本地存储
本文转载:CSDN博客