###1. 爬虫设计的技术
1)数据获取,通过http获取网站的数据,如urllib,urllib2,requests等模块;
2)数据提取,将web站点所获取的数据进行处理,获取所需要的数据,常使用的技术有:正则re,BeautifulSoup,xpath;
3)数据存储,将获取的数据有效的存储,常见的存储方式包括:文件file,csv文件,Excel,MongoDB数据库,MySQL数据库
###2. 环境信息
1)python2.7
2)mongo2.6
3)使用模块包括re,requests,lxml,pymongo
###3. 代码内容
import re
import sys
import requests
import pymongo
from time import sleep
from lxml import etree
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
reload(sys)
sys.setdefaultencoding('utf8')
def get_web_html(url):
'''
@params: url 通过requests获取web站点的HTML源代码数据,并返回
'''
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"
}
try:
req = requests.get(url,headers=headers)
if req.status_code == 200:
response = req.text.encode('utf8')
else:
response = ''
except Exception as e:
print e
return response
def get_music_url(url):
'''
@params: url提供页面的url地址,获取音乐详细的URL地址,通过正则表达式获取
'''
music_url_list = []
selector = etree.HTML(get_web_html(url))
music_urls = selector.xpath('//div[@class="pl2"]/a/@href')
for music_url in music_urls:
music_url_list.append(music_url)
sleep(1)
return music_url_list
def get_music_info(url):
'''
@params: 爬取url地址中音乐的特定信息
'''
print "正在获取%s音乐地址的URL地址信息..." % (url)
response = get_web_html(url)
selector = etree.HTML(response)
music_name = selector.xpath('//div[@id="wrapper"]/h1/span/text()')[0].strip()
author = selector.xpath('//div[@id="info"]/span/span/a/text()')[0].strip()
styles = re.findall(r'<span class="pl">流派:</span> (.*?)<br />',response,re.S|re.M)
if len(styles) == 0:
style = '未知'
else:
style = styles[0].strip()
publish_time = re.findall('<span class="pl">发行时间:</span> (.*?)<br />',response,re.S|re.M)[0].strip()
publish_users= re.findall('<span class="pl">出版者:</span> (.*?)<br />',response,re.S|re.M)[0].strip()
if len(publish_users) == 0:
publish_user = '未知'
else:
publish_user = publish_users[0].strip()
scores = selector.xpath('//strong[@class="ll rating_num"]/text()')[0].strip()
music_info_data = {
"music_name": music_name,
"author": author,
"style": style,
"publish_time": publish_time,
"publish_user": publish_user,
"scores": scores
}
write_into_mongo(music_info_data)
def write_into_mongo(data):
'''
@params: data,将数据封装为字典,然后将其写入到MongoDB数据库中
'''
print "正在插入数据%s" % (data)
try:
client = pymongo.MongoClient('localhost',27017)
db = client.db
table = db['douban_book']
table.insert_one(data)
except Exception as e:
print e
def main():
'''主函数'''
urls = ['https://music.douban.com/top250?start={}'.format(i) for i in range(0,230,25)]
for url in urls:
for u in get_music_url(url):
get_music_info(u)
if __name__ == "__main__":
main()