创建项目
scrapy startproject douyu
编写items.py
1 import scrapy
2
3 class DouyuItem(scrapy.Item):
4 nickname = scrapy.Field()
5 imagelink = scrapy.Field()
6 imagePath = scrapy.Field()
创建基础类的爬虫
scrapy genspider douyutupian capi.douyucdn.cn
手机抓包得到API接口,返回JSON格式数据
douyutupian.py
import scrapy
from douyu.items import DouyuItem
import json
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class DouyumeinvSpider(scrapy.Spider):
name = "douyutupian"
allowed_domains = ["capi.douyucdn.cn"]
offset = 0
url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
start_urls = [url + str(offset)]
def parse(self, response):
# 把json格式的数据转换为python格式,data段是列表
data = json.loads(response.text)["data"]
for each in data:
item = DouyuItem()
item["nickname"] = each["nickname"]
item["imagelink"] = each["vertical_src"]
yield item
self.offset += 20
yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
管道文件pipelines.py
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import os
class ImagesPipeline(ImagesPipeline):
#def process_item(self, item, spider):
# return item
# 获取settings文件里设置的变量值
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
def get_media_requests(self, item, info):
image_url = item["imagelink"]
yield scrapy.Request(image_url)
def item_completed(self, result, item, info):
image_path = [x["path"] for ok, x in result if ok]
os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")
item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]
return item
settings.py
BOT_NAME = 'douyu'
SPIDER_MODULES = ['douyu.spiders']
NEWSPIDER_MODULE = 'douyu.spiders'
DEFAULT_REQUEST_HEADERS = {
"User-Agent" : "DYZB/1 CFNetwork/808.2.16 Darwin/16.3.0"
}
ITEM_PIPELINES = {
'douyu.pipelines.ImagesPipeline': 300,
}
IMAGES_STORE = "IMAGES_STORE = "../../Images"