爬取 http://tech.china.com/articles/
抓取新闻列表中所有分页的新闻详情,包括标题、正文、时间、来源等信息。
创建项目
scrapy startproject China
scrapy genspider -t crawl chinatech
items.py
from scrapy import Field, Item
class ChinaItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
text = Field()
datetime = Field()
source = Field()
url = Field()
website = Field()
chinatech.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from China.items import *
from China.loaders import *
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class ChinatechSpider(CrawlSpider):
name = 'chinatech'
allowed_domains = ['tech.china.com']
start_urls = ['http://tech.china.com/articles/']
rules = (
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
)
def parse_item(self, response):
loader = ChinaLoader(item=ChinaItem(), response=response)
loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
loader.add_value('url', response.url)
loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
loader.add_value('website', '中华网')
yield loader.load_item()
loads.py
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Join, Compose
class NewsLoader(ItemLoader):
default_output_processor = TakeFirst()
class ChinaLoader(NewsLoader):
text_out = Compose(Join(), lambda s: s.strip())
source_out = Compose(Join(), lambda s: s.strip())
pipelines.py
import json
class ChinaPipeline(object):
def __init__(self):
self.filename = open("china.json", "w")
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
self.filename.write(text)
return item
def close_spider(self, spider):
self.filename.close()
settings.py
BOT_NAME = 'China'
SPIDER_MODULES = ['China.spiders']
NEWSPIDER_MODULE = 'China.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'China.pipelines.ChinaPipeline': 300,
}