爬取 http://tech.china.com/articles/

抓取新闻列表中所有分页的新闻详情,包括标题、正文、时间、来源等信息。

创建项目

scrapy startproject China
scrapy genspider -t crawl chinatech 

items.py

from scrapy import Field, Item


class ChinaItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title = Field()
    text = Field()
    datetime = Field()
    source = Field()
    url = Field()
    website = Field()

chinatech.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from China.items import *
from China.loaders import *
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class ChinatechSpider(CrawlSpider):
    name = 'chinatech'
    allowed_domains = ['tech.china.com']
    start_urls = ['http://tech.china.com/articles/']

    rules = (
        Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
             callback='parse_item'),
        Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
    )

    def parse_item(self, response):
        loader = ChinaLoader(item=ChinaItem(), response=response)
        loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
        loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
        loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
        loader.add_value('website', '中华网')
        yield loader.load_item()

loads.py

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Join, Compose


class NewsLoader(ItemLoader):
    default_output_processor = TakeFirst()


class ChinaLoader(NewsLoader):
    text_out = Compose(Join(), lambda s: s.strip())
    source_out = Compose(Join(), lambda s: s.strip())

pipelines.py

import json

class ChinaPipeline(object):

    def __init__(self):
        self.filename = open("china.json", "w")

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
        self.filename.write(text)
        return item

    def close_spider(self, spider):
        self.filename.close()

settings.py

BOT_NAME = 'China'

SPIDER_MODULES = ['China.spiders']
NEWSPIDER_MODULE = 'China.spiders'

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'China.pipelines.ChinaPipeline': 300,
}

本文转载:CSDN博客