python爬取IT之家业界新闻

爬取站点 https://it.ithome.com/ityejie/ ，进入详情页提取内容。
import requests
import json
from lxml import etree
from pymongo import MongoClient
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
url = 'https://it.ithome.com/ithome/getajaxdata.aspx'
headers = {
    'authority': 'it.ithome.com',
    'method': 'POST',
    'path': '/ithome/getajaxdata.aspx',
    'scheme': 'https',
    'accept': 'text/html, */*; q=0.01',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'content-length': '40',
    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'cookie': 'BAIDU_SSP_lcr=https://www.hao123.com/link/https/?key=http%3A%2F%2Fwww.ithome.com%2F&&monkey=m-kuzhan-group1&c=B329C2F33C91DEACCFAEB1680305F198; Hm_lvt_f2d5cbe611513efcf95b7f62b934c619=1530106766; ASP.NET_SessionId=tyxenfioljanx4xwsvz3s4t4; Hm_lvt_cfebe79b2c367c4b89b285f412bf9867=1530106547,1530115669; BEC=228f7aa5e3abfee5d059195ad34b4137|1530117889|1530109082; Hm_lpvt_f2d5cbe611513efcf95b7f62b934c619=1530273209; Hm_lpvt_cfebe79b2c367c4b89b285f412bf9867=1530273261',
    'origin': 'https://it.ithome.com',
    'referer': 'https://it.ithome.com/ityejie/',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3472.3 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
}

client = MongoClient()
db = client['ithome']
collection = db['ithome']
max_page = 1000

def get_page(page):  

    formData = {
        'categoryid': '31',
        'type': 'pccategorypage',
        'page': page,
        }
    try:
        r = requests.post(url, data=formData, headers=headers)
        if r.status_code == 200:

            #print(type(r))
            html = r.text
            # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
            text = etree.HTML(html)
            link_list = text.xpath('//h2/a/@href') 

            print("提取第"+str(page)+"页文章")
            id=0
            for link in link_list:
                id+=1
                print("解析第"+str(page)+"页第"+str(id)+"篇文章")
                print("链接为："+link)
                loadpage(link)

    except requests.ConnectionError as e:
        print('Error', e.args)    


# 取出每个文章的链接
def loadpage(link):

    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3472.3 Safari/537.36'}

    try:
 
        reseponse = requests.get(link, headers = headers)
        if reseponse.status_code == 200:     
            html = reseponse.text
            # 解析
            node = etree.HTML(html)

            ithome ={}
            # 取出每个标题，正文等

            # xpath返回的列表，这个列表就这一个参数，用索引方式取出来，标题
            ithome['title'] = node.xpath('//*[@id="wrapper"]/div[1]/div[2]/h1')[0].text
            # 时间
            ithome['data'] = node.xpath('//*[@id="pubtime_baidu"]')[0].text
            # 取出标签下的内容
            #content = node.xpath('//*[@id="paragraph"]/p/text()')
            ithome['content'] = "".join(node.xpath('//*[@id="paragraph"]/p/text()')).strip()
            #content = node.xpath('//*[@id="paragraph"]/p')[1].text
            # 取出标签里包含的内容，作者
            ithome['author'] = node.xpath('//*[@id="author_baidu"]/strong')[0].text
            # 评论数
            ithome['commentcount'] = node.xpath('//span[@id="commentcount"]')[0].text
            #评论数没有取到
            write_to_file(ithome)
            save_to_mongo(ithome)             

    except requests.ConnectionError as e:
        print('Error', e.args)  

def write_to_file(content):
    with open('ithome.json','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()

def save_to_mongo(result):
    if collection.insert(result):
        print('Saved to Mongo')

if __name__ == '__main__':
    for page in range(1, max_page + 1):
        get_page(page)
Biegral Blog

python爬取IT之家业界新闻

阅读排行

分类

归档