使用XPath
import requests
import json
from lxml import etree
from urllib import parse
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
items ={}
for node in node_list:
# xpath返回的列表,这个列表就这一个参数,用索引方式取出来
#问题
question = node.xpath('.//h2/a')[0].text.replace("\n","")
# 作者
author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
# 回答
answer = node.xpath('.//*[@class="content"]')[0].text
#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
items = {
"question" : question,
"author" : author,
"answer" : answer,
}
with open("explore.json", "a") as f:
#f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")
f.write(json.dumps(items, ensure_ascii = False) + "\n")
####保存为TXT
import requests
from lxml import etree
from urllib import parse
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
for node in node_list:
# xpath返回的列表,这个列表就这一个参数,用索引方式取出来
#问题
question = node.xpath('.//h2/a')[0].text.replace("\n","")
# 作者
author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
# 回答
answer = node.xpath('.//*[@class="content"]')[0].text
#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
with open('explore.txt', 'a', encoding='utf-8') as file:
file.write('\n'.join([question, author, answer]))
file.write('\n' + '=' * 50 + '\n')
保存为csv
import requests
from lxml import etree
from urllib import parse
import csv
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
for node in node_list:
# xpath返回的列表,这个列表就这一个参数,用索引方式取出来
#问题
question = node.xpath('.//h2/a')[0].text.replace("\n","")
# 作者
author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
# 回答,为方便展示,只取部分内容,text[ :10]
answer = node.xpath('.//*[@class="content"]')[0].text[ :10]
#answer = node.xpath('.//*[@class="content"]')[0].text
#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
with open('explore.csv', 'a', encoding='utf-8') as csvfile:
fieldnames = ['question', 'author', 'answer']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'question': question, 'author': author, 'answer': answer})
####读取csv
1 import csv
2
3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:
4 reader = csv.reader(csvfile)
5 for row in reader:
6 print(row)
####保存到MongoDB
import requests
from lxml import etree
from urllib import parse
from pymongo import MongoClient
client = MongoClient()
db = client['explore']
collection = db['explore']
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
# 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
text = etree.HTML(html)
# 返回所有内容的结点位置
node_list = text.xpath('//div[@class="explore-feed feed-item"]')
for node in node_list:
# xpath返回的列表,这个列表就这一个参数,用索引方式取出来
#问题
question = node.xpath('.//h2/a')[0].text.replace("\n","")
# 作者
author = node.xpath('.//*[@class="author-link-line"]/*')[0].text
#author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")
# 回答
answer = node.xpath('.//*[@class="content"]')[0].text
#answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()
#answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]
items = {
"question" : question,
"author" : author,
"answer" : answer,
}
if collection.insert(items):
print('Saved to Mongo')