前言
文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
作者: 你想要 极客猫喵
PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取
代码实现
1、定义的请求函数
设置随机请求头,随机代理 返回 respose.text对象
def get_requests (url):
#user_Agent列表
user_agent_list = [
"Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)",
"Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)"
]
# #ip地址列表
# ip_list= ['60.216.101.46:59351', '117.69.201.116:9999', '113.128.10.77:9999']
#
# #产生随机ip
# random_proxies = {
# 'http':random.choice(ip_list)
# }
#产生一个随机user-agent
random_header = {
#从上面的列表上随机取一个
"User-Agent":random.choice(user_agent_list),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
#proxies = random_proxies ,
#使用随机ip与请求头
response = requests.get(url,headers=random_header)
return response.text
2.get_fakeurl(start,end)函数
返回带href属性的fake_url列表, 之后会对这个列表进行循环请求得到=======重定向后—带有参数的真正的url
def get_fakeurl(start,end):
fakeurl_list = []
for i in range(start,end):
new_url = domain +'/house/i3'+ str(i)+'/'
response_re = get_requests(new_url)
selector = etree.HTML(response_re,etree.HTMLParser())
href_list = selector.xpath('//dl[@class="clearfix"]//dd//h4/a/@href')
for href_url in href_list:
fake_detail_url = domain + href_url
fakeurl_list.append(fake_detail_url)
return fakeurl_list
3.获得重定向后的详情页url
def get_realurl(fakeurl):
try:
detail_text = requests.get(fakeurl).text
# re提取url参数部分,并拼接出url
real_url = fakeurl + '?' + re.findall(r't3=\'(.*?)\'',detail_text)[0]
# print(real_url)
return real_url
except:
print("信息丢失")
4.对详情页url的页面进行解析
并利用xpath进行匹配,抓取存入字典中 最后统一存入列表中,让pandas处理,存入excel中
def parse_infor(real_url):
dic = {}
# print(detail_url)
dic["url"] = real_url
response = get_requests(real_url)
# print(response)
selector = etree.HTML(response,etree.HTMLParser())
title = selector.xpath('//div[@class="title rel"]/h1/text()')[0]
# print(title)
price = selector.xpath('//div[@class="trl-item_top"]/div/i/text()')[0]
# print(price)
# title = "".join(title.split())
# print(title)
dic["标题"] = "".join(title.split())
dic["价格"] = price
infors1 = selector.xpath('//div[@class="tr-line clearfix"]//div/div[@class="tt"]/text()')
lab1 = selector.xpath('//div[@class="tr-line clearfix"]//div/div[@class="font14"]/text()')
dic.update(dict(zip(lab1[:6],infors1[:6])))
dic[lab1[-1]] = "".join(infors1[-1].split())
infors2 = selector.xpath('//div[@class="cont clearfix"]/div[@class="text-item clearfix"]/span[@class="rcont"]/text()')
lab2 = selector.xpath('//div[@class="cont clearfix"]/div[@class="text-item clearfix"]/span[@class="lab"]/text()')
dic.update(dict(zip(lab2,infors2)))
dic[lab2[-1]] = "".join(infors2[-1].split())
return dic
5.实现组织上面定义的函数功能,达到数据成功run
def get_allinfor(fakeurl_list):
house_arry = []
for fakeurl in fakeurl_list:
try:
real_url = get_realurl(fakeurl)
dic = parse_infor(real_url)
house_arry.append(dic)
print("-------------------恭喜你,此房源信息爬取成功!-------------------")
except:
print("!!!!!未爬取到,信息丢失!!!!!")
pass
time.sleep(2)
return house_arry
def papline():
pass
6.主函数,爬虫的启动器
if __name__ == '__main__':
domain = 'http://huaibei.esf.fang.com/'
start = 1
end = 16
fakeurl_list = get_fakeurl(start,end)
houseinfor = get_allinfor(fakeurl_list)
df = pd.DataFrame(houseinfor)
df.to_excel("huaibei_house.xlsx",index=False)
实现效果
最终顺利爬取数据至本地excel.很方便之后利用pandas,numpy读取,并进行数据清洗,分析统计。