本文主要包含爬虫框架六大基础模块,分别为爬虫调度器、URL下载器、URL管理器、HTML下载器、HTML解析器、数据存储器
功能分析如下
- 爬虫调度器:主要负责统筹其他四个模块的工作。
- URL下载器:主要负责下载需要爬取数据的URL链接。
- URL管理器:负责管理URL链接,维护已经爬取的URL集合和未爬取的URL集合,提供获取新URL链接的接口。
- HTML下载器:用于从URL管理器中获取未爬取的URL链接并下载HRML网页。
- HTML解析器:用户从HTML下载器中获取已经下载的HTML网页,解析出有效数据交给数据存储器。
- 数据存储器:用于将HTML解析器解析出来的数据通过文件或者数据库的形式储存起来。
- 为了方便理解,以下是基础爬虫框架运行流程示意图
[外链图片转存失败(img-Gm7dSvnn-1566201386701)(https://upload-images.jianshu.io/upload_images/13406307-0c0e1059769c587d.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)]
一、URL下载器
URL下载器包含两步,首先下载网站左侧导航栏的URL,然后通过导航栏的URL获取每个子栏目包含的链接列表。
[外链图片转存失败(img-haV3aHAw-1566201386701)(https://upload-images.jianshu.io/upload_images/13406307-35f58d52456b03c9.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)]
下面是获取左侧导航栏所有链接并生成导航文件的代码
# -*- coding: utf-8 -*-
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import re
import os
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class get_catalog(object):
'''生成和操作导航文件'''
def save_catalog(self):
'''获得证券之星左侧自导航的内容和网址并保存'''
#获取网页内容
url = 'http://quote.stockstar.com'
request =urllib.request.Request(url = url)
response = urllib.request.urlopen(request)
content = response.read().decode('gbk')
#截取左侧导航内容
soup = BeautifulSoup(content,"lxml")
soup = BeautifulSoup(str(soup.find_all('div',class_ = "subMenuBox")),"lxml")
#初始化一级子目录和二级子目录的数据框
catalog1 = pd.DataFrame(columns = ["cata1","cata2","url2"])
catalog2 = pd.DataFrame(columns = ["url2","cata3","url3"])
#整理目录内容和其对应的链接
index1 = 0;index2 = 0
for content1 in soup.find_all('div',class_ = re.compile("list submenu?")):
cata1 = re.findall('>(.*?)<',str(content1.h3.a))
for content2 in content1.find_all('dl'):
cata2 = re.findall('>(.*?)<',str(content2.dt.a).replace('\r\n',''))
url2 = url + content2.dt.a['href']
catalog1.loc[index1] = {'cata1':cata1[0],'cata2':cata2[0].split()[0],'url2':url2}
index1 += 1
for content3 in content2.find_all('li'):
cata3 = re.findall('·(.*?)<',str(content3.a))
url3 = url + content3.a['href']
catalog2.loc[index2] = {'url2':url2,'cata3':cata3[0],'url3':url3}
index2 += 1
#对一级子目录表和二级子目录表做表连接并保存
catalog = pd.merge(catalog1,catalog2,on='url2',how='left')
catalog.to_csv('catalog.csv')
def load_catalog(self):
'''判断导航文件是否存在并载入'''
if 'catalog.csv' not in os.listdir():
self.save_catalog()
print('网址导航文件已生成')
else:
print('网址导航文件已存在')
catalog = pd.read_csv('catalog.csv',encoding='gbk',usecols=range(1,6))
print("网址导航文件已载入")
return(catalog)
def index_info(self,catalog,index):
'''创建每行的行名,作为存入数据库的表名,并获取每行终端的网址链接'''
if str(catalog.loc[index]['cata3'])=='nan':
table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2']
url = catalog.loc[index]['url2']
else:
#+、()等符号不能作为数据库表名,得替换或剔除
if '+' in catalog.loc[index]['cata3']:
cata3 = catalog.loc[index]['cata3'].replace('+','')
table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3
elif '(' in catalog.loc[index]['cata3']:
cata3 = catalog.loc[index]['cata3'].replace('(','').replace(')','')
table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3
else:
table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + catalog.loc[index]['cata3']
url = catalog.loc[index]['url3']
return(table_name,url)
get_catalog
下面是获取每个子栏目所有链接的代码
import pandas as pd
from selenium import webdriver
import time
import re
import math
from get_catalog import get_catalog
class get_urls(object):
'''获取每个栏目的链接列表'''
def __init__(self,browser,url):
self.browser = browser #浏览器对象
self.url = url #待爬取的URL
def get_browser(self):
'''连接URL'''
state = 0
test = 0
while state == 0 and test < 5:
try:
self.browser.get(self.url)
state = 1
print('成功连接 %s'%self.url)
except:
test += 1
def get_element(self):
'''获取翻页相关按钮的链接列表'''
self.get_browser()
element_list=[]
for i in range(1,8):
try:
element = self.browser.find_element_by_xpath('//*[@id="divPageControl1"]/a[%d]'%i).get_attribute('href')
element_list.append(element)
except:
time.sleep(0.2)
return(element_list)
def get_urllist(self):
'''通过翻页相关按钮生成有效的页码链接列表'''
element_list = self.get_element()
if len(element_list)<=1:
urls = [self.url]
else:
try:
max_number = re.search('_(\d*)\.',element_list[len(element_list)-3])
begin = max_number.start() + 1
end = max_number.end() - 1
int_max_number = int(element_list[len(element_list)-3][begin:end])
urls = []
for i in range(1,int_max_number + 1):
url = element_list[len(element_list)-3][:begin] + str(i) + element_list[len(element_list)-3][end:]
urls.append(url)
except:
urls = [self.url]
return(urls)
二、URL管理器
URL管理器主要包括两个变量,一个是已爬取的URL的 集合,另外一个是未爬取的URL的集合。采用Python中的set类型,主要是使用set的去重功能。
URL管理器除了具有两个URL集合,还需要提供以下接口,用于配合其他模块使用,接口如下:
判断是否有待取的URL,方法定义为has_new_url()。
添加新的URL到未爬取集合中,方法定义为add_new_url(url),add_new_urls(urls)。
获取一个未爬取的URL,方法定义为get_new_url()
下面为URL管理器模块的代码
# coding:utf - 8
class UrlManager(object):
'''URL管理器'''
def __init__(self):
self.new_urls = set() #未爬取URL集合
self.old_urls = set() #已爬取URL
def has_new_url(self):
'''判断是否有未爬取的URL'''
return(self.new_url_size()!=0)
def get_new_url(self):
'''获取一个未爬取的URL'''
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return(new_url)
def add_new_url(self,url):
'''将新的URL添加到未爬取的URL集合中'''
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
'''将新的URL列表添加到未爬取的URL集合中'''
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def new_url_size(self):
'''获取为爬取URL集合的大小'''
return(len(self.new_urls))
三、HTML下载器
下面是获取代理IP池的代码
import urllib.request
import re
import time
import random
import socket
import threading
class proxy_ip(object):
'''获取有效代理IP并保存'''
def __init__(self,url,total_page):
self.url = url #打算爬取的网址
self.total_page = total_page #遍历代理IP网页的页数
def get_proxys(self):
'''抓取代理IP'''
user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
ip_totle=[]
for page in range(1,self.total_page+1):
#url = 'http://www.httpsdaili.com/?page='+str(page)
#url='http://www.kuaidaili.com/free/inha/'+str(page)+'/'
url='http://www.xicidaili.com/nn/'+str(page) #西刺代理
headers={"User-Agent":random.choice(user_agent)}
try:
request=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
print('get page',page)
pattern=re.compile('<td>(\d.*?)</td>') #截取<td>与</td>之间第一个数为数字的内容
ip_page=re.findall(pattern,str(content))
ip_totle.extend(ip_page)
except Exception as e:
print(e)
time.sleep(random.choice(range(1,5)))
#打印抓取内容
print('代理IP地址 ','\t','端口','\t','速度','\t','验证时间')
for i in range(0,len(ip_totle),4):
print(ip_totle[i],' ','\t',ip_totle[i+1],'\t',ip_totle[i+2],'\t',ip_totle[i+3])
#整理代理IP格式
proxys = []
for i in range(0,len(ip_totle),4):
proxy_host = ip_totle[i]+':'+ip_totle[i+1]
proxy_temp = {"http":proxy_host}
proxys.append(proxy_temp)
return(proxys)
def test(self,lock,proxys,i,f):
'''验证代理IP有效性'''
socket.setdefaulttimeout(15) #设置全局超时时间
url = self.url
try:
proxy_support = urllib.request.ProxyHandler(proxys[i])
opener = urllib.request.build_opener(proxy_support)
opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")]
urllib.request.install_opener(opener)
#res = urllib.request.urlopen(url).read().decode('gbk')
res = urllib.request.urlopen(url).read().decode('utf-8')
print(res)
lock.acquire() #获得锁
print(proxys[i],'is OK')
f.write('%s\n' %str(proxys[i])) #写入该代理IP
lock.release() #释放锁
except Exception as e:
lock.acquire()
print(proxys[i],e)
lock.release()
def get_ip(self):
'''多线程验证'''
f = open('proxy_ip.txt','a+') #新建一个储存有效IP的文档
lock=threading.Lock() #建立一个锁
#多线程验证
proxys = self.get_proxys()
threads=[]
for i in range(len(proxys)):
thread=threading.Thread(target=self.test,args=[lock,proxys,i,f])
threads.append(thread)
thread.start()
#阻塞主进程,等待所有子线程结束
for thread in threads:
thread.join()
f.close() #关闭文件
下面是HTML下载器模块的代码
# _*_ coding:utf-8 _*_
from firstSpider.get_proxy_ip import proxy_ip
import urllib.request
import random
import os
import socket
import time
import re
class HtmlDownloader(object):
'''获取网页内容'''
def download(self,url):
user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
state = 0;test = 0
socket.setdefaulttimeout(20) #设置全局超时时间
while state == 0 and test < 5:
try:
request = urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素
response = urllib.request.urlopen(request)
readhtml = response.read()
content = readhtml.decode('gbk') #读取网页内容
time.sleep(random.randrange(1,6))
if re.search('Auth Result',content) == None:
state = 1
except Exception as e:
print('系统IP获取网页失败','',e)
if 'proxy_ip.txt' not in os.listdir() or os.path.getsize('proxy_ip.txt') == 0:
print('代理IP池不存在,新建代理IP池')
pool = proxy_ip(url,5)
pool.get_ip()
print('代理IP池创建完毕')
else:
f = open('proxy_ip.txt','r')
proxys_ip = f.readlines()
f.close()
random.shuffle(proxys_ip)
for i in range(len(proxys_ip)):
try:
proxy_support = urllib.request.ProxyHandler(eval(proxys_ip[i][:-1]))
opener = urllib.request.build_opener(proxy_support)
opener.addheaders=[("User-Agent",random.choice(user_agent))]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
readhtml = response.read()
content = readhtml.decode('gbk')
time.sleep(random.randrange(1,6))
if re.search('Auth Result',content) == None: #排除被判别为无效用户的情况
state = 1
print('成功接入代理IP',proxys_ip[i])
break
except Exception as e:
print(proxys_ip[i],'请求失败',e)
except urllib.error.HTTPError as e:
print(proxys_ip[i],'请求失败',e.code)
except urllib.error.URLError as e:
print(proxys_ip[i],'请求失败',e.reason)
try:
if i == len(proxys_ip)-1:
os.remove('proxy_ip.txt')
print('代理IP池失效,已删除')
except: #i不存在的情况
os.remove('proxy_ip.txt')
print('代理IP池为空,文件已删除')
time.sleep(60)
test += 1
if test == 5:
print('未成功获取 %s 页面内容'%url)
content = None
return(content)
四、HTML解析器
# coding:utf-8
import re
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import numpy as np
import time
import datetime
class HtmlParser(object):
'''解析网页内容'''
def __init__(self,content):
self.soup = BeautifulSoup(content,"lxml") #待解析内容
def get_header(self):
'''获取表格标题'''
try:
header = []
for tag in self.soup.thead.find_all('td'):
title = str(tag)
title = title.replace(' ','')
title = title.replace('\n','')
header.extend(re.findall('>(.*?)<',title))
header_name = []
for data in header:
if data != '':
header_name.append(data.strip())
header_name.append('数据时间')
except: #无标题返回空列表,标记了该内容是否有效
header_name = []
return(header_name)
h2_len = len(self.soup.thead.find_all('td',class_ = "h2"))
datalist_len = len(self.soup.find_all('tbody',id="datalist") + self.soup.find_all('tbody',id="datalist1") + self.soup.find_all('tbody',id="datalist2"))
if h2_len >= 6 or datalist_len == 0: #排除了标题格式不统一和没数据的两种情况
header_name = []
return(header_name)
def get_header2(self):
'''获取表格标题(标题存在两层)'''
stati_date = []
for date in self.soup.thead.find_all('td',class_ = "double align_center"):
stati_date.extend(re.findall('>(.*?)<',str(date)))
header_total = self.get_header()
header_name = header_total[:-5]
header_name = header_name[:2] + header_total[-5:-1] + header_name[2:]
if stati_date[0] in header_name:
header_name.remove(stati_date[0])
if stati_date[1] in header_name:
header_name.remove(stati_date[1])
header_name.append('三四列统计时间')
header_name.append('五六列统计时间')
header_name.append('数据时间')
return(header_name,stati_date)
def get_datatime(self):
'''获取数据时间'''
try:
date = re.findall('数据时间:(.*?)<',str(self.soup.find_all('span',class_ = "fl")))[0][0:10]
except: #若不存在,根据系统时间推断
now_time = time.localtime()
if time.strftime("%w",now_time) in ['1','2','3','4','5']:
date = time.strftime("%Y-%m-%d",now_time)
elif time.strftime("%w",now_time) == '6':
dt = (datetime.datetime.now() - datetime.timedelta(days = 1))
date = dt.strftime("%Y-%m-%d")
else:
dt = (datetime.datetime.now() - datetime.timedelta(days = 2))
date = dt.strftime("%Y-%m-%d")
return(date)
def get_datalist(self):
'''获取数据内容'''
if len(self.soup.find_all('tbody',id="datalist")) >= 1:
soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist")[0]),"lxml")
elif len(self.soup.find_all('tbody',id="datalist1")) >= 1:
soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist1")[0]),"lxml")
else:
soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist2")[0]),"lxml")
date = self.get_datatime()
row = len(soup.tbody.find_all('tr'))
#初始化正常标题和双重标题时的数组
if len(self.soup.thead.find_all('td',class_ = "double align_center")) == 0:
header_name = self.get_header()
col = len(header_name)
datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col)
flag = 1
else:
header_name = self.get_header2()[0]
col = len(header_name)
datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col)
flag = 2
for i in range(row): #提取数据并写入数组
detail = re.findall('>(.*?)<',str(soup.find_all('tr')[i]))
for blank in range(detail.count('')):
detail.remove("")
try:
if flag == 1:
detail.append(date)
datalist[i] = detail
elif flag == 2:
stati_date = self.get_header2()[1]
detail.append(stati_date[0])
detail.append(stati_date[1])
detail.append(date)
datalist[i] = detail
except:
datalist[i][0] = detail[0]
datalist[i][col-1] = date
return(datalist,header_name)
def get_dataframe(self):
'''组合标题和数据数据为数据框并输出'''
datalist,header_name = self.get_datalist()
table = pd.DataFrame(datalist ,columns = header_name)
return(table)
五、数据存储器
import pymysql
from sqlalchemy import create_engine
import pandas as pd
from firstSpider.HtmlParser import HtmlParser
class DataOutput(object):
'''把数据存入MYSQL数据库'''
def __init__(self,engine,table,table_name):
self.engine = engine #数据库连接引擎
self.table = table #要储存的表
self.table_name = table_name #表名
def output(self):
self.table.to_sql(name = self.table_name,con = self.engine,if_exists = 'append',index = False,index_label = False)
六、爬虫调度器
from firstSpider.UrlManager import UrlManager
from firstSpider.HtmlDownloader import HtmlDownloader
from firstSpider.HtmlParser import HtmlParser
from firstSpider.DataOutput import DataOutput
from sqlalchemy import create_engine
import threadpool,time
class SpiderMan(object):
'''爬虫机器人'''
def __init__(self,engine,table_name):
self.engine = engine #数据库连接引擎
self.table_name = table_name #表名
self.manager = UrlManager() #URL管理器
self.downloader = HtmlDownloader() #HTML下载器
def spider(self,url):
'''单网页爬虫组件'''
# HTML下载器下载网页
html = self.downloader.download(url)
f = open('stock.txt','w')
f.write(html)
f.close()
# HTML解析器抽取网页数据
parser = HtmlParser(html)
if len(parser.get_header()) > 0:
data = parser.get_dataframe()
# 数据储存器储存文件
out = DataOutput(self.engine,data,self.table_name)
out.output()
print('%s 的数据已存入表 %s'%(url,self.table_name))
time.sleep(1)
return(parser.get_datatime())
def crawl(self,urls):
'''爬取一个栏目连接列表的内容'''
self.manager.add_new_urls(urls)
# 判断url管理器中是否有新的url
pool = threadpool.ThreadPool(10)
while(self.manager.has_new_url()):
# 从URL管理器获取新的url
new_url = self.manager.get_new_url()
requests = threadpool.makeRequests(self.spider,(new_url,))
pool.putRequest(requests[0])
pool.wait()
完整代码
from firstSpider.get_proxy_ip import proxy_ip
from firstSpider.get_catalog import get_catalog
from firstSpider.get_urls import get_urls
from firstSpider.SpiderMan import SpiderMan
from selenium import webdriver
from sqlalchemy import create_engine
import time
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
'''根据左侧子导航下载证券之星当天所有数据'''
if __name__ == "__main__":
print('获取代理IP并验证有效性')
ip_pool = proxy_ip('http://quote.stockstar.com',8)
ip_pool.get_ip()
print('代理IP池建立完毕')
getcata = get_catalog()
catalog = getcata.load_catalog()
start = 0
end = len(catalog)
catalog = catalog[start : end]
print('初始化浏览器')
browser = webdriver.Chrome()
engine = create_engine('mysql+pymysql://root:Jwd116875@localhost:3306/scott?charset=utf8')
for index in range(start,end):
table_name,url = getcata.index_info(catalog,index)
stop_url = ['http://quote.stockstar.com/gold/globalcurrency.shtml'] #想过滤掉的网页链接
if url not in stop_url:
geturls = get_urls(browser,url)
urls = geturls.get_urllist()
print('已获取 %s 的链接列表'%table_name)
Spider_man = SpiderMan(engine,table_name)
Spider_man.crawl(urls)
datatime = Spider_man.spider(urls[0])
print('%s: %s 栏目 %s 的增量数据爬取完毕'%(index,table_name,datatime))