阳光采购平台每月初会把当月的价格挂到平台上,现模拟用户登录平台,将需要的数据保存到csv文件和数据库,并且发送给指定人员。
开发环境搭建:
网上教程一大堆,不赘述了。安装好后需要安装一些必须的库,如下:
bs4(页面html解析)
csv(用于保存csv文件)
smtplib(用于发送邮件)
mysql.connector(用于连接数据库)
写了一个gl.py,用于保存全局变量:
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
import time
timeStr = time.strftime('%Y%m%d', time.localtime(time.time()))
monthStr = time.strftime('%m', time.localtime(time.time()))
yearStr = time.strftime('%Y', time.localtime(time.time()))
LOG_FILE = "log/" + timeStr + '.log'
csvFileName = "csv/" + timeStr + ".csv"
fileName = timeStr + ".csv"
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(message)s'
loginUrl = "http://yourpath/Login.aspx"
productUrl = 'http://yourpath/aaa.aspx'
username = 'aaaa'
password = "aaa"
preCodeurl = "yourpath"
host="yourip"
user="aaa"
passwd="aaa"
db="mysql"
charset="utf8"
postData={
'__VIEWSTATE':'',
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'btnLogin':"登录",
'txtUserId':'aaaa',
'txtUserPwd':'aaa',
'txtCode':'',
'hfip':'yourip'
}
tdd={
'__VIEWSTATE':'',
'__EVENTTARGET':'ctl00$ContentPlaceHolder1$AspNetPager1',
'ctl00$ContentPlaceHolder1$AspNetPager1_input':'1',
'ctl00$ContentPlaceHolder1$AspNetPager1_pagesize':'50',
'ctl00$ContentPlaceHolder1$txtYear':'',
'ctl00$ContentPlaceHolder1$txtMonth':'',
'__EVENTARGUMENT':'',
}
vs={
'__VIEWSTATE':''
}
主代码中设置日志,csv,数据库连接,cookie:
formatter = logging.Formatter(gl.fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('tst')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
csvFile = codecs.open(gl.csvFileName, 'w+', 'utf_8_sig')
writer = csv.writer(csvFile)
conn = mysql.connector.connect(host=gl.host, user=gl.user, passwd=gl.passwd, db=gl.db, charset=gl.charset)
cursor = conn.cursor()
cookiejar = cookielib.MozillaCookieJar()
cookieSupport = urllib2.HTTPCookieProcessor(cookiejar)
httpsHandLer = urllib2.HTTPSHandler(debuglevel=0)
opener = urllib2.build_opener(cookieSupport, httpsHandLer)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')]
urllib2.install_opener(opener)
登录方法:
首先是识别验证码,转为数字。然后用(密码+用户名+验证)提交到登录方法,可能会失败,因为识别验证码有时候识别的不正确。如果登录失败,那么重新获取验证码,再次识别,再次登录,直到登录成功。
def get_logined_Data(opener,logger,views):
print "get_logined_Data"
indexCount = 1
retData = None
while indexCount <= 15:
print "begin login ", str(indexCount), " time"
logger.info("begin login " + str(indexCount) + " time")
vrifycodeUrl = gl.preCodeurl + str(random.random())
text = get_image(vrifycodeUrl)#封装一个方法,传入验证码URL,返回识别出的数字
postData = gl.postData
postData["txtCode"] = text
postData["__VIEWSTATE"]=views
data = urllib.urlencode(postData)
try:
headers22 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
request = urllib2.Request(gl.loginUrl, data, headers22)
opener.open(request)
except Exception as e:
print "catch Exception when login"
print e
request = urllib2.Request(gl.productUrl)
response = opener.open(request)
dataPage = response.read().decode('utf-8')
bsObj = BeautifulSoup(dataPage,'html.parser')
tabcontent = bsObj.find(id="tabcontent") #登录成功后,页面才有tabcontent这个元素,所以更具这个来判断是否登录成功
if (tabcontent is not None):
print "login succesfully"
logger.info("login succesfully")
retData = bsObj
break
else:
print "enter failed,try again"
logger.info("enter failed,try again")
time.sleep(3)
indexCount += 1
return retData
分析代码发现,每次请求获取数据都需要带上’__VIEWSTATE’这个参数,这个参数是存放在页面,所以需要把‘__VIEWSTATE’提出出来,用于访问下一页的时候带到参数里面去。
验证码解析:
通过验证码的url地址,将验证码保存到本地,因为验证码是彩色的,所有需要先把验证码置灰,然后再调用图像识别转为数字。这个验证码为4位数字,但是调用图像识别的时候,可能会转成字母,所有手动将字母转为数字,转换后识别率还能接受。
def get_image(codeurl):
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + " begin get code num")
index = 1
while index<=15:
file = urllib2.urlopen(codeurl).read()
im = cStringIO.StringIO(file)
img = Image.open(im)
imgName = "vrifycode/" + gl.timeStr + "_" + str(index) + ".png"
print 'begin get vrifycode'
text = convert_image(img, imgName)
print "vrifycode", index, ":", text
# logger.info('vrifycode' + str(index) + ":" + text)
if (len(text) != 4 or text.isdigit() == False): # 如果验证码不是4位那么肯定是错误的。
print 'vrifycode:', index, ' is wrong'
index += 1
time.sleep(2)
continue
return text
#将图片转为数字
def convert_image(image,impName):
print "enter convert_image"
image = image.convert('L') # 灰度
image2 = Image.new('L', image.size, 255)
for x in range(image.size[0]):
for y in range(image.size[1]):
pix = image.getpixel((x, y))
if pix < 90: # 灰度低于120 设置为 0
image2.putpixel((x, y), 0)
print "begin save"
image2.save(impName) # 将灰度图存储下来看效果
print "begin convert"
text = pytesseract.image_to_string(image2)
print "end convert"
snum = ""
for j in text:#进行简单转换
if (j == 'Z'):
snum += "2"
elif (j == 'T'):
snum += "7"
elif (j == 'b'):
snum += "5"
elif (j == 's'):
snum += "8"
elif (j == 'S'):
snum += "8"
elif (j == 'O'):
snum += "0"
elif (j == 'o'):
snum += "0"
else:
snum += j
return snum
数据转换:
将html数据转换为数组,供保存csv文件和数据库时使用
def paras_data(nameList,logger):
data = []
mainlist = nameList
rows = mainlist.findAll("tr", {"class": {"row", "alter"}})
try:
if (len(rows) != 0):
for name in rows:
tds = name.findAll("td")
if tds == None:
print "get tds is null"
logger.info("get tds is null")
else:
item = []
for index in range(len(tds)):
s_span = (tds[index]).find("span")
if (s_span is not None):
tmp = s_span["title"]
else:
tmp = (tds[index]).get_text()
# tmp=(tds[index]).get_text()
item.append(tmp.encode('utf-8')) # gb2312 utf-8
item.append(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))#本条数据获取时间
data.append(tuple(item))
except Exception as e:
print "catch exception when save csv", e
logger.info("catch exception when save csv" + e.message)
return data
保存csv文件:
def save_to_csv(data ,writer):
for d in data:
if d is not None:
writer.writerow(d)
保存数据库:
def save_to_mysql(data,conn,cursor):
try:
cursor.executemany(
"INSERT INTO `aaa`(aaa,bbb) VALUES (%s,%s)",
data)
conn.commit()
except Exception as e:
print "catch exception when save to mysql",e
else:
pass
保存指定页数据:
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
def get_appointed_page(snum,opener,vs,logger):
tdd = get_tdd()
tdd["__VIEWSTATE"] = vs['__VIEWSTATE']
tdd["__EVENTARGUMENT"] = snum
tdd=urllib.urlencode(tdd)
# print "tdd",tdd
op = opener.open(gl.productUrl, tdd)
if (op.getcode() != 200):
print("the" + snum + " page ,state not 200,try connect again")
return None
data = op.read().decode('utf-8', 'ignore')
# print "data",data
bsObj = BeautifulSoup(data,"lxml")
nameList = bsObj.find("table", {"class": "mainlist"})
# print "nameList",nameList
if len(nameList) == 0:
return None
viewState = bsObj.find(id="__VIEWSTATE")
if viewState is None:
logger.info("the other page,no viewstate,try connect again")
print("the other page,no viewstate,try connect again")
return None
vs['__VIEWSTATE'] = viewState["value"]
return nameList
Main方法:
while flag == True and logintime <50:
try:
print "global login the ", str(logintime), " times"
logger.info("global login the " + str(logintime) + " times")
bsObj = get_logined_Data(opener, logger,views)
if bsObj is None:
print "try login 15 times,but failed,exit"
logger.info("try login 15 times,but failed,exit")
exit()
else:
print "global login the ", str(logintime), " times succesfully!"
logger.info("global login the " + str(logintime) + " times succesfully!")
viewState_Source = bsObj.find(id="__VIEWSTATE")
if totalNum == -1:
totalNum = get_totalNum(bsObj)
print "totalNum:",str(totalNum)
logger.info("totalnum:"+str(totalNum))
vs = gl.vs
if viewState_Source != None:
vs['__VIEWSTATE'] = viewState_Source["value"]
# 获取指定snum页的数据
# while snum<=totalNum:
while snum<=totalNum:
print "begin get the ",str(snum)," page"
logger.info("begin get the "+str(snum)+" page")
nameList = get_appointed_page(snum, opener, vs, logger)
if nameList is None:
print "get the nameList failed,connect agian"
logger.info("get the nameList failed,connect agian")
raise Exception
else:
print "get the ", str(snum), " successfully"
logger.info("get the " + str(snum) + " successfully")
mydata = paras_data(nameList,logger)
#保存CSV文件
save_to_csv(mydata, snum, writer)
#保存到数据库
save_to_mysql(mydata, conn, cursor)
snum+=1
time.sleep(3)
flag = False
except Exception as e:
logintime+=1
print "catch exception",e
logger.error("catch exception"+e.message)
定时任务设置:
cd /var/spool/cron/
crontab –e#编辑定时任务
输入:1 1 1 * * /yourpath/normal_script.sh>>/yourpath/cronlog.log 2>&1
(上面定时任务的意思是每月1号1点1分执行文件normal_script.sh,日志存放在cronlog.log)
目录结构: