#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pymongo.errors import ConfigurationError
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery
from urllib3.exceptions import NewConnectionError, MaxRetryError
from config_vip import *
from multiprocessing import Pool
from selenium.webdriver.chrome.options import Options
import os
import pymongo
import requests
import hashlib
import time
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
if browser_method == 0:
browser = webdriver.Chrome()
print('你选择使用Chrome()方法...')
elif browser_method == 1:
browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=false'])
print('你选择使用PhantomJS()方法...')
else:
chrome_option = Options()
chrome_option.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_option)
print('你选择使用Headless()方法...')
browser.set_window_size(1920, 1080)
wait = WebDriverWait(browser, 10)
try:
client = pymongo.MongoClient(mongo_url)
database = client[mongo_database]
except TypeError:
print('数据库创建失败'.center(130, '*'))
except ConfigurationError:
print('数据库创建失败'.center(130, '*'))
# 实现数据库对象
def drop_down_scrollbar():
# 定义下拉滚动条方法
times = 1
while times < total_times:
js = "var q=document.documentElement.scrollTop={}".format(times * size)
browser.execute_script(js)
time.sleep(1)
times += 1
def get_search(search_word):
# 定义get_()search方法
url = main_url
browser.get(url)
# 打开url,获得内容
time.sleep(3)
try:
search_bar = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J-search > div.c-search-form > input')))
enter_button = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J-search > div.c-search-form > a > span')))
# 确定输入框和搜索按钮可用
search_bar.send_keys(search_word)
time.sleep(1)
enter_button.click()
# 输入关键字并点击搜索
time.sleep(5)
drop_down_scrollbar()
pages = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_pagingCt > a:nth-child(6)')))
# 获得总页数,main()中作为for循环参数
print('搜索到{}共{}页的内容'.format(search_word, pages.text))
time.sleep(3)
print('开始获取{}第{}页的内容...'.format(search_word, str(1)))
get_page_detail(search_word)
print('完成获取{}第{}页的内容...'.format(search_word, str(1)))
return pages.text
except TimeoutException:
print('网页未加载完成,无法搜索信息!', TimeoutException.args)
pass
except WebDriverException:
print(WebDriverException.args)
pass
def get_next_page(search_word, page):
# 定义get_next_page()方法进行跳转
try:
url1 = url_search.format(search_word, str(page))
# 找出网页规律,定个模板
print('开始获取{}第{}页的内容...\n'.format(search_word, page))
browser.get(url1)
drop_down_scrollbar()
get_page_detail(search_word)
print('完成获取{}第{}页的内容...\n'.format(search_word, page))
except TimeoutException:
print('跳转网页超时!', TimeoutException.args)
pass
except WebDriverException:
print(WebDriverException.args)
pass
def get_page_detail(search_word):
# 定义get_page_detail()方法获取网页详细信息
try:
source = browser.page_source
html = PyQuery(source)
print('解析数据成功'.center(130, '*'))
# PyQuery解析源代码
good_items = html('.goods-list .goods-list-item').items()
# 调用items()方法获得数据
for item in good_items:
goods = {
'good-title': item.find('.goods-title-info ').text().split('\n')[1],
'good-sells-price': item.find('.goods-info .goods-price-wrapper .goods-sells-price .price').text(),
'good-market-price': item.find('.goods-info .goods-market-price').text()[2:],
'good-discount': item.find('.goods-info .goods-discount').text(),
'good-brand': item.find('.goods-info .goods-brand').text(),
'image': 'http:{}'.format(item.find('.goods-slide .goods-image-link .J_img').attr('src')),
'detail': 'http:{}'.format(item.find(' .goods-slide .goods-image-link').attr('href'))
}
image_url = goods['image']
content = get_image_content(image_url)
if content:
# 确定图片网页是否可以打开
download_image(content, search_word, image_url)
save_to_mongodb(goods, search_word)
# 调用find方法和CSS取得数据
except TimeoutException:
print('爬取网页超时!', TimeoutException.args)
pass
def save_to_mongodb(goods, database_table):
# 定义save_to_mongoDB(goods)方法将数据存储到mongoDB数据
try:
if database[database_table].insert(goods):
# 插入数据成功
print('存储数据成功'.center(130, '*'))
print(goods, '\n')
except Exception:
print('写入数据出错!', Exception.args)
pass
def get_image_content(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content
else:
print('请求图片链接失败!')
except ConnectionError:
print(ConnectionError.args)
return False
except NewConnectionError:
print(NewConnectionError.args)
return False
except MaxRetryError:
print(MaxRetryError.args)
return False
def download_image(content, folder, image_url):
# 定义download_image(content)保存图片
time_stamp = time.strftime("%Y%m%d", time.localtime())
path = file_path.format(mongo_database, time_stamp, folder)
if os.path.exists(path):
pass
else:
os.makedirs(path)
# 利用hash算法获得content MD5值以16进制显示
filename = hashlib.md5(content).hexdigest()
with open(file_type.format(path, filename), 'wb')as f:
f.write(content)
f.close()
# 打开文件保存路径,文件名,格式,wb写入形式
print(' {} 下载图片成功'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())).center(125, '*'))
print(filename, image_url)
def main(search_word):
pages = int(get_search(search_word))
page = 2
if pages >= end:
pages = end
try:
while page <= pages:
get_next_page(search_word, page)
page += 1
except TimeoutException:
print(TimeoutException.args)
pass
if __name__ == '__main__':
pool = Pool(processes=2)
pool.map(main, [keyword for keyword in keywords])
pool.close()
#锁定进程池
pool.join()
os.system('taskkill /im chromedriver.exe /F')
os.system('taskkill /im chrome.exe /F')
#杀死多余的chromedriver进程以及chrome进程
mongo_url = 'localhost'
mongo_database = 'vip'
#数据库地址以及名称
main_url = 'https://www.vip.com/'
total_times =16
size =500
#设定下来滚动条的次数和大小
browser_method = 2
#驱动浏览器的方法
start=1
end = 45
#设定结束网页,有些网页没有内容,容易引起一场
url_search = 'https://category.vip.com/suggest.php?keyword={}&page={}&count=100&suggestType=brand#catPerPos'
#定义模板
file_path = 'H:/Python_download/{}/{}/image/{}/'
file_type = '{}{}.jpg'
# 文件类型以及文件夹
keywords=['苹果','雪梨','香蕉']