解决网站爬取时,内容类似:$#x12E0;样式,且每次字体文件变化。
下载FontCreator

image

用FontCreator打开base.woff.查看对应字体关系

image

初始化时将对应关系写入字典中。

#!/usr/bin/env python
# coding:utf-8
import requests
import re
import os
from fontTools.ttLib import TTFont
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
#下载字体
class MaoYan(object):

    def __init__(self):
        self.url = 'http://maoyan.com/films/1198214'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
        }
        self.base_num = {}  # 编号—数字
        self.base_obj = {}  # 编号—对象
        # base.woff 为当前网站下载的一个字体
        self.base_font_file = TTFont('./fonts/base.woff')
        # 需要先下载字体编辑软件(FontCreator),以便查看对应关系
        self.base_num["uniF3BA"] = "0"
        self.base_num["uniF2A9"] = "1"
        self.base_num["uniE6A5"] = "2"
        self.base_num["uniF680"] = "3"
        self.base_num["uniE69C"] = "4"
        self.base_num["uniE710"] = "5"
        self.base_num["uniE07D"] = "6"
        self.base_num["uniE5A7"] = "7"
        self.base_num["uniEC7A"] = "8"
        self.base_num["uniE2A3"] = "9"

        for key in self.base_num:
            self.base_obj[key] =self.base_font_file['glyf'][key]

    def baseobj(self):
        for key in self.base_num:

            self.base_obj[key] =self.base_font_file['glyf'][key]  # 获得woff内编号对应的字体对象
            return self.base_obj

    # 发送请求获得响应
    def get_html(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content

    def create_font(self, re_font_file):
        # 列出已下载文件
        file_list = os.listdir('./fonts')
        # 判断是否已下载
        if re_font_file not in file_list:

            print('不在字体库中, 下载:', re_font_file)
            url = 'http://vfile.meituan.net/colorstone/' + re_font_file
            new_file = self.get_html(url)
            with open('./fonts/' + re_font_file, 'wb') as f:
                f.write(new_file)

        # 打开字体文件,创建 self.font_file属性
        self.font_file = TTFont('./fonts/' + re_font_file)

    def get_num_from_font_file(self, re_star):

        newstar = re_star.upper().replace("&#X", "uni")
        realnum = newstar.replace(";", "")
        numlist = realnum.split(".")
        # gly_list = self.font_file.getGlyphOrder() #uni列表['glyph00000', 'x', 'uniF680', 'uniE2A3', 'uniE710', 'uniE69C', 'uniEC7A', 'uniF2A9', 'uniE5A7', 'uniE07D', 'uniE6A5', 'uniF3BA']
        star_rating = []
        for hax_num in numlist:
            font_file_num = self.font_file['glyf'][hax_num]
            for key in self.baseobj():
                if font_file_num == self.base_obj[key]:
                    star_rating.append(self.base_num[key])
        # 星级评分待优化,暂不支持10.0,
        star_rating = star_rating[0]+"."+star_rating[1]
        return star_rating

    def start_crawl(self):
        html = self.get_html(self.url).decode('utf-8')

        # 正则匹配字体文件
        re_font_file = re.findall(r'vfile\.meituan\.net\/colorstone\/(\w+\.woff)', html)[0]
        self.create_font(re_font_file)
        # 正则匹配星级评分
        re_star_rating = re.findall(r'<span class="index-left info-num ">\s+<span class="stonefont">(.*?)</span>\s+</span>', html)[0]
        star_rating = self.get_num_from_font_file(re_star_rating)
        print("星级评分:", star_rating)


if __name__ == '__main__':

    m = MaoYan()
    m.start_crawl()

本文转载:CSDN博客