master

分支 (1)

管理

管理

master

PythonSpiders
/
wx_spider.py

import re
import threading
import time
from lxml import etree
from queue import Queue
from selenium import webdriver
from pymysql import connect


class WxSpider:

    first_click = True

    def __init__(self, start_url):
        self.start_url = start_url
        self.html_quene = Queue()  # 队列容器 存储每页的数据
        self.item_quene = Queue()  # 队列容器 存储提取的单条数据
        self.driver = webdriver.Chrome()
        self.conntent_pattern = re.compile('\n|\t')
        self.conn = connect(host='localhost', port=3306, user='root', password='root', database='boss_spider', charset='utf8')
        self.cursor = self.conn.cursor()

    def put_html_quene(self):
        while True:
            if self.first_click:
                self.driver.get(self.start_url)
                self.first_click = False
            else:
                if self.driver.find_element_by_class_name('next-month').get_attribute('class').find('disable') == -1:
                    self.driver.find_element_by_class_name('next-month').click()
                else:
                    # 获取到的HTML内容放入队列
                    self.html_quene.put(self.driver.page_source)
                    self.driver.quit()
                    break
            time.sleep(2)

    def get_html_quene(self):
        while True:
            html = self.html_quene.get()
            element = etree.HTML(html)
            for li in element.xpath('//li[contains(@class, "v_2018")]'):
                # 左边的页 //*[@id="page117"]/div[1]/div/div/div/div[1]/div[2] //*[@id="page9"]/div[2]/div/div/div/div/div[2]
                left_content = li.xpath('./div[1]/div/div/div/div[@class="text-warp-top"]/div[2]//text()')
                if len(left_content):
                    publish_time = li.xpath('./div[1]/div/p/text()')[0]
                    publish_week = li.xpath('./div[1]/div/p/span/text()')[0]
                    publish_content = self.conntent_pattern.sub('', ''.join(left_content))
                    left_item = dict(publish_time=publish_time, publish_week=publish_week, publish_content=publish_content)
                    self.item_quene.put(left_item)
                # 右边的页
                right_content = li.xpath('./div[2]/div/div/div/div[@class="text-warp-top"]/div[2]//text()')
                if len(right_content):
                    publish_time = li.xpath('./div[2]/div/p/text()')[0]
                    publish_week = li.xpath('./div[2]/div/p/span/text()')[0]
                    publish_content = self.conntent_pattern.sub('', ''.join(right_content))
                    right_item = dict(publish_time=publish_time, publish_week=publish_week, publish_content=publish_content)
                    self.item_quene.put(right_item)
            self.html_quene.task_done()

    def save_item(self):
        while True:
            item = self.item_quene.get()
            publish_time = item['publish_time'].replace('/', '-')
            publish_week = item['publish_week']
            publish_content = item['publish_content'].replace("'", "\\'")
            sql = f"insert into wx_content (date,week,content) values ('{publish_time}','{publish_week}','{publish_content}')"
            # print(sql)
            self.cursor.execute(sql)
            self.conn.commit()
            self.item_quene.task_done()

    def create_wc_img(self):
        import matplotlib as mpl
        mpl.use("Agg")
        import matplotlib.pyplot as plt  # 数学绘图库
        import jieba  # 分词库
        from wordcloud import WordCloud  # 词云库
        from wordcloud import ImageColorGenerator
        import numpy as np
        from PIL import Image

        sql = 'select * from wx_content'
        self.cursor.execute(sql)
        items = self.cursor.fetchall()
        wx_contents = ';'.join([i[3] for i in items])
        words = jieba.cut(wx_contents)
        result = "/".join(words)  # 只有用"/"或" "等分割开的，才能用于构建词云图
        # 初始化自定义背景图片
        bg_img = "wx.png"
        image = Image.open(bg_img)
        graph = np.array(image)
        # wordcloud配置
        wc = WordCloud(
            font_path="C:\Windows\Fonts\msyhbd.ttc",  # 设置字体
            background_color='white',  # 背景颜色
            width=600,  # 设置宽，我这里设置和背景图片宽一样
            height=600,  # 设置高，我这里设置和背景图片高一样
            max_font_size=70, min_font_size=10,  # 字体最大/最小值
            # stopwords=no_name,  # 设置停用词，不在词云图中显示
            max_words=2000,  # 设置最大显示的字数
            mode='RGBA'
        )
        wc.generate(result)
        # 绘制文字的颜色以背景图颜色为参考
        image_color = ImageColorGenerator(graph)  # 从背景图片生成颜色值
        wc.recolor(color_func=image_color)
        # 保存图片的名字
        img_name = 'wc.png'
        # 生成图片
        wc.to_file(img_name)
        # 4、显示图片
        plt.figure("词云图")  # 指定所绘图名称
        plt.imshow(wc)  # 以图片的形式显示词云
        plt.axis("off")  # 关闭图像坐标系
        plt.show()

    def run(self):
        thread_list = []
        put_html = threading.Thread(target=self.put_html_quene)
        thread_list.append(put_html)

        get_html = threading.Thread(target=self.get_html_quene)
        thread_list.append(get_html)

        save_item = threading.Thread(target=self.save_item)
        thread_list.append(save_item)

        for t in thread_list:
            t.setDaemon(True)
            t.start()

        for t in thread_list:
            t.join()

        for q in [self.html_quene, self.item_quene]:
            q.join()

        print('over')


if __name__ == '__main__':
    url = 'https://chushu.la/book/chushula-966439005'
    spider = WxSpider(url)
    spider.run()