代码拉取完成,页面将自动刷新
import re
import threading
import time
from lxml import etree
from queue import Queue
from selenium import webdriver
from pymysql import connect
class WxSpider:
first_click = True
def __init__(self, start_url):
self.start_url = start_url
self.html_quene = Queue() # 队列容器 存储每页的数据
self.item_quene = Queue() # 队列容器 存储提取的单条数据
self.driver = webdriver.Chrome()
self.conntent_pattern = re.compile('\n|\t')
self.conn = connect(host='localhost', port=3306, user='root', password='root', database='boss_spider', charset='utf8')
self.cursor = self.conn.cursor()
def put_html_quene(self):
while True:
if self.first_click:
self.driver.get(self.start_url)
self.first_click = False
else:
if self.driver.find_element_by_class_name('next-month').get_attribute('class').find('disable') == -1:
self.driver.find_element_by_class_name('next-month').click()
else:
# 获取到的HTML内容放入队列
self.html_quene.put(self.driver.page_source)
self.driver.quit()
break
time.sleep(2)
def get_html_quene(self):
while True:
html = self.html_quene.get()
element = etree.HTML(html)
for li in element.xpath('//li[contains(@class, "v_2018")]'):
# 左边的页 //*[@id="page117"]/div[1]/div/div/div/div[1]/div[2] //*[@id="page9"]/div[2]/div/div/div/div/div[2]
left_content = li.xpath('./div[1]/div/div/div/div[@class="text-warp-top"]/div[2]//text()')
if len(left_content):
publish_time = li.xpath('./div[1]/div/p/text()')[0]
publish_week = li.xpath('./div[1]/div/p/span/text()')[0]
publish_content = self.conntent_pattern.sub('', ''.join(left_content))
left_item = dict(publish_time=publish_time, publish_week=publish_week, publish_content=publish_content)
self.item_quene.put(left_item)
# 右边的页
right_content = li.xpath('./div[2]/div/div/div/div[@class="text-warp-top"]/div[2]//text()')
if len(right_content):
publish_time = li.xpath('./div[2]/div/p/text()')[0]
publish_week = li.xpath('./div[2]/div/p/span/text()')[0]
publish_content = self.conntent_pattern.sub('', ''.join(right_content))
right_item = dict(publish_time=publish_time, publish_week=publish_week, publish_content=publish_content)
self.item_quene.put(right_item)
self.html_quene.task_done()
def save_item(self):
while True:
item = self.item_quene.get()
publish_time = item['publish_time'].replace('/', '-')
publish_week = item['publish_week']
publish_content = item['publish_content'].replace("'", "\\'")
sql = f"insert into wx_content (date,week,content) values ('{publish_time}','{publish_week}','{publish_content}')"
# print(sql)
self.cursor.execute(sql)
self.conn.commit()
self.item_quene.task_done()
def create_wc_img(self):
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt # 数学绘图库
import jieba # 分词库
from wordcloud import WordCloud # 词云库
from wordcloud import ImageColorGenerator
import numpy as np
from PIL import Image
sql = 'select * from wx_content'
self.cursor.execute(sql)
items = self.cursor.fetchall()
wx_contents = ';'.join([i[3] for i in items])
words = jieba.cut(wx_contents)
result = "/".join(words) # 只有用"/"或" "等分割开的,才能用于构建词云图
# 初始化自定义背景图片
bg_img = "wx.png"
image = Image.open(bg_img)
graph = np.array(image)
# wordcloud配置
wc = WordCloud(
font_path="C:\Windows\Fonts\msyhbd.ttc", # 设置字体
background_color='white', # 背景颜色
width=600, # 设置宽,我这里设置和背景图片宽一样
height=600, # 设置高,我这里设置和背景图片高一样
max_font_size=70, min_font_size=10, # 字体最大/最小值
# stopwords=no_name, # 设置停用词,不在词云图中显示
max_words=2000, # 设置最大显示的字数
mode='RGBA'
)
wc.generate(result)
# 绘制文字的颜色以背景图颜色为参考
image_color = ImageColorGenerator(graph) # 从背景图片生成颜色值
wc.recolor(color_func=image_color)
# 保存图片的名字
img_name = 'wc.png'
# 生成图片
wc.to_file(img_name)
# 4、显示图片
plt.figure("词云图") # 指定所绘图名称
plt.imshow(wc) # 以图片的形式显示词云
plt.axis("off") # 关闭图像坐标系
plt.show()
def run(self):
thread_list = []
put_html = threading.Thread(target=self.put_html_quene)
thread_list.append(put_html)
get_html = threading.Thread(target=self.get_html_quene)
thread_list.append(get_html)
save_item = threading.Thread(target=self.save_item)
thread_list.append(save_item)
for t in thread_list:
t.setDaemon(True)
t.start()
for t in thread_list:
t.join()
for q in [self.html_quene, self.item_quene]:
q.join()
print('over')
if __name__ == '__main__':
url = 'https://chushu.la/book/chushula-966439005'
spider = WxSpider(url)
spider.run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。