代码拉取完成,页面将自动刷新
__author__ = 'yzh'
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions
from pyquery import PyQuery as pq
import re,sys,time
path = 'D:\code\\bigdatacenter-master\\bigdatacenter-master\\chromedriver\\chromedriver.exe'
option = ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
option.add_experimental_option("prefs", prefs)
browser = webdriver.Chrome(path, chrome_options=option)
page_numbers = 3
zytzb_herf_list = []
title_list = []
date_list = []
source_list = []
for num in range(1,page_numbers+1):
time.sleep(1)
if num == 1:
url = 'https://www.zytzb.gov.cn/zytzb/index/tzsx/index.shtml'
else:
url = 'https://www.zytzb.gov.cn/zytzb/index/tzsx/e85c86fb-'+str(num)+'.shtml'
browser.get(url)
content = browser.page_source
content_1 = browser.find_elements(By.XPATH, '//*[@id="e85c86fb66234b43a762088eb82b4c04"]/div[2]/div/div[1]')
for content_list in content_1:
get_html1 = content_list.get_attribute('innerHTML')
print(get_html1)
temp_info = pq(get_html1)
print(temp_info)
herf = re.findall(r'</span> <a href="(/zytzb/\d{4}-\d{2}/\d{2}/article_.*?\.shtml)" target="_blank" title=',str(temp_info),re.S)
print(herf)
print(len(herf))
zytzb_herf_list.extend([x for x in herf[0:]])
print(zytzb_herf_list)
print(len(zytzb_herf_list))
for temp_herf in zytzb_herf_list:
url = 'https://www.zytzb.gov.cn' + str(temp_herf)
time.sleep(1)
browser.get(url)
title_content = browser.page_source
title_temp = browser.find_elements(By.XPATH,'//*[@id="2ca8ea9dfb394b768ed8cdafefbb63b7"]/div[2]/div[2]/div[1]')
for title1 in title_temp:
get_title = title1.get_attribute('innerHTML')
title = get_title.strip().replace('<br>','')
match_title = re.search(r'.*?台.*?|.*?统战.*?|.*?党.*?|.*?暨南大学.*?|.*?二十大.*?|.*?统一战线.*?|.*?两岸.*?', str(title),re.S)
if match_title == None:
continue
print(title)
source_temp = browser.find_elements(By.XPATH,'/html/body/div[5]/div/div[2]/div[2]/div[2]/span[3]/b')
for source1 in source_temp:
get_source = source1.get_attribute('innerHTML')
source = get_source.strip()
match_source = re.search(r'^中国和平统一促进会网站.*?|^中国新闻网|^中央社院.*?|^中新.*?|^全国台联网站|^台胞之家.*?',str(source),re.S)
if match_source == None:
continue
print(source)
date_temp = browser.find_elements(By.XPATH,'/html/body/div[5]/div/div[2]/div[2]/div[2]/span[2]')
for date1 in date_temp:
get_date = date1.get_attribute('innerHTML')
date = re.findall(r'发布时间: (\d{4}年\d{2}月\d{2}日)',get_date,re.S)[0]
strinfo = re.compile(r'["年""月"]')
date = strinfo.sub('-',date).replace('日','')
print(date)
title_list.append(title)
date_list.append(date)
source_list.append(source)
print(title_list)
print(len(title_list))
print(date_list)
print(len(date_list))
print(source_list)
print(len(source_list))
print('---------开始写入数据库------------')
# import pymysql
#
# db = pymysql.connect(
# host='',
# port=,
# user='',
# password='',
# database=''
# )
# cursor = db.cursor()
# for list_num in tqdm((range(len(title_list)))):
# sql = "INSERT INTO `crawl_db`.`china_govmt_mov_news` (`prov_name`, `city_name`, `area_name`, `media_name`, `news_title`, `issu_time`, `create_person`)" \
# " VALUES ('0', '0', '0', '%s', '%s', '%s', 'yzh')" % (
# source_list[list_num], title_list[list_num],date_list[list_num])
# cursor.execute(sql)
# db.commit()
# print("============插入成功================")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。