代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
import requests
from config import *
import re
import json
from docx import Document
class Spider(object):
def __init__(self):
self.url = "https://wenku.baidu.com/view/1d64ae0250d380eb6294dd88d0d233d4b14e3f4e"
# 获取文本地址
def getDocLink(self, articles=[]):
r = requests.get(url=self.url, headers=HEADER_FOR_LINK)
if r.status_code == 200:
html = re.search(r"WkInfo.htmlUrls = '(.*)'", r.text).groups(0)[0].replace("\\x22", '"')
result = (json.loads(html))
for item in (result["json"]):
data = {
"page_index": item["pageIndex"],
"download_url": eval(repr(item["pageLoadUrl"].replace("\\", "")))
}
# 获取文档内容
doc = self.getDocData(data)
# articles.append(doc.replace("\n\n", "\t"))
articles.append(doc.replace("。\n","\n"))
global title
title = articles[0].split("\n")[0]
if len(title) == 0:
title = articles[0].split("\n")[1]
# articles = [ art.replace(title,"") for art in articles ]
# # 去除标题
# words = ''.join(articles).replace("精选资料,欢迎下载\n。", "")
# self.writeDocText( words )
# 获取文本内容
def getDocData(self, data=None, string=""):
r = requests.get(url=data["download_url"], headers=HEADER_FOR_CON)
if r.status_code == 206:
data = re.findall(r"({.*})", r.text)[0]
items = json.loads(data)
for item in ((items["body"])):
if "r" not in item or (item["r"] == [0] and len(item["c"]) == 1 ):
continue
else:
if isinstance(item["c"], dict):
pass
else:
string += item["c"]
strings = string.replace(" ", "\n")
return strings
# 写入doc文本
def writeDocText(self,text):
doc = Document( )
doc.add_heading( title )
doc.add_paragraph( text )
doc.save("{0}.doc".format( title ))
if __name__ == '__main__':
s = Spider()
s.getDocLink()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。