1 Star 0 Fork 0

seven/baiduwenku

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
spider.py 2.31 KB
一键复制 编辑 原始数据 按行查看 历史
seven 提交于 2020-04-21 17:31 . doc排版问题有待处理
# -*- coding: utf-8 -*-
import requests
from config import *
import re
import json
from docx import Document
class Spider(object):
def __init__(self):
self.url = "https://wenku.baidu.com/view/1d64ae0250d380eb6294dd88d0d233d4b14e3f4e"
# 获取文本地址
def getDocLink(self, articles=[]):
r = requests.get(url=self.url, headers=HEADER_FOR_LINK)
if r.status_code == 200:
html = re.search(r"WkInfo.htmlUrls = '(.*)'", r.text).groups(0)[0].replace("\\x22", '"')
result = (json.loads(html))
for item in (result["json"]):
data = {
"page_index": item["pageIndex"],
"download_url": eval(repr(item["pageLoadUrl"].replace("\\", "")))
}
# 获取文档内容
doc = self.getDocData(data)
# articles.append(doc.replace("\n\n", "\t"))
articles.append(doc.replace("。\n","\n"))
global title
title = articles[0].split("\n")[0]
if len(title) == 0:
title = articles[0].split("\n")[1]
# articles = [ art.replace(title,"") for art in articles ]
# # 去除标题
# words = ''.join(articles).replace("精选资料,欢迎下载\n。", "")
# self.writeDocText( words )
# 获取文本内容
def getDocData(self, data=None, string=""):
r = requests.get(url=data["download_url"], headers=HEADER_FOR_CON)
if r.status_code == 206:
data = re.findall(r"({.*})", r.text)[0]
items = json.loads(data)
for item in ((items["body"])):
if "r" not in item or (item["r"] == [0] and len(item["c"]) == 1 ):
continue
else:
if isinstance(item["c"], dict):
pass
else:
string += item["c"]
strings = string.replace(" ", "\n")
return strings
# 写入doc文本
def writeDocText(self,text):
doc = Document( )
doc.add_heading( title )
doc.add_paragraph( text )
doc.save("{0}.doc".format( title ))
if __name__ == '__main__':
s = Spider()
s.getDocLink()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/xuyanzhao/baiduwenku.git
[email protected]:xuyanzhao/baiduwenku.git
xuyanzhao
baiduwenku
baiduwenku
master

搜索帮助