代码拉取完成,页面将自动刷新
# encoding = utf-8
'''
Author: Du
Blog:https://www.cnblogs.com/Du704/
Date: 2019/12/9 16:53
'''
import console
import requests, re, os, threading
from bs4 import BeautifulSoup, NavigableString
from concurrent.futures import ThreadPoolExecutor
# 存放路径,可自行修改
download_path = os.path.join(os.path.dirname(__file__), '博客园随笔md格式')
# 懒人模式,一键创建
if not os.path.exists(download_path):
os.mkdir(download_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# 小猿取经博客密码
blog_data = {
"tb_password": 'xiaoyuanqujing@666'
}
# 标题的正则
h_re = re.compile('^h(\d{1})')
def remove_div(body):
'''去除多余的div'''
while body.div:
body.div.unwrap()
def md_toc(body):
toc = body.select_one(".toc")
if toc:
toc.replace_with('[TOC]')
def md_h(body):
'''修改标题,已简化标题'''
h = body.find_all(h_re)
for i in h:
n = int(i.name[1])
i.replace_with('#' * n + ' ' + i.get_text())
def md_table(body):
'''改变表格'''
tbs = body.find_all('table')
for t in tbs:
ths = t.select_one('thead > tr') # 表头
trs = t.select('tbody > tr') # 表行
if not ths: # 防止对方格式不对,没有表头
ths, tds = trs[0], trs[1:]
head = mid = '|'
for th in ths.find_all(re.compile('th|td')):
style = th.get('style')
style = style if style else 'center'
if 'left' in style:
head += ' ' + th.get_text() + ' |'
mid += ' :--- |'
elif 'right' in style:
head += ' ' + th.get_text() + ' |'
mid += ' ---: |'
else:
head += ' ' + th.get_text() + ' |'
mid += ' :--: |'
body = head + '\n' + mid + '\n'
for tr in trs:
row = "|"
for td in tr.find_all('td'):
style = td.get('style')
style = style if style else 'center'
if 'left' in style:
row += ' ' + td.get_text() + ' |'
elif 'right' in style:
row += ' ' + td.get_text() + ' |'
else:
row += ' ' + td.get_text() + ' |'
body = body + row + '\n'
t.replace_with(body)
def md_pre(body):
''' 修改代码块部分'''
for pre in body.find_all('pre') + body.select('.cnblogs_code'):
code_type = pre.get('class') # 确定代码块语言
code_type = code_type if code_type else ['python'] # 如果没有class则默认为python
pre_replace = pre.parent
hide_sta = ''
if pre_replace: # 后部分的可能会被替换,此节点变为None
if not pre_replace.get('class'): # 内嵌代码块
pre_replace = pre
if code_type and 'cnblogs_code' in code_type: # 处理除pre外的代码块
code_type = ['css'] # 这个随意调整
pre_replace = pre
if '```' in pre.text: # 防止多层内嵌(不知道怎么有这样的格式)
pre.replace_with(pre.text)
continue
# 处理隐藏代码块(实际上markdown不支持隐藏代码块)
elif pre_replace.get('class') and pre_replace.get('class')[0] == 'cnblogs_code_hide':
pre_replace = pre_replace.parent
pre_div = pre_replace.find_all(True)[-1]
pre_title = pre_div.string.lstrip()
# 隐藏代码块的父类的最后一个元素是隐藏的标题(加粗+斜体+显示)
hide_sta = '' if pre_title == 'View Code' else f' ***{pre_title}***\n'
else:
# 替换形式
# pre_replace = pre_replace if pre_replace.get('class')[0] == 'cnblogs_code' else pre
if not pre_replace.get('class') or pre_replace.get('class')[0] != 'cnblogs_code':
pre_replace = pre
# 代码块主体内容
content = f'```{code_type[0] if code_type else ""}\n' \
+ pre.get_text().lstrip().rstrip().replace('```', '```') + '\n```'
if hide_sta: # 这个只和隐藏代码有关
content = hide_sta + content
if pre_replace in body: # 检查是否还在文档树里
pre_replace.replace_with(content)
def md_coed(body):
'''修改单独的代码'''
codes = body.find_all('code')
for i in codes:
if not i.parent.name == 'pre':
i.replace_with(f"`{i.get_text()}`")
def md_img(body):
'''修改图片'''
imgs = body.find_all('img')
for i in imgs:
src = i.get('src') # 拿到img的路径地址,不要使用i.src!!! src会返回None
i.replace_with(f"![]({src})")
def md_link(body):
'''修改链接'''
links = body.find_all('a')
for a in links:
href = a.get('href') # 拿到a连接的指向地址,不要使用a.href!!! 会返回None,同img.src 参考上方代码
a.replace_with(f"[{a.text}]({href})")
# if '/p/' in href or '/article/' in href:
# get_one_md(href)
def md_b_i(body):
'''修改斜体粗体'''
b = body.find_all(re.compile('strong|em'))
for i in b:
xing = '**' if i.name == 'strong' else '*'
if not h_re.match(i.parent.name):
i.replace_with(xing + i.get_text() + xing)
def md_ol_ul(body):
'''修改列表'''
l_lis = body.find_all(re.compile('^ul|ol'))
for i in l_lis:
start = "* " if i.name == 'ul' else 1
lis = i.find_all('li')
if len(lis) > 0:
for j in lis:
if isinstance(start, str):
j.replace_with(start + j.get_text().lstrip())
else:
j.replace_with(f"{start}. " + j.get_text().lstrip())
start += 1
i.replace_with(i.get_text())
else:
if i.li:
i.li.unwarp()
# i.unwrap()
def md_p(body):
'''修改段落'''
while body.span:
body.span.unwrap()
while body.p:
body.p.unwrap()
def to_md(body):
item_name = body.name
if not isinstance(body, NavigableString): # 清除格式,不过基本没用,目的是看起来简洁一点
body.attrs.clear()
if item_name == 'br': # 可能会影响格式
return ' \n'
if item_name == 'hr':
return '\n----\n'
else:
return str(body).replace('<', '<').replace('>', '>')
def transform(body):
'''整合所有修改,尽量不要改变顺序'''
md_toc(body)
md_h(body)
md_table(body)
md_pre(body)
remove_div(body)
md_img(body)
md_link(body)
md_coed(body)
md_b_i(body)
md_ol_ul(body)
md_p(body)
def get_one_md(url='', down_path=download_path):
if not url:
url = input("请粘贴要转化的博客地址:\
\n\t如:https://www.cnblogs.com/Du704/p/11270106.html\n\
只需要输入:\033[1;35mDu704/p/11270106.html \033[0m即可。\n\t>>").strip()
if not url:
print('\033[1;31m输入信息不能为空!\033[0m')
return False
url = 'https://www.cnblogs.com/' + url
print(f"正在读取... {url} ")
# post请求是与加密文章相关,无加密信息也不会影响,可自行设置
res = requests.post(url, data=blog_data, headers=headers)
if res.status_code != 200:
print(f"{url} 请求失败!")
return False
soup = BeautifulSoup(res.text, 'lxml')
# 有的文章会进行二次加密
if '博文阅读密码验证' in soup.select_one('head > title').get_text():
url = 'https://www.cnblogs.com' + soup.select_one('body > form').get('action')
return get_one_md(url, down_path)
blog_name = soup.select_one('#cb_post_title_url').get_text().rstrip() # 本篇博客名
blog_name += '.md'
# 如果爬虫有错可以离线修改,也可以查看网页源代码
# with open(os.path.join(download_path, blog_name + '.html'), 'w', encoding='utf-8') as f:
# f.write(soup.prettify())
# 这一步的目的是防止对方的博客就是粘贴来的
body = soup.select_one("#cnblogs_post_body > #cnblogs_post_body")
if not body:
body = soup.select_one('#cnblogs_post_body')
try:
# 转换为md
transform(body)
except Exception as e:
print(f"\033[31m解析{url}失败,\033[35m该博客可能包含无法解析的内容!\033[0m")
print("异常信息:", e)
try:
with open(os.path.join(down_path, blog_name), 'w', encoding='utf-8') as f:
for i in body:
data = to_md(i)
f.write(data)
print(f"\n\033[1;36m{blog_name} \033[33m下载完成!\033[0m\n\n")
except Exception as e:
print(f"保存时异常终止,可能是文件名出错:\033[31m{blog_name}\033[0m\n\t请查看是否有特殊字符!")
print("异常信息:", e)
def get_all_url(url=None):
if not url:
url = input("请粘贴要群爬的博客地址:\
\n\t如:https://www.cnblogs.com/Du704/p/11270106.html\n\
只需要输入:\033[1;35mDu704/p/11270106.html \033[0m即可。\n\t>>").strip()
if not url:
print('\033[1;31m输入信息不能为空!\033[0m')
return False
url = 'https://www.cnblogs.com/' + url
res = requests.get(url, headers=headers)
if res.status_code != 200:
print(f"{url} 请求失败!")
return False
soup = BeautifulSoup(res.text, 'lxml')
dir_name = soup.select_one('#cb_post_title_url').get_text().rstrip() # 整体的名字
dir_name = os.path.join(download_path, dir_name)
if not os.path.exists(dir_name):
os.mkdir(dir_name)
# 有问题可以检查名字是否合法
# print(dir_name)
body = soup.select_one("#cnblogs_post_body > #cnblogs_post_body")
if not body:
body = soup.select_one('#cnblogs_post_body')
# links = body.select('a[target="-Blank"]') or body.select('a[target="_blank"]')
links = body.find_all('a')
return [a.get('href') for a in links if a.get('href').startswith("https://www.cnblogs.com")], dir_name
def get_result(future):
'''这个回调函数我还没想好怎么用'''
print()
def get_all_of_one():
urls, down_path = get_all_url()
print(urls)
# 建立线程池,加快进度
with ThreadPoolExecutor(max_workers=4) as pool:
for href in urls:
pool.submit(get_one_md, href, down_path).add_done_callback(get_result)
func_dict = {
'1': ['爬单页面', get_one_md],
'2': ['爬单页面的所有文章', get_all_of_one]
}
if __name__ == '__main__':
print("\033[1;31m这是转换博客园文章为md格式的小工具,请勿用于非法用途!!\033[0m")
while 1:
for k, v in func_dict.items():
print(k, v[0])
func_c = input('选择功能:')
if func_c in func_dict:
while 1:
func_dict.get(func_c)[1]()
elif func_c.lower() == 'q':
exit('Bye~')
else:
print("输入有误")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。