master

分支 (1)

管理

管理

master

cnblog_sys
/
博客园爬虫.py

# encoding = utf-8

'''
Author: Du
Blog：https://www.cnblogs.com/Du704/

Date: 2019/12/9 16:53
'''

import console
import requests, re, os, threading
from bs4 import BeautifulSoup, NavigableString
from concurrent.futures import ThreadPoolExecutor

# 存放路径，可自行修改
download_path = os.path.join(os.path.dirname(__file__), '博客园随笔md格式')
# 懒人模式，一键创建
if not os.path.exists(download_path):
    os.mkdir(download_path)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

# 小猿取经博客密码
blog_data = {
    "tb_password": 'xiaoyuanqujing@666'
}

# 标题的正则
h_re = re.compile('^h(\d{1})')


def remove_div(body):
    '''去除多余的div'''
    while body.div:
        body.div.unwrap()


def md_toc(body):
    toc = body.select_one(".toc")
    if toc:
        toc.replace_with('[TOC]')


def md_h(body):
    '''修改标题，已简化标题'''
    h = body.find_all(h_re)
    for i in h:
        n = int(i.name[1])
        i.replace_with('#' * n + ' ' + i.get_text())


def md_table(body):
    '''改变表格'''
    tbs = body.find_all('table')
    for t in tbs:
        ths = t.select_one('thead > tr')  # 表头
        trs = t.select('tbody > tr')  # 表行

        if not ths:  # 防止对方格式不对，没有表头
            ths, tds = trs[0], trs[1:]

        head = mid = '|'
        for th in ths.find_all(re.compile('th|td')):
            style = th.get('style')
            style = style if style else 'center'
            if 'left' in style:
                head += ' ' + th.get_text() + '    |'
                mid += ' :--- |'
            elif 'right' in style:
                head += '    ' + th.get_text() + ' |'
                mid += ' ---: |'
            else:
                head += '  ' + th.get_text() + '   |'
                mid += ' :--: |'
        body = head + '\n' + mid + '\n'
        for tr in trs:
            row = "|"
            for td in tr.find_all('td'):
                style = td.get('style')
                style = style if style else 'center'
                if 'left' in style:
                    row += ' ' + td.get_text() + '    |'
                elif 'right' in style:
                    row += '    ' + td.get_text() + ' |'
                else:
                    row += '  ' + td.get_text() + '   |'
            body = body + row + '\n'

        t.replace_with(body)


def md_pre(body):
    ''' 修改代码块部分'''
    for pre in body.find_all('pre') + body.select('.cnblogs_code'):
        code_type = pre.get('class')  # 确定代码块语言
        code_type = code_type if code_type else ['python']  # 如果没有class则默认为python
        pre_replace = pre.parent
        hide_sta = ''

        if pre_replace:  # 后部分的可能会被替换，此节点变为None
            if not pre_replace.get('class'):  # 内嵌代码块
                pre_replace = pre
            if code_type and 'cnblogs_code' in code_type:  # 处理除pre外的代码块
                code_type = ['css']  # 这个随意调整
                pre_replace = pre
                if '```' in pre.text:  # 防止多层内嵌（不知道怎么有这样的格式）
                    pre.replace_with(pre.text)
                    continue

            # 处理隐藏代码块(实际上markdown不支持隐藏代码块)
            elif pre_replace.get('class') and pre_replace.get('class')[0] == 'cnblogs_code_hide':
                pre_replace = pre_replace.parent
                pre_div = pre_replace.find_all(True)[-1]
                pre_title = pre_div.string.lstrip()
                # 隐藏代码块的父类的最后一个元素是隐藏的标题（加粗+斜体+显示）
                hide_sta = '' if pre_title == 'View Code' else f' ***{pre_title}***\n'
            else:
                # 替换形式
                # pre_replace = pre_replace if pre_replace.get('class')[0] == 'cnblogs_code' else pre
                if not pre_replace.get('class') or pre_replace.get('class')[0] != 'cnblogs_code':
                    pre_replace = pre

            # 代码块主体内容
            content = f'```{code_type[0] if code_type else ""}\n' \
                      + pre.get_text().lstrip().rstrip().replace('```', '```') + '\n```'

            if hide_sta:  # 这个只和隐藏代码有关
                content = hide_sta + content

            if pre_replace in body:  # 检查是否还在文档树里
                pre_replace.replace_with(content)


def md_coed(body):
    '''修改单独的代码'''
    codes = body.find_all('code')
    for i in codes:
        if not i.parent.name == 'pre':
            i.replace_with(f"`{i.get_text()}`")


def md_img(body):
    '''修改图片'''
    imgs = body.find_all('img')
    for i in imgs:
        src = i.get('src')  # 拿到img的路径地址,不要使用i.src！！！ src会返回None
        i.replace_with(f"![]({src})")


def md_link(body):
    '''修改链接'''
    links = body.find_all('a')
    for a in links:
        href = a.get('href')  # 拿到a连接的指向地址,不要使用a.href！！！ 会返回None，同img.src 参考上方代码
        a.replace_with(f"[{a.text}]({href})")
        # if '/p/' in href or '/article/' in href:
        #     get_one_md(href)


def md_b_i(body):
    '''修改斜体粗体'''
    b = body.find_all(re.compile('strong|em'))
    for i in b:
        xing = '**' if i.name == 'strong' else '*'
        if not h_re.match(i.parent.name):
            i.replace_with(xing + i.get_text() + xing)


def md_ol_ul(body):
    '''修改列表'''
    l_lis = body.find_all(re.compile('^ul|ol'))
    for i in l_lis:
        start = "* " if i.name == 'ul' else 1
        lis = i.find_all('li')
        if len(lis) > 0:
            for j in lis:
                if isinstance(start, str):
                    j.replace_with(start + j.get_text().lstrip())
                else:
                    j.replace_with(f"{start}. " + j.get_text().lstrip())
                    start += 1
            i.replace_with(i.get_text())
        else:
            if i.li:
                i.li.unwarp()
            # i.unwrap()


def md_p(body):
    '''修改段落'''
    while body.span:
        body.span.unwrap()
    while body.p:
        body.p.unwrap()


def to_md(body):
    item_name = body.name
    if not isinstance(body, NavigableString):  # 清除格式，不过基本没用，目的是看起来简洁一点
        body.attrs.clear()
    if item_name == 'br':  # 可能会影响格式
        return '  \n'
    if item_name == 'hr':
        return '\n----\n'
    else:
        return str(body).replace('&lt;', '<').replace('&gt;', '>')


def transform(body):
    '''整合所有修改，尽量不要改变顺序'''
    md_toc(body)
    md_h(body)
    md_table(body)
    md_pre(body)
    remove_div(body)
    md_img(body)
    md_link(body)
    md_coed(body)
    md_b_i(body)
    md_ol_ul(body)
    md_p(body)


def get_one_md(url='', down_path=download_path):
    if not url:
        url = input("请粘贴要转化的博客地址:\
                \n\t如：https://www.cnblogs.com/Du704/p/11270106.html\n\
                只需要输入：\033[1;35mDu704/p/11270106.html \033[0m即可。\n\t>>").strip()

        if not url:
            print('\033[1;31m输入信息不能为空！\033[0m')
            return False

        url = 'https://www.cnblogs.com/' + url

    print(f"正在读取... {url} ")
    # post请求是与加密文章相关，无加密信息也不会影响，可自行设置
    res = requests.post(url, data=blog_data, headers=headers)
    if res.status_code != 200:
        print(f"{url} 请求失败!")
        return False

    soup = BeautifulSoup(res.text, 'lxml')

    # 有的文章会进行二次加密
    if '博文阅读密码验证' in soup.select_one('head > title').get_text():
        url = 'https://www.cnblogs.com' + soup.select_one('body > form').get('action')
        return get_one_md(url, down_path)

    blog_name = soup.select_one('#cb_post_title_url').get_text().rstrip()  # 本篇博客名
    blog_name += '.md'

    # 如果爬虫有错可以离线修改，也可以查看网页源代码
    # with open(os.path.join(download_path, blog_name + '.html'), 'w', encoding='utf-8') as f:
    #     f.write(soup.prettify())

    # 这一步的目的是防止对方的博客就是粘贴来的
    body = soup.select_one("#cnblogs_post_body > #cnblogs_post_body")
    if not body:
        body = soup.select_one('#cnblogs_post_body')

    try:
        # 转换为md
        transform(body)
    except Exception as e:
        print(f"\033[31m解析{url}失败，\033[35m该博客可能包含无法解析的内容！\033[0m")
        print("异常信息：", e)

    try:
        with open(os.path.join(down_path, blog_name), 'w', encoding='utf-8') as f:
            for i in body:
                data = to_md(i)
                f.write(data)
        print(f"\n\033[1;36m{blog_name} \033[33m下载完成！\033[0m\n\n")
    except Exception as e:
        print(f"保存时异常终止，可能是文件名出错:\033[31m{blog_name}\033[0m\n\t请查看是否有特殊字符！")
        print("异常信息：", e)


def get_all_url(url=None):
    if not url:
        url = input("请粘贴要群爬的博客地址:\
                \n\t如：https://www.cnblogs.com/Du704/p/11270106.html\n\
                只需要输入：\033[1;35mDu704/p/11270106.html \033[0m即可。\n\t>>").strip()

        if not url:
            print('\033[1;31m输入信息不能为空！\033[0m')
            return False

        url = 'https://www.cnblogs.com/' + url

        res = requests.get(url, headers=headers)
        if res.status_code != 200:
            print(f"{url} 请求失败!")
            return False
        soup = BeautifulSoup(res.text, 'lxml')

        dir_name = soup.select_one('#cb_post_title_url').get_text().rstrip()  # 整体的名字
        dir_name = os.path.join(download_path, dir_name)
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)

        # 有问题可以检查名字是否合法
        # print(dir_name)

        body = soup.select_one("#cnblogs_post_body > #cnblogs_post_body")
        if not body:
            body = soup.select_one('#cnblogs_post_body')

        # links = body.select('a[target="-Blank"]') or body.select('a[target="_blank"]')
        links = body.find_all('a')
        return [a.get('href') for a in links if a.get('href').startswith("https://www.cnblogs.com")], dir_name


def get_result(future):
    '''这个回调函数我还没想好怎么用'''
    print()


def get_all_of_one():
    urls, down_path = get_all_url()
    print(urls)
    # 建立线程池，加快进度
    with ThreadPoolExecutor(max_workers=4) as pool:
        for href in urls:
            pool.submit(get_one_md, href, down_path).add_done_callback(get_result)


func_dict = {
    '1': ['爬单页面', get_one_md],
    '2': ['爬单页面的所有文章', get_all_of_one]
}

if __name__ == '__main__':
    print("\033[1;31m这是转换博客园文章为md格式的小工具，请勿用于非法用途！！\033[0m")
    while 1:
        for k, v in func_dict.items():
            print(k, v[0])

        func_c = input('选择功能：')
        if func_c in func_dict:
            while 1:
                func_dict.get(func_c)[1]()
        elif func_c.lower() == 'q':
            exit('Bye~')
        else:
            print("输入有误")