master

分支 (1)

管理

管理

master

Keyword_get_article
/
exclude.py

# -*- coding:utf-8 -*-
import re

# 加载特征词
badwords = r'|'.join(w.strip() for w in open('exclude.txt', encoding='utf-8'))
# 检测包含特征词的正则
rm_bad_sentence = re.compile(badwords, re.I | re.S)
# 检测内容网址的正则
rm_links = re.compile('((https?|ftp|file)://)?[-a-z0-9+&@#/%=~|?!:,\.。;]+\.[-a-z0-9+&@#/%=~|]+', re.I | re.S)
# 提取段落
exp = re.compile(r'<p>([^>]+)</p>', re.I | re.S)


# 采集内容的一般处理方法
# 1. 对内容去除噪音之后直接使用
# 2. 直接拼接
# 3. 在1的基础上，加上自己的一两段话，分别加在首末段，这种都是有模板的
def filter_text(content):
    """
    对内容进行清洗
    1. 去除内容噪音（特征词包含），只要包含特征词的句子全部干掉
    2. 去除网址
    """
    print(content)
    print('*' * 50)
    # 提取文章段落
    # sentences = exp.findall(content)
    # new_content = ""
    # for p in sentences:
    #     if rm_bad_sentence.search(p):
    #         continue
    #     # 过滤完文本之后,立即进行文本翻译伪原创处理
    #     if p == '':
    #         new_content += f'<br/>'
    #     else:
    #         new_content += f'<p>{p}</p>'
    # # 删除链接并返回
    # return rm_links.sub('', new_content)
    for p in content.split('\r\n'):
        if p == '':
            p_tag = f'<p><br></p>'
        else:
            p_tag = f'<p>{p}</p>'

        print(p_tag)

# 对于标题的建议
# 1. 直接有用关键词作为标题
# 2. 关键词+相关词作为标题
# 3. 对原标题进行处理

def test(text):
    content = text
    content = rm_bad_sentence.sub('',content)
    content = rm_links.sub('',content)
    print(content)

if __name__ == '__main__':
    with open('china.txt','r',encoding='utf-8') as f:
        text = f.read()
    res = test(text)
    print(res)