2 Star 0 Fork 0

leimiemie/py-jieba-export

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
main.py 2.58 KB
一键复制 编辑 原始数据 按行查看 历史
feebee 提交于 2021-03-05 12:03 . update main.py.
import pandas as pd
import os
# import jieba
from snownlp import SnowNLP
CONST_COLUMN_WORDS_COUNT = '词数'
def read_keywords(keywords_xlsx_file):
df = pd.read_excel(keywords_xlsx_file)
return list(df['Phrase'])
def build_df(lst_cols, rows):
df = pd.DataFrame(columns=lst_cols, index=rows.keys())
for k, v in rows.items():
df.loc[k] = pd.Series(v)
# print(df)
return df
def read_file(file):
with open(file, 'r', encoding='utf8', errors='ignore') as f:
return f.read()
def build_row_counts(txt_file, keywords):
content = read_file(txt_file)
counts = {}
for keyword in keywords:
if keyword == CONST_COLUMN_WORDS_COUNT:
sn = SnowNLP(content)
# seg_list = jieba.cut(content)
counts[keyword] = len(sn.words)
else:
counts[keyword] = content.count(keyword)
file_name = os.path.basename(txt_file).replace('.txt', '')
return {file_name: counts}
def build_rows(txt_files_dir, keywords):
rows = {}
for f in os.listdir(txt_files_dir):
full_name = os.path.join(txt_files_dir, f)
rows.update(build_row_counts(full_name, keywords))
# print(rows)
return rows
def count_and_export(keywords_xlsx_file, txt_files_dir, export_xlsx_file):
keywords = [CONST_COLUMN_WORDS_COUNT]
keywords.extend(read_keywords(keywords_xlsx_file))
# print(keywords)
# build_df(['a', 'b'], {'x': {'a': 11, 'b': 22}, 'y': {'a': 111, 'b': 222}})
df = build_df(keywords, build_rows(txt_files_dir, keywords))
df.to_excel(export_xlsx_file)
if __name__ == '__main__':
count_and_export(r'C:\Users\YANG.LEI\Downloads\test\Keyword_Phrases.xlsx',
r'C:\Users\YANG.LEI\Downloads\test\txt实验文件',
r'C:\Users\YANG.LEI\Downloads\test\导出.xlsx'
)
print('main end')
#!/usr/bin/python
# -*- coding: utf-8 -*-
import textract, re
import PyPDF2
from snownlp import SnowNLP
import pandas as pd
import numpy as np
import os
import jieba.posseg as pseg
import jieba
from tika import parser
os.chdir('annual reports')
#first, read the pdfs as text, and save them as txt, which will make processing faster in the future
for filename in os.listdir('annual reports'):
if filename.endswith('.pdf'):
try:
text = parser.from_file('pdf/' + filename)
filename = filename.replace(".pdf", "")
text_file = open('text_doc/' + filename + ".txt", "w+")
n = text_file.write(text['content'])
text_file.close()
print("finished writing" + filename)
except:
print("can't write")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/leimiemie/py-jieba-export.git
[email protected]:leimiemie/py-jieba-export.git
leimiemie
py-jieba-export
py-jieba-export
master

搜索帮助