代码拉取完成,页面将自动刷新
同步操作将从 sslsrl/巨潮网年报爬取 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# -*- coding = utf-8 -*-
import os
import xlwt
class KeyWordStat:
def __init__(self, kw, folder):
self.encoding = "utf-8"
self.kw = kw
self.folder = folder
def matchKeyWords2(self, txt_folder, kwList):
files = os.listdir(txt_folder)
words_num = [] # 保存所有文件词频
for file in files:
word_freq = {} # 单词出现频率次:word:num
if os.path.splitext(file)[-1] == ".txt":
txt_path = os.path.join(txt_folder, file)
with open(txt_path, "r", encoding=self.encoding, errors='ignore') as fp:
text = fp.readlines()
alltext = ''
for line in text:
alltext += line.replace("\n", "")
#len(alltext)可近似看作全文总字数:计算关键词出现次数占企业年报全文字数的比例
print(len(alltext))
for word in kwList:
num = 0
num += alltext.count(word)
word_freq[word] = num
# 关注点1
stock_code = file.split("-")[1]
stock_name = file.split("-")[2]
year = file.split("-")[0][0:4]
words_num.append((word_freq, stock_code, stock_name, year))
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('年报关键词词频统计', cell_overwrite_ok=True)
sheet.write(0, 0, '年份')
sheet.write(0, 1, '企业代码')
# 自定义表格
for i in range(0, len(kwList)):
sheet.write(0, i + 2, kwList[i])
for index, one in enumerate(words_num):
word_f = one[0]
stock_code = one[1]
stock_name = one[2]
year = one[3]
for ind, word in enumerate(kwList):
sheet.write(index + 1, ind + 2, word_f[word])
sheet.write(index + 1, 0, year)
sheet.write(index + 1, 1, stock_code)
# sheet.write(index + 1, 1, stock_name)
book.save(self.folder + '\年报关键词词频统计.xls')
def run(self):
'''词频统计'''
print('开始统计词频!!!!!')
# self.matchKeyWords(self.folder,self.kw)#不能统计到换行关键字
self.matchKeyWords2(self.folder, self.kw) # 可以统计到换行关键字,可能有未知bug
print('统计结束')
if __name__ == '__main__':
kw = "核心、价值、数字经济、提升" # 词典默认分隔符为、(顿号)
kwList = kw.split("、")
folder = "年报"
test = KeyWordStat(kwList, folder)
test.run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。