1 Star 0 Fork 1

jason_udu/巨潮网年报爬取

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
KeyWordStat.py 2.71 KB
一键复制 编辑 原始数据 按行查看 历史
# -*- coding = utf-8 -*-
import os
import xlwt
class KeyWordStat:
def __init__(self, kw, folder):
self.encoding = "utf-8"
self.kw = kw
self.folder = folder
def matchKeyWords2(self, txt_folder, kwList):
files = os.listdir(txt_folder)
words_num = [] # 保存所有文件词频
for file in files:
word_freq = {} # 单词出现频率次:word:num
if os.path.splitext(file)[-1] == ".txt":
txt_path = os.path.join(txt_folder, file)
with open(txt_path, "r", encoding=self.encoding, errors='ignore') as fp:
text = fp.readlines()
alltext = ''
for line in text:
alltext += line.replace("\n", "")
#len(alltext)可近似看作全文总字数:计算关键词出现次数占企业年报全文字数的比例
print(len(alltext))
for word in kwList:
num = 0
num += alltext.count(word)
word_freq[word] = num
# 关注点1
stock_code = file.split("-")[1]
stock_name = file.split("-")[2]
year = file.split("-")[0][0:4]
words_num.append((word_freq, stock_code, stock_name, year))
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('年报关键词词频统计', cell_overwrite_ok=True)
sheet.write(0, 0, '年份')
sheet.write(0, 1, '企业代码')
# 自定义表格
for i in range(0, len(kwList)):
sheet.write(0, i + 2, kwList[i])
for index, one in enumerate(words_num):
word_f = one[0]
stock_code = one[1]
stock_name = one[2]
year = one[3]
for ind, word in enumerate(kwList):
sheet.write(index + 1, ind + 2, word_f[word])
sheet.write(index + 1, 0, year)
sheet.write(index + 1, 1, stock_code)
# sheet.write(index + 1, 1, stock_name)
book.save(self.folder + '\年报关键词词频统计.xls')
def run(self):
'''词频统计'''
print('开始统计词频!!!!!')
# self.matchKeyWords(self.folder,self.kw)#不能统计到换行关键字
self.matchKeyWords2(self.folder, self.kw) # 可以统计到换行关键字,可能有未知bug
print('统计结束')
if __name__ == '__main__':
kw = "核心、价值、数字经济、提升" # 词典默认分隔符为、(顿号)
kwList = kw.split("、")
folder = "年报"
test = KeyWordStat(kwList, folder)
test.run()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/jason-udu/cninfo-crawl.git
[email protected]:jason-udu/cninfo-crawl.git
jason-udu
cninfo-crawl
巨潮网年报爬取
master

搜索帮助