3 Star 0 Fork 0

曾杨龙/实习

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
music3.py 5.54 KB
一键复制 编辑 原始数据 按行查看 历史
zyl 提交于 2023-09-16 21:28 . zyl-爬取网易云热评
import matplotlib.pyplot as plt
import pandas as pd
import jieba
import jieba.analyse
import numpy as np
from PIL import Image
from wordcloud import WordCloud
from pyecharts import Line
#中文乱码
plt.rcParams["font.sans-serif"] = ["SimHei"]
df3 = pd.read_csv('hotComments_06p.csv',index_col = 0,lineterminator='\n')
df3['year'] = df3['time'].apply(lambda x : int(str(x).split('-')[0]))
df3['month'] = df3['time'].apply(lambda x : int(str(x).split('-')[1]))
date_message = df3.groupby(['year'])
date_com = date_message['year'].agg(['count'])
date_com.reset_index(inplace=True)
# 绘制走势图
attr = date_com['year']
v1 = date_com['count']
line = Line("歌曲发布后评论的日期分布", title_pos='center', title_top='22', width=1000, height=600)
line.add("", attr, v1, is_smooth=True, is_fill=True, area_color="#000", is_xaxislabel_align=True, xaxis_min="dataMin", area_opacity=0.3, mark_point=["max"], mark_point_symbol="pin", mark_point_symbolsize=55)
line.render("歌曲评论的日期分布.html")
'''
读取刚刚爬好的热评文件
df1 = pd.read_csv('hotComments_06.csv',index_col = 0)
ERROR: Buffer overflow caught -缓冲区溢出
发现也是因为csv文件中单个item内有\r,即回车符
解决方法:lineterminator=”\n”:让\n作为换行符即可
'''
df3 = pd.read_csv('hotComments.csv',index_col = 0,lineterminator='\n')
#查看数据情况
df3.info()
#去除重复
df3.drop_duplicates()
#删除缺失值
df3.dropna()
# 空格的影响会导致打字内容一样,但却被判为不一样
# 用strip()方法去除开头或则结尾的空格
df3['content1'] = df3['content'].apply(lambda x:x.strip())
# 有些句子中有\r,因为我们以\n作为换行符,所以这些\r不属于文本,需要去掉
df3['content1'] = df3['content'].apply(lambda x:x.replace('\r',''))
df4 = df3.drop(['content'], axis =1 )
df4.rename(columns = {'content1':'content'},inplace = True)
# 对点赞数排序
df5 = df4.sort_values(by = 'likecount',ascending = False)
df5.head(10).to_csv('strat_TOP10.csv',index = False, encoding = 'utf-8-sig')
# 对重复的句子次数排序
df6 = df4.groupby('content').size().sort_values(ascending = False).reset_index(name = 'count')
df6.head(15).to_csv('hot_copy01.csv',index = False, encoding = 'utf-8-sig') # index = False :不需要导出index
#热评最多的ID都有那些特征
df10 = df4.groupby('userid').count().sort_values(by = 'content',ascending = False)
# 热评最多的Id 是 1313672474
df11 = df4[df4['userid']==1313672474]
df10 = df4.groupby('userid').count().sort_values(by = 'content',ascending = False)
'''
hist: 直方图
bins: 条形数
density: bool 密度显示
'''
df12 = df11['likecount']
plt.hist(df12, bins = 200, density = True )
plt.xlim((0,6000))
plt.title('1313672474用户的点赞分布')
plt.savefig('start_1313672474.png',dpi = 100)
plt.show()
'''
看评论的长度分布
len(df11['content']) --- out: 133
显示的是这个series的长度
'''
df12 = df11['content'].map(len) # map函数进行求取每一单元格个长度
plt.hist(df12,bins = 20, density = True)
plt.title('1313672474用户的评论长度分布')
plt.savefig('len_1313672474.png', dpi = 100)
plt.show()
'''
jieba库中基于 TextRank 算法的关键词抽取
详情见官方文档:https://github.com/fxsjy/jieba
'''
segments = []
for index,row in df4.iterrows():
content = row[5]
words = jieba.analyse.textrank(content,topK=3, withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
for w in words: # 对分词好后的words进行提取,并且关联一个1,方便进行计数
segments.append({'word':w,'counts':1})
df_w = pd.DataFrame(segments)
df_w.to_csv('jieba_01.csv',index = False,encoding = 'utf-8-sig')
# wordcloud库制作云词
# 将我们之前做的分词列表合并成字符串,以空格连接方便制作云词
text = ' '.join(df_w['word'])
'''
chinese.png是一张作为蒙版的图片,需要转换成numy数组才可以用
利用PIL模块读取我们的png文件并转换为numpy数组,作为WordCloud的mask参数传入
'''
mask_cir = np.array(Image.open('chinese.jpeg'))
wordc =WordCloud(
background_color='white',
mask = mask_cir,
font_path = 'C:/Windows/Fonts/STCAIYUN.TTF', # 中文显示的方法,baidu载一个SimHei.ttf字体包即可让云词显示中文
max_words=1000
).generate(text)
plt.imshow(wordc)
plt.axis('off') #关闭坐标轴,更加美观
plt.savefig('词云图.jpg',dpi=600, bbox_inches='tight', quality=95) # bbox_inches='tight',可以达到去除空白的效果
plt.show()
import pandas as pd
#从pyecharts库中导入Bar子类
from pyecharts import Bar
df = pd.read_csv('jieba_01.csv')
df2 = df.groupby('word').size().sort_values(ascending = False).reset_index(name = 'count')
print(df2)
# 创建数据
value = [i for i in df2['count'].head(10)]
attr = [i for i in df2['word'].head(10)]
#定义Bar()柱状图,同时设置主标题和副标题
bar = Bar("词评出现统计")
#调用add()函数添加图表的数据和设置各种配置项
bar.add("次数", attr, value, mark_line=["average"], mark_point=["max", "min"])
#打印输出图表的所有配置项
bar.show_config()
#生成render.html文件,也可以设置路径和文件名
bar.render('bar.html')
import pandas as pd
from pyecharts import Pie
df = pd.read_csv('strat_TOP10.csv')
print(df)
# 创建数据
value = [i for i in df['likecount']]
attr = [i for i in df['name\r']]
pie = Pie("歌名和点赞量对比",title_pos='left', width=1600)
pie.add("", attr, value, is_label_show=True)
pie.show_config()
pie.render('pie.html')
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Java
1
https://gitee.com/zyl512/practice.git
[email protected]:zyl512/practice.git
zyl512
practice
实习
master

搜索帮助