1 Star 0 Fork 0

zhangxipeng/bert-bilstm-in-Sentiment-classification

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
data_clean.py 2.69 KB
一键复制 编辑 原始数据 按行查看 历史
zhangxipeng 提交于 2024-07-13 13:00 . init
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read Data
df = pd.read_csv('data/nCoV_100k_train.labled.csv')
# Only need text and labels
df = df[['微博中文内容', '情感倾向']]
df = df.rename(columns={'微博中文内容': 'text', '情感倾向': 'label'})
print(df)
# Observing data balance
print(df.label.value_counts())
print(df.label.value_counts() / df.shape[0] * 100)
plt.figure(figsize=(8, 4))
sns.countplot(x='label', data=df)
plt.show()
# print(df_train[df_train.label > 5.0])
# print(df_train[(df_train.label < -1.1)])
# # discarding outliers
# df_train.drop(df_train[(df_train.label < -1.1) | (df_train.label > 5)].index, inplace=True, axis=0)
# df_train.reset_index(inplace=True, drop=True)
# print(df_train.label.value_counts())
# sns.countplot(x='label', data=df_train)
# plt.show()
df.drop(df[(df.label == '4') |
(df.label == '-') |
(df.label == '·') |
(df.label == '-2') |
(df.label == '10') |
(df.label == '9')].index, inplace=True, axis=0)
df.reset_index(inplace=True, drop=True)
print(df.value_counts())
sns.countplot(x='label', data=df)
plt.show()
# checking for empty rows
print(df.isnull().sum())
# deleting empty row data
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.isnull().sum())
# examining duplicate data
print(df.duplicated().sum())
print(df[df.duplicated()==True])
# deleting duplicate data
index = df[df.duplicated() == True].index
df.drop(index, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.duplicated().sum())
# We also need to address duplicate data where the text is the same but the label is different
print(df['text'].duplicated().sum())
print(df[df['text'].duplicated() == True])
# viewing examples
print(df[df['text'] == df.iloc[1473]['text']])
print(df[df['text'] == df.iloc[1814]['text']])
# removing data where the text is the same but the label is different
index = df[df['text'].duplicated() == True].index
df.drop(index, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)
# checking
print(df['text'].duplicated().sum()) # 0
print(df)
# inspecting shapes and indices
print("======data-clean======")
print(df.tail())
print(df.shape)
# viewing the maximum length of text
print(df['text'].str.len().sort_values())
# Split dataset. 0.6/0.2/0.2
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.25)
print(train.shape)
print(test.shape)
print(val.shape)
train.to_csv('./data/clean/train.csv', index=None)
val.to_csv('./data/clean/val.csv', index=None)
test.to_csv('./data/clean/test.csv', index=None)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/linux-ape/bert-bilstm-in-sentiment-classification.git
[email protected]:linux-ape/bert-bilstm-in-sentiment-classification.git
linux-ape
bert-bilstm-in-sentiment-classification
bert-bilstm-in-Sentiment-classification
master

搜索帮助