代码拉取完成,页面将自动刷新
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read Data
df = pd.read_csv('data/nCoV_100k_train.labled.csv')
# Only need text and labels
df = df[['微博中文内容', '情感倾向']]
df = df.rename(columns={'微博中文内容': 'text', '情感倾向': 'label'})
print(df)
# Observing data balance
print(df.label.value_counts())
print(df.label.value_counts() / df.shape[0] * 100)
plt.figure(figsize=(8, 4))
sns.countplot(x='label', data=df)
plt.show()
# print(df_train[df_train.label > 5.0])
# print(df_train[(df_train.label < -1.1)])
# # discarding outliers
# df_train.drop(df_train[(df_train.label < -1.1) | (df_train.label > 5)].index, inplace=True, axis=0)
# df_train.reset_index(inplace=True, drop=True)
# print(df_train.label.value_counts())
# sns.countplot(x='label', data=df_train)
# plt.show()
df.drop(df[(df.label == '4') |
(df.label == '-') |
(df.label == '·') |
(df.label == '-2') |
(df.label == '10') |
(df.label == '9')].index, inplace=True, axis=0)
df.reset_index(inplace=True, drop=True)
print(df.value_counts())
sns.countplot(x='label', data=df)
plt.show()
# checking for empty rows
print(df.isnull().sum())
# deleting empty row data
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.isnull().sum())
# examining duplicate data
print(df.duplicated().sum())
print(df[df.duplicated()==True])
# deleting duplicate data
index = df[df.duplicated() == True].index
df.drop(index, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.duplicated().sum())
# We also need to address duplicate data where the text is the same but the label is different
print(df['text'].duplicated().sum())
print(df[df['text'].duplicated() == True])
# viewing examples
print(df[df['text'] == df.iloc[1473]['text']])
print(df[df['text'] == df.iloc[1814]['text']])
# removing data where the text is the same but the label is different
index = df[df['text'].duplicated() == True].index
df.drop(index, axis=0, inplace=True)
df.reset_index(inplace=True, drop=True)
# checking
print(df['text'].duplicated().sum()) # 0
print(df)
# inspecting shapes and indices
print("======data-clean======")
print(df.tail())
print(df.shape)
# viewing the maximum length of text
print(df['text'].str.len().sort_values())
# Split dataset. 0.6/0.2/0.2
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.25)
print(train.shape)
print(test.shape)
print(val.shape)
train.to_csv('./data/clean/train.csv', index=None)
val.to_csv('./data/clean/val.csv', index=None)
test.to_csv('./data/clean/test.csv', index=None)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。