1 Star 1 Fork 0

zhangxipeng/BiLSTM情感分类模型

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
BiLSTM.py 4.08 KB
一键复制 编辑 原始数据 按行查看 历史
zhangxipeng 提交于 2024-07-21 13:24 . init
import torch
import torch.nn as nn
import torch.optim as optim
import collections
import torch.utils.data as Data
from torch.autograd import Variable
import jieba
# 1. 数据集 共9条
# feature
seq = ["我喜欢你", "我恨你", "我今天很开心", "我最近很沮丧", "我很难过", "我讨厌你", "你非常的勤奋", "我特别懒惰", "我特别痛苦"]
# label
label = [1, 0, 1, 0, 0, 0, 1, 0, 0]
#分词
def tokenize(seq):
seq_cut = []
seq_cut_list = []
for i in seq:
cut_res = list(jieba.cut(i))
seq_cut = seq_cut + cut_res
seq_cut_list.append(cut_res)
# 计算每个词有多少个,根据数量排序,例如:('我', 8) 表示 我 出现了 8 次
word2num = sorted(collections.Counter(seq_cut).items(), key=lambda item: item[1], reverse=True)
# 词对应索引
word2index = {w[0]: i+1 for i, w in enumerate(word2num)}
word2index["PAD"] = 0
# 词典大小
vocab_size = len(word2index)
# 每条最多有多少词,以便给短的词补充 0
seq_length = max([len(i) for i in seq_cut_list])
return word2index, seq_cut_list, seq_length, vocab_size
# 将词转换为数字
def convert_token_to_id(seq, label):
inputs = []
for i in seq:
# 将词转为数字,例如:将 我 转为 1
seq_index = [word2index[word] for word in i]
# 补全保持句子长度一致,通过 0 补位
if len(seq_index) != seq_length:
seq_index = seq_index + [0] * (seq_length-len(seq_index))
inputs.append(seq_index)
targets = [i for i in label]
input_batch, target_batch = Variable(torch.LongTensor(inputs)), Variable(torch.LongTensor(targets))
return input_batch, target_batch
word2index, seq_cut_list, seq_length, vocab_size = tokenize(seq)
input_batch, target_batch = convert_token_to_id(seq_cut_list, label)
# 将 list 转换为 tensor
batch_size = 3
embedding_size = 3
num_classes = 2
n_hidden = 5
# BiLSTM 模型
class BiLSTM(nn.Module):
def __init__(self):
super(BiLSTM, self).__init__()
# 嵌入模型 Embedding,W(vocab_size, embedding_size)
self.word_vec = nn.Embedding(vocab_size, embedding_size)
# bidirectional双向LSTM,
self.bilstm = nn.LSTM(embedding_size, n_hidden, 1, bidirectional=True)
# 神经网络线性层 W(n_hidden * 2, num_classes)
self.fc = nn.Linear(n_hidden * 2, num_classes)
def forward(self, input):
# input [9, 4] 9 条数据,每条长度为 4
# embedding_input = [9, 4, 3], 3 表示特征数,每个词用三个特征表示
embedding_input = self.word_vec(input)
# 调换第一维和第二维度 [4, 9, 3]
embedding_input = embedding_input.permute(1, 0, 2)
output, (h_n, c_n) = self.bilstm(embedding_input)
# 使用正向LSTM与反向LSTM最后一个输出做拼接
encoding1 = torch.cat([h_n[0], h_n[1]], dim=1) # dim=1代表横向拼接
# 使用双向LSTM的输出头尾拼接做文本分类
encoding2 = torch.cat([output[0], output[-1]], dim=1)
fc_out = self.fc(encoding1)
return fc_out
def train():
model = BiLSTM()
print(model)
# 损失函数
criterion = nn.CrossEntropyLoss()
# 优化算法
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练
for epoch in range(5000):
pred = model(input_batch)
loss = criterion(pred, target_batch)
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
optimizer.zero_grad()
loss.backward()
optimizer.step()
torch.save(model, "./BiLSTM.pt")
return model
def test():
# 加载模型
model = torch.load("./BiLSTM.pt")
# 测试
test_text = '我今天很痛苦'
# 分词
test_cut = list(jieba.cut(test_text))
# 索引
test_batch, _ = convert_token_to_id([test_cut], [1])
out = model(test_batch)
predict = torch.max(out, 1)[1]
if predict.item() == 0:
print(test_text,"is Bad Mean...")
else:
print(test_text,"is Good Mean!!")
model = train()
test()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/linux-ape/BiLSTM.git
[email protected]:linux-ape/BiLSTM.git
linux-ape
BiLSTM
BiLSTM情感分类模型
master

搜索帮助