master

分支 (1)

管理

管理

master

BiLSTM
/
BiLSTM - 副本.py

import torch
import torch.nn as nn
import torch.optim as optim
import collections
import torch.utils.data as Data
from torch.autograd import Variable
import jieba

# 数据集 9条
# feature
seq = ["我喜欢你", "我恨你", "我今天很开心", "我最近很沮丧", "我很难过", "我讨厌你", "你非常的勤奋", "我特别懒惰", "我特别痛苦"]
# label
label = [1, 0, 1, 0, 0, 0, 1, 0, 0]

#分词
seq_cut = []
seq_cut_list = []
for i in seq:
    print("i = ", i)
    cut_res = list(jieba.cut(i))
    print("cut_res = ", cut_res)
    seq_cut = seq_cut + cut_res
    print("seq_cut = ", seq_cut)
    seq_cut_list.append(cut_res)
    print("seq_cut_list = ", seq_cut_list)
# seq_cut =  ['我', '喜欢', '你', '我', '恨', '你', '我', '今天', '很', '开心', '我', '最近', '很', '沮丧', '我', '很', '难过', '我', '讨厌', '你', '你', '非常', '的', '勤奋', '我', '特别', '懒惰', '我', '特别', '痛苦']
# seq_cut_list =  [['我', '喜欢', '你'], ['我', '恨', '你'], ['我', '今天', '很', '开心'], ['我', '最近', '很', '沮丧'], ['我', '很', '难过'], ['我', '讨厌', '你'], ['你', '非常', '的', '勤奋'], ['我', '特别', '懒惰'], ['我', '特别', '痛苦']]
# 计算每个词有多少个
word2num = sorted(collections.Counter(seq_cut).items(), key=lambda item: item[1], reverse=True)
# word2num =  [('我', 8), ('你', 4), ('很', 3), ('特别', 2), ('喜欢', 1), ('恨', 1), ('今天', 1), ('开心', 1), ('最近', 1), ('沮丧', 1), ('难过', 1), ('讨厌', 1), ('非常', 1), ('的', 1), ('勤奋', 1), ('懒惰', 1), ('痛苦', 1)]
# ('我', 8) 表示 我 出现了 8 次
print("word2num = ", word2num)
# 词去重
vocab = list(set(seq_cut))
print("vocab = ", vocab)
# vocab =  ['痛苦', '很', '喜欢', '特别', '懒惰', '非常', '讨厌', '恨', '的', '开心', '我', '今天', '沮丧', '勤奋', '最近', '难过', '你']
# 词对应索引
word2index = {w[0]: i+1 for i, w in enumerate(word2num)}
word2index["PAD"] = 0
# word2index =  {'我': 1, '你': 2, '很': 3, '特别': 4, '喜欢': 5, '恨': 6, '今天': 7, '开心': 8, '最近': 9, '沮丧': 10, '难过': 11, '讨厌': 12, '非常': 13, '的': 14, '勤奋': 15, '懒惰': 16, '痛苦': 17, 'PAD': 0}
print("word2index = ", word2index)

# 词典大小
vocab_size = len(word2index)
# 数据量大小
seq_size = len(seq)
# 每条最多有多少词
seq_length = max([len(i) for i in seq_cut_list])


batch_size = 3
embedding_size = 3
num_classes = 2
n_hidden = 5

# 将词转换为数字
def make_data(seq, label):
    inputs = []
    for i in seq:
        # 将词转为数字，例如：将 我 转为 1
        seq_index = [word2index[word] for word in i]
        # 补全保持句子长度一致，通过 0 补位
        if len(seq_index) != seq_length:
            seq_index = seq_index + [0] * (seq_length-len(seq_index))
        inputs.append(seq_index)
    targets = [i for i in label]
    return inputs, targets

input_batch, target_batch = make_data(seq_cut_list, label)
# 将 list 转换为 tensor
input_batch, target_batch = Variable(torch.LongTensor(input_batch)), Variable(torch.LongTensor(target_batch))

# dataset = Data.TensorDataset(input_batch, target_batch)
# loader = Data.DataLoader(dataset, batch_size, shuffle=True)

# 建立模型
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        # 嵌入模型 Embedding，W(vocab_size, embedding_size)
        self.word_vec = nn.Embedding(vocab_size, embedding_size)
        # bidirectional双向LSTM
        self.bilstm = nn.LSTM(embedding_size, n_hidden, 1, bidirectional=True)

        # 神经网络线性层 W(n_hidden * 2, num_classes)
        self.fc = nn.Linear(n_hidden * 2, num_classes)

    def forward(self, input):
        embedding_input = self.word_vec(input)
        # 调换第一维和第二维度
        embedding_input = embedding_input.permute(1, 0, 2)
        output, (h_n, c_n) = self.bilstm(embedding_input)
        # 使用正向LSTM与反向LSTM最后一个输出做拼接
        encoding1 = torch.cat([h_n[0], h_n[1]], dim=1) # dim=1代表横向拼接
        # 使用双向LSTM的输出头尾拼接做文本分类
        encoding2 = torch.cat([output[0], output[-1]], dim=1)
        fc_out = self.fc(encoding1)
        return fc_out

def train():
    model = BiLSTM()
    print(model)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 训练
    for epoch in range(5000):
        pred = model(input_batch)
        loss = criterion(pred, target_batch)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return  model

def test(model):
    # 测试
    test_text = '我今天很痛苦'
    # 分词
    test_cut = list(jieba.cut(test_text))
    # 索引
    test_batch, _ = make_data([test_cut], [1])
    test_batch = torch.LongTensor(test_batch)
    out = model(test_batch)
    predict = torch.max(out, 1)[1]
    if predict.item() == 0:
        print(test_text,"is Bad Mean...")
    else:
        print(test_text,"is Good Mean!!")

model = train()
test(model)