使用CNN模型(Pytorch)实现文本情感分类

使⽤CNN模型(Pytorch)实现⽂本情感分类导包
import collections
import os
import time
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
functional as F
import random
from tqdm import tqdm
基本参数配置
device = torch.device('cuda'if torch.cuda.is_available()else'cpu')
DATA_ROOT ="E:\data"
池化
# 池化
class GlobalMaxPool1d(nn.Module):
def__init__(self):
super(GlobalMaxPool1d, self).__init__()
def forward(self, x):
# x shape: [batch_size, channel, seq_len]
# return shape: [batch_size, channel, 1]
return F.max_pool1d(x, kernel_size=x.shape[2])
读取数据
# 读取数据集
def read_imdb(folder='train', data_root=r'E:\data\aclImdb'):
data =[]
for label in['pos','neg']:
folder_name = os.path.join(data_root, folder, label)# 拼接⽂件路径如:E:\data\aclImdb\train\pos\
for file in tqdm(os.listdir(folder_name)):# os.listdir(folder_name) 读取⽂件路径下的所有⽂件名,并存⼊列表
with open(os.path.join(folder_name,file),'rb')as f:
review = f.read().decode('utf-8').replace('\n',' ').lower()
data.append([review,1if label =='pos'else0])# 将每个⽂本读取的内容和对应的标签存⼊data列表中
出水服务random.shuffle(data)# 打乱data列表中的数据排列顺序
return data
空格分词
# 空格分词
def get_tokenized_imdb(data):
中空板封边机'''
:param data: list of [string, label]
'''
def tokenizer(text):
return[tok.lower()for tok in text.split(' ')]
return[tokenizer(review)for review,_ in data]# 只从data中读取review(评论)内容⽽不读取标签(label),对review使⽤tokenizer⽅法进⾏分词创建词典
# 创建词典
def get_vocab_imdb(data):
tokenized_data = get_tokenized_imdb(data)# 调⽤get_tokenized_imdb()空格分词⽅法获取到分词后的数据tokenized_data
counter = collections.Counter([tk for st in tokenized_data for tk in st])# 读取tokenized_data列表中每个句⼦的每个词,放⼊列表中。
# collections.Counter()⽅法可计算出列表中所有不重复的词数总和
return Vocab.Vocab(counter, min_freq=5)# 去掉词频⼩于5的词
对data列表中的每⾏数据进⾏处理,将词转换为索引,并使每⾏数据等长
def process_imdb(data, vocab):
max_len =500# 每条评论通过截断或者补0,使得长度变成500
def pad(x):
return x[:max_len]if len(x)> max_len else x +[0]*(max_len -len(x))# x[:max_len] 只获取前max_len个词
# x + [0]*(max_len - len(x)) 词数⼩于max_len,⽤pad=0补长到max_len
tokenized_data = get_tokenized_imdb(data)# 调⽤⽅法获取分词后的数据
features = sor([pad([vocab.stoi[word]for word in words])for words in tokenized_data])# 将词转换为vocab词典中对应词的索引    labels = sor([score for _, score in data])
return features, labels
迭代数据集
# 迭代数据集
batch_size =64
train_data = read_imdb('train', data_root=os.path.join(DATA_ROOT,"aclImdb"))
test_data = read_imdb('test', data_root=os.path.join(DATA_ROOT,"aclImdb"))铌高
vocab = get_vocab_imdb(train_data)
train_set = Data.TensorDataset(*process_imdb(train_data, vocab))椒盐噪声
test_set = Data.TensorDataset(*process_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
创建TextCNN模型
class TextCNN(nn.Module):
def__init__(self, vocab, embed_size, kernel_sizes, num_channels):
super(TextCNN, self).__init__()
# 不参与训练的嵌⼊层
self.dropout = nn.Dropout(0.5)
self.decoder = nn.Linear(sum(num_channels),2)
# 时序最⼤池化层没有权重,所以可以共⽤⼀个实例
self.pool = GlobalMaxPool1d()
for c, k in zip(num_channels, kernel_sizes):
out_channels = c,
kernel_size = k))
def forward(self, inputs):
# 将两个形状是(批量⼤⼩, 词数, 词向量维度)的嵌⼊层的输出按词向量连结
embeddings = torch.cat((
# 根据Conv1D要求的输⼊格式,将词向量维,即⼀维卷积层的通道维(即词向量那⼀维),变换到前⼀维
embeddings = embeddings.permute(0,2,1)
# 对于每个⼀维卷积层,在时序最⼤池化后会得到⼀个形状为(批量⼤⼩, 通道⼤⼩, 1)的
# Tensor。使⽤flatten函数去掉最后⼀维,然后在通道维上连结
encoding = torch.cat([self.lu(conv(embeddings))).squeeze(-1)for conv vs], dim=1)
# 应⽤丢弃法后使⽤全连接层得到输出
outputs = self.decoder(self.dropout(encoding))
return outputs
创建TextCNN实例,有3个卷积层,核宽分别为3、4和5,输出通道数均为100
# 创建TextCNN实例,有3个卷积层,核宽分别为3、4和5,输出通道数均为100
embed_size, kerel_sizes, num_channels =100,[3,4,5],[100,100,100]
net = TextCNN(vocab, embed_size, kerel_sizes, num_channels)
加载预训练的100维GloVe词向量,并分别初始化嵌⼊层embedding和constant_embedding,前者参与训练,⽽后者权重固定。# 加载预训练的100维GloVe词向量,并分别初始化嵌⼊层embedding和constant_embedding,前者参与训练,⽽后者权重固定。
glove_vocab = Vocab.GloVe(name='6B', dim=100,
cache=os.path.join(DATA_ROOT,"glove"))
# print(len(glove_vocab.stoi)) # 400000
# print(glove_vocab[0].shape)
加载预训练模型
# 加载预训练模型
def load_pretrained_embedding(words, pretrained_vocab):
'''从训练好的vocab中提取出words对应的词向量'''
embed = s(len(words), pretrained_vocab.vectors[0].shape[0])# pretrained_vocab.vectors[0].shape # torch.Size([100])
oov_count =0# out of vocabulary
for i, word in enumerate(words):
try:
idx = pretrained_vocab.stoi[word]
embed[i,:]= pretrained_vocab.vectors[idx]# 将第i⾏⽤预训练的单词向量替换
except KeyError:
oov_count +=1
if oov_count >0:
print("There are %d oov words."% oov_count)
return embed
load_pretrained_embedding(vocab.itos, glove_vocab))
load_pretrained_embedding(vocab.itos, glove_vocab))
训练并评价模型
lr, num_epochs =0.001,5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
评价函数
# 评价函数
def evaluate_accuracy(data_iter, net, device=None):
if device is None and isinstance(net, Module):
# 如果没指定device就使⽤net的device
device =list(net.parameters())[0].device
acc_sum, n =0.0,0
_grad():
for X, y in data_iter:
if isinstance(net, Module):
net.eval()# 评估模式, 这会关闭dropout
acc_sum +=((device)).argmax(dim=1)== y.to(device)).float().sum().cpu().item()                ain()# 改回训练模式
else:# ⾃定义的模型, 3.13节之后不会⽤到, 不考虑GPU
if('is_training'in net.__code__.co_varnames):# 如果有is_training这个参数
# 将is_training设置成False
acc_sum +=(net(X, is_training=False).argmax(dim=1)== y).float().sum().item() else:
acc_sum +=(net(X).argmax(dim=1)== y).float().sum().item()
n += y.shape[0]
return acc_sum / n
训练函数
# 训练函数
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
net = (device)
print("training on ", device)
batch_count =0
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n, start =0.0,0.0,0, time.time()
for X, y in train_iter:
X = X.to(device)
y = y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
<_grad()
l.backward()
optimizer.step()
train_l_sum += l.cpu().item()
train_acc_sum +=(y_hat.argmax(dim=1)== y).sum().cpu().item()
n += y.shape[0]
batch_count +=1
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
%(epoch +1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time()- start))训练
# 训练
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
training on  cuda
epoch 1, loss 0.4792, train acc 0.763, test acc 0.844, time 7.9 sec
epoch 2, loss 0.1616, train acc 0.862, test acc 0.872, time 7.9 sec
epoch 3, loss 0.0682, train acc 0.918, test acc 0.881, time 7.9 sec
epoch 4, loss 0.0289, train acc 0.959, test acc 0.876, time 7.9 sec
epoch 5, loss 0.0131, train acc 0.978, test acc 0.868, time 7.9 sec
预测函数
# 预测函数
def predict_sentiment(net, vocab, sentence):
'''sentence是词语的列表'''
device =list(net.parameters())[0].device
sentence = sor([vocab.stoi[word]for word in sentence], device=device)
label = torch.argmax(net(sentence.view((1,-1))), dim=1)
return'positive'if label.item()==1else'negative'
预测
# 预测
predict_sentiment(net, vocab,['this','movie','is','so','great'])# positive
predict_sentiment(net, vocab,['this','movie','is','so','bad'])# negative
完整代码
import collections
import os
import time
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
functional as F
import random
from tqdm import tqdm
患者腕带
device = torch.device('cuda'if torch.cuda.is_available()else'cpu')
DATA_ROOT ="E:\data"
# ⼀维卷积
def corr1d(X, K):
w = K.shape[0]
Y = s(X.shape[0]- w +1)
for i in range(Y.shape[0]):
Y[i]=(X[i: i+w]* K).sum()
return Y
# X, K = sor([0, 1, 2, 3, 4, 5, 6]), sor([1, 2])
大规模定制
# res = corr1d(X, K)
# print(res)
def corr1d_multi_in(X, K):
# ⾸先沿着X和K的第0维(通道维)遍历并计算⼀维互相关结果。然后将所有结果堆叠起来沿第0维累加return torch.stack([corr1d(x, k)for x,k in zip(X, K)]).sum(dim=0)
# X = sor([[0, 1, 2, 3, 4, 5, 6],
#              [1, 2, 3, 4, 5, 6, 7],

本文发布于:2024-09-23 16:23:29,感谢您对本站的认可!

本文链接:https://www.17tex.com/tex/2/97574.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:训练   列表   向量   数据   分词
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2024 Comsenz Inc.Powered by © 易纺专利技术学习网 豫ICP备2022007602号 豫公网安备41160202000603 站长QQ:729038198 关于我们 投诉建议