首页 > 专利信息

使用CNN模型（Pytorch）实现文本情感分类

使⽤CNN模型（Pytorch）实现⽂本情感分类导包

import collections

import os

import time

import torch

from torch import nn

import torchtext.vocab as Vocab

import torch.utils.data as Data

functional as F

import random

from tqdm import tqdm

基本参数配置

device = torch.device('cuda'if torch.cuda.is_available()else'cpu')

DATA_ROOT ="E:\data"

池化

# 池化

class GlobalMaxPool1d(nn.Module):

def__init__(self):

super(GlobalMaxPool1d, self).__init__()

def forward(self, x):

# x shape: [batch_size, channel, seq_len]

# return shape: [batch_size, channel, 1]

return F.max_pool1d(x, kernel_size=x.shape[2])

读取数据集

# 读取数据集

def read_imdb(folder='train', data_root=r'E:\data\aclImdb'):

data =[]

for label in['pos','neg']:

folder_name = os.path.join(data_root, folder, label)# 拼接⽂件路径如：E:\data\aclImdb\train\pos\

for file in tqdm(os.listdir(folder_name)):# os.listdir(folder_name) 读取⽂件路径下的所有⽂件名，并存⼊列表中

with open(os.path.join(folder_name,file),'rb')as f:

review = f.read().decode('utf-8').replace('\n',' ').lower()

data.append([review,1if label =='pos'else0])# 将每个⽂本读取的内容和对应的标签存⼊data列表中

出水服务random.shuffle(data)# 打乱data列表中的数据排列顺序

return data

空格分词

# 空格分词

def get_tokenized_imdb(data):

中空板封边机'''

:param data: list of [string, label]

'''

def tokenizer(text):

return[tok.lower()for tok in text.split(' ')]

return[tokenizer(review)for review,_ in data]# 只从data中读取review(评论)内容⽽不读取标签(label)，对review使⽤tokenizer⽅法进⾏分词创建词典

# 创建词典

def get_vocab_imdb(data):

tokenized_data = get_tokenized_imdb(data)# 调⽤get_tokenized_imdb()空格分词⽅法获取到分词后的数据tokenized_data

counter = collections.Counter([tk for st in tokenized_data for tk in st])# 读取tokenized_data列表中每个句⼦的每个词，放⼊列表中。

# collections.Counter()⽅法可计算出列表中所有不重复的词数总和

return Vocab.Vocab(counter, min_freq=5)# 去掉词频⼩于5的词

对data列表中的每⾏数据进⾏处理，将词转换为索引，并使每⾏数据等长

def process_imdb(data, vocab):

max_len =500# 每条评论通过截断或者补0，使得长度变成500

def pad(x):

return x[:max_len]if len(x)> max_len else x +[0]*(max_len -len(x))# x[:max_len] 只获取前max_len个词

# x + [0]*(max_len - len(x)) 词数⼩于max_len,⽤pad=0补长到max_len

tokenized_data = get_tokenized_imdb(data)# 调⽤⽅法获取分词后的数据

features = sor([pad([vocab.stoi[word]for word in words])for words in tokenized_data])# 将词转换为vocab词典中对应词的索引 labels = sor([score for _, score in data])

return features, labels

迭代数据集

# 迭代数据集

batch_size =64

train_data = read_imdb('train', data_root=os.path.join(DATA_ROOT,"aclImdb"))

test_data = read_imdb('test', data_root=os.path.join(DATA_ROOT,"aclImdb"))铌高

vocab = get_vocab_imdb(train_data)

train_set = Data.TensorDataset(*process_imdb(train_data, vocab))椒盐噪声

test_set = Data.TensorDataset(*process_imdb(test_data, vocab))

train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)

test_iter = Data.DataLoader(test_set, batch_size)

创建TextCNN模型

class TextCNN(nn.Module):

def__init__(self, vocab, embed_size, kernel_sizes, num_channels):

super(TextCNN, self).__init__()

# 不参与训练的嵌⼊层

self.dropout = nn.Dropout(0.5)

self.decoder = nn.Linear(sum(num_channels),2)

# 时序最⼤池化层没有权重，所以可以共⽤⼀个实例

self.pool = GlobalMaxPool1d()

for c, k in zip(num_channels, kernel_sizes):

out_channels = c,

kernel_size = k))

def forward(self, inputs):

# 将两个形状是(批量⼤⼩, 词数, 词向量维度)的嵌⼊层的输出按词向量连结

embeddings = torch.cat((

# 根据Conv1D要求的输⼊格式，将词向量维，即⼀维卷积层的通道维(即词向量那⼀维)，变换到前⼀维

embeddings = embeddings.permute(0,2,1)

# 对于每个⼀维卷积层，在时序最⼤池化后会得到⼀个形状为(批量⼤⼩, 通道⼤⼩, 1)的

# Tensor。使⽤flatten函数去掉最后⼀维，然后在通道维上连结

encoding = torch.cat([self.lu(conv(embeddings))).squeeze(-1)for conv vs], dim=1)

# 应⽤丢弃法后使⽤全连接层得到输出

outputs = self.decoder(self.dropout(encoding))

return outputs

创建TextCNN实例，有3个卷积层，核宽分别为3、4和5，输出通道数均为100

# 创建TextCNN实例，有3个卷积层，核宽分别为3、4和5，输出通道数均为100

embed_size, kerel_sizes, num_channels =100,[3,4,5],[100,100,100]

net = TextCNN(vocab, embed_size, kerel_sizes, num_channels)

加载预训练的100维GloVe词向量，并分别初始化嵌⼊层embedding和constant_embedding，前者参与训练，⽽后者权重固定。# 加载预训练的100维GloVe词向量，并分别初始化嵌⼊层embedding和constant_embedding，前者参与训练，⽽后者权重固定。

glove_vocab = Vocab.GloVe(name='6B', dim=100,

cache=os.path.join(DATA_ROOT,"glove"))

# print(len(glove_vocab.stoi)) # 400000

# print(glove_vocab[0].shape)

加载预训练模型

# 加载预训练模型

def load_pretrained_embedding(words, pretrained_vocab):

'''从训练好的vocab中提取出words对应的词向量'''

embed = s(len(words), pretrained_vocab.vectors[0].shape[0])# pretrained_vocab.vectors[0].shape # torch.Size([100])

oov_count =0# out of vocabulary

for i, word in enumerate(words):

try:

idx = pretrained_vocab.stoi[word]

embed[i,:]= pretrained_vocab.vectors[idx]# 将第i⾏⽤预训练的单词向量替换

except KeyError:

oov_count +=1

if oov_count >0:

print("There are %d oov words."% oov_count)

return embed

load_pretrained_embedding(vocab.itos, glove_vocab))

训练并评价模型

lr, num_epochs =0.001,5

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)

loss = nn.CrossEntropyLoss()

评价函数

# 评价函数

def evaluate_accuracy(data_iter, net, device=None):

if device is None and isinstance(net, Module):

# 如果没指定device就使⽤net的device

device =list(net.parameters())[0].device

acc_sum, n =0.0,0

_grad():

for X, y in data_iter:

if isinstance(net, Module):

net.eval()# 评估模式, 这会关闭dropout

acc_sum +=((device)).argmax(dim=1)== y.to(device)).float().sum().cpu().item() ain()# 改回训练模式

else:# ⾃定义的模型, 3.13节之后不会⽤到, 不考虑GPU

if('is_training'in net.__code__.co_varnames):# 如果有is_training这个参数

# 将is_training设置成False

acc_sum +=(net(X, is_training=False).argmax(dim=1)== y).float().sum().item() else:

acc_sum +=(net(X).argmax(dim=1)== y).float().sum().item()

n += y.shape[0]

return acc_sum / n

训练函数

# 训练函数

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):

net = (device)

print("training on ", device)

batch_count =0

for epoch in range(num_epochs):

train_l_sum, train_acc_sum, n, start =0.0,0.0,0, time.time()

for X, y in train_iter:

X = X.to(device)

y = y.to(device)

y_hat = net(X)

l = loss(y_hat, y)

<_grad()

l.backward()

optimizer.step()

train_l_sum += l.cpu().item()

train_acc_sum +=(y_hat.argmax(dim=1)== y).sum().cpu().item()

n += y.shape[0]

batch_count +=1

test_acc = evaluate_accuracy(test_iter, net)

print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'

%(epoch +1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time()- start))训练

# 训练

train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on cuda

epoch 1, loss 0.4792, train acc 0.763, test acc 0.844, time 7.9 sec

epoch 2, loss 0.1616, train acc 0.862, test acc 0.872, time 7.9 sec

epoch 3, loss 0.0682, train acc 0.918, test acc 0.881, time 7.9 sec

epoch 4, loss 0.0289, train acc 0.959, test acc 0.876, time 7.9 sec

epoch 5, loss 0.0131, train acc 0.978, test acc 0.868, time 7.9 sec

预测函数

# 预测函数

def predict_sentiment(net, vocab, sentence):

'''sentence是词语的列表'''

device =list(net.parameters())[0].device

sentence = sor([vocab.stoi[word]for word in sentence], device=device)

label = torch.argmax(net(sentence.view((1,-1))), dim=1)

return'positive'if label.item()==1else'negative'

预测

# 预测

predict_sentiment(net, vocab,['this','movie','is','so','great'])# positive

predict_sentiment(net, vocab,['this','movie','is','so','bad'])# negative

完整代码

import collections

import os

import time

import torch

from torch import nn

import torchtext.vocab as Vocab

import torch.utils.data as Data

functional as F

import random

from tqdm import tqdm

患者腕带

device = torch.device('cuda'if torch.cuda.is_available()else'cpu')

DATA_ROOT ="E:\data"

# ⼀维卷积

def corr1d(X, K):

w = K.shape[0]

Y = s(X.shape[0]- w +1)

for i in range(Y.shape[0]):

Y[i]=(X[i: i+w]* K).sum()

return Y

# X, K = sor([0, 1, 2, 3, 4, 5, 6]), sor([1, 2])

大规模定制

# res = corr1d(X, K)

# print(res)

def corr1d_multi_in(X, K):

# ⾸先沿着X和K的第0维（通道维）遍历并计算⼀维互相关结果。然后将所有结果堆叠起来沿第0维累加return torch.stack([corr1d(x, k)for x,k in zip(X, K)]).sum(dim=0)

# X = sor([[0, 1, 2, 3, 4, 5, 6],

# [1, 2, 3, 4, 5, 6, 7],

本文发布于:2024-09-23 16:23:29，感谢您对本站的认可！

本文链接：https://www.17tex.com/tex/2/97574.html

上一篇：中考数学专题之“一线三角”(K型图)证相似

下一篇：基于Python实现五大常用分类算法（原理+代码）

标签：训练列表向量数据分词

留言与评论（共有 0 条评论）