I want to implement a recurrent neural network for natural language inference. I'm new in this topic and this is a task from a module from my university, so i've had some code beforehand which i tried to adopt for this task. The problem i have is that with my current implementation i get a accuracy of 0.5633 like everytime. So far i've implemented this code.
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam, SGD, Adagrad
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn.utils.rnn as rnn_utils
from sklearn.metrics import accuracy_score
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
s_words = stopwords.words('english')
puncts = string.punctuation
word2index_combined = {}
def data_preprocessing(fname):
sentence_pairs = []
data = pd.read_csv(fname, sep='\t')
sentences_1 = data['sentence1'].tolist()
sentences_1 = [word_tokenize(s) for s in sentences_1]
sentences_1 = [[w.lower() for w in sentences_1[i] if w.lower() not in s_words and w.lower() not in puncts] for i in range(len(sentences_1))]
sentences_2 = data['sentence2'].tolist()
sentences_2 = [word_tokenize(s) for s in sentences_2]
sentences_2 = [[w.lower() for w in sentences_2[i] if w.lower() not in s_words and w.lower() not in puncts] for i in range(len(sentences_2))]
labels = data['label'].astype(int).tolist()
k = 1
for i in range(len(sentences_1)):
for w in sentences_1[i]:
if w not in word2index:
word2index[w] = k
k += 1
for w in sentences_2[i]:
if w not in word2index:
word2index[w] = k
k += 1
return sentences_1, sentences_2, labels
class data(Dataset):
def __init__(self, sent_1, sent_2, labels, padding):
self.padding = padding
self.sent_1 = self._pad(sent_1)
self.sent_2 = self._pad(sent_2)
self.labels = labels
def __len__(self):
return len(self.sent_1)
def __getitem__(self, idx):
s1 = [word2index.get(word, 0) for word in self.sent_1[idx]]
s2 = [word2index.get(word, 0) for word in self.sent_2[idx]]
label = self.labels[idx]
return torch.tensor(s1), torch.tensor(s2), torch.tensor(label)
def _pad(self, sentences):
sents = []
for sent in sentences:
if len(sent) < self.padding:
sent += [0] * (self.padding - len(sent))
else:
sent = sent[:self.padding]
sents.append(sent)
return sents
class RNN(nn.Module):
def __init__(self, vocab_size, hidden_size, emb_dim, num_layers):
super().__init__()
self.vocab_size = vocab_size+1
self.hidden_size = hidden_size
self.emb_dim = emb_dim
self.num_layers = num_layers
self.emb = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
self.lstm = nn.GRU(self.emb_dim, self.hidden_size, num_layers=self.num_layers, batch_first=True, bidirectional=True)
self.fc1 = nn.Linear(self.hidden_size*2, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, 2)
def forward(self, inp_seq1, inp_seq2):
batch_size = inp_seq1.size(0)
inp1 = self.emb(inp_seq1)
inp2 = self.emb(inp_seq2)
h_0 = (torch.rand(self.num_layers*2, batch_size, self.hidden_size))
packed_output1, last_hidden1 = self.lstm(inp1, h_0)
packed_output2, last_hidden2 = self.lstm(inp2, h_0)
out = torch.cat((last_hidden1[-1], last_hidden2[-1]), dim=1)
out = self.fc1(out)
out = self.relu(out)
out = self.fc2(out)
return out
def validate(model, dev_loader):
label_pred = []
label_original = []
with torch.no_grad():
for X1, X2, y in dev_loader:
out = model(X1, X2)
label = torch.argmax(out, dim=1)
label_pred.extend(label.numpy().tolist())
label_original.extend(y.numpy().tolist())
return accuracy_score(label_original, label_pred)
def train(model, train_loader, dev_loader):
val_accuracy = 0
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.0001)
epochs = 10
for e in range(epochs):
for X1, X2, y in tqdm(train_loader):
if X1.shape[0] != y.shape[0]: # Check if batch sizes match
continue
out = model(X1, X2)
loss = criterion(out, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
acc = validate(model, dev_loader)
print(f"accuracy at the end of epoch - {e}: {acc}")
if acc > val_accuracy:
torch.save(model.state_dict(), 'rnn_model.pt')
val_accuracy = acc
model.train()
train_sentences, train_labels = data_preprocessing('WNLI/train.tsv')
test_sentences, test_labels = data_preprocessing('WNLI/test.tsv')
dev_sentences, dev_labels = data_preprocessing('WNLI/dev.tsv')
train_data = data(train_sentences, train_labels)
test_data = data(test_sentences, test_labels)
dev_data = data(dev_sentences, dev_labels)
train_loader = DataLoader(train_data, batch_size=32, collate_fn=train_data.collate_fn, drop_last=True)
test_loader = DataLoader(test_data, batch_size=32, collate_fn=test_data.collate_fn, drop_last=True)
dev_loader = DataLoader(dev_data, batch_size=32, collate_fn=dev_data.collate_fn, drop_last=True)
vocab_size = len(word2index_combined)
hidden_size = 1024
emb_dim = 250
num_layers = 3
model = RNN(vocab_size, hidden_size, emb_dim, num_layers)
train(model, train_loader, dev_loader)
So far i've tried it with different batch sizes, number of layers, number of epochs, hidden sizes, embedding dimensions and learning rate but i still get the same accuracy or worse. So i don't really know where the problem could be and at this point i think maybe i'm going into the wrong direction with my implementation.
From my experience, one thing that is currently very hot in natural language processing is using self-attention. I recommend using MultiHeadAttention as the first layer after input. Study it a bit to get the parameters right for your problem but it might give you the edge you seek.
This article might help you to understand how it works for your problem.
Also using the weighted cross-entropy loss function might help you considering how text usually favors the use of certain words with a greater frequency than others.