How can i improve my recurrent neural network

87 views Asked by At

I want to implement a recurrent neural network for natural language inference. I'm new in this topic and this is a task from a module from my university, so i've had some code beforehand which i tried to adopt for this task. The problem i have is that with my current implementation i get a accuracy of 0.5633 like everytime. So far i've implemented this code.

import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam, SGD, Adagrad
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn.utils.rnn as rnn_utils
from sklearn.metrics import accuracy_score

from nltk import word_tokenize
from nltk.corpus import stopwords
import string 

s_words = stopwords.words('english')
puncts = string.punctuation

word2index_combined = {}

def data_preprocessing(fname):
    sentence_pairs = []
    data = pd.read_csv(fname, sep='\t')
    sentences_1 = data['sentence1'].tolist()
    sentences_1 = [word_tokenize(s) for s in sentences_1]
    sentences_1 = [[w.lower() for w in sentences_1[i] if w.lower() not in s_words and w.lower() not in puncts] for i in range(len(sentences_1))] 
    sentences_2 = data['sentence2'].tolist()
    sentences_2 = [word_tokenize(s) for s in sentences_2]
    sentences_2 = [[w.lower() for w in sentences_2[i] if w.lower() not in s_words and w.lower() not in puncts] for i in range(len(sentences_2))] 
    labels = data['label'].astype(int).tolist()  
    k = 1
    for i in range(len(sentences_1)):
        for w in sentences_1[i]:
            if w not in word2index:
                word2index[w] = k
                k += 1
        for w in sentences_2[i]:
            if w not in word2index:
                word2index[w] = k
                k += 1       
    return sentences_1, sentences_2, labels

class data(Dataset):
    def __init__(self, sent_1, sent_2, labels, padding):
        self.padding = padding
        self.sent_1 = self._pad(sent_1)
        self.sent_2 = self._pad(sent_2)        
        self.labels = labels
        
    def __len__(self):
        return len(self.sent_1)
    
    def __getitem__(self, idx):
        s1 = [word2index.get(word, 0) for word in self.sent_1[idx]]
        s2 = [word2index.get(word, 0) for word in self.sent_2[idx]]
        label = self.labels[idx]
        return torch.tensor(s1), torch.tensor(s2), torch.tensor(label)
    
    def _pad(self, sentences):
        sents = []
        for sent in sentences:
            if len(sent) < self.padding:
                sent += [0] * (self.padding - len(sent))
            else:
                sent = sent[:self.padding]
            sents.append(sent)
        return sents
        

class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, emb_dim, num_layers):
        super().__init__()
        self.vocab_size = vocab_size+1
        self.hidden_size = hidden_size
        self.emb_dim = emb_dim
        self.num_layers = num_layers
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.lstm = nn.GRU(self.emb_dim, self.hidden_size, num_layers=self.num_layers, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(self.hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 2)
        
    def forward(self, inp_seq1, inp_seq2):
        batch_size = inp_seq1.size(0)
        inp1 = self.emb(inp_seq1)
        inp2 = self.emb(inp_seq2)
        h_0 = (torch.rand(self.num_layers*2, batch_size, self.hidden_size))
        packed_output1, last_hidden1 = self.lstm(inp1, h_0)
        packed_output2, last_hidden2 = self.lstm(inp2, h_0)
        out = torch.cat((last_hidden1[-1], last_hidden2[-1]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

def validate(model, dev_loader):
    label_pred = []
    label_original = []
    with torch.no_grad():
        for X1, X2, y in dev_loader:
            out = model(X1, X2)
            label = torch.argmax(out, dim=1)
            label_pred.extend(label.numpy().tolist())
            label_original.extend(y.numpy().tolist())
    
    return accuracy_score(label_original, label_pred)   

def train(model, train_loader, dev_loader):
    val_accuracy = 0
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.0001)
    epochs = 10
    for e in range(epochs):
        for X1, X2, y in tqdm(train_loader):
            if X1.shape[0] != y.shape[0]:  # Check if batch sizes match
                continue
            out = model(X1, X2)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()
        acc = validate(model, dev_loader)
        print(f"accuracy at the end of epoch - {e}: {acc}")
        if acc > val_accuracy:
            torch.save(model.state_dict(), 'rnn_model.pt')
            val_accuracy = acc
        model.train()     

train_sentences, train_labels = data_preprocessing('WNLI/train.tsv')
test_sentences, test_labels = data_preprocessing('WNLI/test.tsv')
dev_sentences, dev_labels = data_preprocessing('WNLI/dev.tsv')

train_data = data(train_sentences, train_labels)
test_data = data(test_sentences, test_labels)
dev_data = data(dev_sentences, dev_labels)

train_loader = DataLoader(train_data, batch_size=32, collate_fn=train_data.collate_fn, drop_last=True)
test_loader = DataLoader(test_data, batch_size=32, collate_fn=test_data.collate_fn, drop_last=True)
dev_loader = DataLoader(dev_data, batch_size=32, collate_fn=dev_data.collate_fn, drop_last=True)

vocab_size = len(word2index_combined)
hidden_size = 1024
emb_dim = 250
num_layers = 3
model = RNN(vocab_size, hidden_size, emb_dim, num_layers)
train(model, train_loader, dev_loader)

So far i've tried it with different batch sizes, number of layers, number of epochs, hidden sizes, embedding dimensions and learning rate but i still get the same accuracy or worse. So i don't really know where the problem could be and at this point i think maybe i'm going into the wrong direction with my implementation.

1

There are 1 answers

2
Hugofac On

From my experience, one thing that is currently very hot in natural language processing is using self-attention. I recommend using MultiHeadAttention as the first layer after input. Study it a bit to get the parameters right for your problem but it might give you the edge you seek.

This article might help you to understand how it works for your problem.

Also using the weighted cross-entropy loss function might help you considering how text usually favors the use of certain words with a greater frequency than others.