Pytorch BCE loss not decreasing for word sense disambiguation task

339 views Asked by At

I am performing word sense disambiguation and have created my own vocabulary of the top 300k most common English words. My model is very simple where each word in the sentences (their respective index value) is passed through an embedding layer which embeds the word and average the resulting embedding. The averaged embedding is then sent through a linear layer, as shown in the model below.

class TestingClassifier(nn.Module):
  def __init__(self, vocabSize, features, embeddingDim):
      super(TestingClassifier, self).__init__()
      self.embeddings = nn.Embedding(vocabSize, embeddingDim)
      self.linear = nn.Linear(features, 2)
      self.sigmoid = nn.Sigmoid()

  def forward(self, inputs):
      embeds = self.embeddings(inputs)
      avged = torch.mean(embeds, dim=-1)
      output = self.linear(avged)
      output = self.sigmoid(output)
      return output

I am running BCELoss as loss function and SGD as optimizer. My problem is that my loss barely decreases as training goes on, almost as if it converges with a very high loss. I have tried different learning rates (0.0001, 0.001, 0.01 and 0.1) but I get the same issue.

My training function is as follows:

def train_model(model, 
                optimizer,
                lossFunction,
                batchSize, 
                epochs, 
                isRnnModel, 
                trainDataLoader, 
                validDataLoader, 
                earlyStop = False, 
                maxPatience = 1
):

  validationAcc = []
  patienceCounter = 0
  stopTraining = False
  model.train()

  # Train network
  for epoch in range(epochs):
    losses = []
    if(stopTraining):
      break

    for inputs, labels in tqdm(trainDataLoader, position=0, leave=True):

      optimizer.zero_grad()

      # Predict and calculate loss
      prediction = model(inputs)
      loss = lossFunction(prediction, labels)
      losses.append(loss)

      # Backward propagation
      loss.backward()

      # Readjust weights
      optimizer.step()

    print(sum(losses) / len(losses))
    curValidAcc = check_accuracy(validDataLoader, model, isRnnModel) # Check accuracy on validation set
    curTrainAcc = check_accuracy(trainDataLoader, model, isRnnModel)
    print("Epoch", epoch + 1, "Training accuracy", curTrainAcc, "Validation accuracy:", curValidAcc)

    # Control early stopping
    if(earlyStop):
      if(patienceCounter == 0):
        if(len(validationAcc) > 0 and curValidAcc < validationAcc[-1]):
          benchmark = validationAcc[-1]
          patienceCounter += 1
          print("Patience counter", patienceCounter)
      
      elif(patienceCounter == maxPatience):
        print("EARLY STOP. Patience level:", patienceCounter)
        stopTraining = True

      else:
        if(curValidAcc < benchmark):
          patienceCounter += 1
          print("Patience counter", patienceCounter)
        
        else:
          benchmark = curValidAcc
          patienceCounter = 0

      validationAcc.append(curValidAcc)

Batch size is 32 (training set contains 8000 rows), vocabulary size is 300k, embedding dimension is 24. I have tried adding more linear layers to the network, but it makes no difference. The prediction accuracy on the training and validation sets stays at around 50% (which is horrible) even after many epochs of training. Any help is much appreciated!

0

There are 0 answers