PyTorch model not learning - Loss remains unchanged

83 views Asked by At

I'm currently working on a recommendation system using PyTorch and a DeepFM model architecture. Despite applying proper weight initialization and following common troubleshooting steps, I'm facing an issue where the model is not learning, and the loss remains unchanged during training.

The data im using is from the MovieLens dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip, I used the OneHotEncoder() object for the userId and the movieId (categorical features), for the timestamp a min-max normalization and for the ratings simply ensure the value is between 0 and 1

# Categorical data
categorical_features = ["userId","movieId"]
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_features = torch.from_numpy(encoder.fit_transform(data[categorical_features])).type(torch.float)

# Dense data
data["timestamp"] = (data["timestamp"] - data["timestamp"].min()) / (data["timestamp"].max() - data["timestamp"].min())
encoded_dense_features = torch.tensor(data["timestamp"]).type(torch.float).unsqueeze(1)

# Output
data["rating"] = data["rating"] * 0.2
y = torch.tensor(data["rating"]).type(torch.float)

After concatenation concatenated_linear_input = torch.cat((encoded_categorical_features, encoded_dense_features), dim=1) I get something like this (I only took the first 10000 samples):

(tensor([1.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.1907]) # concatenated[0], torch.Size([10000, 3285])) # concatenated.shape

The model is a copy of the Factorization Machine model:

from torch.nn.modules import dropout
from sklearn.decomposition import PCA
import pdb

class DeepFM(nn.Module):
  def __init__(self, input_dim, split_numbers, embedding_dimension, device=device, n_components = 64, hidden_units = 10):
    super().__init__()

    self.input_dim = input_dim
    self.embedding_dimension = embedding_dimension
    self.split_numbers = split_numbers
    self.embeddings = nn.ModuleList([
        nn.Embedding(self.split_numbers[i], embedding_dimension) for i in range(len(split_numbers)-1)
    ]).to(device)

    self.classifier = nn.Sequential(
        nn.Dropout(0.2),
        nn.Linear(embedding_dimension, hidden_units),
        nn.ReLU(),
        nn.Linear(hidden_units, hidden_units),
        nn.ReLU(),
        nn.Linear(hidden_units, 1),
        nn.Sigmoid(),  # Add Sigmoid activation for the desired output range
    ).to(device)

    self.output_unit = nn.Sigmoid().to(device)

    self.regressor = nn.Linear(input_dim, 1).to(device)

  def forward(self, X):

    ### FM Layer ###
    # Linear
    regression = self.regressor(X).squeeze()
    # Cross Feature interactions
    tensors = torch.split(X, self.split_numbers, dim=1)
    tensors = [tensor.requires_grad_(True) for tensor in tensors]
    embeddings = [self.embeddings[i](tensors[i].long()) for i in range(len(tensors)-1)]
    cross_feature = torch.sum(torch.matmul(embeddings[0],embeddings[1].transpose(1,2)), dim=(1,2))

    ### Deep Layer ###
    deep_concat = torch.cat((embeddings[0], embeddings[1], tensors[2].unsqueeze(2).expand(tensors[2].size(0),tensors[2].size(1), self.embedding_dimension)), dim=1)

    deep_reduced = torch.mean(deep_concat, dim=1)

    #print(deep_reduced.shape)
    #pdb.set_trace()
    #reduced_input = torch.from_numpy(self.pca.transform(flattened_input.to("cpu").detach())).to(device).type(torch.float).requires_grad_(True)
    deep_output = self.classifier(deep_reduced).squeeze()

    #print(list(self.classifier.parameters()))

    #print(regression.requires_grad and cross_feature.requires_grad and deep_output.requires_grad)

    ## Final Output ##
    return self.output_unit(regression + cross_feature + deep_output)

Then goes a normal train-test split and the training loop but apparently the model isn't updating the weights:

### Split ###
X_train, X_test, y_train, y_test = train_test_split(
    concatenated_linear_input, y, test_size=0.2, random_state=44, shuffle=True)

# Device Agnostic code
X_train, X_test, y_train, y_test = X_train.to(device), X_test.to(device), y_train.to(device), y_test.to(device)

### Model ###

import copy

# Model creation
model = DeepFM(input_dim = linear_input_shape, split_numbers = [len(data["userId"].unique()), len(data["movieId"].unique()),1], embedding_dimension = 10, hidden_units=10)

# embedding weights inizialization
def init_weights(m):
    if isinstance(m, nn.Embedding):
        # Initialize the embedding layer using Xavier/Glorot initialization
        nn.init.xavier_uniform_(m.weight)
model.apply(init_weights)

# Loss fucntion and optimizer
loss_fn = nn.BCELoss()
optimizer = optim.RMSprop(params= model.parameters(), lr=0.01)

# Train the model
num_epochs = 10
batch_size = 32
num_batches = len(X_train) // batch_size

# for checking why isn't learning
model1 = copy.deepcopy(model)

for epoch in range(num_epochs):
    running_loss = 0.0

    # Mini-batch training
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size

        X = X_train[start_idx:end_idx]
        y_tensor = y_train[start_idx:end_idx]

        # Forward pass
        outputs = model(X.requires_grad_(True))
        loss = loss_fn(outputs, y_tensor)
        #loss = loss_fn(y_tensor, outputs)

        # Zero the gradients
        optimizer.zero_grad()

        # Backward pass and optimization
        loss.backward(retain_graph=True)

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)  # Clip gradients with a max_norm value

        # Optimizer step
        optimizer.step()

        # total loss
        running_loss += loss

    # Print epoch loss
    epoch_loss = running_loss / num_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

# Not learning :c
parameters_equal = all(
torch.all(p1.data == p2.data)
for p1, p2 in zip(model1.parameters(), model.parameters())
)
print(parameters_equal)

I tried multiple different strategies:

  • Smaller/grater sample size
  • debuging techniques
  • different optimizers and lr
  • different loss functions
  • different batch_size, embedding_dim and other hyperparameters

The output of the training loop looks like this

Epoch [1/10], Loss: 70.7113
Epoch [2/10], Loss: 70.7113
Epoch [3/10], Loss: 70.7113
Epoch [4/10], Loss: 70.7113
Epoch [5/10], Loss: 70.7113
Epoch [6/10], Loss: 70.7113
Epoch [7/10], Loss: 70.7113
Epoch [8/10], Loss: 70.7113
Epoch [9/10], Loss: 70.7113
Epoch [10/10], Loss: 70.7113
True

I would highly appreciate the help

0

There are 0 answers