I'm currently working on a recommendation system using PyTorch and a DeepFM model architecture. Despite applying proper weight initialization and following common troubleshooting steps, I'm facing an issue where the model is not learning, and the loss remains unchanged during training.
The data im using is from the MovieLens dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip, I used the OneHotEncoder() object for the userId and the movieId (categorical features), for the timestamp a min-max normalization and for the ratings simply ensure the value is between 0 and 1
# Categorical data
categorical_features = ["userId","movieId"]
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_features = torch.from_numpy(encoder.fit_transform(data[categorical_features])).type(torch.float)
# Dense data
data["timestamp"] = (data["timestamp"] - data["timestamp"].min()) / (data["timestamp"].max() - data["timestamp"].min())
encoded_dense_features = torch.tensor(data["timestamp"]).type(torch.float).unsqueeze(1)
# Output
data["rating"] = data["rating"] * 0.2
y = torch.tensor(data["rating"]).type(torch.float)
After concatenation concatenated_linear_input = torch.cat((encoded_categorical_features, encoded_dense_features), dim=1) I get something like this (I only took the first 10000 samples):
(tensor([1.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.1907]) # concatenated[0], torch.Size([10000, 3285])) # concatenated.shape
The model is a copy of the Factorization Machine model:
from torch.nn.modules import dropout
from sklearn.decomposition import PCA
import pdb
class DeepFM(nn.Module):
def __init__(self, input_dim, split_numbers, embedding_dimension, device=device, n_components = 64, hidden_units = 10):
super().__init__()
self.input_dim = input_dim
self.embedding_dimension = embedding_dimension
self.split_numbers = split_numbers
self.embeddings = nn.ModuleList([
nn.Embedding(self.split_numbers[i], embedding_dimension) for i in range(len(split_numbers)-1)
]).to(device)
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(embedding_dimension, hidden_units),
nn.ReLU(),
nn.Linear(hidden_units, hidden_units),
nn.ReLU(),
nn.Linear(hidden_units, 1),
nn.Sigmoid(), # Add Sigmoid activation for the desired output range
).to(device)
self.output_unit = nn.Sigmoid().to(device)
self.regressor = nn.Linear(input_dim, 1).to(device)
def forward(self, X):
### FM Layer ###
# Linear
regression = self.regressor(X).squeeze()
# Cross Feature interactions
tensors = torch.split(X, self.split_numbers, dim=1)
tensors = [tensor.requires_grad_(True) for tensor in tensors]
embeddings = [self.embeddings[i](tensors[i].long()) for i in range(len(tensors)-1)]
cross_feature = torch.sum(torch.matmul(embeddings[0],embeddings[1].transpose(1,2)), dim=(1,2))
### Deep Layer ###
deep_concat = torch.cat((embeddings[0], embeddings[1], tensors[2].unsqueeze(2).expand(tensors[2].size(0),tensors[2].size(1), self.embedding_dimension)), dim=1)
deep_reduced = torch.mean(deep_concat, dim=1)
#print(deep_reduced.shape)
#pdb.set_trace()
#reduced_input = torch.from_numpy(self.pca.transform(flattened_input.to("cpu").detach())).to(device).type(torch.float).requires_grad_(True)
deep_output = self.classifier(deep_reduced).squeeze()
#print(list(self.classifier.parameters()))
#print(regression.requires_grad and cross_feature.requires_grad and deep_output.requires_grad)
## Final Output ##
return self.output_unit(regression + cross_feature + deep_output)
Then goes a normal train-test split and the training loop but apparently the model isn't updating the weights:
### Split ###
X_train, X_test, y_train, y_test = train_test_split(
concatenated_linear_input, y, test_size=0.2, random_state=44, shuffle=True)
# Device Agnostic code
X_train, X_test, y_train, y_test = X_train.to(device), X_test.to(device), y_train.to(device), y_test.to(device)
### Model ###
import copy
# Model creation
model = DeepFM(input_dim = linear_input_shape, split_numbers = [len(data["userId"].unique()), len(data["movieId"].unique()),1], embedding_dimension = 10, hidden_units=10)
# embedding weights inizialization
def init_weights(m):
if isinstance(m, nn.Embedding):
# Initialize the embedding layer using Xavier/Glorot initialization
nn.init.xavier_uniform_(m.weight)
model.apply(init_weights)
# Loss fucntion and optimizer
loss_fn = nn.BCELoss()
optimizer = optim.RMSprop(params= model.parameters(), lr=0.01)
# Train the model
num_epochs = 10
batch_size = 32
num_batches = len(X_train) // batch_size
# for checking why isn't learning
model1 = copy.deepcopy(model)
for epoch in range(num_epochs):
running_loss = 0.0
# Mini-batch training
for i in range(num_batches):
start_idx = i * batch_size
end_idx = start_idx + batch_size
X = X_train[start_idx:end_idx]
y_tensor = y_train[start_idx:end_idx]
# Forward pass
outputs = model(X.requires_grad_(True))
loss = loss_fn(outputs, y_tensor)
#loss = loss_fn(y_tensor, outputs)
# Zero the gradients
optimizer.zero_grad()
# Backward pass and optimization
loss.backward(retain_graph=True)
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1) # Clip gradients with a max_norm value
# Optimizer step
optimizer.step()
# total loss
running_loss += loss
# Print epoch loss
epoch_loss = running_loss / num_batches
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')
# Not learning :c
parameters_equal = all(
torch.all(p1.data == p2.data)
for p1, p2 in zip(model1.parameters(), model.parameters())
)
print(parameters_equal)
I tried multiple different strategies:
- Smaller/grater sample size
- debuging techniques
- different optimizers and lr
- different loss functions
- different batch_size, embedding_dim and other hyperparameters
The output of the training loop looks like this
Epoch [1/10], Loss: 70.7113
Epoch [2/10], Loss: 70.7113
Epoch [3/10], Loss: 70.7113
Epoch [4/10], Loss: 70.7113
Epoch [5/10], Loss: 70.7113
Epoch [6/10], Loss: 70.7113
Epoch [7/10], Loss: 70.7113
Epoch [8/10], Loss: 70.7113
Epoch [9/10], Loss: 70.7113
Epoch [10/10], Loss: 70.7113
True
I would highly appreciate the help