How to feed or mask missing data to RNN, LSTM, and GRU with pytorch?

35 views Asked by At

I have a dataset that contain ~10% of random missing values.The data consist of repeated measures determined by the variable DIM, where each row is one day of data for an individual, each individual has data from DIM=3 to 21-28, also the variable parity that could take value 1 or 2, and the target (label_train, label_val) is binary. I in order to run the RNN, LSTM, adn GRU I had to fill the missing points, I did a data inputation with the mean of the features (sensor data for daily physical activity, and other continuos parameters), calculated by day and parity. Now I would like to NOT do the data inputation, and train the models with the missing data, since in real life missing data is very common, so either ignore the specific datapint or lear, but I would like to feed them without doing a data imputation or interpolation, etc. I have tried masking the missing values without success so far.

I'm working in a JupyterLab notebook. Dataset is as follows:

Column A Column B Column C Column D Column E Column F DIM Parity
NaN 68.70 5.68 284.0 2.02 55.1 3.0 2
376.6 73.05 NaN 272.0 1.72 54.4 4.0 2
301.5 NaN 5.11 187.0 NaN NaN 5.0 2
333.0 82.48 5.13 NaN 1.87 67.2 6.0 2
321.5 NaN 5.01 487.0 1.78 NaN 7.0 2
295.5 85.73 NaN 680.0 NaN 60.3 8.0 2

My code so far is:

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(data_train_pd.values, dtype=torch.float32)
y_train_tensor = torch.tensor(label_train_pd.values, dtype=torch.float32).unsqueeze(1)  # Add a singleton dimension for compatibility

X_val_tensor = torch.tensor(data_val_pd.values, dtype=torch.float32)
y_val_tensor = torch.tensor(label_val_pd.values, dtype=torch.float32).unsqueeze(1)

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

# Instantiate models
input_size = data_train_pd.shape[1]
hidden_size = 64
output_size = 1
num_layers = 4

rnn_model = SimpleRNN(input_size, hidden_size, output_size, num_layers)

# Assuming your imbalance ratio is 10
imbalance_ratio = 10.0
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(imbalance_ratio))
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=1e-4, weight_decay=1e-5)

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
val_dataset = CustomDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

def cal_print_stat(FP, TP, FN, TN, prefix=""):
    sensitivity = TP / (TP + FN) if (TP + FN) != 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    npv = TN / (TN + FN) if (TN + FN) != 0 else 0
    
    TPR= TP / (TP + FN)
    FPR= FP / (FP + TN)
    FNR= FN / (TP + FN)
    TNR= TN / (FP + TN)
    F1S= 2*(((TP / (TP + FP)) * (TP / (TP + FN))) / ((TP / (TP + FP)) + (TP / (TP + FN)))) if ((TP + FP) != 0) and ((TP + FN) != 0) else 0
    MCC = math.sqrt((TP / (TP + FN)) * (TN / (FP + TN)) * (TP / (TP + FP)) * (TN / (TN + FN))) - math.sqrt((FN / (TP + FN)) * (FP / (FP + TN)) * (FN / (FN + TN)) * (FP / (TP + FP))) if ((TP + FN) * (FP + TN) * (FN + TN) * (FP + TP)) != 0 else 0
    
    print("{}-sensitivity = {:.2f}%".format(prefix, sensitivity * 100))
    print("{}-specificity = {:.2f}%".format(prefix, specificity * 100))
    print("{}-precision   = {:.2f}%".format(prefix, precision * 100))
    print("{}-npv         = {:.2f}%".format(prefix, npv * 100))
    print("{}-TPR = {:.2f}%".format(prefix, TPR * 100))
    print("{}-FPR = {:.2f}%".format(prefix, FPR * 100))
    print("{}-FNR = {:.2f}%".format(prefix, FNR * 100))
    print("{}-TNR = {:.2f}%".format(prefix, TNR * 100))
    print("{}-MCC = {:.2f}".format(prefix, MCC))
                             
    return sensitivity, specificity, precision, npv, TPR, FPR, FNR, TNR, F1S, MCC #PLR, NLR, ACC, DOR

# Lists to store training and validation metrics
train_roc_aucs = []
val_roc_aucs = []

# Training loop
EPOCH = 1000
LOG_EVERY = 50

models = [rnn_model] 

optimizers = [rnn_optimizer]

for model, optimizer in zip(models, optimizers):
    model_name = type(model).__name__

    train_roc_aucs = []
    val_roc_aucs = []
    
    for epoch in range(EPOCH):
        model.train()
        avg_loss = 0.0
        
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features.unsqueeze(1))
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()

        avg_loss /= len(train_loader)

        if (epoch + 1) % LOG_EVERY == 0:
            # Training evaluation
            model.eval()
            train_all_preds = []
            train_all_labels = []
            
            with torch.no_grad():
                for features, labels in train_loader:
                    outputs = model(features.unsqueeze(1))
                    preds = (outputs >= 0.5).float()
                    train_all_preds.extend(preds.cpu().numpy())
                    train_all_labels.extend(labels.cpu().numpy())

                # Convert predictions and labels to numpy arrays
                train_all_preds = np.array(train_all_preds)
                train_all_labels = np.array(train_all_labels)
                
                # Calculate training ROC curve
                fpr_train, tpr_train, _ = roc_curve(train_all_labels, train_all_preds)
                roc_auc_train = auc(fpr_train, tpr_train)
                train_roc_aucs.append(roc_auc_train)

                # Calculate metrics
                train_conf_matrix = confusion_matrix(train_all_labels, train_all_preds)
                train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()

                # Print training metrics
                print(f"\nTraining Metrics after epoch {epoch + 1} (Model: {model_name}):")
                cal_print_stat(train_FP, train_TP, train_FN, train_TN, prefix="train")
                print(f'Training ROCAUC: {roc_auc_train}\n')

            # Evaluation
            model.eval()
            all_preds = []
            all_labels = []
            
            with torch.no_grad():
                for features, labels in val_loader:
                    outputs = model(features.unsqueeze(1))
                    preds = (outputs >= 0.5).float()
                    all_preds.extend(preds.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                # Convert predictions and labels to numpy arrays
                all_preds = np.array(all_preds)
                all_labels = np.array(all_labels)

                # Calculate metrics
                conf_matrix = confusion_matrix(all_labels, all_preds)
                TN, FP, FN, TP = conf_matrix.ravel()

                # Calculate validation ROC curve
                fpr_val, tpr_val, _ = roc_curve(all_labels, all_preds)
                roc_auc_val = auc(fpr_val, tpr_val)
                val_roc_aucs.append(roc_auc_val)

                # Print validation metrics
                print(f"\nValidation Metrics after epoch {epoch + 1} (Model: {model_name}):")
                cal_print_stat(FP, TP, FN, TN, prefix="val")
                print(f'Validation ROCAUC: {roc_auc_val}\n')
                
                
                plt.figure(figsize=(8, 8))
                plt.plot(fpr_train, tpr_train, color='blue', lw=2, label=f'Training ROC curve (area = {roc_auc_train:.2f})')
                plt.plot(fpr_val, tpr_val, color='darkorange', lw=2, label=f'Validation ROC curve (area = {roc_auc_val:.2f})')
                plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'Training and Validation ROC Curves - Model: {model_name}')
                plt.legend(loc='lower right')
                plt.show()

I already tried a few ways of masking without succes, like:

from torch.nn.utils.rnn import pad_sequence

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        sequence = self.features[idx]
        label = self.labels[idx]

        # Create a mask tensor
        mask = torch.ones_like(sequence)
        mask[torch.isnan(sequence)] = 0  # Set 0 for missing values

        return sequence, label, mask

def collate_fn(batch):
    sequences, labels, masks = zip(*batch)

    # Pad sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=float('nan'))
    
    # Stack masks
    stacked_masks = torch.stack(masks, dim=0)

    return padded_sequences, torch.tensor(labels, dtype=torch.float32).unsqueeze(1), stacked_masks

# Convert data to DataLoader
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
val_dataset = CustomDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

and a few other way but without success, getting errors like:

  • RuntimeError: expand(torch.FloatTensor{[32, 26, 1, 1]}, size=[32, 1, 64]): the number of sizes provided (3) must be greater or equal to the number of dimensions in the tensor (4)
  • RuntimeError: The size of tensor a (32) must match the size of tensor b (26) at non-singleton dimension 1
0

There are 0 answers