"RuntimeError: You must train on the training inputs! " When I'm trying to use mini batch in training Gaussian Process Regression Model

220 views Asked by At

I have written a piece of code to train a Gaussian Process Regression Model to predict age. I've written the following code and it's running well:

import numpy as np
import pandas as pd
import h5py
import torch
import gpytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time


disease_mapping = {
    'control': 0,
    "Alzheimer's disease": 1,
    "Graves' disease": 2,
    "Huntington's disease": 3,
    "Parkinson's disease": 4,
    'rheumatoid arthritis': 5,
    'schizophrenia': 6,
    "Sjogren's syndrome": 7,
    'stroke': 8,
    'type 2 diabetes': 9
}
sample_type_mapping = {'control': 0, 'disease tissue': 1}


def load_idmap(idmap_dir):
    idmap = pd.read_csv(idmap_dir, sep=',')
    age = idmap.age.to_numpy()
    age = age.astype(np.float32)
    sample_type = idmap.sample_type.replace(sample_type_mapping)
    return age, sample_type


def load_methylation_h5(prefix):
    '''
    Load methylation data from .h5 file.

    Parameters:
    ------------
    prefix: 'train' or 'test'
    '''
    methylation = h5py.File('encoded_'+prefix + 'data.h5', 'r')['data']
    h5py.File('encoded_'+prefix + 'data.h5', 'r').close()
    #return methylation[:, :10000]  # 5000 just for test
    return methylation[:, :]  # If you want to use full data, you can use this line.


def evaluate_ml(y_true, y_pred, sample_type):
    '''
    This function is used to evaluate the performance of the model.

    Parameters:
    ------------
    y_true: true age
    y_pred: predicted age
    sample_type: sample type, 0 for control, 1 for case

    Return:
    ------------
    mae: mean absolute error.
    mae_control: mean absolute error of control samples.
    mae_case: mean absolute error of case samples.

    We use MAE to evaluate the performance.
    Please refer to evaluation section in the the official website for more details.
    '''
    mae_control = np.mean(
        np.abs(y_true[sample_type == 0] - y_pred[sample_type == 0]))

    case_true = y_true[sample_type == 1]
    case_pred = y_pred[sample_type == 1]
    above = np.where(case_pred >= case_true)
    below = np.where(case_pred < case_true)

    ae_above = np.sum(np.abs(case_true[above] - case_pred[above])) / 2
    ae_below = np.sum(np.abs(case_true[below] - case_pred[below]))
    mae_case = (ae_above + ae_below) / len(case_true)

    mae = np.mean([mae_control, mae_case])
    return mae, mae_control, mae_case

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
methylation = load_methylation_h5('train')
methylation_test = load_methylation_h5('test')

idmap_train_dir = 'trainmap.csv'
idmap_test_dir = 'testmap.csv'

age, sample_type = load_idmap(idmap_train_dir)

print('Load data done')

# split and preprocess testing and training splits
indices = np.arange(len(age))
[indices_train, indices_valid, age_train,
 age_valid] = train_test_split(indices, age, test_size=0.2, shuffle=True)
methylation_train, methylation_valid = methylation[
                                           indices_train], methylation[indices_valid]
sample_type_train, sample_type_valid = sample_type[
                                           indices_train], sample_type[indices_valid]
feature_size = methylation_train.shape[1]
del methylation

# convert data to torch tensors
train_x = torch.tensor(methylation_train, dtype=torch.float32).to(device)
train_y = torch.tensor(age_train, dtype=torch.float32).to(device)
test_x = torch.tensor(methylation_valid, dtype=torch.float32).to(device)
test_y = torch.tensor(age_valid, dtype=torch.float32).to(device)
#dataset = torch.utils.data.TensorDataset(train_x, train_y)
#data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

# define Gaussian Process model
class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood().to(device)
model = GPRegressionModel(train_x, train_y, likelihood).to(device) # pre-test

# prepare for training
model.train()
likelihood.train()

# use Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#lr for learning rate
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(device)

# begin training
## settings parameters
num_epochs = 2
target_loss = 0.5
print('Start training...')
for epoch in range(num_epochs):
    start = time.time()
    optimizer.zero_grad()
    output = model(train_x)
    loss = -mll(output, train_y)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}', f'Training time: {time.time() - start}s')
    # check if target loss is achieved
    if loss.item() <= target_loss:
        print(f"Terminating training at iteration {epoch} as target loss {target_loss} is achieved.")
        break
# change to the evaluation model
model.eval()
likelihood.eval()

# do prediction
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    observed_pred = likelihood(model(test_x))
    age_valid_pred = observed_pred.mean
#print(age_valid_pred)
age_valid_pred = age_valid_pred.cpu().numpy()
#print(age_valid_pred)
mae = evaluate_ml(age_valid, age_valid_pred, sample_type_valid)
print(f'Validation MAE: {mae}')
# prediction
pred_x = torch.tensor(methylation_test, dtype=torch.float32).to(device)
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    observed_pred = likelihood(model(pred_x))
    age_pred = observed_pred.mean
age_pred = age_pred.cpu().numpy()
age_pred[age_pred < 0] = 0
# naive post-processing to ensure age >= 0

age_pred = np.around(age_pred, decimals=2)
age_pred = ['%.2f' % i for i in age_pred]
sample_id = pd.read_csv(idmap_test_dir, sep=',').sample_id
# Note: sample_id in submission should be the same as the order in testmap.csv.
# We do not provide the matching producdure for disordered sample_id in evaluation.

#submission = pd.DataFrame({'sample_id': sample_id, 'age': age_pred})
#submission_file = 'submit7.txt'
#submission.to_csv(submission_file, index=False)

but I have noticed that in each epoch, the same data was input, which I think may cause over fitting, so I want to use mini batch to train the model. I edit my code, as follow.

import numpy as np
import pandas as pd
import h5py
import torch
import gpytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time


disease_mapping = {
    'control': 0,
    "Alzheimer's disease": 1,
    "Graves' disease": 2,
    "Huntington's disease": 3,
    "Parkinson's disease": 4,
    'rheumatoid arthritis': 5,
    'schizophrenia': 6,
    "Sjogren's syndrome": 7,
    'stroke': 8,
    'type 2 diabetes': 9
}
sample_type_mapping = {'control': 0, 'disease tissue': 1}


def load_idmap(idmap_dir):
    idmap = pd.read_csv(idmap_dir, sep=',')
    age = idmap.age.to_numpy()
    age = age.astype(np.float32)
    sample_type = idmap.sample_type.replace(sample_type_mapping)
    return age, sample_type


def load_methylation_h5(prefix):
    '''
    Load methylation data from .h5 file.

    Parameters:
    ------------
    prefix: 'train' or 'test'
    '''
    methylation = h5py.File('encoded_'+prefix + 'data.h5', 'r')['data']
    h5py.File('encoded_'+prefix + 'data.h5', 'r').close()
    #return methylation[:, :10000]  # 5000 just for test
    return methylation[:, :]  # If you want to use full data, you can use this line.


def evaluate_ml(y_true, y_pred, sample_type):
    '''
    This function is used to evaluate the performance of the model.

    Parameters:
    ------------
    y_true: true age
    y_pred: predicted age
    sample_type: sample type, 0 for control, 1 for case

    Return:
    ------------
    mae: mean absolute error.
    mae_control: mean absolute error of control samples.
    mae_case: mean absolute error of case samples.

    We use MAE to evaluate the performance.
    Please refer to evaluation section in the the official website for more details.
    '''
    mae_control = np.mean(
        np.abs(y_true[sample_type == 0] - y_pred[sample_type == 0]))

    case_true = y_true[sample_type == 1]
    case_pred = y_pred[sample_type == 1]
    above = np.where(case_pred >= case_true)
    below = np.where(case_pred < case_true)

    ae_above = np.sum(np.abs(case_true[above] - case_pred[above])) / 2
    ae_below = np.sum(np.abs(case_true[below] - case_pred[below]))
    mae_case = (ae_above + ae_below) / len(case_true)

    mae = np.mean([mae_control, mae_case])
    return mae, mae_control, mae_case

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
methylation = load_methylation_h5('train')
methylation_test = load_methylation_h5('test')

idmap_train_dir = 'trainmap.csv'
idmap_test_dir = 'testmap.csv'

age, sample_type = load_idmap(idmap_train_dir)

print('Load data done')

# split and preprocess testing and training splits
indices = np.arange(len(age))
[indices_train, indices_valid, age_train,
 age_valid] = train_test_split(indices, age, test_size=0.2, shuffle=True)
methylation_train, methylation_valid = methylation[
                                           indices_train], methylation[indices_valid]
sample_type_train, sample_type_valid = sample_type[
                                           indices_train], sample_type[indices_valid]
feature_size = methylation_train.shape[1]
del methylation

# convert data to torch tensors
train_x = torch.tensor(methylation_train, dtype=torch.float32).to(device)
train_y = torch.tensor(age_train, dtype=torch.float32).to(device)
test_x = torch.tensor(methylation_valid, dtype=torch.float32).to(device)
test_y = torch.tensor(age_valid, dtype=torch.float32).to(device)
#dataset = torch.utils.data.TensorDataset(train_x, train_y)
#data_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

# define Gaussian Process models
class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood().to(device)
model = GPRegressionModel(train_x, train_y, likelihood).to(device) # pre-test

# prepare training
model.train()
likelihood.train()

# use the Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#lr for learning rate
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(device)

# begin training
## settings parameters
num_epochs = 2
target_loss = 0.5
print('Start training...')
for epoch in range(num_epochs):
    start = time.time()
    optimizer.zero_grad()
    output = model(train_x)
    loss = -mll(output, train_y)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}', f'Training time: {time.time() - start}s')
    # check if the target loss is achieved
    if loss.item() <= target_loss:
        print(f"Terminating training at iteration {epoch} as target loss {target_loss} is achieved.")
        break
# switch to the evaluation model
model.eval()
likelihood.eval()

# do prediction
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    observed_pred = likelihood(model(test_x))
    age_valid_pred = observed_pred.mean
#print(age_valid_pred)
age_valid_pred = age_valid_pred.cpu().numpy()
#print(age_valid_pred)
mae = evaluate_ml(age_valid, age_valid_pred, sample_type_valid)
print(f'Validation MAE: {mae}')
#预测
pred_x = torch.tensor(methylation_test, dtype=torch.float32).to(device)
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    observed_pred = likelihood(model(pred_x))
    age_pred = observed_pred.mean
age_pred = age_pred.cpu().numpy()
age_pred[age_pred < 0] = 0
# naive post-processing to ensure age >= 0

age_pred = np.around(age_pred, decimals=2)
age_pred = ['%.2f' % i for i in age_pred]
sample_id = pd.read_csv(idmap_test_dir, sep=',').sample_id
# Note: sample_id in submission should be the same as the order in testmap.csv.
# We do not provide the matching producdure for disordered sample_id in evaluation.

#submission = pd.DataFrame({'sample_id': sample_id, 'age': age_pred})
#submission_file = 'submit7.txt'
#submission.to_csv(submission_file, index=False)

This time the programme returns an error, traced back to line 147 output = model(train_x):

RuntimeError: You must train on the training inputs!

Is this method unable to use mini batch, or have I just done something wrong in my code?

0

There are 0 answers