I am running neural network training for sequential data. The hyperopt will run approximately 20 iterations before Tensorflow OOM. The training is done using sliding window in HPC environment.
Hence, changing hardward/instance configuration is not possible. Is there anything can be done in my code so that i will not have Tensorflow OOM issues?
Below is my code.
def data_generator(data, labels, batch_size):
num_samples = len(data)
while True:
for offset in range(0, num_samples, batch_size):
batch_data = data[offset:offset+batch_size]
batch_labels = labels[offset:offset+batch_size]
yield batch_data, batch_labels
Callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=Patience)
# build model
def hybrid_model(params):
keras.backend.clear_session()
train_generator = None
model_input = Input(shape=(len(X_train_array[0]), len(X_train_array[0][0])))
conv_1 = Conv1D(params['CNN1_neurons'], kernel_size=3, activation='relu')(model_input)
pooling_1 = MaxPooling1D(3, strides=2)(conv_1)
drop_cnn1 = tf.keras.layers.Dropout(params['dropout_rate'])(pooling_1)
flatten_1 = Flatten()(drop_cnn1)
dense_cnnflat = tf.keras.layers.Dense(params['CNN_dense'], activation='relu')(flatten_1)
drop_cnnflat = tf.keras.layers.Dropout(params['dropout_rate'])(dense_cnnflat)
reshape_1 = tf.keras.layers.Reshape((-1, params['CNN_dense']))(drop_cnnflat)
lstm_1 = LSTM(params['LSTM1_neurons'], activation=mish, return_sequences=True)(reshape_1)
attn = tf.keras.layers.Attention()([lstm_1, lstm_1])
dense_lstm1 = tf.keras.layers.Dense(params['LSTM1_dense'], activation='relu')(attn)
drop_lstm1 = tf.keras.layers.Dropout(params['dropout_rate'])(dense_lstm1)
model_output = tf.keras.layers.Dense(len(Y_train_array[0]), activation='relu')(drop_lstm1)
model = tf.keras.models.Model(inputs=model_input, outputs=model_output)
model.compile(optimizer='adam', loss=rmse, metrics=['mse', 'mae', r2_scores, 'mape'])
# model.summary(print_fn=myprint)
train_generator = data_generator(X_train_array, Y_train_array, batch_size=params['batch_size'])
train_start = time.time()
# Train the model using the generator
history = model.fit_generator(generator=train_generator,
steps_per_epoch=len(X_train_array) // params['batch_size'],
epochs=EPOCHS,
callbacks=[Callback], verbose=2)
train_time = str(round(time.time()-train_start, 3))
test_result = model.evaluate(X_test_array, Y_test_array, verbose=2)
return {'loss': test_result[0], 'mse': test_result[1], 'mae': test_result[2],
'r2': test_result[3], 'mape': test_result[4], 'status': STATUS_OK}
space = {
'batch_size': hp.randint('batch_size', 50, 300),
'CNN1_neurons': hp.randint('CNN1_neurons', 2000),
'CNN_dense': hp.randint('CNN_dense', 2000),
'LSTM1_neurons': hp.randint('LSTM1_neurons', 2000),
'LSTM1_dense': hp.randint('LSTM1_dense', 2000),
'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.8)
}
print('hyperparameter optimizing!')
t2 = time.time()
trials = Trials()
best = fmin(fn=hybrid_model,
space=space,
algo=tpe.suggest,
max_evals=300,
trials=trials)
I have tried clearing some memory before the start of every trials.
Clear keras sessions:
keras.backend.clear_session()
Clear Train generator:
train_generator = None
Is there any other method to trace the memory leaks? Or what needed to be done to prevent Tensorflow OOM?