Data preprocessing tuning with keras BayesianOptimization tuner

46 views Asked by At

I'm working on a multivariate timeseries vanillaLSTM model where the input shape is (n_samples,past_steps, n_features), It's a single step output model. I'm trying to do hyperparameter tuning for it ( number of units, learning rate, activation functions) also i want to include past_steps in the search space. how to do that? i'm using this function to prepare input data.

def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i, step)
        data.append(dataset.iloc[indices][selected_cols].values)

        if single_step:
            labels.append(target.iloc[i + target_size])
        else:
            labels.append(target.iloc[i:i + target_size].values)

    return np.array(data), np.array(labels)

And i tried doing this but it didn't work since because the x_train_multi, y_train_multi is not returned from the build_model_and_prprocess function, and I can't return them because the tuner doesn't accept anything other than the model. how can i do it then?

def build_model_and_preprocess(hp):
    feature_cols = ['T2M','NO','WS10M','WD10M','PS','QV2M']
    target_col = 'PM10'
    selected_cols = feature_cols + [target_col]
    dataset = df[selected_cols]
    future_target = 1
    ##################
    past_history = hp.Choice('Input window',values=[5,8,10,15,20,30,45,60],default=10)
    ##################
    STEP = 1
    TRAIN_SPLIT = int(len(dataset)*0.7)
    TEST_SPLIT = int(len(dataset)*0.85)
    BATCH_SIZE = 512
    BUFFER_SIZE = 10000
    dataset,data_min,data_max = normalize_dataset(dataset)

    x_train_multi, y_train_multi = multivariate_data(dataset, dataset[target_col], 0,
                                                 TRAIN_SPLIT, past_history,
                                                 future_target, STEP)
    x_val_multi, y_val_multi = multivariate_data(dataset, dataset[target_col],
                                             TRAIN_SPLIT, TEST_SPLIT, past_history,
                                             future_target, STEP)
    train_data_multi = tf.data.Dataset.from_tensor_slices((x_train_multi, y_train_multi))
    train_data_multi = train_data_multi.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

    val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
    val_data_multi = val_data_multi.batch(BATCH_SIZE)
    # Build the model
    model = keras.Sequential()
    model.add(layers.LSTM(units=hp.Int('units', min_value=10, max_value=100, step=20),
                          return_sequences=True,
                          input_shape=x_train_multi.shape[-2:]))
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(layers.Dense(1))
    model.compile(optimizer=keras.optimizers.Adam(
        hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4],default=0.001)),
        loss='mean_squared_error')

    return model

tuner = BayesianOptimization(
    build_model_and_preprocess,
    objective='val_loss',
    max_trials=20,
    executions_per_trial=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=1, restore_best_weights=True)
x_train_multi, y_train_multi, x_val_multi, y_val_multi = preprocess_data(past_history=tuner.get_space().get('Input window'),df=um_imputed_no_anom)
tuner.search(x_train_multi, y_train_multi,
             epochs=EPOCHS,
             validation_data=(x_val_multi, y_val_multi),callbacks=[early_stopping])

best_model = tuner.get_best_models(num_models=1)[0]
0

There are 0 answers