1D CNN predictions plot mismatch with actual time series plot

68 views Asked by At

I wanted to predict the time series count of Dengue cases for given covariates using a 1D CNN model. The loss function and metrics such as MSE and MAE seem satisfactory. However, the predicted plots for both the training and testing sets do not match the actual data. I'm unsure what went wrong. Here are the complete codes:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Load the data
file_path = 'https://gist.githubusercontent.com/JishanAhmed2019/a7666a3651d27bf03dc93e63aac896b0/raw/f93f28aeaa41418689744a4fbb5bde29114f9872/Dengue.csv'
data = pd.read_csv(file_path, index_col='Date', sep='\t', parse_dates=True)

# Split the data into training and testing sets before scaling
split_fraction = 0.85
split_point = int(len(data) * split_fraction)

train_data = data.iloc[:split_point]
test_data = data.iloc[split_point:]

# Extract features and targets from both sets
X_train, y_train = train_data.drop('Dhaka_Dengue', axis=1), train_data['Dhaka_Dengue']
X_test, y_test = test_data.drop('Dhaka_Dengue', axis=1), test_data['Dhaka_Dengue']

# Apply scaling separately to avoid data leakage
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

X_train_scaled = scaler_features.fit_transform(X_train)
X_test_scaled = scaler_features.transform(X_test)

y_train_scaled = scaler_target.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_target.transform(y_test.values.reshape(-1, 1)).flatten()

# Manually split the training data to create a validation set
val_fraction = 0.15
val_split_point = int(len(X_train_scaled) * (1 - val_fraction))

X_train_final = X_train_scaled[:val_split_point]
y_train_final = y_train_scaled[:val_split_point]
X_val = X_train_scaled[val_split_point:]
y_val = y_train_scaled[val_split_point:]

# Reshape for 1D CNN input
X_train_final_reshaped = X_train_final.reshape((X_train_final.shape[0], X_train_final.shape[1], 1))
X_val_reshaped = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Define the 1D CNN model
model = Sequential([
    Conv1D(32, 5, padding='same', activation=LeakyReLU(alpha=0.1), input_shape=(X_train_final_reshaped.shape[1], 1)),
    MaxPooling1D(2, padding="same"),
   # Conv1D(16, 5, padding='same', activation=LeakyReLU(alpha=0.1)),
   # MaxPooling1D(2, padding="same"),
    Flatten(),
    #Dense(32, activation='relu'),
    Dropout(0.20),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')

# Train the model
history = model.fit(
    X_train_final_reshaped, y_train_final,
    epochs=500,
    batch_size=32,
    verbose=1,
    validation_data=(X_val_reshaped, y_val),
    callbacks=[early_stopping]
)

# Make predictions
train_predictions = model.predict(X_train_final_reshaped)
val_predictions = model.predict(X_val_reshaped)
test_predictions = model.predict(X_test_reshaped)

# Inverse transform predictions and actual values to original scale
train_predictions_inverse = scaler_target.inverse_transform(train_predictions).flatten()
val_predictions_inverse = scaler_target.inverse_transform(val_predictions).flatten()
test_predictions_inverse = scaler_target.inverse_transform(test_predictions).flatten()

y_train_inverse = scaler_target.inverse_transform(y_train_final.reshape(-1, 1)).flatten()
y_val_inverse = scaler_target.inverse_transform(y_val.reshape(-1, 1)).flatten()
y_test_inverse = scaler_target.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()

# Calculate and print RMSE and MAE for training, validation, and test sets
train_rmse = np.sqrt(mean_squared_error(y_train_inverse, train_predictions_inverse))
val_rmse = np.sqrt(mean_squared_error(y_val_inverse, val_predictions_inverse))
test_rmse = np.sqrt(mean_squared_error(y_test_inverse, test_predictions_inverse))

train_mae = mean_absolute_error(y_train_inverse, train_predictions_inverse)
val_mae = mean_absolute_error(y_val_inverse, val_predictions_inverse)
test_mae = mean_absolute_error(y_test_inverse, test_predictions_inverse)

print("Training RMSE:", train_rmse, "MAE:", train_mae)
print("Validation RMSE:", val_rmse, "MAE:", val_mae)
print("Testing RMSE:", test_rmse, "MAE:", test_mae)

# Plot training and validation loss
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot actual vs. predicted for the test set
plt.figure(figsize=(10, 4))
plt.plot(test_data.index, y_test_inverse, label='Actual')
plt.plot(test_data.index, test_predictions_inverse, label='Predicted')
plt.title('Test Set Actual vs. Predicted')
plt.legend()
plt.show()


# Plot the actual vs. predicted values for the training set
plt.figure(figsize=(14, 5))
plt.plot(train_dates, y_train_inverse, label='Train Actual')
plt.plot(train_dates, train_predictions_inverse, label='Train Predictions')
plt.title('Training Predictions vs Actual')
plt.ylabel('Dengue Incidents')
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Plot the actual vs. predicted values for the testing set
plt.figure(figsize=(14, 5))
plt.plot(test_dates, y_test_inverse, label='Test Actual')
plt.plot(test_dates, test_predictions_inverse, label='Test Predictions')
plt.title('Testing Predictions vs Actual')
plt.ylabel('Dengue Incidents')
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 7))  # Set the figure size for better readability
plt.plot(data.index, data['Dhaka_Dengue'], label='Actual Dengue Count', color='blue')
plt.title('Actual Dengue Count Time Series')
plt.xlabel('Date')
plt.ylabel('Dengue Count')
plt.legend()
plt.xticks(rotation=45)  # Rotate date labels for better readability
plt.tight_layout()
plt.show()

1

There are 1 answers

2
Anonymous On

You are not passing the dates to your model. No idea what the optimal way is, but something like converting your dates to unix time, or just a plain index column seems to work:

from datetime import timezone

data = pd.read_csv(file_path, index_col='Date', sep='\t', parse_dates=True)
data.reset_index(level=0, inplace=True)
data["Date"] = data["Date"].map(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

# Split the data into training and testing sets before scaling
features = data.drop("Dhaka_Dengue", axis=1).values
target = data["Dhaka_Dengue"].values

# Scaling features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
target_scaled = scaler.fit_transform(target.reshape(-1, 1)).flatten()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features_scaled, target_scaled, test_size=0.2, random_state=42
)

enter image description here

_, y_test_sorted = zip(*sorted(zip(X_test[:, 0], y_test)))
_, y_pred_sorted = zip(*sorted(zip(X_test[:, 0], test_predictions)))

plt.plot(y_test_sorted)
plt.plot(y_pred_sorted)

enter image description here