I have two directories, train_data_npy
and valid_data_npy
where there are 3013 and 1506 *.npy
files, respectively.
Each *.npy
file has 11 columns of float types, of which the first eight columns are features and the last three columns are one-hot-encoded labels (characters) of three classes.
----------------------------------------------------------------------
f1 f2 f3 f4 f5 f6 f7 f8 ---classes---
----------------------------------------------------------------------
0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 1.0
6.559 9.22 0.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 1.0
5.512 6.891 10.589 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
7.082 8.71 7.227 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
6.352 9.883 12.492 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
6.711 10.422 13.44 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
7.12 9.283 12.723 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
6.408 9.277 12.542 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
6.608 9.686 12.793 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
6.723 8.602 12.168 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
... ... ... ... ...
Given the format of the data, I have written two scripts.
cnn_autokeras_by_chunk_with_ohe.py
uses OHE labels as they are, and cnn_autokeras_by_chunk_without_ohe.py
converts OHE data into integers.
The first one achieves an accuracy of 0.40
, and the second one achieves an accuracy of 0.97
.
Why does the one-hot-encoding give worse accuracy in this case?
The Python script's task is to load those *.npy
files in chunks so that the memory is not overflowed while searching for the best model.
# File: cnn_autokeras_by_chunk_with_ohe.py
import numpy as np
import tensorflow as tf
import autokeras as ak
import os
# Update these values to match your actual data
N_FEATURES = 8
N_CLASSES = 3 # Number of classes
BATCH_SIZE = 100
def get_data_generator(folder_path, batch_size, n_features, n_classes):
"""Get a generator returning batches of data from .npy files in the specified folder.
The shape of the features is (batch_size, n_features).
The shape of the labels is (batch_size, n_classes).
"""
def data_generator():
files = os.listdir(folder_path)
npy_files = [f for f in files if f.endswith('.npy')]
for npy_file in npy_files:
data = np.load(os.path.join(folder_path, npy_file))
x = data[:, :n_features]
y = data[:, n_features:]
for i in range(0, len(x), batch_size):
yield x[i:i+batch_size], y[i:i+batch_size]
return data_generator
train_data_folder = '/home/my_user_name/original_data/train_data_npy'
validation_data_folder = '/home/my_user_name/original_data/valid_data_npy'
train_dataset = tf.data.Dataset.from_generator(
get_data_generator(train_data_folder, BATCH_SIZE, N_FEATURES, N_CLASSES),
output_signature=(
tf.TensorSpec(shape=(None, N_FEATURES), dtype=tf.float32),
tf.TensorSpec(shape=(None, N_CLASSES), dtype=tf.float32) # Labels are now 2D with one-hot encoding
)
)
validation_dataset = tf.data.Dataset.from_generator(
get_data_generator(validation_data_folder, BATCH_SIZE, N_FEATURES, N_CLASSES),
output_signature=(
tf.TensorSpec(shape=(None, N_FEATURES), dtype=tf.float32),
tf.TensorSpec(shape=(None, N_CLASSES), dtype=tf.float32) # Labels are now 2D with one-hot encoding
)
)
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(max_trials=10) # Set max_trials to any value you desire.
# Feed the tensorflow Dataset to the classifier.
clf.fit(train_dataset, epochs=100)
# Get the best hyperparameters
best_hps = clf.tuner.get_best_hyperparameters()[0]
# Print the best hyperparameters
print(best_hps)
# Export the best model
model = clf.export_model()
# Save the model in tf format
model.save("heca_v2_model_with_ohe", save_format='tf') # Note the lack of .h5 extension
# Evaluate the best model with testing data.
print(clf.evaluate(validation_dataset))
# File: cnn_autokeras_by_chunk_without_ohe.py
import numpy as np
import tensorflow as tf
import os
import autokeras as ak
N_FEATURES = 8
N_CLASSES = 3 # Number of classes
BATCH_SIZE = 100
def get_data_generator(folder_path, batch_size, n_features):
"""Get a generator returning batches of data from .npy files in the specified folder.
The shape of the features is (batch_size, n_features).
"""
def data_generator():
files = os.listdir(folder_path)
npy_files = [f for f in files if f.endswith('.npy')]
for npy_file in npy_files:
data = np.load(os.path.join(folder_path, npy_file))
x = data[:, :n_features]
y = data[:, n_features:]
y = np.argmax(y, axis=1) # Convert one-hot-encoded labels back to integers
for i in range(0, len(x), batch_size):
yield x[i:i+batch_size], y[i:i+batch_size]
return data_generator
train_data_folder = '/home/my_user_name/original_data/train_data_npy'
validation_data_folder = '/home/my_user_name/original_data/valid_data_npy'
train_dataset = tf.data.Dataset.from_generator(
get_data_generator(train_data_folder, BATCH_SIZE, N_FEATURES),
output_signature=(
tf.TensorSpec(shape=(None, N_FEATURES), dtype=tf.float32),
tf.TensorSpec(shape=(None,), dtype=tf.int32) # Labels are now 1D integers
)
)
validation_dataset = tf.data.Dataset.from_generator(
get_data_generator(validation_data_folder, BATCH_SIZE, N_FEATURES),
output_signature=(
tf.TensorSpec(shape=(None, N_FEATURES), dtype=tf.float32),
tf.TensorSpec(shape=(None,), dtype=tf.int32) # Labels are now 1D integers
)
)
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(max_trials=10) # Set max_trials to any value you desire.
# Feed the tensorflow Dataset to the classifier.
clf.fit(train_dataset, epochs=100)
# Get the best hyperparameters
best_hps = clf.tuner.get_best_hyperparameters()[0]
# Print the best hyperparameters
print(best_hps)
# Export the best model
model = clf.export_model()
# Save the model in tf format
model.save("heca_v2_model_without_ohe", save_format='tf') # Note the lack of .h5 extension
# Evaluate the best model with testing data.
print(clf.evaluate(validation_dataset))
EDIT:
0 MSE C 0.000 0.000 0.000 1 1 1 1 1 0
1 ASN C 7.042 9.118 0.000 1 1 1 1 1 0
2 LEU H 5.781 5.488 7.470 0 0 0 0 1 0
3 THR H 5.399 5.166 6.452 0 0 0 0 0 0
4 GLU H 5.373 4.852 6.069 0 0 0 0 1 0
5 LEU H 5.423 5.164 6.197 0 0 0 0 2 0
(1) - residue number for debug purpose only (NOT A FEATURE)
(2) - residue type for debug purpose only (NOT A FEATURE)
(3) - secondary structure (TRUE LABEL)
(4) - r13
(5) - r14
(6) - r15
(7) - neighbor count with 4A
(8) - neighbor count with 4.5A
(9) - neighbor count with 5A
(10) - neighbor count with 6A
(11) - neighbor count with 8A
(12) - hydrogen bonds count
Assumption 1: OHE preprocessing, results in 3 classes, based on the Structured data classifier documentation -
autokeras
will pickcategorical_crossentropy
.Assumption 2: Without_OHE, it will result in 1 class (with possible integer values 1, 2, or 3). Again, based on the documentation mentioned above -
autokeras
will now pickbinary_crossentropy
(I guess).So without the full data (or at least a MVDataset) we have a hard time helping. I would recommend to debug as follows:
autokeras'
Structured data regressor.Note: From my experience (but who am I to judge) I would expect the classification problem to be better performing. As this is an easier task to begin with. So I would not trust your current regression performance, and you did right showing up here.