I have no coding experience, new to python. task: use cnn to do image binary classification problem: memory error
# data is confidential, image example is pasted. [enter image description here][1]
# two classes of images: 294 images for class 'e'; 5057 images for class 'l'. Given datasets were imbalanced, the original plan was set batch_size=500 in datagen.flow_from_directory for each class. So, in every batch, the whole dataset of class 'e' and 500 images of class 'l' were fed to the model. However, google colab keeps crashing out of ram. Batch_size was downgraded to 50, still failed.
# x=image data; y=label; bs=batch_size
bs = 50
def generate_batch_data_random(x, y, bs):
ylen = len(y)
loopcount = ylen // bs
while (True):
i = random.randint(0,loopcount)
yield x[i * bs:(i + 1) * bs], y[i * bs:(i + 1) * bs]
def train_and_validate_model(model, x, y):
(trainX, testX, trainY, testY) = train_test_split(x, y, test_size=0.25, random_state=6)
trainY = to_categorical(trainY, num_classes=2)
testY = to_categorical(testY, num_classes=2)
logger = CSVLogger(kfold_train_and_validate, append=True)
H = model.fit_generator(generator=generate_batch_data_random(trainX, trainY, bs),
steps_per_epoch= len(trainX) / bs,
epochs=10,
validation_data=generate_batch_data_random(testX, testY, bs), validation_steps= len(testX) /bs,
callbacks=[checkpoint])
return H,testX,testY
# use imagedatagenerator to save memory. Split groups seem more appropriate than fixed training and validate groups. So dataset structure was built based on the classes of images (one folder of images for each class), not the training and validation groups. The plan was to use imagedatagenerator to send images in batches, then use kfold to split each batch into training and validate groups.
path = '/content/drive/MyDrive/er_lr/erlr_vs_er'
datagen = ImageDataGenerator(rescale=1./255)
data_e = datagen.flow_from_directory(directory=path,
target_size=(128,128),
classes='e',
batch_size=50,
class_mode='categorical')
x_e, y_e = next(data_e)
data_l = datagen.flow_from_directory(directory=path,
classes='l',
target_size=(128,128),
batch_size=50,
class_mode='categorical')
x_l, y_l = next(data_l)
for i in range(0,len(y_e)):
y_e[i] = 0
for j in range(0,len(y_l)):
y_l[j] = 1
x = []
y = []
x.extend(np.array(data_e)[0][0])
x.extend(np.array(data_l)[0][0])
y.extend(np.array(y_e))
y.extend(np.array(y_l))
seed = 10
np.random.seed(seed)
filepath = '/content/drive/MyDrive/er_lr/hdf5/my_best_model.epoch{epoch:02d}-loss{val_loss:.2f}.hdf5'
fold = 1
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for train, test in kfold.split(x, y):
model = create_model()
checkpoint = keras.callbacks.ModelCheckpoint(filepath,
monitor='val_loss', save_weights_only=True,verbose=1,save_best_only=True, save_freq='epoch', period=1)
H,validationX,validationY=train_and_validate_model(model, x[train], y[train])
training_ACCs.append(H.history['accuracy'])
training_loses.append(H.history['loss'])
val_ACCs.append(H.history['val_accuracy'])
val_loses.append(H.history['val_loss'])
labels_test_cat = to_categorical(y[test], num_classes=2)
scores = model.evaluate(x[test], labels_test_cat, verbose=0)
fold = fold + 1
crashed in google colab repeatedly for out of ram. batch_size of 50 and np.shape of (128, 128, 3) of each image do not seem large-scaled. Any thoughts? [1]: https://i.stack.imgur.com/Lp1H9.png