slow training despite using tf data pipeline

464 views Asked by At

I am training image classification model which contains 21000 image. I have created data pipeline with the help of tf.data API of tensorflow. My issue is that training is too slow despite using API. I have also enabled tensorflow gpu version. Please help me out.I thought first it was due to keras imagedatagenerator which is slowing down my training time but now when I changed it tf.data pipeline it still does not utilizes my gpu. Below is my whole code

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50, EfficientNetB3, InceptionV3, DenseNet121
from tensorflow.keras.optimizers import Adam

# ignoring warnings
import warnings
warnings.simplefilter("ignore")
import os,cv2


base_dir = "D:/cassava-leaf-disease-classification/"
train_csv = pd.read_csv("D:/cassava-leaf-disease-classification/train.csv")
# print(train_csv.head())
df_sample = pd.read_csv("D:/cassava-leaf-disease-classification/sample_submission.csv")
train_images = "D:/cassava-leaf-disease-classification/train_images/"+train_csv['image_id']
# print(train_images)

# print(os.listdir(train_images))
train_labels = pd.read_csv(os.path.join(base_dir, "train.csv"))

# print(train_labels)
BATCH_SIZE = 16
EPOCHS = 25
STEPS_PER_EPOCH = len(train_labels)*0.8 / BATCH_SIZE
TARGET_SIZE = 300
# train_labels['label'] = train_labels.label.astype('str')
labels = train_labels.iloc[:,-1].values
# print(labels)

def build_decoder(with_labels=True, target_size=(TARGET_SIZE, TARGET_SIZE), ext='jpg'):
    def img_decode(img_path):
        file_bytes = tf.io.read_file(img_path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)

        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)

        return img

    def decode_with_labels(img_path, label):
        return img_decode(img_path), label

    if with_labels == True:
        return decode_with_labels

    else:
        return img_decode


def build_augmenter(with_labels=True):
    def augment(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        img = tf.image.random_brightness(img, 0.1)
        img = tf.image.random_contrast(img, 0.9, 1.1)
        img = tf.image.random_saturation(img, 0.9, 1.1)
        return img

    def augment_with_labels(img, label):
        return augment(img), label

    if with_labels == True:
        return augment_with_labels

    else:
        return augment


def build_dataset(paths, labels=None, bsize=32, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=True, repeat=True, shuffle=1024,
                  cache_dir=""):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)

    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)

    if augment_fn is None:
        augment_fn = build_augmenter(labels is not None)

    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)

    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    # dset = dset.cache(cache_dir) if cache else dset
    dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)

    return dset

# Train test split
(train_img, valid_img,train_labels,valid_labels) = train_test_split(train_images,labels,train_size = 0.8,random_state = 0)
# print(train, valid)


# Tensorflow datasets
train_df = build_dataset(
    train_img, train_labels, bsize=BATCH_SIZE,
    cache=True)

valid_df = build_dataset(
    valid_img, valid_labels, bsize=BATCH_SIZE,
    repeat=False, shuffle=False, augment=False,
    cache=True)


def create_model():
    model = models.Sequential()
    model.add(EfficientNetB3(include_top=False, weights='imagenet',
                             input_shape=(TARGET_SIZE,TARGET_SIZE,3)))
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dense(5,activation='softmax'))
    model.compile(optimizer=Adam(lr=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

model = create_model()
model.summary()

model_save = ModelCheckpoint('C:/Users/rosha/PycharmProjects/CLDD/saved_Models/EffNetB3_300_16_best_weights.h5',
                             save_best_only=True,
                             save_weights_only=True,
                             monitor='val_accuracy',
                             mode='max',
                             verbose=1
                             )

early_stop = EarlyStopping(monitor='val_accuracy',
                           min_delta=0.001,
                           patience=5,
                           mode='max',
                           verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_accuracy',
                            factor=0.3,
                            patience=2,
                            min_delta=0.001,
                            mode='max',
                            verbose=1)

history = model.fit(
    train_df,
    validation_data=valid_df,
    steps_per_epoch=STEPS_PER_EPOCH,
    epochs=EPOCHS,
    callbacks=[model_save, early_stop, reduce_lr],
    verbose=1,
)


plt.rcParams.update({'font.size': 16})
hist = pd.DataFrame(history.history)
fig, (ax1, ax2) = plt.subplots(figsize=(12, 12), nrows=2, ncols=1)
hist['loss'].plot(ax=ax1, c='k', label='training loss')
hist['val_loss'].plot(ax=ax1, c='r', linestyle='--', label='validation loss')
ax1.legend()
hist['accuracy'].plot(ax=ax2, c='k', label='training accuracy')
hist['val_accuracy'].plot(ax=ax2, c='r', linestyle='--', label='validation accuracy')
ax2.legend()
plt.show()

model.save('./EffNetB3_300_16.h5')
1

There are 1 answers

0
VBence On

So here is a small checklist I like to go over:

  1. Execute the following code to check whether the GPU is found by tensorflow:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
  1. If the output is "Num GPUs Available: 0", then you should check that you indeed have tensorflow-gpu installed, you might also want to check that support libraries are also in the gpu version.

  2. If your libraries are correct you will need to check to see if your CUDA driver installation is correct. This step is somewhat OS dependent but there are many tutorials online for both. My favourite for TF can be found on the official website: https://www.tensorflow.org/install/gpu