1.Introduction

I write a simple mnist 3D CNN classifier in tensorflow eager execution mode.

I just simple repeat 28x28 pixels gray images 10 times into new 4D image with shape = (D,W,H,C), that is (1, 28, 28, 10), then use tf.dataset.TFRecord to wrap them into batch samples with shape = (batch_size, 1, 28, 28, 10).

I split 60000 mnist samples into tarin_dataset and val_dataset, and my 3D CNN Keras model can train and validate on them respectively (on ONE GPU).

2.Problem:

My server have 10 GPUs.

1) when I use single GPU to train my Keras model, ECC GPU-Util Compute M. is ~80% and Memory-Usage is ~3.4Gb)

2) when I multi_gpu_model() to let my model train on multi-GPUs (GPU 0 and 1 for example), and nvidia-smi shows that only GPU 0 works (Volatile Uncorr. ECC GPU-Util Compute M. is ~60% and Memory-Usage is ~3.4Gb), but for GPU 1, Volatile Uncorr. ECC GPU-Util Compute M. is 0%, and ~137Mb Memory-Usage. Just like Image 1.

Image 1

I have googled many times, but didn't find the solution, is there something wrong in my code?

Looking forward to any suggestions. ^ ^

3.My code

My main.py as follows

import tensorflow as tf
import yaml
import os
from sample.models.CNN_3D import get_model
from yaml import CLoader as Loader
from sample.dataset import load_tf_records
# import matplotlib.pyplot as plt
import tensorflow.contrib.eager as tfe
# import numpy as np
from tensorflow.python.keras.utils import multi_gpu_model

# from sklearn.metrics import confusion_matrix

tf.enable_eager_execution()  # start eager mode
tf.executing_eagerly()
cfg = yaml.load(
    open(os.path.join(os.path.abspath(os.path.join(os.getcwd(), "../config/mnist_config.yml")))),
    Loader=Loader
)["DATASET"][0]

print("cfg =\n", cfg)

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # ignore warning
os.environ["CUDA_VISIBLE_DEVICES"] = cfg["DEVICES_IDS"]
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
tf.keras.backend.set_session(session)


def loss(net, inputs, gts):
    """
    Get loss value

    :param net: model
    :param inputs: a batch of input tensor
    :param gts: a batch of ground truth labels
    :return: loss value
    """
    return tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=net(inputs),
            labels=gts
        )
    )


def train_step(loss_f, net, opt, x, y):
    """
    Perform a single step of optimization

    :param loss_f: loss function
    :param net: network model
    :param opt: optimizer
    :param x: a batch of input tensor
    :param y: a batch of ground truth labels
    :return:
    """
    opt.minimize(
        lambda: loss_f(net, x, y),
        global_step=tf.train.get_or_create_global_step()
    )


if __name__ == '__main__':
    # TODO save/load model; add AUC; add early stop
    train_dataset, val_dataset = load_tf_records()

    if len(cfg["DEVICES_IDS"].split(",")) > 1:  # use multi_GPUs
        with tf.device('/cpu:0'):
            train_model = get_model(summary=True, data_format=cfg["DATA_FORMAT"])
        train_model = multi_gpu_model(train_model, gpus=len(cfg["DEVICES_IDS"].split(",")))
    else:  # use single GPU
        train_model = get_model(summary=True, data_format=cfg["DATA_FORMAT"])

    loss_fn = tf.nn.softmax_cross_entropy_with_logits_v2

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)

    # use TensorBoard
    tb_path = os.path.join(cfg["ROOT"], cfg["TB_FOLDER"])
    if not os.path.exists(tb_path):
        os.makedirs(tb_path)
    writer = tf.contrib.summary.create_file_writer(tb_path)
    global_step = tf.train.get_or_create_global_step()
    with writer.as_default(), tf.contrib.summary.always_record_summaries():
        # Loop over the epochs
        for epoch in range(cfg["EPOCHS"]):
            # Initialize the metric

            train_acc = tfe.metrics.Accuracy(name="train_acc")
            val_acc = tfe.metrics.Accuracy(name="val_acc")

            for xb, yb in tfe.Iterator(train_dataset.batch(cfg["BATCH_SIZE"])):
                # Save the loss on disk
                tf.contrib.summary.scalar("train_loss", loss(train_model, xb, yb))
                # Make a training step
                train_step(loss, train_model, optimizer, xb, yb)
                train_acc(tf.argmax(train_model(tf.constant(xb)), axis=1), tf.argmax(tf.constant(yb), axis=1))
                train_acc.result(write_summary=True)
                if (global_step.numpy() + 1) % 10 == 0:
                    break  # TODO need to remove
            for xb, yb in tfe.Iterator(val_dataset.batch(cfg["BATCH_SIZE"])):
                tf.contrib.summary.scalar("val_loss", loss(train_model, xb, yb))
                # Save the validation accuracy on the batch
                val_acc(tf.argmax(train_model(tf.constant(xb)), axis=1), tf.argmax(tf.constant(yb), axis=1))
                val_acc.result(write_summary=True)
                break  # TODO need to remove

            # Save the overall accuracy in our vector
            # acc_history[epoch] = accuracy.result().numpy()

    # # At the end, plot the evolution of the training accuracy
    # plt.figure()
    # plt.plot(acc_history)
    # plt.xlabel('Epoch')
    # plt.ylabel('Accuracy')
    # plt.show()

The output of print("cfg =\n", cfg) is

cfg = {'NAME': 'MNIST', 'ROOT': '/home/csy/Jupyters/MNIST', 'TFR_FOLDER': 'tfrecords', 'PIX_BOUND': [0.0, 1.0], 'ZERO_CENTER': 0.5, 'CUBE_SIZE': [10, 32, 32, 1], 'NUM_PARALLEL_CALLS': 32, 'PREFETCH_BUFFER_SIZE': 3200, 'NUM_CLASSES': 10, 'BATCH_SIZE': 32, 'SUM_OF_ALL_DATASAMPLES': 60000, 'DEVICES_IDS': '4,5', 'EPOCHS': 10, 'LEARNING_RATE': '1e-1', 'DATA_FORMAT': 'channels_last', 'TB_FOLDER': 'tensorboard'}

0 Answers