TensorFlow for MultiGPU

585 views Asked by At

If someone can help me understand the situation it would be great. Thanks in advance. My setup: OS: Ubuntu 16.04, 2 Titan X GPUs. TensorFlow (version 0.12.1) installed in a conda environment using pip as on TF docs. Python 3.5.

Code: I ran the following code to test my 2 GPU setup. Once each with random_matrix = tf.zeros(...) and random_matrix = tf.random_uniform(...). The outputs are shown below.

Questions: 1) When I run with tf.zeros. The timings on CPU and GPU are identical. But with tf.random_uniform I see that the GPU is faster (as I had expected). Why is tf.zeros slower on GPU? What am I missing? 2) I have fixed the global seed and the local seed. Why are the outputs within the GPUs different for the tf.random_uniform case?

Thanks a lot for any insights in advance.

import sys
import numpy as np
import tensorflow as tf
from datetime import datetime

device_names = ["/cpu:0", "/gpu:0", "/gpu:1"]
shapes = [(3000, 3000), (6000, 6000), (9000, 9000), (12000, 12000)]

all_timings = []
tf.set_random_seed(1234)
for device_name in device_names:
    device_timings = []
    for shape in shapes:
        print("device_name:::::::::{}".format(device_name))
        with tf.device(device_name):
            # random_matrix = tf.zeros(shape)
            random_matrix = tf.random_uniform(shape=shape, 
                                              minval=0, 
                                              maxval=1, 
                                              seed=1234)
            result_op = tf.reduce_sum(tf.matmul(random_matrix,tf.transpose(random_matrix)))

        start_time = datetime.now()
        result = -1.0
        with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as session:
            result = session.run(result_op)
            time_diff = datetime.now() - start_time
            device_timings.append((device_name,
                                   shape,
                                   "time_taken (secs): {}".format(time_diff.total_seconds()),
                                   "result: {}".format(result)))
            print("++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n")
all_timings.append(device_timings)

print("\n\n")
for device_timings in all_timings:
    for t in device_timings:
    print(t)
    print("---------------------------------------------------------\n\n")

Timings with tf.random_uniform():

('/cpu:0', (3000, 3000), 'time_taken (secs): 1.146831', 'result:     6754431488.0')
('/cpu:0', (6000, 6000), 'time_taken (secs): 2.816985', 'result: 54023852032.0')
('/cpu:0', (9000, 9000), 'time_taken (secs): 9.372665', 'result: 184425938944.0')
('/cpu:0', (12000, 12000), 'time_taken (secs): 21.718614', 'result: 439655661568.0')
--------------------------------------------------------


('/gpu:0', (3000, 3000), 'time_taken (secs): 0.39667', 'result: 6754406912.0')
('/gpu:0', (6000, 6000), 'time_taken (secs): 0.085984', 'result: 54006796288.0')
('/gpu:0', (9000, 9000), 'time_taken (secs): 0.221407', 'result: 182251880448.0')
('/gpu:0', (12000, 12000), 'time_taken (secs): 0.444187', 'result: 431996174336.0')
---------------------------------------------------------


('/gpu:1', (3000, 3000), 'time_taken (secs): 0.399159', 'result: 6754401792.0')
('/gpu:1', (6000, 6000), 'time_taken (secs): 0.102889', 'result: 54006857728.0')
('/gpu:1', (9000, 9000), 'time_taken (secs): 0.262842', 'result: 182251585536.0')
('/gpu:1', (12000, 12000), 'time_taken (secs): 0.469139', 'result: 431996141568.0')
---------------------------------------------------------

Timings with tf.zeros():

('/cpu:0', (3000, 3000), 'time_taken (secs): 1.040602', 'result: 0.0')
('/cpu:0', (6000, 6000), 'time_taken (secs): 2.760587', 'result: 0.0')
('/cpu:0', (9000, 9000), 'time_taken (secs): 9.134257', 'result: 0.0')
('/cpu:0', (12000, 12000), 'time_taken (secs): 21.410583', 'result: 0.0')
---------------------------------------------------------


('/gpu:0', (3000, 3000), 'time_taken (secs): 0.394707', 'result: 0.0')
(/gpu:0', (6000, 6000), 'time_taken (secs): 2.750311', 'result: 0.0')
('/gpu:0', (9000, 9000), 'time_taken (secs): 9.141721', 'result: 0.0')
('/gpu:0', (12000, 12000), 'time_taken (secs): 21.441183', 'result: 0.0')
 --------------------------------------------------------


('/gpu:1', (3000, 3000), 'time_taken (secs): 0.390197', 'result: 0.0')
('/gpu:1', (6000, 6000), 'time_taken (secs): 2.788815', 'result: 0.0')
('/gpu:1', (9000, 9000), 'time_taken (secs): 9.335516', 'result: 0.0')
('/gpu:1', (12000, 12000), 'time_taken (secs): 21.654866', 'result: 0.0')
2

There are 2 answers

0
Prabu On

Thanks Yaroslav! I provide the code and results from my run, just in case somebody else is interested. If you try the code please be patient for a few minutes.

Code:

import sys
import numpy as np
import tensorflow as tf
from datetime import datetime


device_names = ["/cpu:0", "/gpu:0", "/gpu:1"]
shapes = [(3000, 3000), (6000, 6000), (9000, 9000), (12000, 12000)]
messages = ["RESULTS\n"]

def timing_run(matrix_type, config_name, warmup):
    configs = {"simple": tf.ConfigProto(log_device_placement=False),
               "optim": tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))}

    messages.append("matrix={}+config={}+warmup={}".format(matrix_type, config_name, warmup))
    tf.set_random_seed(1234)
    for device_name in device_names:
        for shape in shapes:
            with tf.device(device_name):
                if matrix_type == "random_uniform":
                    random_matrix = tf.random_uniform(shape=shape,
                                                      minval=0,
                                                      maxval=1,
                                                      seed=1234)
                else:
                    random_matrix = tf.zeros(shape)
                result_op = tf.reduce_sum(tf.matmul(random_matrix,tf.transpose(random_matrix)))

            session = tf.Session(config=configs[config_name])
            result1, result2 = -1.0, -1.0
            # warm up
            start_time1 = datetime.now()
            result1 = session.run(result_op)
            time_diff1 = datetime.now() - start_time1
            messages.append((device_name,
                             "shape = {}".format(shape),
                             "times = {} seconds".format(time_diff1.total_seconds()),
                             "result = {}".format(result1)))
            if warmup:
                # warmed up - runs if warmup=True.
                start_time2 = datetime.now()
                result2 = session.run(result_op)
                time_diff2 = datetime.now() - start_time2
                messages.append((device_name,
                                 "shape = {}".format(shape),
                                 "times = {} seconds".format(time_diff2.total_seconds()),
                                 "result = {}".format(result1),
                                 "*****WARMED UP*****"))
            session.close()
        messages.append("++++++++++++++++++++++++++++++++++++++++++++++++++++")
    messages.append("\n\n")



if __name__ == "__main__":
    timing_run(matrix_type="random_uniform", config_name="simple", warmup=False)
    timing_run(matrix_type="random_uniform", config_name="simple", warmup=True)
    timing_run(matrix_type="random_uniform", config_name="optim", warmup=False)
    timing_run(matrix_type="zeros", config_name="simple", warmup=False)
    timing_run(matrix_type="zeros", config_name="simple", warmup=True)
    timing_run(matrix_type="zeros", config_name="optim", warmup=False)

    # print timings
    for e in messages:
        print(e)

Summary:

matrix=random_uniform+config=simple+warmup=False
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.428429 seconds', 'result = 6754431488.0')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.806464 seconds', 'result = 54023852032.0')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.36232 seconds', 'result = 184425938944.0')
('/cpu:0', 'shape = (12000, 12000)', 'times = 22.376751 seconds', 'result = 439655661568.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.392957 seconds', 'result = 6754390016.0')
('/gpu:0', 'shape = (6000, 6000)', 'times = 0.082889 seconds', 'result = 54006833152.0')
('/gpu:0', 'shape = (9000, 9000)', 'times = 0.221844 seconds', 'result = 182251814912.0')
('/gpu:0', 'shape = (12000, 12000)', 'times = 0.438476 seconds', 'result = 431995879424.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.483864 seconds', 'result = 6754393088.0')
('/gpu:1', 'shape = (6000, 6000)', 'times = 0.097571 seconds', 'result = 54006833152.0')
('/gpu:1', 'shape = (9000, 9000)', 'times = 0.250176 seconds', 'result = 182252044288.0')
('/gpu:1', 'shape = (12000, 12000)', 'times = 0.473314 seconds', 'result = 431996567552.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++



matrix=random_uniform+config=simple+warmup=True
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.475717 seconds', 'result = 6754431488.0')
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.379542 seconds', 'result = 6754431488.0', '*****WARMED UP*****')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.856803 seconds', 'result = 54023852032.0')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.798967 seconds', 'result = 54023852032.0', '*****WARMED UP*****')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.447787 seconds', 'result = 184425938944.0')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.385646 seconds', 'result = 184425938944.0', '*****WARMED UP*****')
('/cpu:0', 'shape = (12000, 12000)', 'times = 21.752967 seconds', 'result = 439655661568.0')
('/cpu:0', 'shape = (12000, 12000)', 'times = 21.832136 seconds', 'result = 439655661568.0', '*****WARMED UP*****')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.067066 seconds', 'result = 6754394624.0')
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.008072 seconds', 'result = 6754394624.0', '*****WARMED UP*****')
('/gpu:0', 'shape = (6000, 6000)', 'times = 0.123611 seconds', 'result = 54006833152.0')
('/gpu:0', 'shape = (6000, 6000)', 'times = 0.057391 seconds', 'result = 54006833152.0', '*****WARMED UP*****')
('/gpu:0', 'shape = (9000, 9000)', 'times = 0.248432 seconds', 'result = 182251913216.0')
('/gpu:0', 'shape = (9000, 9000)', 'times = 0.18535 seconds', 'result = 182251913216.0', '*****WARMED UP*****')
('/gpu:0', 'shape = (12000, 12000)', 'times = 0.48081 seconds', 'result = 431996043264.0')
('/gpu:0', 'shape = (12000, 12000)', 'times = 0.412447 seconds', 'result = 431996043264.0', '*****WARMED UP*****')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.105071 seconds', 'result = 6754395648.0')
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.008107 seconds', 'result = 6754395648.0', '*****WARMED UP*****')
('/gpu:1', 'shape = (6000, 6000)', 'times = 0.137264 seconds', 'result = 54006849536.0')
('/gpu:1', 'shape = (6000, 6000)', 'times = 0.064462 seconds', 'result = 54006849536.0', '*****WARMED UP*****')
('/gpu:1', 'shape = (9000, 9000)', 'times = 0.280302 seconds', 'result = 182251831296.0')
('/gpu:1', 'shape = (9000, 9000)', 'times = 0.191399 seconds', 'result = 182251831296.0', '*****WARMED UP*****')
('/gpu:1', 'shape = (12000, 12000)', 'times = 0.509208 seconds', 'result = 431996534784.0')
('/gpu:1', 'shape = (12000, 12000)', 'times = 0.4263 seconds', 'result = 431996534784.0', '*****WARMED UP*****')
++++++++++++++++++++++++++++++++++++++++++++++++++++



matrix=random_uniform+config=optim+warmup=False
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.552631 seconds', 'result = 6754431488.0')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.894024 seconds', 'result = 54023852032.0')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.394226 seconds', 'result = 184425938944.0')
('/cpu:0', 'shape = (12000, 12000)', 'times = 21.870817 seconds', 'result = 439655661568.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.107416 seconds', 'result = 6754392576.0')
('/gpu:0', 'shape = (6000, 6000)', 'times = 0.163633 seconds', 'result = 54006804480.0')
('/gpu:0', 'shape = (9000, 9000)', 'times = 0.304741 seconds', 'result = 182251667456.0')
('/gpu:0', 'shape = (12000, 12000)', 'times = 0.526494 seconds', 'result = 431995944960.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.119625 seconds', 'result = 6754394624.0')
('/gpu:1', 'shape = (6000, 6000)', 'times = 0.203158 seconds', 'result = 54006800384.0')
('/gpu:1', 'shape = (9000, 9000)', 'times = 0.317646 seconds', 'result = 182251978752.0')
('/gpu:1', 'shape = (12000, 12000)', 'times = 0.544184 seconds', 'result = 431996076032.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++



matrix=zeros+config=simple+warmup=False
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.632157 seconds', 'result = 0.0')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.901679 seconds', 'result = 0.0')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.345713 seconds', 'result = 0.0')
('/cpu:0', 'shape = (12000, 12000)', 'times = 21.707619 seconds', 'result = 0.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.498451 seconds', 'result = 0.0')
('/gpu:0', 'shape = (6000, 6000)', 'times = 2.900121 seconds', 'result = 0.0')
('/gpu:0', 'shape = (9000, 9000)', 'times = 9.4296 seconds', 'result = 0.0')
('/gpu:0', 'shape = (12000, 12000)', 'times = 21.750406 seconds', 'result = 0.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.523286 seconds', 'result = 0.0')
('/gpu:1', 'shape = (6000, 6000)', 'times = 2.887522 seconds', 'result = 0.0')
('/gpu:1', 'shape = (9000, 9000)', 'times = 9.377383 seconds', 'result = 0.0')
('/gpu:1', 'shape = (12000, 12000)', 'times = 21.639043 seconds', 'result = 0.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++



matrix=zeros+config=simple+warmup=True
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.520212 seconds', 'result = 0.0')
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.000172 seconds', 'result = 0.0', '*****WARMED UP*****')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.914485 seconds', 'result = 0.0')
('/cpu:0', 'shape = (6000, 6000)', 'times = 0.000166 seconds', 'result = 0.0', '*****WARMED UP*****')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.346122 seconds', 'result = 0.0')
('/cpu:0', 'shape = (9000, 9000)', 'times = 0.000207 seconds', 'result = 0.0', '*****WARMED UP*****')
('/cpu:0', 'shape = (12000, 12000)', 'times = 21.715376 seconds', 'result = 0.0')
('/cpu:0', 'shape = (12000, 12000)', 'times = 0.0002 seconds', 'result = 0.0', '*****WARMED UP*****')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.556841 seconds', 'result = 0.0')
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.000234 seconds', 'result = 0.0', '*****WARMED UP*****')
('/gpu:0', 'shape = (6000, 6000)', 'times = 2.936608 seconds', 'result = 0.0')
('/gpu:0', 'shape = (6000, 6000)', 'times = 0.000244 seconds', 'result = 0.0', '*****WARMED UP*****')
('/gpu:0', 'shape = (9000, 9000)', 'times = 9.34956 seconds', 'result = 0.0')
('/gpu:0', 'shape = (9000, 9000)', 'times = 0.000246 seconds', 'result = 0.0', '*****WARMED UP*****')
('/gpu:0', 'shape = (12000, 12000)', 'times = 21.634354 seconds', 'result = 0.0')
('/gpu:0', 'shape = (12000, 12000)', 'times = 0.000221 seconds', 'result = 0.0', '*****WARMED UP*****')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.562244 seconds', 'result = 0.0')
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.000255 seconds', 'result = 0.0', '*****WARMED UP*****')
('/gpu:1', 'shape = (6000, 6000)', 'times = 2.961658 seconds', 'result = 0.0')
('/gpu:1', 'shape = (6000, 6000)', 'times = 0.000237 seconds', 'result = 0.0', '*****WARMED UP*****')
('/gpu:1', 'shape = (9000, 9000)', 'times = 9.308582 seconds', 'result = 0.0')
('/gpu:1', 'shape = (9000, 9000)', 'times = 0.000239 seconds', 'result = 0.0', '*****WARMED UP*****')
('/gpu:1', 'shape = (12000, 12000)', 'times = 21.707127 seconds', 'result = 0.0')
('/gpu:1', 'shape = (12000, 12000)', 'times = 0.000261 seconds', 'result = 0.0', '*****WARMED UP*****')
++++++++++++++++++++++++++++++++++++++++++++++++++++



matrix=zeros+config=optim+warmup=False
('/cpu:0', 'shape = (3000, 3000)', 'times = 0.560451 seconds', 'result = 0.0')
('/cpu:0', 'shape = (6000, 6000)', 'times = 2.978946 seconds', 'result = 0.0')
('/cpu:0', 'shape = (9000, 9000)', 'times = 9.3279 seconds', 'result = 0.0')
('/cpu:0', 'shape = (12000, 12000)', 'times = 21.694664 seconds', 'result = 0.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:0', 'shape = (3000, 3000)', 'times = 0.249778 seconds', 'result = 0.0')
('/gpu:0', 'shape = (6000, 6000)', 'times = 0.365332 seconds', 'result = 0.0')
('/gpu:0', 'shape = (9000, 9000)', 'times = 0.663667 seconds', 'result = 0.0')
('/gpu:0', 'shape = (12000, 12000)', 'times = 1.032716 seconds', 'result = 0.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
('/gpu:1', 'shape = (3000, 3000)', 'times = 0.299856 seconds', 'result = 0.0')
('/gpu:1', 'shape = (6000, 6000)', 'times = 0.294592 seconds', 'result = 0.0')
('/gpu:1', 'shape = (9000, 9000)', 'times = 0.55067 seconds', 'result = 0.0')
('/gpu:1', 'shape = (12000, 12000)', 'times = 0.806868 seconds', 'result = 0.0')
++++++++++++++++++++++++++++++++++++++++++++++++++++
2
Yaroslav Bulatov On

I suspect this this related to GPU kernel optimization. If you "pre-warm" your GPU by running the same computation shape, the next execution is much faster. There's PTX compilation that adds a couple of seconds to the first usage of kernel on a GPU in a process, but it's peculiar that your runtime increases with size of the matrix, perhaps there's some profiling going on as well.

Note that without tf.OptimizerOptions.L0 it becomes implausibly fast, so there's some caching happening as well.

shape = (6000, 6000)
with tf.device("/gpu:0"):
    random_matrix_gpu = tf.zeros(shape)
    result_op_gpu = tf.reduce_sum(tf.matmul(random_matrix_gpu,tf.transpose(random_matrix_gpu)))
with tf.device("/cpu:0"):
    random_matrix_cpu = tf.zeros(shape)
    result_op_cpu = 

tf.reduce_sum(tf.matmul(random_matrix_cpu,tf.transpose(random_matrix_cpu)))
config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
sess = tf.Session(config=config)

def profile(op, msg):
    start_time = time.time()
    sess.run(op)
    print(msg, time.time()-start_time)

profile(result_op_cpu, "cpu1")
profile(result_op_cpu, "cpu2")
profile(result_op_gpu, "gpu1")
profile(result_op_gpu, "gpu2")

I see this:

cpu1 1.716048240661621
cpu2 1.509080171585083
gpu1 4.192790746688843
gpu2 0.13361549377441406