dimensions must equal error but they are equal

194 views Asked by At

I added a print to the "discriminator_loss" function to see what was going on. at first it will tell me the shape of both are 16. later it tells me the shape of "real_loss" is only 15 while the other stays 16. So far I have only tried lowering the batchsize's and increasing them by 1 ect. I have provided the most relevant parts of the code. I can provide the rest of the code if needed. I have no clue why this is happening and it breaks the code.

with strategy.scope():
  BATCH_SIZE = 16
  GLOBAL_BATCH_SIZE = 32#batchsize*# of gpus
  im_size = 256
  latent_size = 512
with strategy.scope():
  cross_entropy = tf.keras.losses.BinaryCrossentropy(
    from_logits=True,\
    reduction = tf.keras.losses.Reduction.NONE)

  #this is used to evaluate discriminators ability to discriminate
  def discriminator_loss(real_output, fake_output):
      real_loss = cross_entropy(tf.ones_like(real_output), real_output)#compares prediction to actual value of 1
      fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)#compares rediction to actual value of 0
      print(real_loss)
      print(fake_loss)
      total_loss = real_loss + fake_loss
      total_loss = total_loss/GLOBAL_BATCH_SIZE
      return total_loss


  #how well was generator able to trick discriminator
  def generator_loss(fake_output):
      gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)#compares predictions to the expected value 1 of a real image
      gen_loss = gen_loss / GLOBAL_BATCH_SIZE
      return gen_loss
with strategy.scope():
  EPOCHS = 80
  noise_dim = 512
  num_examples_to_generate = 32



# We will reuse this seed overtime (so it's easier)
# to visualize progress in the animated GIF)
with strategy.scope():
  def noise(n):
    return tf.random.normal([n, latent_size])

  def noiseImage(n):
    return tf.random.uniform([n, im_size, im_size, 1])
  #seed = tf.random.normal([num_examples_to_generate, noise_dim])



#seed used to generate image>the discriminator than classifies real images from training set and a set of generated images>loss is calculated and gradients are used to update the model
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
with strategy.scope():
  #@tf.function
  def train_step(images):
      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        g_loss = generator_loss(fake_output)#runs generator loss
        d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
            
      G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
      D_grads = disc_tape.gradient(d_loss, discriminator.trainable_variables)

      generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
      discriminator_optimizer.apply_gradients(zip(D_grads, discriminator.trainable_variables))

      #run g_optim twice to make sure d_loss doesn't go to zero
      with tf.GradientTape() as gen_tape:
        generated_imgs = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
        fake_output = discriminator(generated_imgs, training=True)
        g_loss = generator_loss(fake_output)

      G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
      generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))

      return g_loss, d_loss


  @tf.function
  def distributed_train_step(dist_dataset):
      per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
      total_g_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_g_losses,axis=0)
      total_d_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_d_losses,axis=0)
      return total_g_loss, total_d_loss


with strategy.scope():
  def train(dist_dataset, epochs):
    for epoch in range(epochs):
      start = time.time()
      for image_batch in dist_dataset:
        total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function


with strategy.scope():
  train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well

error and traceback

Traceback (most recent call last):
  File "C:\image generator\pixiv\#image generator.py", line 507, in <module>
    train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
  File "C:\image generator\pixiv\#image generator.py", line 441, in train
    total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
    result = self._call(*args, **kwds)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2419, in __call__
    graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2777, in _maybe_define_function
    graph_function = self._create_graph_function(args, kwargs)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2667, in _create_graph_function
    capture_by_value=self._capture_by_value),
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 981, in func_graph_from_py_func
    func_outputs = python_func(*func_args, **func_kwargs)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 441, in wrapped_fn
    return weak_wrapped_fn().__wrapped__(*args, **kwds)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 968, in wrapper
    raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:

    C:\image generator\pixiv\#image generator.py:419 distributed_train_step  *
        per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
    C:\image generator\pixiv\#image generator.py:393 train_step  *
        d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
    C:\image generator\pixiv\#image generator.py:328 discriminator_loss  *
        total_loss = real_loss + fake_loss
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:984 binary_op_wrapper
        return func(x, y, name=name)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1276 _add_dispatch
        return gen_math_ops.add_v2(x, y, name=name)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:483 add_v2
        "AddV2", x=x, y=y, name=name)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
        compute_device)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
        op_def=op_def)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1817 __init__
        control_input_ops, op_def)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 0 and 2 for '{{node replica_1/add}} = AddV2[T=DT_FLOAT](replica_1/binary_crossentropy_1/weighted_loss/Mul, replica_1/binary_crossentropy_2/weighted_loss/Mul)' with input shapes: [0], [2].
1

There are 1 answers

1
Aaron Keesing On BEST ANSWER

So according to comments the problem lies in unequal batch sizes, due to the final batch being smaller than the specified batch size. I believe this is due to this line:

generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)

where the constant size BATCH_SIZE is used, instead of the actual input shape of the batch, so that generated_images is of a different shape than images.

So one solution as mentioned is simply to use drop_remainder=True in batch(). However it might be better to get the generator to output images of the same shape as the input, so instead of passing BATCH_SIZE as argument to your noise generation functions, you should use the actual size of the input batch. So maybe using tf.shape(images)[0] would help. Alternatively, you could generate a fixed batch of images with BATCH_SIZE, and then simply discard any extra images, like

num_images = tf.shape(images)[0]
generated_images = generated_images[:num_images]