Error when using run_eagerly=False in model.compile custom Keras Model in Tensorflow

1.5k views Asked by At

I am developing a custom model in Tensorflow. I am trying to implement a Virtual Adversarial Training (VAT) model from https://arxiv.org/abs/1704.03976. The model makes use of both labeled and unlabeled data in its classification task. Therefore, in the train_step of the model, I need to divide the data of the batch into labeled (0, or 1), or unlabeled (-1). It seems to work as expected when compiling the model using run_eagerly=True, but when I use run_eagerly=False, it gives me the following error:

ValueError: Number of mask dimensions must be specified, even if some dimensions are None.  E.g. shape=[None] is ok, but shape=None is not.

which seems to be produced in:

X_l, y_l = tf.boolean_mask(X, tf.logical_not(missing)), tf.boolean_mask(y, tf.logical_not(missing))

I am not sure what is causing the error, but it seems to have something to do with a weird tensor shape issues that only occur during run_eagerly=False. I need the boolean_mask functionality in order to distinguish the labeled and unlabeled data. I hope someone can help me out. In order to reproduce the errors, I added the model, and a small simulation example. The simulation will produce the error I have, when run_eagerly=False is set.

Thanks in advance.

Model defintion:

from tensorflow import keras
import tensorflow as tf


metric_acc = keras.metrics.BinaryAccuracy()
metric_loss = keras.metrics.Mean('loss')


class VAT(keras.Model):

    def __init__(self, units_1=16, units_2=16, dropout=0.3, xi=1e-6, epsilon=2.0, alpha=1.0):
        super(VAT, self).__init__()

        # Set model parameters
        self.units_1 = units_1
        self.units_2 = units_2
        self.dropout = dropout
        self.xi = xi
        self.epsilon = epsilon
        self.alpha = alpha

        # First hidden
        self.dense1 = keras.layers.Dense(self.units_1)
        self.activation1 = keras.layers.Activation(tf.nn.leaky_relu)
        self.dropout1 = keras.layers.Dropout(self.dropout)

        # Second hidden
        self.dense2 = keras.layers.Dense(self.units_2)
        self.activation2 = keras.layers.Activation(tf.nn.leaky_relu)
        self.dropout2 = keras.layers.Dropout(self.dropout)

        # Output layer
        self.dense3 = keras.layers.Dense(1)
        self.activation3 = keras.layers.Activation("sigmoid")

    def call(self, inputs, training=None, mask=None):

        x1 = self.dense1(inputs)
        x2 = self.activation1(x1)
        x3 = self.dropout1(x2, training=True)

        x4 = self.dense2(x3)
        x5 = self.activation2(x4)
        x6 = self.dropout2(x5, training=True)

        x7 = self.dense3(x6)
        x8 = self.activation3(x7)

        return x8

    def generate_perturbation(self, inputs):

        # Generate normal vectors
        d = tf.random.normal(shape=tf.shape(inputs))

        # Normalize vectors
        d = tf.math.l2_normalize(d, axis=1)

        # Calculate r
        r = self.xi * d

        # Make predictions
        p = self(inputs, training=True)

        # Tape gradient
        with tf.GradientTape() as tape:
            tape.watch(r)

            # Perturbed predictions
            p_perturbed = self(inputs + r, training=True)

            # Calculate divergence
            D = keras.losses.KLD(p, p_perturbed) + keras.losses.KLD(1 - p, 1 - p_perturbed)

        # Calculate gradient
        gradient = tape.gradient(D, r)

        # Calculate r_vadv
        r_vadv = tf.math.l2_normalize(gradient, axis=1)

        # Return virtual adversarial perturbation
        return r_vadv

    @tf.function
    def train_step(self, data):

        # Unpack data
        X, y = data

        # Missing label boolean indices
        missing = tf.squeeze(tf.equal(y, -1))

        # Split data into labeled and unlabeled data
        X_l, y_l = tf.boolean_mask(X, tf.logical_not(missing)), tf.boolean_mask(y, tf.logical_not(missing))
        X_u = tf.boolean_mask(X, missing)

        # Calculate virtual perturbations for labeled and unlabeled
        r_l = self.generate_perturbation(X_l)
        r_u = self.generate_perturbation(X_u)

        # Tape gradient
        with tf.GradientTape() as model_tape:
            model_tape.watch(self.trainable_variables)

            # Calculate probabilities real data
            prob_l, prob_u = self(X_l, training=True), self(X_u, training=True)

            # Calculate probabilities perturbed data
            prob_r_l, prob_r_u = self(X_l + self.epsilon * r_l, training=True), self(X_u + self.epsilon * r_u, training=True)

            # Calculate loss
            loss = vat_loss(y_l, prob_l, prob_u, prob_r_l, prob_r_u, self.alpha)

        # Calculate gradient
        model_gradient = model_tape.gradient(loss, self.trainable_variables)

        # Update weights
        self.optimizer.apply_gradients(zip(model_gradient, self.trainable_variables))

        # Compute metrics
        metric_acc.update_state(y_l, prob_l)
        metric_loss.update_state(loss)

        return {'loss': metric_loss.result(), 'accuracy': metric_acc.result()}

    @property
    def metrics(self):
        return [metric_loss, metric_acc]


def vat_loss(y_l, prob_l, prob_u, prob_r_l, prob_r_u, alpha):
    N_l = tf.cast(tf.size(prob_l), dtype=tf.dtypes.float32)
    N_u = tf.cast(tf.size(prob_u), dtype=tf.dtypes.float32)

    if tf.equal(N_l, 0):
        # No labeled examples: get contribution from unlabeled data using perturbations
        R_vadv = tf.reduce_sum(
            keras.losses.KLD(prob_u, prob_r_u)
            + keras.losses.KLD(1 - prob_u, 1 - prob_r_u)
        )

        return alpha * R_vadv / N_u

    elif tf.equal(N_u, 0):

        # No unlabeled examples: get contribution from labeled data
        R = tf.reduce_sum(keras.losses.binary_crossentropy(y_l, prob_l))

        R_vadv = tf.reduce_sum(
            keras.losses.KLD(prob_l, prob_r_l)
            + keras.losses.KLD(1 - prob_l, 1 - prob_r_l)
        )

        return R / N_l + alpha * R_vadv / N_l

    else:
        # Get contribution from labeled data
        R = tf.reduce_sum(keras.losses.binary_crossentropy(y_l, prob_l))

        # Get contribution from labeled and unlabeled data using perturbations
        R_vadv = tf.reduce_sum(
            keras.losses.KLD(prob_l, prob_r_l)
            + keras.losses.KLD(1 - prob_l, 1 - prob_r_l)
        ) + tf.reduce_sum(
            keras.losses.KLD(prob_u, prob_r_u)
            + keras.losses.KLD(1 - prob_u, 1 - prob_r_u)
        )

        return R / N_l + alpha * R_vadv / (N_l + N_u)

Simulation example: To show that the model/code works as desired (when using run_eagerly=True, I made a simulation example. In this example, I bias when observations are labeled/unlabeled. The figure below illustrates the labeled observations used by the model (yellow or purple), and the unlabeled observations (blue)Labeled and unlabeled data.

The VAT produces an accuracy of around ~0.75, whereas the reference model produces an accuracy of around ~0.58. These accuracies are produced without hyperparameter tuning.

from modules.vat import VAT

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

def create_biased_sample(x, proportion_labeled):
    labeled =  np.random.choice([True, False], p=[proportion_labeled, 1-proportion_labeled])
    if x[0] < 0.0:
        return False
    elif x[0] > 1.0:
        return False
    else:
        return labeled


# Simulation parameters
N = 2000
proportion_labeled = 0.15

# Model training parameters
BATCH_SIZE = 128
BUFFER_SIZE = 60000
EPOCHS = 100

# Generate a dataset
X, y = datasets.make_moons(n_samples=N, noise=.05, random_state=3)
X, y = X.astype('float32'), y.astype('float32')
y = y.reshape(-1, 1)


# Split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)

# Simulate missing labels
sample_biased = lambda x: create_biased_sample(x, proportion_labeled)
labeled = np.array([sample_biased(k) for k in X_train])
y_train[~ labeled] = -1

# Estimate VAT model
vat = VAT(dropout=0.2, units_1=16, units_2=16, epsilon=0.5)
vat.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), run_eagerly=True)
vat.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)

# Estimate a reference model
reference = keras.models.Sequential([
    keras.layers.Input(shape=(2,)),
    keras.layers.Dense(16),
    keras.layers.Activation(tf.nn.leaky_relu),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(16),
    keras.layers.Activation(tf.nn.leaky_relu),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1),
    keras.layers.Activation("sigmoid")
])
reference.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss=keras.losses.binary_crossentropy, run_eagerly=False)
reference.fit(X_train[y_train.flatten() != -1, :], y_train[y_train.flatten() != -1], batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)

# Calculate out-of-sample accuracies
test_acc_vat = tf.reduce_mean(keras.metrics.binary_accuracy(y_test, vat(X_test, training=False)))
test_acc_reference = tf.reduce_mean(keras.metrics.binary_accuracy(y_test, reference(X_test, training=False)))

# Print results
print('Test accuracy of VAT: {}'.format(test_acc_vat))
print('Test accuracy of reference model: {}'.format(test_acc_reference))

# Plot scatter
plt.scatter(X_test[:, 0], X_test[:, 1])
plt.scatter(X_train[y_train.flatten() != -1, 0], X_train[y_train.flatten() != -1, 1], c=y_train.flatten()[y_train.flatten() != -1])

1

There are 1 answers

0
thijsvdp On

For anyone who is interested, I solved the issue by adding the following in the train_step() method:

missing.set_shape([None])

It should be just after declaring the tensor missing. I solved this using this thread: Tensorflow boolean_mask with dynamic mask.