Incompatible shapes:[512] vs. [512,1024]

35 views Asked by At
class BaseModel(tf.keras.Model):
    def __init__(self):
        super(BaseModel, self).__init__()

        self.dense1 = Dense(256, input_shape=(152,))
        self.batch_norm1 = BatchNormalization()
        self.activation1 = Activation(tf.keras.layers.LeakyReLU(alpha=0.3))
        self.dropout1 = Dropout(0.2)

        self.dense2 = Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.3))
        self.dropout2 = Dropout(0.2)

        self.dense3 = Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.3))
        self.batch_norm2 = BatchNormalization()
        self.dropout3 = Dropout(0.2)

        self.dense4 = Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.3))
        self.batch_norm3 = BatchNormalization()
        self.dropout4 = Dropout(0.3)

        self.dense5 = Dense(2048, activation=tf.keras.layers.LeakyReLU(alpha=0.3))
        self.batch_norm4 = BatchNormalization()
        self.dropout5 = Dropout(0.3)

        self.dense6 = Dense(1024, activation=tf.keras.layers.LeakyReLU(alpha=0.3))
        self.batch_norm5 = BatchNormalization()
        self.dropout6 = Dropout(0.3)

        self.dense7 = Dense(1024, activation=tf.keras.layers.LeakyReLU(alpha=0.3))
        self.batch_norm6 = BatchNormalization()
        self.dropout7 = Dropout(0.3)

        self.dense8 = Dense(18211, activation="linear")

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.batch_norm1(x)
        x = self.activation1(x)
        x = self.dropout1(x)

        x = self.dense2(x)
        x = self.dropout2(x)

        x = self.dense3(x)
        x = self.batch_norm2(x)
        x = self.dropout3(x)

        x = self.dense4(x)
        x = self.batch_norm3(x)
        x = self.dropout4(x)

        x = self.dense5(x)
        x = self.batch_norm4(x)
        x = self.dropout5(x)

        x = self.dense6(x)
        x = self.batch_norm5(x)
        x = self.dropout6(x)

        x = self.dense7(x)
        x = self.batch_norm6(x)
        x = self.dropout7(x)

        return self.dense8(x)

# Instantiate the model
base_model = BaseModel()

class MAMLTrainer:
    def __init__(self, model, loss = loss_function, lr_inner=0.01, log_steps=1000):
        self.model = model
        self.loss_function = loss_function
        self.lr_inner = lr_inner
        self.log_steps = log_steps
        self.optimizer = tf.keras.optimizers.Adam()
    

    def copy_model(self, model, x):
        copied_model = tf.keras.models.clone_model(self.model)
        copied_model(x)  # Initialize weights
        copied_model.set_weights(model.get_weights())
        return copied_model
    

    def train_maml(self, x_train, y_train, epochs=1, batch_size=1):

        for _ in range(epochs):
            total_loss = 0
            losses = []
            start = time.time()

            # Split x and y into batches
            x_batches = [x_train[i:i + batch_size] for i in range(0, len(x_train), batch_size)]
            y_batches = [y_train[i:i + batch_size] for i in range(0, len(y_train), batch_size)]

            # Use tqdm to create a progress bar
            for x_batch, y_batch in tqdm(zip(x_batches, y_batches), total=len(x_batches), unit="batch"):
                x = tf.convert_to_tensor(x_batch, dtype=tf.float32)
                y = tf.convert_to_tensor(y_batch, dtype=tf.float32)

                with tf.GradientTape() as test_tape:
                    with tf.GradientTape() as train_tape:
                        y_pred = self.model(x)
                        train_loss = self.loss_function(y, y_pred)
                    gradients = train_tape.gradient(train_loss, self.model.trainable_variables)

                    k = 0
                    model_copy = self.copy_model(self.model, x)

                    for layer, gradient in zip(model_copy.layers, gradients):
                        if isinstance(layer, tf.keras.layers.BatchNormalization):
                            continue  # Skip BatchNormalization layers
                        if isinstance(layer, (tf.keras.layers.Activation, tf.keras.layers.LeakyReLU)):
                            continue  # Skip Activation layers
                        if isinstance(layer, tf.keras.layers.Dropout):
                            continue  # Skip Dropout layers

                        # Scale the gradients to match the shape of the layer's weights
                        scaled_gradients = gradient * layer.trainable_weights[0]  # Assuming the first trainable weight is the kernel
                        layer.trainable_weights[0].assign(layer.trainable_weights[0] - self.lr_inner * scaled_gradients)

                    test_loss = self.loss_function(y, model_copy(x))

                gradients = test_tape.gradient(test_loss, self.model.trainable_variables)
                self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

                total_loss += test_loss
                loss = total_loss / len(x_batches)  # Use the batch count instead of (i+1.0)
                losses.append(loss)

                if len(losses) % self.log_steps == 0 and len(losses) > 0:
                    print('Step {}: loss = {}, Time to run {} steps = {}'.format(len(losses), loss, self.log_steps, time.time() - start))
                    start = time.time()

        return self.model

This is the code that I'm trying to implement for meta-learning using MAML algorithm on the base model. But it is giving me incompatibility error. The strange thing to notice here is model is working for first three layers of base-model(until 1024 neurons Dense layer is encountered.)

Please ignore the complexity of base-model, I was just playing around with base-model. I tried by keeping only first three layers with same neuron configuration given in the above code. But output layer has 18211 neuron which cannot be removed(expected output). When I keep first three layers and output layer, it gives me error on last layer.

InvalidArgumentError: {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [512] vs. [512,18211] [Op:Mul]

And don't know why this is happening on the layer with large number of neurons only. I doubt there is a problem in meta-learning class. Please give the solution on this.

training data: features: (429, 152) labels: (429, 18211)

Thank you in advance!

0

There are 0 answers