DDPG training in Tensorflow.js

175 views Asked by At

I'm trying to figure out how to implement DDPG in Tensorflow.js using Python examples such as this one from keras website. I got stuck on the training code:

        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

So far my typescript version looks like this:

    const batch = this.memory.getMinibatch(this.config.replayBatchSize);
    const states = this.actorService.getStateTensor(this.actor, ...batch.map(s => s.state));
    const nextStates = this.actorService.getStateTensor(this.actor, ...batch.map(s => s.nextState));
    const rewards = tf.tensor2d(batch.map(s => s.reward), [batch.length, 1], 'float32');
    const actions = this.actorService.getActionTensor(...batch.map(s => s.action));

    const criticLossFunction = () => tf.tidy(() => {
      let targetQs: tf.Tensor;
      if (this.config.discountRate === 0) {
        targetQs = rewards;
      } else {
        const targetActions = this.targetActorModel.predict(nextStates) as tf.Tensor;
        const targetCriticQs = this.targetCriticModel.predict(tf.concat([nextStates, targetActions], 1)) as tf.Tensor;
        targetQs = rewards.add(targetCriticQs.mul(this.config.discountRate));
      }
      const criticQs = this.criticModel.predict(tf.concat([states, actions], 1)) as tf.Tensor;
      const criticLoss = tf.losses.meanSquaredError(targetQs, criticQs);
      return criticLoss.asScalar();
    });
    const criticTrainableVars = this.criticModel.getWeights(true) as tf.Variable<tf.Rank>[];
    const criticGradient = tf.variableGrads(criticLossFunction, criticTrainableVars);
    // HOWTO:  zip(critic_grad, critic_model.trainable_variables)
    this.criticModel.optimizer.applyGradients(criticGradient.grads);
    tf.dispose(criticGradient);

    const actorLossFunction = () => tf.tidy(() => {
      const policyActions = this.actorModel.predict(states) as tf.Tensor;
      const criticQs = this.criticModel.predict(tf.concat([states, policyActions], 1)) as tf.Tensor;
      const actorLoss = tf.mean(criticQs.mul(-1));
      return actorLoss.asScalar()
    });
    const actorTrainableVars = this.actorModel.getWeights(true) as tf.Variable<tf.Rank>[];
    const actorGradient = tf.variableGrads(actorLossFunction, actorTrainableVars);
    // HOWTO: zip(actor_grad, actor_model.trainable_variables)
    this.actorModel.optimizer.applyGradients(actorGradient.grads);
    const actorLoss = actorGradient.value.dataSync()[0];
    tf.dispose(actorGradient);

but my code does not work correctly (the loss is too high on a very simple task) due to it's missing one major step (?): zipping critic grads with critic trainable vars before passing it to applyGradients.

1

There are 1 answers

0
Oleg Khalidov On BEST ANSWER

False alarm, the code works correctly, no need to zip or anything. High loss was due to I used actorLoss as the value instead of criticLoss (criticGradient.value.dataSync()[0])