my code:
with strategy.scope():
model = MoAt(
in_shape=(image_size, image_size, 3),
out_classes=total_labels,
definition_name=definition_name,
window_sides=window_sides,
input_scaling="inception",
stochdepth_rate=stochdepth_rate,
)
if args.checkpoint is None and args.origin==False:
save_path = "outputs/%s_%s" % (definition_name, date_time)
elif args.origin == False and args.checkpoint is not None:
save_path = f'{checkpoint_dir}'
model = tf.keras.models.load_model(args.checkpoint, compile=False) #
elif args.origin==True:
save_path = "outputs/%s_%s" % (definition_name, date_time)
origin_model = tf.keras.models.load_model("results/reference")
i = 0
for layer_original, layer_modified in zip(origin_model.layers[:-2], model.layers[:-2]):
if layer_original.get_weights():
i+=1
layer_modified.set_weights(layer_original.get_weights())
print("load from reference:", i)
f1 = F1Score(total_labels, "micro", 0.4) #
rec_at_p65 = tf.keras.metrics.RecallAtPrecision(0.65, num_thresholds=1024) #
loss = AsymmetricLoss(
reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
gamma_neg=asl_gamma_neg,
gamma_pos=asl_gamma_pos,
clip=asl_clip,
)
curr_opt = tf.keras.optimizers.legacy.Adam(
learning_rate=warmup_learning_rate,
decay=weight_decay_rate,
)
""" curr_opt.exclude_from_weight_decay(var_names = [
r".*(gamma|beta|bias|mean|variance|embedding):0$"
]) """
opt = GradientAccumulateOptimizer(optimizer=curr_opt, accum_steps=args.accu_steps) #
model.compile(optimizer=opt, loss=loss, metrics=[f1, rec_at_p65])
t800 = tf.keras.callbacks.TerminateOnNaN()
sched = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=True)
rmc_loss = tf.keras.callbacks.ModelCheckpoint(
"%s/variables/best_model/best"%save_path,
save_best_only=True,
save_freq="epoch",
save_weights_only=True,
)
tensorboard_step_writer = tf.summary.create_file_writer(f"{save_path}/tensorboard_step")
tensorboard_epoch_writer = tf.summary.create_file_writer(f"{save_path}/tensorboard_epoch")
cb_list = [t800, rmc_loss, sched, CustomCallback(), metrics_csv_logger]
print("initial_epoch:", initial_epoch)
print(model.summary())
history = model.fit(
training_dataset,
validation_data=validation_dataset,
initial_epoch=initial_epoch,
epochs=total_epochs,
steps_per_epoch= math.ceil(train_dataset_len // global_batch_size) ,
validation_steps=math.ceil(val_dataset_len // global_batch_size ),
callbacks=cb_list,
)
my command:
CUDA_VISIBLE_DEVICES=0,1,2,3 python train_moat_tfrecord.py --checkpoint /Projects/SW-CV-outputs/MoAt2_03_08_2024_18h06m07s/variables/saved_model_2
the saved model:
I can train the model but there are some DEBUG INFO:
[[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_2}}]]
2024-03-14 10:18:40.901993: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall' with dtype float and shape [?,112,112,128]
[[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall}}]]
2024-03-14 10:18:40.902095: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1' with dtype float and shape [?,1,1,1]
the biggest problem is the loss:
Why does such an abnormal loss curve appear?
my environment
tensorflow==2.12.0
tensorflow_addons==0.20.0