I'm currently approximately following Gitesh Chawda's guide for YOLOV8 from here
Everything is going as expected until the image augmentation stage, at which point the images are correctly augmented, but the corresponding labels are effectively destroyed (but the shape of the structure remains)
Edit: I've added a minimal reproducible example here https://github.com/jacky-ct/SampleData
def to_tf_format(img_path, img_class, img_box):
image = tf.io.read_file(img_path)
image = tf.image.decode_jpeg(image, channels=3)
print(f"img = {image}, cls = {img_class}, box = {img_box}")
classes = tf.cast(img_class, dtype = tf.float32)
boxes = tf.cast(img_box, dtype = tf.float32)
bounding_boxes = {
"classes": classes,
"boxes": boxes,
}
return {"images": tf.cast(image, tf.float32), "bounding_boxes": bounding_boxes}
augmentor = keras.Sequential(
layers=[
keras_cv.layers.RandomFlip(mode="horizontal", bounding_box_format="rel_xyxy"),
keras_cv.layers.RandomShear(
x_factor=0.2, y_factor=0.2, bounding_box_format="rel_xyxy"
),
keras_cv.layers.JitteredResize(
(640, 640), scale_factor = (0.7, 1.3), bounding_box_format="rel_xyxy"
),
]
)
def dict_to_tuple(inputs):
return inputs["images"], inputs["bounding_boxes"]
train_ds = train_data.map(to_tf_format, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.shuffle(BATCH_SIZE * 4)
train_ds = train_ds.ragged_batch(BATCH_SIZE, drop_remainder=True)
train_ds = train_ds.map(augmentor, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
With the output of the print statement in to_tf_format
being:
img = Tensor("DecodeJpeg:0", shape=(None, None, 3), dtype=uint8), cls = Tensor("RaggedFromVariant/RaggedTensorFromVariant:0", shape=(None,), dtype=int32), box = Tensor("RaggedFromVariant_1/RaggedTensorFromVariant:0", shape=(None, 4), dtype=float64)
to give an idea of the structure of the data. Checking the form of the data after this transformation gives an expected output if I don't apply any preprocessing -next(iter(train_ds.take(1)))[0]
gives:
<tf.RaggedTensor [[[[74.0, 79.0, 83.0],
[76.0, 81.0, 85.0],
[79.0, 84.0, 88.0],
...,
[59.0, 87.0, 126.0],
[57.0, 85.0, 124.0],
[56.0, 84.0, 123.0]],
[[71.0, 76.0, 80.0],
[73.0, 78.0, 82.0],
[75.0, 80.0, 84.0],
...,
[61.0, 89.0, 126.0],
[60.0, 88.0, 125.0],
[58.0, 86.0, 123.0]],
[[67.0, 75.0, 77.0],
[68.0, 76.0, 78.0],
[69.0, 77.0, 79.0],
...,
[67.0, 96.0, 130.0],
[67.0, 96.0, 130.0],
[65.0, 94.0, 128.0]],
...,
...
[58.0, 55.0, 38.0],
...,
[67.0, 62.0, 43.0],
[71.0, 66.0, 47.0],
[74.0, 69.0, 50.0]]]]>
and next(iter(train_ds(1)))[1]
gives:
{'classes': <tf.RaggedTensor [[7.0], [7.0, 7.0], [7.0], [7.0]]>,
'boxes': <tf.RaggedTensor [[[0.4369476, 0.92896473, 0.43951866, 0.9240088]],
[[0.6037169, 0.8419604, 0.6054107, 0.8397577],
[0.60536873, 0.89794916, 0.60703814, 0.8959251]],
[[0.42042324, 0.50033903, 0.43218476, 0.4572454]],
[[0.6108871, 0.14179516, 0.61400294, 0.13711454]]]>}
When I perform ANY kind of preprocessing (i.e. each layer individually has this effect, leading me to believe I'm packing my data wrong), the output of next(iter(train_ds.take(1)))[0]
is of the same form, but the output of next(iter(train_ds.take(1)))[1]
is 4 empty objects, one for each item in my batch size:
{'classes': <tf.RaggedTensor [[], [], [], []]>,
'boxes': <tf.RaggedTensor [[], [], [], []]>}
Obviously the expected behaviour would be that the tensors are populated with the augmented bounding box values.
Any help is greatly appreciated - I've done a lot of research and can't figurte out where I'm going wrong.