Image Augmentation using Albumentations

584 views Asked by At

I'm facing an issue when I am using the albumentations library in python to do image augmentation on the fly, which means while training the model. I am trying to train an object detection (OD) model and I am using albumentations to perform the augmentations because they make it so easy when dealing with bounding boxes. The issue is whenever I find a source to train an OD model, I search for the part where the data goes into the model, I only add one step where I pass the data to the augmentation function and make sure that the output of this function is exactly as the input without the function and then I proceed to train the model. I tried this resource and added this function

`def get_dataset(filenames, batch_size, architecture, data_type):

AUTOTUNE = tf.data.AUTOTUNE

if data_type=="train":
    dataset = (
        tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .map(lambda image, bboxes, labels: tf.py_function(augment_image, inp=[image, bboxes, labels], Tout=(tf.float32, tf.float32, tf.float32)), num_parallel_calls=AUTOTUNE)
        .batch(batch_size=batch_size)
        .shuffle(batch_size * 10)
        .map(LabelEncoder(architecture=architecture).encode_batch, num_parallel_calls=AUTOTUNE)
        .prefetch(AUTOTUNE)

    )
else:
    dataset = (
        tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .map(lambda image, bboxes, labels: tf.py_function(augment_image, inp=[image, bboxes, labels], Tout=(tf.float32, tf.float32, tf.float32)), num_parallel_calls=AUTOTUNE)
        .batch(batch_size=batch_size)
        .shuffle(batch_size * 10)
        .map(LabelEncoder(architecture=architecture).encode_batch, num_parallel_calls=AUTOTUNE)
        .prefetch(AUTOTUNE))

return dataset`

where augment_image is:

# augment image

def augment_image(image, bboxes, labels): # getting image metadata class_labels = labels

# defining transformation object
transform = A.Compose([
A.HorizontalFlip(p=0.5),
A.HorizontalFlip(p=0.5),
A.Resize(height=224, width=224, interpolation=cv2.INTER_AREA, p=1),
A.RandomContrast(limit=0.2, p=0.2),
A.RandomBrightness(limit=0.2, p=0.2),
], bbox_params=A.BboxParams(format='coco', label_fields=['class_labels']))

# applying transformation to image
transformed = transform(image=image.numpy(), bboxes=bboxes.numpy().reshape(-1, 4), class_labels=class_labels.numpy())
transformed_bboxes = np.array(transformed['bboxes'])
transformed_class_labels = transformed['class_labels']

# from xmin, ymin, width, height to x_center, y_center, width, height 
transformed_bboxes = np.column_stack((transformed_bboxes[..., :2]+transformed_bboxes[..., 2:]/2, transformed_bboxes[..., 2:]))

# image
transformed_image = np.float32(transformed['image'])/255

return transformed_image, transformed_bboxes, transformed_class_labels

When i used these augmentations the model was getting 0 mAP but it was performing well without them.

also i tried using DETR from this source, where i changed the class CoocoDetection like this:

class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder,  train_json_path, test_json_path, feature_extractor, train=True):
    self.train = train

    if self.train:
        ann_file = train_json_path
    else:
        ann_file = test_json_path

    super(CocoDetection, self).__init__(img_folder, ann_file)
    self.feature_extractor = feature_extractor

    # Define an augmentation pipeline
    self.transform = A.Compose([
                A.Resize(height=800, width=800, interpolation=cv2.INTER_AREA, p=1),
                A.VerticalFlip(p=0.2),
                A.HorizontalFlip(p=0.2),
                A.RandomBrightnessContrast(p=0.2),
                ], bbox_params=A.BboxParams(format='coco', label_fields=['class_labels']))

def __getitem__(self, idx):
    img, target = super(CocoDetection, self).__getitem__(idx)
    
    # Apply transformations
    img, target = self.augment_image(img, target)

    # preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
    image_id = self.ids[idx]
    target = {'image_id': image_id, 'annotations': target}
    encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
    pixel_values = encoding["pixel_values"].squeeze()
    target = encoding["labels"][0]

    return pixel_values, target

def augment_image(self, image, target):
    # copy to avoid over writing
    transformed_target = copy.deepcopy(target)
    new_image = np.array(copy.copy(image))

    # processing
    assert len(transformed_target)==1, "One annotation per image"
    bboxes = [obj['bbox'] for obj in transformed_target]
    class_labels = [obj['category_id'] for obj in transformed_target]

    # applying transformation to image
    transformed = self.transform(image=new_image, bboxes=bboxes, class_labels=class_labels)
    transformed_image = Image.fromarray(transformed['image'])
    transformed_bboxes = np.array(transformed['bboxes'])

    transformed_target[0]['area'] = np.round(transformed_bboxes[0][2]*transformed_bboxes[0][3]).astype(int)
    transformed_target[0]['bbox'] = np.round(transformed_bboxes[0]).astype(int).tolist()
    

    return transformed_image, transformed_target

and I faced the same issue.

I really don't understand what's happening. in both cases I tried simplifying the augmentation as much as possible all the way to a simple resize and still the sane thing. please note that in both cases I am augmenting the validation set which to my understanding is not a wrong practice. I am also making sure that my bounding boxes are correct after the transformations.

0

There are 0 answers