The classification module returns a tensor filled with '0' from the second epoch

29 views Asked by At

I'm practicing this code. https://github.com/afzalxo/CameraRadarFusionNet-Pytorch

Perhaps because of the time, there were a lot of version problems(nuscenes-devkit, python), so we implemented the learning after a little adjustment of the data loader.

After training with nuscenes dataset, I made an inference, but the result value was not derived at all.

And I found that the return value of 'nn.Sigmoid(features)' in the classification module is zero in the process of training.

And I found that the return value of sigmoid() in the classification module was a tensor filled with 0.

But it happened since the second epoch started not the first epoch.

I attatch pictures of the values of classification out of the first epoch and classification out of the second epoch. classification_out, 1 epoch

classification_out, from 2 epoch

This is the classification module code.

def run_classification_submodel(self, features, num_classes):
        for i in range(len(self.classification_ops)):
            features = self.classification_ops[i](features)
        # features = torch.permute(features, (0, 2, 3, 1)) 
        features = features.permute(0,2,3,1).contiguous()
        batch_size, width, height, channels = features.shape
        
        features = self.out_sig(features)
        
        # batch_size, channels, height, width = features.shape
        # features = features.view(batch_size, width, height, self.num_anchors, num_classes)
        outputs = features.view(batch_size, channels, height, width, -1)
        # features = features.permute(0, 2, 3, 1, 4)
        # outputs = features.contiguous().view(features.shape[0], -1, num_classes)
        
        outputs = outputs.contiguous().view(batch_size, -1, num_classes)
        return outputs

The 'classification' in the figure above and the 'classification' in the code below represent the same value.

And this is the whole code.


class Vggmax(nn.Module):
    # def __init__(self, radar):
    def __init__(self):
        super(Vggmax, self).__init__()
        # self.radar = False
        self.radar = True

        if self.radar:
            self.b1_in_ch = 5
            self.b2_in_ch = 66
            self.b3_in_ch = 130
            self.b4_in_ch = 258
            self.b5_in_ch = 514
        else:
            self.b1_in_ch = 3
            self.b2_in_ch = 64
            self.b3_in_ch = 128
            self.b4_in_ch = 256
            self.b5_in_ch = 512

        self.block1 = nn.Sequential(OrderedDict([
            ('block1_conv1', nn.Conv2d(in_channels=self.b1_in_ch, out_channels=64, kernel_size=3, stride=1, padding=1)),
            ('block1_conv1relu', nn.ReLU(inplace=False)),
            ('block1_conv2', nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)),
            ('block1_conv2relu', nn.ReLU(inplace=False)),
            ('block1_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
        ]))
        self.block2 = nn.Sequential(OrderedDict([
            ('block2_conv1', nn.Conv2d(in_channels=self.b2_in_ch, out_channels=128, kernel_size=3, stride=1, padding=1)),
            ('block2_conv1relu', nn.ReLU(inplace=False)),
            ('block2_conv2', nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)),
            ('block2_conv2relu', nn.ReLU(inplace=False)),
            ('block2_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
        ]))
        self.block3 = nn.Sequential(OrderedDict([
            ('block3_conv1', nn.Conv2d(in_channels=self.b3_in_ch, out_channels=256, kernel_size=3, stride=1, padding=1)),
            ('block3_conv1relu', nn.ReLU(inplace=False)),
            ('block3_conv2', nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)),
            ('block3_conv2relu', nn.ReLU(inplace=False)),
            ('block3_conv3', nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)),
            ('block3_conv3relu', nn.ReLU(inplace=False)),
            ('block3_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
        ]))
        self.block4 = nn.Sequential(OrderedDict([
            ('block4_conv1', nn.Conv2d(in_channels=self.b4_in_ch, out_channels=512, kernel_size=3, stride=1, padding=1)),
            ('block4_conv1relu', nn.ReLU(inplace=False)),
            ('block4_conv2', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
            ('block4_conv2relu', nn.ReLU(inplace=False)),
            ('block4_conv3', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
            ('block4_conv3relu', nn.ReLU(inplace=False)),
            ('block4_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0)))
        ]))
        self.block5 = nn.Sequential(OrderedDict([
            ('block5_conv1', nn.Conv2d(in_channels=self.b5_in_ch, out_channels=512, kernel_size=3, stride=1, padding=1)),
            ('block5_conv1relu', nn.ReLU(inplace=False)),
            ('block5_conv2', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
            ('block5_conv2relu', nn.ReLU(inplace=False)),
            ('block5_conv3', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
            ('block5_conv3relu', nn.ReLU(inplace=False)),
            ('block5_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0)))
        ]))

        if self.radar:
            self.rad_block1_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
            self.rad_block2_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
            self.rad_block3_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
            self.rad_block4_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0))
            self.rad_block5_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0))
            self.rad_block6_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
            self.rad_block7_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1,1)) 

    def _feature_sizes(self):
        #return [66, 130, 258, 514, 514]
        return [self.b2_in_ch, self.b3_in_ch, self.b4_in_ch, self.b5_in_ch, self.b5_in_ch]

    def preprocess_image(self, inputs):
        return preprocess_image(inputs, mode='tf')#mode='caffe')

    def forward(self, input):
        concat_out = []
        if self.radar:
            radar_out = []
            radar_input = input[:,3:,:,:]
            
            x = self.block1(input)
        else:
            x = input[:, :3, :, :]
        if self.radar:
            y = self.rad_block1_pool(radar_input)
            x = torch.cat((x, y), axis=1)
        x = self.block2(x)
        if self.radar:
            y = self.rad_block2_pool(y)
            x = torch.cat((x, y), axis=1)
        x = self.block3(x)
        if self.radar:
            y = self.rad_block3_pool(y)
            radar_out.append(y)
            x = torch.cat((x, y), axis=1)
        concat_out.append(x)
        x = self.block4(x)
        if self.radar:
            y = self.rad_block4_pool(y)
            radar_out.append(y)
            x = torch.cat((x, y), axis=1)
        concat_out.append(x)
        x = self.block5(x)
        if self.radar:
            y = self.rad_block5_pool(y)
            radar_out.append(y)
            x = torch.cat((x, y), axis=1)
        concat_out.append(x)
        x = self.global_avg_pool(x)
        if self.radar:
            y = self.rad_block6_pool(y) 
            radar_out.append(y)
            y = self.rad_block7_pool(y)
            radar_out.append(y)
            return concat_out, radar_out
        else:
            return concat_out



class Retinanet(nn.Module):
    # def __init__(self, backbone, pretrained, num_anchors, num_classes, num_values_regression=4, feature_size=254, image_size=(360, 640)):
    def __init__(self, num_anchors, num_classes, num_values_regression=4, feature_size=254, image_size=(360, 640)):
        super(Retinanet, self).__init__()
        
        self.feature_size = feature_size
        self.num_values_regression = num_values_regression
        self.num_anchors = num_anchors
        self.pyramid_feature_size = 256
        self.regression_feature_size = 256
        self.classification_feature_size = 256
        self.num_classes = num_classes

        # self.backbone = backbone#torchmodels.vgg16()#backbone
        self.backbone = Vggmax()

        __feature_size = self.backbone._feature_sizes()
        self.p5_conv1 = nn.Conv2d(in_channels=__feature_size[-1], out_channels=self.feature_size, kernel_size=1, stride=1, padding=0)
        self.p5_conv2 = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=1, padding=1)
        self.p5_upsample = transforms.Resize((int(image_size[0]/16+1), int(image_size[1]/16)), interpolation=InterpolationMode.NEAREST)
        #self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest')

        self.p4_conv1 = nn.Conv2d(in_channels=__feature_size[-2], out_channels=self.feature_size, kernel_size=1, stride=1, padding=0)
        self.p4_conv2 = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=1, padding=1)
        self.p4_upsample = transforms.Resize((int(image_size[0]/8), int(image_size[1]/8)), interpolation=InterpolationMode.NEAREST)
        #self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest')

        self.p3_conv1 = nn.Conv2d(in_channels=__feature_size[-3], out_channels=self.feature_size, kernel_size=1, stride=1, padding=0)
        self.p3_conv2 = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=1, padding=1)

        self.p6_conv = nn.Conv2d(in_channels=__feature_size[-1], out_channels=self.feature_size, kernel_size=3, stride=2, padding=1)
        self.p7_conv = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=2, padding=1)

        ### Regression ops here
        self.regression_ops = nn.ModuleList()
        inp_channels = self.pyramid_feature_size
        for i in range(4):
            self.regression_ops += [nn.Conv2d(in_channels=inp_channels, out_channels=self.regression_feature_size, kernel_size=3, stride=1, padding=1)]  #TODO: Kernel initializer to normal pending
            inp_channels = self.regression_feature_size
            self.regression_ops += [nn.ReLU(inplace=False)]
        self.regression_ops += [nn.Conv2d(in_channels=self.regression_feature_size, out_channels=self.num_anchors*self.num_values_regression, kernel_size=3, stride=1, padding=1)] #TODO: Kernel initializer to normal pending

        ### Classification ops here
        self.classification_ops = nn.ModuleList()
        inp_channels = self.pyramid_feature_size
        for i in range(4):
            self.classification_ops += [nn.Conv2d(in_channels=inp_channels, out_channels=self.classification_feature_size, kernel_size=3, stride=1, padding=1)]
            inp_channels = self.classification_feature_size
            self.classification_ops += [nn.ReLU(inplace=False)]
        self.classification_ops += [nn.Conv2d(in_channels=self.classification_feature_size, out_channels=self.num_classes*self.num_anchors, kernel_size=3, stride=1, padding=1)]

        self.out_sig = nn.Sigmoid()

        self.anchors = Anchors()

        self.focalloss = losses.FocalLoss()
        self.bboxtransform = BBoxTransform()
        self.clipboxes = ClipBoxes()


    def create_pyramid_features(self, concat_features, radar_layers=None):

        p5 = self.p5_conv1(concat_features[-1])
        p5_upsampled = self.p5_upsample(p5)
        p5 = self.p5_conv2(p5)

        p4 = self.p4_conv1(concat_features[-2])

        p4 += p5_upsampled
        p4_upsampled = self.p4_upsample(p4)
        p4 = self.p4_conv2(p4)

        p3 = self.p3_conv1(concat_features[-3])

        p3 += p4_upsampled
        p3 = self.p3_conv2(p3)

        p6 = self.p6_conv(concat_features[-1])

        p7 = nn.ReLU(inplace=False)(p6)
        p7 = self.p7_conv(p7)

        if self.backbone.radar:
            r3 = radar_layers[0]
            r4 = radar_layers[1]
            r5 = radar_layers[2]
            r6 = radar_layers[3]
            r7 = radar_layers[4]

            p3 = torch.cat((p3, r3), axis=1) 
            p4 = torch.cat((p4, r4), axis=1) 
            p5 = torch.cat((p5, r5), axis=1) 
            p6 = torch.cat((p6, r6), axis=1) 
            p7 = torch.cat((p7, r7), axis=1) 

        return [p3, p4, p5, p6, p7]

    def run_regression_submodel(self, features, num_values):
        for i in range(len(self.regression_ops)):
            features = self.regression_ops[i](features)
        features = torch.permute(features, (0, 2, 3, 1))
        outputs = features.contiguous().view(features.shape[0], -1, num_values)

        return outputs

    def run_classification_submodel(self, features, num_classes):
        for i in range(len(self.classification_ops)):
            features = self.classification_ops[i](features)

        features = features.permute(0,2,3,1).contiguous()
        batch_size, width, height, channels = features.shape
        
        features = self.out_sig(features)
        
        # batch_size, channels, height, width = features.shape
        # features = features.view(batch_size, width, height, self.num_anchors, num_classes)
        outputs = features.view(batch_size, channels, height, width, -1)
        # features = features.permute(0, 2, 3, 1, 4)
        # outputs = features.contiguous().view(features.shape[0], -1, num_classes)
        
        outputs = outputs.contiguous().view(batch_size, -1, num_classes)
        return outputs

    def forward(self, input):

        if self.backbone.radar:
            image_features, radar_features = self.backbone(input)
        else:
            image_features, radar_features = self.backbone(input), None
        pyramid_features = self.create_pyramid_features(concat_features=image_features, radar_layers=radar_features) 

        regression_out = torch.cat([self.run_regression_submodel(feature, 4) for feature in pyramid_features], dim=1)
        classification_out = torch.cat([self.run_classification_submodel(feature, self.num_classes) for feature in pyramid_features], dim=1)

        anchors = self.anchors(input)

        if self.training:
            return classification_out, regression_out, anchors

        else:
            transformed_anchors = self.bboxtransform(anchors.to(regression_out.device), regression_out)
            transformed_anchors = self.clipboxes(transformed_anchors, input)

            ## visualization
            # for b in range(input.shape[0]):
            #     img_rgb = input[b, :, :, :3]
            #     pred_boxes = transformed_anchors[b, :, :]
            #     draw_bboxes_cv2(img_rgb, pred_boxes)

            finalResult = [[], [], []]
            finalScores = torch.Tensor([])
            finalAnchorBoxesIndexes = torch.Tensor([]).long()
            finalAnchorBoxesCoordinates = torch.Tensor([])

            if torch.cuda.is_available():
                finalScores = finalScores.cuda()
                finalAnchorBoxesIndexes = finalAnchorBoxesIndexes.cuda()
                finalAnchorBoxesCoordinates = finalAnchorBoxesCoordinates.cuda()


            for i in range(classification_out.shape[2]):
                scores = torch.squeeze(classification_out[:, :, i])
                # scores_over_thresh = (scores > 0.05)
                scores_over_thresh = (scores > 0.01)
                if scores_over_thresh.sum() == 0:
                    # no boxes to NMS, just continue
                    continue

                scores = scores[scores_over_thresh]
                anchorBoxes = torch.squeeze(transformed_anchors)
                anchorBoxes = anchorBoxes[scores_over_thresh]
                anchors_nms_idx = nms(anchorBoxes, scores, 0.5)

                finalResult[0].extend(scores[anchors_nms_idx])
                finalResult[1].extend(torch.tensor([i] * anchors_nms_idx.shape[0]))
                finalResult[2].extend(anchorBoxes[anchors_nms_idx])

                finalScores = torch.cat((finalScores, scores[anchors_nms_idx]))
                finalAnchorBoxesIndexesValue = torch.tensor([i] * anchors_nms_idx.shape[0])
                if torch.cuda.is_available():
                    finalAnchorBoxesIndexesValue = finalAnchorBoxesIndexesValue.cuda()

                finalAnchorBoxesIndexes = torch.cat((finalAnchorBoxesIndexes, finalAnchorBoxesIndexesValue))
                finalAnchorBoxesCoordinates = torch.cat((finalAnchorBoxesCoordinates, anchorBoxes[anchors_nms_idx]))

            return [finalScores, finalAnchorBoxesIndexes, finalAnchorBoxesCoordinates]

I'm confused whether this problem comes from the Loss function or the model architecture.

Any help would be appreciated.

0

There are 0 answers