I'm practicing this code. https://github.com/afzalxo/CameraRadarFusionNet-Pytorch
Perhaps because of the time, there were a lot of version problems(nuscenes-devkit, python), so we implemented the learning after a little adjustment of the data loader.
After training with nuscenes dataset, I made an inference, but the result value was not derived at all.
And I found that the return value of 'nn.Sigmoid(features)' in the classification module is zero in the process of training.
And I found that the return value of sigmoid() in the classification module was a tensor filled with 0.
But it happened since the second epoch started not the first epoch.
I attatch pictures of the values of classification out of the first epoch and classification out of the second epoch.

This is the classification module code.
def run_classification_submodel(self, features, num_classes):
for i in range(len(self.classification_ops)):
features = self.classification_ops[i](features)
# features = torch.permute(features, (0, 2, 3, 1))
features = features.permute(0,2,3,1).contiguous()
batch_size, width, height, channels = features.shape
features = self.out_sig(features)
# batch_size, channels, height, width = features.shape
# features = features.view(batch_size, width, height, self.num_anchors, num_classes)
outputs = features.view(batch_size, channels, height, width, -1)
# features = features.permute(0, 2, 3, 1, 4)
# outputs = features.contiguous().view(features.shape[0], -1, num_classes)
outputs = outputs.contiguous().view(batch_size, -1, num_classes)
return outputs
The 'classification' in the figure above and the 'classification' in the code below represent the same value.
And this is the whole code.
class Vggmax(nn.Module):
# def __init__(self, radar):
def __init__(self):
super(Vggmax, self).__init__()
# self.radar = False
self.radar = True
if self.radar:
self.b1_in_ch = 5
self.b2_in_ch = 66
self.b3_in_ch = 130
self.b4_in_ch = 258
self.b5_in_ch = 514
else:
self.b1_in_ch = 3
self.b2_in_ch = 64
self.b3_in_ch = 128
self.b4_in_ch = 256
self.b5_in_ch = 512
self.block1 = nn.Sequential(OrderedDict([
('block1_conv1', nn.Conv2d(in_channels=self.b1_in_ch, out_channels=64, kernel_size=3, stride=1, padding=1)),
('block1_conv1relu', nn.ReLU(inplace=False)),
('block1_conv2', nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)),
('block1_conv2relu', nn.ReLU(inplace=False)),
('block1_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
]))
self.block2 = nn.Sequential(OrderedDict([
('block2_conv1', nn.Conv2d(in_channels=self.b2_in_ch, out_channels=128, kernel_size=3, stride=1, padding=1)),
('block2_conv1relu', nn.ReLU(inplace=False)),
('block2_conv2', nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)),
('block2_conv2relu', nn.ReLU(inplace=False)),
('block2_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
]))
self.block3 = nn.Sequential(OrderedDict([
('block3_conv1', nn.Conv2d(in_channels=self.b3_in_ch, out_channels=256, kernel_size=3, stride=1, padding=1)),
('block3_conv1relu', nn.ReLU(inplace=False)),
('block3_conv2', nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)),
('block3_conv2relu', nn.ReLU(inplace=False)),
('block3_conv3', nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)),
('block3_conv3relu', nn.ReLU(inplace=False)),
('block3_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
]))
self.block4 = nn.Sequential(OrderedDict([
('block4_conv1', nn.Conv2d(in_channels=self.b4_in_ch, out_channels=512, kernel_size=3, stride=1, padding=1)),
('block4_conv1relu', nn.ReLU(inplace=False)),
('block4_conv2', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
('block4_conv2relu', nn.ReLU(inplace=False)),
('block4_conv3', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
('block4_conv3relu', nn.ReLU(inplace=False)),
('block4_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0)))
]))
self.block5 = nn.Sequential(OrderedDict([
('block5_conv1', nn.Conv2d(in_channels=self.b5_in_ch, out_channels=512, kernel_size=3, stride=1, padding=1)),
('block5_conv1relu', nn.ReLU(inplace=False)),
('block5_conv2', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
('block5_conv2relu', nn.ReLU(inplace=False)),
('block5_conv3', nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)),
('block5_conv3relu', nn.ReLU(inplace=False)),
('block5_mp', nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0)))
]))
if self.radar:
self.rad_block1_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.rad_block2_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.rad_block3_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.rad_block4_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0))
self.rad_block5_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=(1,0))
self.rad_block6_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.rad_block7_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.global_avg_pool = nn.AdaptiveAvgPool2d((1,1))
def _feature_sizes(self):
#return [66, 130, 258, 514, 514]
return [self.b2_in_ch, self.b3_in_ch, self.b4_in_ch, self.b5_in_ch, self.b5_in_ch]
def preprocess_image(self, inputs):
return preprocess_image(inputs, mode='tf')#mode='caffe')
def forward(self, input):
concat_out = []
if self.radar:
radar_out = []
radar_input = input[:,3:,:,:]
x = self.block1(input)
else:
x = input[:, :3, :, :]
if self.radar:
y = self.rad_block1_pool(radar_input)
x = torch.cat((x, y), axis=1)
x = self.block2(x)
if self.radar:
y = self.rad_block2_pool(y)
x = torch.cat((x, y), axis=1)
x = self.block3(x)
if self.radar:
y = self.rad_block3_pool(y)
radar_out.append(y)
x = torch.cat((x, y), axis=1)
concat_out.append(x)
x = self.block4(x)
if self.radar:
y = self.rad_block4_pool(y)
radar_out.append(y)
x = torch.cat((x, y), axis=1)
concat_out.append(x)
x = self.block5(x)
if self.radar:
y = self.rad_block5_pool(y)
radar_out.append(y)
x = torch.cat((x, y), axis=1)
concat_out.append(x)
x = self.global_avg_pool(x)
if self.radar:
y = self.rad_block6_pool(y)
radar_out.append(y)
y = self.rad_block7_pool(y)
radar_out.append(y)
return concat_out, radar_out
else:
return concat_out
class Retinanet(nn.Module):
# def __init__(self, backbone, pretrained, num_anchors, num_classes, num_values_regression=4, feature_size=254, image_size=(360, 640)):
def __init__(self, num_anchors, num_classes, num_values_regression=4, feature_size=254, image_size=(360, 640)):
super(Retinanet, self).__init__()
self.feature_size = feature_size
self.num_values_regression = num_values_regression
self.num_anchors = num_anchors
self.pyramid_feature_size = 256
self.regression_feature_size = 256
self.classification_feature_size = 256
self.num_classes = num_classes
# self.backbone = backbone#torchmodels.vgg16()#backbone
self.backbone = Vggmax()
__feature_size = self.backbone._feature_sizes()
self.p5_conv1 = nn.Conv2d(in_channels=__feature_size[-1], out_channels=self.feature_size, kernel_size=1, stride=1, padding=0)
self.p5_conv2 = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=1, padding=1)
self.p5_upsample = transforms.Resize((int(image_size[0]/16+1), int(image_size[1]/16)), interpolation=InterpolationMode.NEAREST)
#self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.p4_conv1 = nn.Conv2d(in_channels=__feature_size[-2], out_channels=self.feature_size, kernel_size=1, stride=1, padding=0)
self.p4_conv2 = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=1, padding=1)
self.p4_upsample = transforms.Resize((int(image_size[0]/8), int(image_size[1]/8)), interpolation=InterpolationMode.NEAREST)
#self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.p3_conv1 = nn.Conv2d(in_channels=__feature_size[-3], out_channels=self.feature_size, kernel_size=1, stride=1, padding=0)
self.p3_conv2 = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=1, padding=1)
self.p6_conv = nn.Conv2d(in_channels=__feature_size[-1], out_channels=self.feature_size, kernel_size=3, stride=2, padding=1)
self.p7_conv = nn.Conv2d(in_channels=self.feature_size, out_channels=self.feature_size, kernel_size=3, stride=2, padding=1)
### Regression ops here
self.regression_ops = nn.ModuleList()
inp_channels = self.pyramid_feature_size
for i in range(4):
self.regression_ops += [nn.Conv2d(in_channels=inp_channels, out_channels=self.regression_feature_size, kernel_size=3, stride=1, padding=1)] #TODO: Kernel initializer to normal pending
inp_channels = self.regression_feature_size
self.regression_ops += [nn.ReLU(inplace=False)]
self.regression_ops += [nn.Conv2d(in_channels=self.regression_feature_size, out_channels=self.num_anchors*self.num_values_regression, kernel_size=3, stride=1, padding=1)] #TODO: Kernel initializer to normal pending
### Classification ops here
self.classification_ops = nn.ModuleList()
inp_channels = self.pyramid_feature_size
for i in range(4):
self.classification_ops += [nn.Conv2d(in_channels=inp_channels, out_channels=self.classification_feature_size, kernel_size=3, stride=1, padding=1)]
inp_channels = self.classification_feature_size
self.classification_ops += [nn.ReLU(inplace=False)]
self.classification_ops += [nn.Conv2d(in_channels=self.classification_feature_size, out_channels=self.num_classes*self.num_anchors, kernel_size=3, stride=1, padding=1)]
self.out_sig = nn.Sigmoid()
self.anchors = Anchors()
self.focalloss = losses.FocalLoss()
self.bboxtransform = BBoxTransform()
self.clipboxes = ClipBoxes()
def create_pyramid_features(self, concat_features, radar_layers=None):
p5 = self.p5_conv1(concat_features[-1])
p5_upsampled = self.p5_upsample(p5)
p5 = self.p5_conv2(p5)
p4 = self.p4_conv1(concat_features[-2])
p4 += p5_upsampled
p4_upsampled = self.p4_upsample(p4)
p4 = self.p4_conv2(p4)
p3 = self.p3_conv1(concat_features[-3])
p3 += p4_upsampled
p3 = self.p3_conv2(p3)
p6 = self.p6_conv(concat_features[-1])
p7 = nn.ReLU(inplace=False)(p6)
p7 = self.p7_conv(p7)
if self.backbone.radar:
r3 = radar_layers[0]
r4 = radar_layers[1]
r5 = radar_layers[2]
r6 = radar_layers[3]
r7 = radar_layers[4]
p3 = torch.cat((p3, r3), axis=1)
p4 = torch.cat((p4, r4), axis=1)
p5 = torch.cat((p5, r5), axis=1)
p6 = torch.cat((p6, r6), axis=1)
p7 = torch.cat((p7, r7), axis=1)
return [p3, p4, p5, p6, p7]
def run_regression_submodel(self, features, num_values):
for i in range(len(self.regression_ops)):
features = self.regression_ops[i](features)
features = torch.permute(features, (0, 2, 3, 1))
outputs = features.contiguous().view(features.shape[0], -1, num_values)
return outputs
def run_classification_submodel(self, features, num_classes):
for i in range(len(self.classification_ops)):
features = self.classification_ops[i](features)
features = features.permute(0,2,3,1).contiguous()
batch_size, width, height, channels = features.shape
features = self.out_sig(features)
# batch_size, channels, height, width = features.shape
# features = features.view(batch_size, width, height, self.num_anchors, num_classes)
outputs = features.view(batch_size, channels, height, width, -1)
# features = features.permute(0, 2, 3, 1, 4)
# outputs = features.contiguous().view(features.shape[0], -1, num_classes)
outputs = outputs.contiguous().view(batch_size, -1, num_classes)
return outputs
def forward(self, input):
if self.backbone.radar:
image_features, radar_features = self.backbone(input)
else:
image_features, radar_features = self.backbone(input), None
pyramid_features = self.create_pyramid_features(concat_features=image_features, radar_layers=radar_features)
regression_out = torch.cat([self.run_regression_submodel(feature, 4) for feature in pyramid_features], dim=1)
classification_out = torch.cat([self.run_classification_submodel(feature, self.num_classes) for feature in pyramid_features], dim=1)
anchors = self.anchors(input)
if self.training:
return classification_out, regression_out, anchors
else:
transformed_anchors = self.bboxtransform(anchors.to(regression_out.device), regression_out)
transformed_anchors = self.clipboxes(transformed_anchors, input)
## visualization
# for b in range(input.shape[0]):
# img_rgb = input[b, :, :, :3]
# pred_boxes = transformed_anchors[b, :, :]
# draw_bboxes_cv2(img_rgb, pred_boxes)
finalResult = [[], [], []]
finalScores = torch.Tensor([])
finalAnchorBoxesIndexes = torch.Tensor([]).long()
finalAnchorBoxesCoordinates = torch.Tensor([])
if torch.cuda.is_available():
finalScores = finalScores.cuda()
finalAnchorBoxesIndexes = finalAnchorBoxesIndexes.cuda()
finalAnchorBoxesCoordinates = finalAnchorBoxesCoordinates.cuda()
for i in range(classification_out.shape[2]):
scores = torch.squeeze(classification_out[:, :, i])
# scores_over_thresh = (scores > 0.05)
scores_over_thresh = (scores > 0.01)
if scores_over_thresh.sum() == 0:
# no boxes to NMS, just continue
continue
scores = scores[scores_over_thresh]
anchorBoxes = torch.squeeze(transformed_anchors)
anchorBoxes = anchorBoxes[scores_over_thresh]
anchors_nms_idx = nms(anchorBoxes, scores, 0.5)
finalResult[0].extend(scores[anchors_nms_idx])
finalResult[1].extend(torch.tensor([i] * anchors_nms_idx.shape[0]))
finalResult[2].extend(anchorBoxes[anchors_nms_idx])
finalScores = torch.cat((finalScores, scores[anchors_nms_idx]))
finalAnchorBoxesIndexesValue = torch.tensor([i] * anchors_nms_idx.shape[0])
if torch.cuda.is_available():
finalAnchorBoxesIndexesValue = finalAnchorBoxesIndexesValue.cuda()
finalAnchorBoxesIndexes = torch.cat((finalAnchorBoxesIndexes, finalAnchorBoxesIndexesValue))
finalAnchorBoxesCoordinates = torch.cat((finalAnchorBoxesCoordinates, anchorBoxes[anchors_nms_idx]))
return [finalScores, finalAnchorBoxesIndexes, finalAnchorBoxesCoordinates]
I'm confused whether this problem comes from the Loss function or the model architecture.
Any help would be appreciated.
