python pytorch Why Sequential NN and the same nn.Module NN have diference results

36 views Asked by At
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.bn1 = torch.nn.BatchNorm2d(num_features=3)
        self.conv1 = torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.act1 = torch.nn.ReLU()
        self.pool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        # self.dr1 = torch.nn.Dropout2d(0.1)

        self.bn2 = torch.nn.BatchNorm2d(num_features=16)
        self.conv2 = torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.act2 = torch.nn.ReLU()
        self.pool2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        # self.dr2 = torch.nn.Dropout2d(0.1)

        self.bn3 = torch.nn.BatchNorm2d(num_features=32)
        self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.act3 = torch.nn.ReLU()
        self.pool3 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        # self.dr3 = torch.nn.Dropout2d(0.1)

        self.bn4 = torch.nn.BatchNorm1d(num_features=4 * 4 * 64)
        self.fc4 = torch.nn.Linear(4 * 4 * 64, 256)
        self.act4 = torch.nn.Tanh()
        # self.dr4 = torch.nn.Dropout1d(0.1)

        self.bn5 = torch.nn.BatchNorm1d(num_features=256)
        self.fc5 = torch.nn.Linear(256, 64)
        self.act5 = torch.nn.Tanh()
        # self.dr5 = torch.nn.Dropout1d(0.1)


        self.fc6 = torch.nn.Linear(64, 10)

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)
        x = self.act1(x)
        x = self.pool1(x)
        # x = self.dr1(x)

        x = self.bn2(x)
        x = self.conv2(x)
        x = self.act2(x)
        x = self.pool2(x)
        # x = self.dr2(x)

        x = self.bn3(x)
        x = self.conv3(x)
        x = self.act3(x)
        x = self.pool3(x)
        # x = self.dr3(x)

        x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
        x = self.bn4(x)
        x = self.fc4(x)
        x = self.act4(x)
        # x = self.dr4(x)

        x = self.bn5(x)
        x = self.fc5(x)
        x = self.act5(x)
        # x = self.dr5(x)

        x = self.fc6(x)
        return x
last_model = Net()
#%%
def conv_block(in_f, out_f, activation='relu', *args, **kwargs):
    activations = nn.ModuleDict([
            ['tanh', nn.Tanh()],
            ['relu', nn.ReLU()]
    ])

    return nn.Sequential(
        nn.BatchNorm2d(in_f),
        nn.Conv2d(in_f, out_f, *args, **kwargs),
        activations[activation],
        nn.MaxPool2d(kernel_size=2, stride=2),
        # nn.Dropout2d(0.1)
    )


class MyEncoder(nn.Module):
    def __init__(self, enc_sizes, *args, **kwargs):
        super().__init__()
        self.conv_blokcs = nn.Sequential(*[conv_block(in_f,
            out_f, kernel_size=3, padding=1, *args, **kwargs)
            for in_f, out_f in zip(enc_sizes, enc_sizes[1:])])


    def forward(self, x):
        return self.conv_blokcs(x)


def dec_block(in_f, out_f):
    return nn.Sequential(
        nn.BatchNorm1d(in_f),
        nn.Linear(in_f, out_f),
        nn.Tanh(),
        # nn.Dropout1d(0.1)
    )

class MyDecoder(nn.Module):
    def __init__(self, dec_sizes, n_classes):
        super().__init__()
        self.dec_blocks = nn.Sequential(*[dec_block(in_f, out_f)
            for in_f, out_f in zip(dec_sizes, dec_sizes[1:])])
        self.last = nn.Linear(dec_sizes[-1], n_classes)


    def forward(self, x):
        return self.dec_blocks(x)


class MyNET(nn.Module):
    def __init__(self, in_c, enc_sizes, dec_sizes, n_classes, activation='relu'):
        super().__init__()
        self.enc_sizes = [in_c, *enc_sizes]
        l = 32 / (2 ** len(enc_sizes))
        # print(enc_sizes[-1] * l * l)
        self.dec_sizes = [int(enc_sizes[-1] * l * l), *dec_sizes]
        self.encoder = MyEncoder(self.enc_sizes, activation=activation)
        self.decoder = MyDecoder(self.dec_sizes, n_classes)


    def forward(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
        x = self.decoder(x)
        return x
my_nodel = MyNET(3, [16, 32, 64], [256, 64], 10, activation='relu')

And results on 5 epochs CIFAR10:

tensor(0.6721)

tensor(0.7059)

tensor(0.7359)

tensor(0.7288)

tensor(0.7373)

---------------

tensor(0.4944)

tensor(0.5391)

tensor(0.5898)

tensor(0.6283)

tensor(0.6398)

train function:

def train(net, X_train, y_train, X_test, y_test):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    net = net.to(device)
    loss = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=1.0e-3, weight_decay=1e-5)
    batch_size = 100
    test_accuracy_history = []
    test_loss_history = []
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    for epoch in range(5):
        order = np.random.permutation(len(X_train))
        for start_index in range(0, len(X_train), batch_size):
            optimizer.zero_grad()
            net.train()

            batch_indexes = order[start_index:start_index+batch_size]

            X_batch = X_train[batch_indexes].to(device)
            y_batch = y_train[batch_indexes].to(device).view(-1)

            preds = net.forward(X_batch)

            loss_value = loss(preds, y_batch)
            loss_value.backward()

            optimizer.step()

        net.eval()
        test_preds = net.forward(X_test)
        test_loss_history.append(loss(test_preds, y_test.squeeze()).data.cpu())

        accuracy = (test_preds.argmax(dim=1) == y_test).float().mean().data.cpu()
        test_accuracy_history.append(accuracy)

        print(accuracy)
    print('---------------')
    return test_accuracy_history, test_loss_history

I was hoping that these are identical neural networks and they will produce the same results. I thought the problem was in the training itself, but if you train first the second and then the first, the results are the same. In the code, I specifically disabled the dropout so that it does not accidentally turn off the neurons (although the random seed is the same). Perhaps the problem is that when gradients are calculated, they are somehow calculated in blocks differently than in the usual form???

1

There are 1 answers

0
Karl On

Your forward method in your MyDecoder module skips the final linear layer.

As written, the first model produces an output of size (bs, 10) while the second model produces an output of size (bs, 64).