python pytorch Why Sequential NN and the same nn.Module NN have diference results

Question

python pytorch Why Sequential NN and the same nn.Module NN have diference results

36 views Asked by Роман Юхимюк At 29 November 2023 at 19:14

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.bn1 = torch.nn.BatchNorm2d(num_features=3)
        self.conv1 = torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.act1 = torch.nn.ReLU()
        self.pool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        # self.dr1 = torch.nn.Dropout2d(0.1)

        self.bn2 = torch.nn.BatchNorm2d(num_features=16)
        self.conv2 = torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.act2 = torch.nn.ReLU()
        self.pool2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        # self.dr2 = torch.nn.Dropout2d(0.1)

        self.bn3 = torch.nn.BatchNorm2d(num_features=32)
        self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.act3 = torch.nn.ReLU()
        self.pool3 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        # self.dr3 = torch.nn.Dropout2d(0.1)

        self.bn4 = torch.nn.BatchNorm1d(num_features=4 * 4 * 64)
        self.fc4 = torch.nn.Linear(4 * 4 * 64, 256)
        self.act4 = torch.nn.Tanh()
        # self.dr4 = torch.nn.Dropout1d(0.1)

        self.bn5 = torch.nn.BatchNorm1d(num_features=256)
        self.fc5 = torch.nn.Linear(256, 64)
        self.act5 = torch.nn.Tanh()
        # self.dr5 = torch.nn.Dropout1d(0.1)


        self.fc6 = torch.nn.Linear(64, 10)

    def forward(self, x):
        x = self.bn1(x)
        x = self.conv1(x)
        x = self.act1(x)
        x = self.pool1(x)
        # x = self.dr1(x)

        x = self.bn2(x)
        x = self.conv2(x)
        x = self.act2(x)
        x = self.pool2(x)
        # x = self.dr2(x)

        x = self.bn3(x)
        x = self.conv3(x)
        x = self.act3(x)
        x = self.pool3(x)
        # x = self.dr3(x)

        x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
        x = self.bn4(x)
        x = self.fc4(x)
        x = self.act4(x)
        # x = self.dr4(x)

        x = self.bn5(x)
        x = self.fc5(x)
        x = self.act5(x)
        # x = self.dr5(x)

        x = self.fc6(x)
        return x
last_model = Net()
#%%
def conv_block(in_f, out_f, activation='relu', *args, **kwargs):
    activations = nn.ModuleDict([
            ['tanh', nn.Tanh()],
            ['relu', nn.ReLU()]
    ])

    return nn.Sequential(
        nn.BatchNorm2d(in_f),
        nn.Conv2d(in_f, out_f, *args, **kwargs),
        activations[activation],
        nn.MaxPool2d(kernel_size=2, stride=2),
        # nn.Dropout2d(0.1)
    )


class MyEncoder(nn.Module):
    def __init__(self, enc_sizes, *args, **kwargs):
        super().__init__()
        self.conv_blokcs = nn.Sequential(*[conv_block(in_f,
            out_f, kernel_size=3, padding=1, *args, **kwargs)
            for in_f, out_f in zip(enc_sizes, enc_sizes[1:])])


    def forward(self, x):
        return self.conv_blokcs(x)


def dec_block(in_f, out_f):
    return nn.Sequential(
        nn.BatchNorm1d(in_f),
        nn.Linear(in_f, out_f),
        nn.Tanh(),
        # nn.Dropout1d(0.1)
    )

class MyDecoder(nn.Module):
    def __init__(self, dec_sizes, n_classes):
        super().__init__()
        self.dec_blocks = nn.Sequential(*[dec_block(in_f, out_f)
            for in_f, out_f in zip(dec_sizes, dec_sizes[1:])])
        self.last = nn.Linear(dec_sizes[-1], n_classes)


    def forward(self, x):
        return self.dec_blocks(x)


class MyNET(nn.Module):
    def __init__(self, in_c, enc_sizes, dec_sizes, n_classes, activation='relu'):
        super().__init__()
        self.enc_sizes = [in_c, *enc_sizes]
        l = 32 / (2 ** len(enc_sizes))
        # print(enc_sizes[-1] * l * l)
        self.dec_sizes = [int(enc_sizes[-1] * l * l), *dec_sizes]
        self.encoder = MyEncoder(self.enc_sizes, activation=activation)
        self.decoder = MyDecoder(self.dec_sizes, n_classes)


    def forward(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
        x = self.decoder(x)
        return x
my_nodel = MyNET(3, [16, 32, 64], [256, 64], 10, activation='relu')

And results on 5 epochs CIFAR10:

tensor(0.6721)

tensor(0.7059)

tensor(0.7359)

tensor(0.7288)

tensor(0.7373)

---------------

tensor(0.4944)

tensor(0.5391)

tensor(0.5898)

tensor(0.6283)

tensor(0.6398)

train function:

def train(net, X_train, y_train, X_test, y_test):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    net = net.to(device)
    loss = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=1.0e-3, weight_decay=1e-5)
    batch_size = 100
    test_accuracy_history = []
    test_loss_history = []
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    for epoch in range(5):
        order = np.random.permutation(len(X_train))
        for start_index in range(0, len(X_train), batch_size):
            optimizer.zero_grad()
            net.train()

            batch_indexes = order[start_index:start_index+batch_size]

            X_batch = X_train[batch_indexes].to(device)
            y_batch = y_train[batch_indexes].to(device).view(-1)

            preds = net.forward(X_batch)

            loss_value = loss(preds, y_batch)
            loss_value.backward()

            optimizer.step()

        net.eval()
        test_preds = net.forward(X_test)
        test_loss_history.append(loss(test_preds, y_test.squeeze()).data.cpu())

        accuracy = (test_preds.argmax(dim=1) == y_test).float().mean().data.cpu()
        test_accuracy_history.append(accuracy)

        print(accuracy)
    print('---------------')
    return test_accuracy_history, test_loss_history

I was hoping that these are identical neural networks and they will produce the same results. I thought the problem was in the training itself, but if you train first the second and then the first, the results are the same. In the code, I specifically disabled the dropout so that it does not accidentally turn off the neurons (although the random seed is the same). Perhaps the problem is that when gradients are calculated, they are somehow calculated in blocks differently than in the usual form???

Original Q&A

There are 1 answers

**Karl** · Answer 1 · 2023-11-29T20:54:03+00:00

Karl On 29 November 2023 at 20:54

Your forward method in your MyDecoder module skips the final linear layer.

As written, the first model produces an output of size (bs, 10) while the second model produces an output of size (bs, 64).

TechQA.

python pytorch Why Sequential NN and the same nn.Module NN have diference results

tensor(0.6721)

tensor(0.7059)

tensor(0.7359)

tensor(0.7288)

tensor(0.7373)

---------------

tensor(0.4944)

tensor(0.5391)

tensor(0.5898)

tensor(0.6283)

tensor(0.6398)

There are 1 answers

Related Questions in PYTHON

Related Questions in PYTORCH

Related Questions in CONV-NEURAL-NETWORK

Related Questions in SEQUENTIAL

Popular Questions

Trending Questions