in Pytorch, restore the model parameters but the same initial loss

891 views Asked by At

I am training a dnn (CRNN) with Pytorch, but some abnormal things happened in terms of loss val.

The program can print avg_loss for every 20 batches and save the model_parameters every 100 batches. And the initial loss is about 20-30. Some problems happened in my program, so the training process is interrupted. After loading the parameters from the saved model, I continue training but find the initial loss still start from 20-30. By the way, I have a dataset about 10 million pictures and I have trained about 3 million of them.

I want to figure about where the problem is, pytorch mechanism or program bugs.

Here is more detailed:

1. CRNN structure:

    CRNN (


(cnn): Sequential (
    (conv0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu0): ReLU (inplace)
    (pooling0): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu1): ReLU (inplace)
    (pooling1): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (conv2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batchnorm2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
    (relu2): ReLU (inplace)
    (conv3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu3): ReLU (inplace)
    (pooling2): MaxPool2d (size=(2, 2), stride=(2, 1), dilation=(1, 1))
    (conv4): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batchnorm4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
    (relu4): ReLU (inplace)
    (conv5): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu5): ReLU (inplace)
    (pooling3): MaxPool2d (size=(2, 2), stride=(2, 1), dilation=(1, 1))
    (conv6): Conv2d(512, 512, kernel_size=(2, 2), stride=(1, 1))
    (batchnorm6): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
    (relu6): ReLU (inplace)
  )
  (rnn): Sequential (
    (0): BidirectionalLSTM (
      (rnn): LSTM(512, 256, bidirectional=True)
      (embedding): Linear (512 -> 256)
    )
    (1): BidirectionalLSTM (
      (rnn): LSTM(256, 256, bidirectional=True)
      (embedding): Linear (512 -> 5530)
    )
  )
)

2. model init and parameters loading.

def crnnSource():
    alphabet = keys.alphabet
    converter = util.strLabelConverter(alphabet)
    model = crnn.CRNN(32, 1 ,len(alphabet)+1, 256, 1) #need 1?
    model.apply(weights_init)
    path = './models/crnn_OCR.pkl'
    model.load_state_dict(torch.load(path))
    return model, converter

3. training code

def trainProc(net ,trainset, converter):
    print ("--------------------------------")
    print ("Start to Train.")

    criterion = CTCLoss().cuda()
    loss_avg = util.averager()
    optimizer = optim.RMSprop(net.parameters(), lr = 0.001)

    image = torch.FloatTensor(BATCH_SIZE, 3, 32, 100) #opt.imgH
    text = torch.IntTensor(BATCH_SIZE * 5)
    length = torch.IntTensor(BATCH_SIZE)
    image = image.cuda()

    image = Variable(image)
    text = Variable(text)
    length = Variable(length)

    sav_inv = 0
    for epoch in range(TRAIN_EPOCHS):
        sav_inv = 0
        timer = time.time()
        for i,data in enumerate(trainset, 0):
            img, txt = data
            img = ConvtFileToTensor(img)
            batch_size = img.size(0)
            util.loadData(image, img)
            t, l = converter.encode(txt)
            util.loadData(text,t)
            util.loadData(length,l)

            preds  = net(image)
            preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
            cost = criterion(preds, text, preds_size, length) / batch_size


            net.zero_grad()
            cost.backward()
            optimizer.step()


            loss_avg.add(cost)
            #running_loss += loss.data[0]
            if i % 20 == 19:
                time2 = time.time()
                print ("[%d, %5d] loss: %.6f  TIME: %.6f" %(epoch+1, i+1, loss_avg.val(),time2 - timer))
                print (cost)
                loss_avg.reset()
                timer = time.time()
            if sav_inv == SAV_INV-1:
                torch.save(net.state_dict(),'./models/crnn_OCR.pkl')
                sav_inv = 0
            else:
                sav_inv += 1
    torch.save(net.state_dict(),'./models/crnn_OCR.pkl')
    print ("Finished Training.")
    return net
0

There are 0 answers