I am training a dnn (CRNN) with Pytorch, but some abnormal things happened in terms of loss val.
The program can print avg_loss for every 20 batches and save the model_parameters every 100 batches. And the initial loss is about 20-30. Some problems happened in my program, so the training process is interrupted. After loading the parameters from the saved model, I continue training but find the initial loss still start from 20-30. By the way, I have a dataset about 10 million pictures and I have trained about 3 million of them.
I want to figure about where the problem is, pytorch mechanism or program bugs.
Here is more detailed:
1. CRNN structure:
CRNN (
(cnn): Sequential (
(conv0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu0): ReLU (inplace)
(pooling0): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
(conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu1): ReLU (inplace)
(pooling1): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
(conv2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(batchnorm2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
(relu2): ReLU (inplace)
(conv3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu3): ReLU (inplace)
(pooling2): MaxPool2d (size=(2, 2), stride=(2, 1), dilation=(1, 1))
(conv4): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(batchnorm4): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
(relu4): ReLU (inplace)
(conv5): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(relu5): ReLU (inplace)
(pooling3): MaxPool2d (size=(2, 2), stride=(2, 1), dilation=(1, 1))
(conv6): Conv2d(512, 512, kernel_size=(2, 2), stride=(1, 1))
(batchnorm6): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
(relu6): ReLU (inplace)
)
(rnn): Sequential (
(0): BidirectionalLSTM (
(rnn): LSTM(512, 256, bidirectional=True)
(embedding): Linear (512 -> 256)
)
(1): BidirectionalLSTM (
(rnn): LSTM(256, 256, bidirectional=True)
(embedding): Linear (512 -> 5530)
)
)
)
2. model init and parameters loading.
def crnnSource():
alphabet = keys.alphabet
converter = util.strLabelConverter(alphabet)
model = crnn.CRNN(32, 1 ,len(alphabet)+1, 256, 1) #need 1?
model.apply(weights_init)
path = './models/crnn_OCR.pkl'
model.load_state_dict(torch.load(path))
return model, converter
3. training code
def trainProc(net ,trainset, converter):
print ("--------------------------------")
print ("Start to Train.")
criterion = CTCLoss().cuda()
loss_avg = util.averager()
optimizer = optim.RMSprop(net.parameters(), lr = 0.001)
image = torch.FloatTensor(BATCH_SIZE, 3, 32, 100) #opt.imgH
text = torch.IntTensor(BATCH_SIZE * 5)
length = torch.IntTensor(BATCH_SIZE)
image = image.cuda()
image = Variable(image)
text = Variable(text)
length = Variable(length)
sav_inv = 0
for epoch in range(TRAIN_EPOCHS):
sav_inv = 0
timer = time.time()
for i,data in enumerate(trainset, 0):
img, txt = data
img = ConvtFileToTensor(img)
batch_size = img.size(0)
util.loadData(image, img)
t, l = converter.encode(txt)
util.loadData(text,t)
util.loadData(length,l)
preds = net(image)
preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
cost = criterion(preds, text, preds_size, length) / batch_size
net.zero_grad()
cost.backward()
optimizer.step()
loss_avg.add(cost)
#running_loss += loss.data[0]
if i % 20 == 19:
time2 = time.time()
print ("[%d, %5d] loss: %.6f TIME: %.6f" %(epoch+1, i+1, loss_avg.val(),time2 - timer))
print (cost)
loss_avg.reset()
timer = time.time()
if sav_inv == SAV_INV-1:
torch.save(net.state_dict(),'./models/crnn_OCR.pkl')
sav_inv = 0
else:
sav_inv += 1
torch.save(net.state_dict(),'./models/crnn_OCR.pkl')
print ("Finished Training.")
return net