I am trying to implement an handwriting ocr based on the keras ocr example: link. However I get the following error:

InvalidArgumentError: All labels must be nonnegative integers, batch: 0 labels: 1,0,11,9,45,0,25,17,27,41,39,9,37,0,23,1,39,9,35,0,11,35,29,25,0,1,0,27,9,1,35,3,49,0,43,17,23,23,1,13,9,0,69,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
     [[{{node ctc_6/CTCLoss}}]]
     [[{{node training_5/SGD/gradients/ctc_6/CTCLoss_grad/mul}}]]

Here are the generator, the ctc and the train function:

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
#Generation of data: load the images, resize, gray, normalize them 
class DataGenerator(keras.utils.Sequence):
    def __init__(self, list_Files, labels,downsample_factor, max_string_length=80, batch_size=32, dim=(512,64), shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_Files = list_Files
        self.shuffle = shuffle
        self.max_string_length = max_string_length
        self.downsample_factor = downsample_factor       

    #TODO: Add weight save
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_Files))
        if self.shuffle==True:

    def __data_generation(self, list_Files_temp):
        #*[2,2] --> 2,2 (unpack values)
        X = np.ones([self.batch_size, *self.dim,1]) 
        y = np.ones([self.batch_size, self.max_string_length])*-1 #As in the keras_ocr example why -1?
        X_length = np.zeros([self.batch_size,1])
        y_length = np.zeros([self.batch_size,1])

        #TODO: add mix with blank inputs as it is said to be important for transitional invariance

        for i, file in enumerate(list_Files_temp):
            im = cv2.imread(file)# load the file as numpy array
            im = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY) #Transform the file into a Gray image
            im = cv2.resize(im, self.dim[::-1]) #Resize it (cv2 takes width first)
            im = im / 255 #Normalization

            X[i,0:self.dim[0],:,0] = im
            X_length[i] = self.dim[0] // self.downsample_factor -2 #?????

            seq = text_to_labels(self.labels[file])            
            y[i,0:len(seq)] = text_to_labels(self.labels[file]) #Transform the text into a list of integers

            y_length[i] = len(y[i])

        inputs={'the_input': X,
                'the_labels': y,
        outputs = {'ctc': np.zeros([self.batch_size])}


        return (inputs, outputs)

    def __len__(self):
        'Number of batches per epoch'
        return int(np.floor(len(self.list_Files) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        list_Files_temp = [self.list_Files[k] for k in indexes]
        (inputs, outputs) = self.__data_generation(list_Files_temp)

        return (inputs, outputs)

def train(dim_images,partition,labels):

    #Misc parameters
    absolute_max_string_length = 80 
    output_size = len(alphabet) + 1 #+1 for the CTC blank symbol

    #Network parameters
    img_h = dim_images[0]
    img_w = dim_images[1]
    conv_filters = 16
    kernel_size = (3,3)
    pool_size = 2
    time_dense_size = 32
    rnn_size = 512
    act = 'relu'
    input_shape = (*DIM_IMAGES,1)
    downsample_factor = pool_size**2

    #Convolutional layer
    input_data = Input(name='the_input', shape=input_shape)
    inner = Conv2D(conv_filters, kernel_size, padding='same', 
                   activation=act, kernel_initializer='he_normal', name='conv1')(input_data)
    inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
    inner = Conv2D(conv_filters, kernel_size, padding='same',
               activation=act, kernel_initializer='he_normal',
    inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)

    conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
    inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)

    #Recurrent layer
    gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
    gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
    gru1_merged = add([gru_1, gru_1b])
    gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)

    # transforms RNN output to character activations:
    inner = Dense(output_size, kernel_initializer='he_normal',
              name='dense2')(concatenate([gru_2, gru_2b]))

    #Prediction (need to be decoded)
    y_pred = Activation('softmax', name='softmax')(inner)

    Model(inputs=input_data, outputs=y_pred).summary()

    labelsI = Input(name='the_labels',
                   shape =[absolute_max_string_length], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(
        ctc_lambda_func, output_shape=(1,),
        name='ctc')([y_pred, labelsI, input_length, label_length])

    training_generator = DataGenerator(partition['train'],labels,downsample_factor, batch_size=BATCH_SIZE, dim=DIM_IMAGES, shuffle=True)
    valid_generator = DataGenerator(partition['valid'], labels,downsample_factor, batch_size=BATCH_SIZE, dim=DIM_IMAGES, shuffle=False)

    # clipnorm seems to speeds up convergence
    sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)

    model = Model(inputs=[input_data, labelsI, input_length, label_length],

    # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)

    # captures output of softmax so we can decode the output during visualization
    test_func = K.function([input_data], [y_pred])

        steps_per_epoch=(len(partition['train'])-len(partition['valid'])) // BATCH_SIZE,

I guess the '-1' labels come from this line:

y = np.ones([self.batch_size, self.max_string_length])*-1

In the original code, the there was a similar line (line 220) but it runs well:

self.Y_data = np.ones([self.num_words, self.absolute_max_string_len]) * -1

I thought the '-1' were a way of padding the sequence, but this value seems forbidden by the ctc function, is there something I am missing here?

1 Answers

chalulu On

It seems I just mixed up my image length and image width. Plus, the "label_length" should be equal to the real length of the sentence (before paddding with -1). Therefore the line:

y_length[i] = len(y[i])

Should be replaced by:

y_length[i] = len(seq)