I have been struggling to create a automatic speech recognition neural network using tensorflow trained on the hugging face mozilla common voice 11 dataset. The model seems to train well for around 100 batches before the loss sudenly goes to infinity.
Here is the code for the data preprocessing:
dataset = datasets.load_dataset("mozilla-foundation/common_voice_11_0", "en")
dataset = dataset.remove_columns(['client_id', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])
def prepare_dataset(batch):
wav_file = batch['path']
# Remove file name
split = wav_file.split("\\")
joined = "\\".join(split[:-1]) + "\\"
# Get the train number
complete_path = glob.glob(joined + "*")
# Combine all the parts
file = complete_path[0] + "\\" + split[-1]
batch['path'] = file
return batch
train_dataset = dataset['train'].map(prepare_dataset).shuffle(len(dataset['train']))
val_dataset = dataset['validation'].map(prepare_dataset).shuffle(len(dataset['validation']))
frame_length = 256
frame_step = 160
fft_length = 384
def load_mp3(wav_file):
audio = tfio.audio.AudioIOTensor(wav_file, dtype=tf.float32)
sample_rate = tf.cast(audio.rate, dtype=tf.int64)
audio = tf.squeeze(audio.to_tensor())
audio = tfio.audio.resample(audio, rate_in=sample_rate, rate_out=8000)
audio = tfio.audio.fade(audio, fade_in=1000, fade_out=2000, mode="logarithmic")
return audio
def convert_to_spect(audio):
spectrogram = tf.signal.stft(
audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
)
spectrogram = tf.abs(spectrogram)
spectrogram = tf.math.pow(spectrogram, 0.5)
spectrogram = tfio.audio.freq_mask(spectrogram, param=25)
spectrogram = tfio.audio.time_mask(spectrogram, param=25)
spectrogram = tfio.audio.freq_mask(spectrogram, param=25)
spectrogram = tfio.audio.time_mask(spectrogram, param=25)
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
spectrogram = (spectrogram - means) / (stddevs + 1e-10)
return spectrogram
def process_text(label):
label = tf.strings.lower(label)
label = tf.strings.unicode_split(label, input_encoding="UTF-8")
label = char_to_num(label)
return label
def encode_mozilla_sample(wav_file, label):
audio = load_mp3(wav_file)
spectrogram = convert_to_spect(audio)
label = process_text(label)
return spectrogram, label
And here is the code for the model:
def CTCLoss(y_true, y_pred):
# Compute the training-time loss value
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
return loss
def build_model(input_dim, output_dim, rnn_layers=5, conv_units=128, rnn_units=128, dropout=0.5):
input_spectrogram = tf.keras.layers.Input((None, input_dim), name="input")
x = tf.keras.layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
# Conv layers
x = tf.keras.layers.Conv2D(
filters=conv_units,
kernel_size=[11, 41],
strides=[2, 2],
padding="same",
use_bias=False,
name="conv_1",
)(x)
x = tf.keras.layers.BatchNormalization(name="conv_1_bn")(x)
x = tf.keras.layers.ReLU(name="conv_1_relu")(x)
x = tf.keras.layers.Conv2D(
filters=conv_units,
kernel_size=[11, 21],
strides=[1, 2],
padding="same",
use_bias=False,
name="conv_2",
)(x)
x = tf.keras.layers.BatchNormalization(name="conv_2_bn")(x)
x = tf.keras.layers.ReLU(name="conv_2_relu")(x)
x = tf.keras.layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
# RNN layers
for i in range(1, rnn_layers + 1):
recurrent = tf.keras.layers.GRU(
units=rnn_units,
activation="tanh",
recurrent_activation="sigmoid",
use_bias=True,
return_sequences=True,
reset_after=True,
name=f"gru_{i}",
)
x = tf.keras.layers.Bidirectional(
recurrent, name=f"bidirectional_{i}", merge_mode="concat"
)(x)
x = tf.keras.layers.BatchNormalization(name=f"rnn_{i}_bn")(x)
if i < rnn_layers:
x = tf.keras.layers.Dropout(rate=dropout)(x)
# Dense layer
x = tf.keras.layers.Dense(units=rnn_units * 2, activation="gelu", name="dense_1")(x)
x = tf.keras.layers.Dropout(rate=dropout)(x)
# Classification layer
output = tf.keras.layers.Dense(units=output_dim + 1, activation="softmax", name="output_layer")(x)
# Model
model = tf.keras.Model(input_spectrogram, output, name="DeepSpeech_2")
# Optimizer
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
# Compile the model and return
model.compile(optimizer=opt, loss=CTCLoss)
return model
# Get the model
model = build_model(
input_dim=fft_length // 2 + 1,
output_dim=char_to_num.vocabulary_size(),
rnn_units=32,
conv_units=32,
rnn_layers=5,
dropout=0.5
)
Versions: tensorflow: 2.10.1
python: 3.9.12
gpu: Nvidia GeForce RTX 3080
OS: Windows 11
cuDNN: 8.1
CUDA: 11.2
I have tried increasing the batch size expecting the model to generalize better but any batch size 256 or higher caused the gpu to run out of memory. The infite loss occurs with any batch size 128 or less. I have also tried increasing the batch size while using less data but the result is the same. I thought that reducing the neural network size would help solve the problem but no matter what, it seems that the loss goes to infinity after reaching a loss of around 200. A few other changes I have tried are activation functions(relu, leakyrelu, gelu), optimizers(SGD, ADAM, ADAMW), and the number of rnn/conv layers.
Note: I have considered using a pretrained model but I have always wanted to successfully create ASR from scratch using tensorflow. Will it even be possible to get even moderately acceptable results using my GPU and data or will I have to resort to using wav2vec?
Another note: I was first inspired to create this project after watching the video https://www.youtube.com/watch?v=YereI6Gn3bM made by "The A.I. Hacker - Michael Phi" who first convinced me that this was possible. Before I had thought that my computer would not be able to handle this task but after seeing him do this with pytorch, similar computer specs, and the same data, I though that I would be able to do so.
Update: I have recently tried replacing the 2D Conv layers with a single 1D Conv layer, making the GRU layer not bidirectional, and going back to the AdamW optimizer but nothing has changed.
Thanks for the solution. I just changed the number of neurons in the second to last dense layer to 512 and the model is currently running without error. Now I am just going to have to figure out how to improve the model so I can finally wrap up this project.