I'm trying to add captions to a video. The desired outcome is to show each word in the exact moment is being said.
I have a method that gives me the accurate time start and end per each word:
def get_words_per_time(audio_speech_file):
model = whisper.load_model("base")
transcribe = model.transcribe(
audio=audio_speech_file, fp16=False, word_timestamps=True
)
segments = transcribe["segments"]
words = []
for seg in segments:
for word in seg["words"]:
words.append(
{
"word": word["word"],
"start": word["start"],
"end": word["end"],
"prob": round(word["probability"], 4),
}
)
return words
Then I have a code that uses MoviePy to create TextClip and assing a given start and end time per pair of words (I know there are redundant statements, srry):
def generate_captions(
words,
font="Komika",
fontsize=32,
color="White",
align="center",
stroke_width=3,
stroke_color="black",
):
text_comp = []
for i in track(range(0, len(words), 2), description="Creating captions..."):
word1 = words[i]
if i + 1 < len(words):
word2 = words[i + 1]
text_clip = TextClip(
f"{word1['word']} {word2['word'] if i + 1 < len(words) else ''}",
font=font, # Change Font if not found
fontsize=fontsize,
color=color,
align=align,
method="caption",
size=(660, None),
stroke_width=stroke_width,
stroke_color=stroke_color,
)
text_clip = text_clip.set_start(word1["start"])
text_clip = text_clip.set_end(
word2["end"] if i + 1 < len(words) else word1["end"]
)
text_comp.append(text_clip)
return text_comp
Finally, I concatenate the words into a single video:
vid_clip = CompositeVideoClip(
[vid_clip, concatenate_videoclips(text_comp).set_position(("center", 860))]
)
The output is this, but you can clearly see the words are not flowing with the speech. They somehow move faster as if the start/end time did not matter. Here's the video
The words with their respective start/end time, look like this:
[
{
'word': 'This',
'start': 0.0,
'end': 0.22,
'prob': 0.805
},
{
'word': 'is',
'start': 0.22,
'end': 0.42,
'prob': 0.9991
},
{
'word': 'a',
'start': 0.42,
'end': 0.6,
'prob': 0.999
},
{
'word': 'test,
',
'start': 0.6,
'end': 1.04,
'prob': 0.9939
},
{
'word': 'to',
'start': 1.18,
'end': 1.3,
'prob': 0.9847
},
{
'word': 'show',
'start': 1.3,
'end': 1.54,
'prob': 0.9971
},
{
'word': 'words',
'start': 1.54,
'end': 1.9,
'prob': 0.995
},
{
'word': 'does',
'start': 1.9,
'end': 2.16,
'prob': 0.997
},
{
'word': 'not',
'start': 2.16,
'end': 2.4,
'prob': 0.9978
},
{
'word': 'appear.',
'start': 2.4,
'end': 2.82,
'prob': 0.9984
},
{
'word': 'At',
'start': 3.46,
'end': 3.6,
'prob': 0.9793
},
{
'word': 'their',
'start': 3.6,
'end': 3.8,
'prob': 0.9984
},
{
'word': 'proper',
'start': 3.8,
'end': 4.22,
'prob': 0.9976
},
{
'word': 'time.',
'start': 4.22,
'end': 4.72,
'prob': 0.999
},
{
'word': 'Thanks',
'start': 5.04,
'end': 5.4,
'prob': 0.9662
},
{
'word': 'for,
',
'start': 5.4,
'end': 5.66,
'prob': 0.9941
},
{
'word': 'watching.',
'start': 5.94,
'end': 6.36,
'prob': 0.7701
}
]
What could be causing this?
The quick solution I found was just to give each word a duration of end - start in seconds. Then, for every speech pause, add a TextClip with one letter and fontSize=1 so it is not visible. This way, the captions appear at the time they should. I know it is not the most orthodox solution, but it works.