Adding h264 frames to mp4 file

50 views Asked by At

I have raw h264 video frames

Stream #0:0: Video: h264 (Main), yuvj420p(pc, bt709, progressive), 1280x720, 25 fps, 25 tbr, 1200k tbn, 50 tbc

and raw audio frames:

Stream #0:0: Audio: pcm_s16le, 16000 Hz, 1 channels, s16, 256 kb/s

I also have a list of timestamps in microseconds of each frame

600 0xd96533 (audio)
601 0xd9e1dd (audio)
602 0xda4f52 (audio)
603 0xda5a63 (video)
604 0xdacc4b (audio)
605 0xdb39a3 (audio)
606 0xdb5ee9 (video)
607 0xdbb6d8 (audio)
608 0xdc23fe (audio)
609 0xdcb255 (audio)
610 0xdd0e69 (audio)
611 0xdd8b96 (audio)
612 0xdd67d0 (video)
613 0xddf8bd (audio)

note that the timestamp difference between two audio frames is ~0.032s or ~0.028s (average 0.03s?)

and the timestamp difference between two video frames is mutiply of ~0.06666s (0.0666,0.1333,0.2)

this data was captured from a camera that is capturing at max 15fps according to the spec.

I want to merge them into one mp4 file.

raw video frame info

[FRAME]
media_type=video
stream_index=0
key_frame=1
pkt_pts=N/A
pkt_pts_time=N/A
-> pkt_dts=N/A
-> pkt_dts_time=N/A
best_effort_timestamp=N/A
best_effort_timestamp_time=N/A
-> pkt_duration=48000
-> pkt_duration_time=0.040000
pkt_pos=1476573
pkt_size=57677
width=1280
height=720
pix_fmt=yuvj420p
sample_aspect_ratio=N/A
pict_type=I
coded_picture_number=189
display_picture_number=0
interlaced_frame=0
top_field_first=0
repeat_pict=0
color_range=pc
color_space=bt709
color_primaries=bt709
color_transfer=bt709
chroma_location=left
[/FRAME]
[FRAME]
media_type=video
stream_index=0
key_frame=0
pkt_pts=N/A
pkt_pts_time=N/A
-> pkt_dts=N/A
-> pkt_dts_time=N/A
best_effort_timestamp=N/A
best_effort_timestamp_time=N/A
-> pkt_duration=48000
-> pkt_duration_time=0.040000
pkt_pos=1534250
pkt_size=3928
width=1280
height=720
pix_fmt=yuvj420p
sample_aspect_ratio=N/A
pict_type=P
coded_picture_number=190
display_picture_number=0
interlaced_frame=0
top_field_first=0
repeat_pict=0
color_range=pc
color_space=bt709
color_primaries=bt709
color_transfer=bt709
chroma_location=left
[/FRAME]

The result frames should have values similar to this:

video frames

[FRAME]
media_type=video
stream_index=0
key_frame=0
pkt_pts=N/A
pkt_pts_time=N/A
-> pkt_dts=500
-> pkt_dts_time=16.666667
best_effort_timestamp=500
best_effort_timestamp_time=16.666667
-> pkt_duration=1
-> pkt_duration_time=0.033333
pkt_pos=1772182
pkt_size=3070
width=1280
height=720
pix_fmt=yuvj420p
sample_aspect_ratio=N/A
pict_type=P
coded_picture_number=191
display_picture_number=0
interlaced_frame=0
top_field_first=0
repeat_pict=0
color_range=pc
color_space=bt709
color_primaries=bt709
color_transfer=bt709
chroma_location=left
[/FRAME]

pkt_duration_time is always 0.033333, pkt_dts maybe even or odd not both (per stream) and also pkt_dts almost always jumps by 2, but sometimes by 4

audio frames

[FRAME]
media_type=audio
stream_index=1
key_frame=1
pkt_pts=0
pkt_pts_time=0.000000
pkt_dts=0
pkt_dts_time=0.000000
best_effort_timestamp=0
best_effort_timestamp_time=0.000000
pkt_duration=480
pkt_duration_time=0.030000
pkt_pos=608
pkt_size=960
sample_fmt=s16
nb_samples=480
channels=1
channel_layout=unknown
[/FRAME]
[FRAME]
media_type=audio
stream_index=1
key_frame=1
pkt_pts=480
pkt_pts_time=0.030000
pkt_dts=480
pkt_dts_time=0.030000
best_effort_timestamp=480
best_effort_timestamp_time=0.030000
pkt_duration=480
pkt_duration_time=0.030000
pkt_pos=1654
pkt_size=960
sample_fmt=s16
nb_samples=480
channels=1
channel_layout=unknown
[/FRAME]
[FRAME]
media_type=audio
stream_index=1
key_frame=1
pkt_pts=960
pkt_pts_time=0.060000
pkt_dts=960
pkt_dts_time=0.060000
best_effort_timestamp=960
best_effort_timestamp_time=0.060000
pkt_duration=480
pkt_duration_time=0.030000
pkt_pos=2726
pkt_size=960
sample_fmt=s16
nb_samples=480
channels=1
channel_layout=unknown
[/FRAME]

those are the sequences

//Audio
frame_len=480
pkt_duration_time=0.030000
pkt_pts=frame_len*frame_index
pkt_pts_time=pkt_duration_time*frame_index
pkt_pos=LAST_FRAME_PTS + ~1000 //or timestamp_us/x ?
//Video
pkt_duration_time=0.033333
pkt_dts=(2 or 4)*frame_index
pkt_dts_time=LAST_FRAME_DTS+pkt_duration_time

Here is my current code for adding a video frame:

#include <libavformat/avformat.h>
AVFormatContext *format_context;
AVStream *out_stream;

void init_out_stream(){
        out_stream->id = 0;
        out_stream->time_base = (AVRational){1, 30}; //<-------------
        out_stream->codec->codec_id   = AV_CODEC_ID_H264;
        out_stream->codec->width      = 1280;
        out_stream->codec->height     = 720;
        out_stream->codec->pix_fmt    = AV_PIX_FMT_YUV420P;
}
int WriteH264VideoSample(unsigned char *sample, unsigned int sample_size, int iskeyframe, unsigned long long int timestamp_us){

        AVPacket packet = { 0 };
        av_init_packet(&packet);

        packet.stream_index = 0;
        packet.data         = sample;
        packet.size         = sample_size;
        packet.pos          = -1;

        timestamp = timestamp_us / 1000; //to ms
        /*pts = last pts + timebase unit (1/30 or 33ms) difference 
        between last and current timestamps*/
        pkt.pts = last_pts + (timestamp - last_timestamp) / 33; 
        last_pts = pkt.pts;
        pkt.dts = pkt.pts;
        last_timestamp = timestamp;
        packet.duration = 0;

        av_packet_rescale_ts(&packet, (AVRational){1, 25}, out_stream->time_base); //<-------------

        if (iskeyframe) {
            packet.flags |= AV_PKT_FLAG_KEY;
        }

        if (av_interleaved_write_frame(format_context, &packet) < 0) {
            printf("Fail to write frame\n");
            return 0;
        }

        //file_duration += duration;

        return 1;
}


int main(){
        avformat_alloc_output_context2(&format_context, 0, "avi", 0);
        out_stream = avformat_new_stream(format_context, 0);
        init_out_stream();

        return 0;
}

However the pts i use doesn't sync correctly, with my code sometimes the pts jumps by 3 and sometimes by 2 each frame however the synced result should jump by 2 or 4. (all even or all odd (per stream))

for the audio i tried

AVPacket packet = { 0 };
av_init_packet(&packet);

packet.stream_index = 1;
packet.data         = sample;
pkt.size            = 960;
packet.pos          = -1;

/* 32000/2 = 16000; 16000/33.33333 = ~480 */
/* 28000/2 = 14000; 14000/33.33333 = ~420 ??*/
timestamp = timestamp_us / 2;
pkt.pts = last_audio_pts + round(timestamp/33.333333333333);
pkt.dts = pkt.pts;
last_audio_pts = pkt.pts;

pkt.duration = 0;

av_packet_rescale_ts(&packet, (AVRational){1, 25}, (AVRational){1, 30});

In this case every frame has the correct info but pkt_duration is 240 instead of 480 and pkt_pts_time jumps by 0.06s instead of 0.03s

What is wrong with my calculation? Thanks.

0

There are 0 answers