I am attempting to run a modified version of the ffmpeg muxing example which outputs vorbis encoded audio to a webm container.
The code works fine if I specify mp3 as the format, just not when I use vorbis
THe code is similar to http://www.ffmpeg.org/doxygen/2.0/doc_2examples_2muxing_8c-example.html but with the video portions stripped out. I tested with video enabled and the example video was encoded properly, but with no audio.
ffmpeg is compiled with libvorbis and libvpx support.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <libavutil/opt.h>
#include <libavutil/mathematics.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libswresample/swresample.h>
#define STREAM_DURATION 200.0
extern AVCodec ff_libvorbis_encoder;
static AVFrame *frame;
static AVStream *add_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id)
{
AVCodecContext *c;
AVStream *st;
/* find the encoder */
//*codec = &ff_libvorbis_encoder;
*codec = avcodec_find_encoder(codec_id);
if (!(*codec)) {
fprintf(stderr, "Could not find encoder for '%s'\n",
avcodec_get_name(codec_id));
exit(1);
}
st = avformat_new_stream(oc, *codec);
if (!st) {
fprintf(stderr, "Could not allocate stream\n");
exit(1);
}
st->id = oc->nb_streams-1;
c = st->codec;
switch ((*codec)->type) {
case AVMEDIA_TYPE_AUDIO:
c->sample_fmt = AV_SAMPLE_FMT_FLTP;
c->bit_rate = 64000;
c->sample_rate = 44100;
c->channels = 2;
break;
default:
break;
}
/* Some formats want stream headers to be separate. */
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
c->flags |= CODEC_FLAG_GLOBAL_HEADER;
return st;
}
static float t, tincr, tincr2;
static uint8_t **src_samples_data;
static int src_samples_linesize;
static int src_nb_samples;
static int max_dst_nb_samples;
uint8_t **dst_samples_data;
int dst_samples_linesize;
int dst_samples_size;
struct SwrContext *swr_ctx = NULL;
static void open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st) {
AVCodecContext *c;
int ret;
c = st->codec;
/* open it */
ret = avcodec_open2(c, codec, NULL);
if (ret sample_rate;
/* increment frequency by 110 Hz per second */
tincr2 = 2 * M_PI * 110.0 / c->sample_rate / c->sample_rate;
src_nb_samples = c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE ?
10000 : c->frame_size;
ret = av_samples_alloc_array_and_samples(&src_samples_data, &src_samples_linesize, c->channels,
src_nb_samples, c->sample_fmt, 0);
if (ret sample_fmt != AV_SAMPLE_FMT_S16) {
swr_ctx = swr_alloc();
if (!swr_ctx) {
fprintf(stderr, "Could not allocate resampler context\n");
exit(1);
}
/* set options */
av_opt_set_int (swr_ctx, "in_channel_count", c->channels, 0);
av_opt_set_int (swr_ctx, "in_sample_rate", c->sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0);
av_opt_set_int (swr_ctx, "out_channel_count", c->channels, 0);
av_opt_set_int (swr_ctx, "out_sample_rate", c->sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
/* initialize the resampling context */
if ((ret = swr_init(swr_ctx)) channels,
max_dst_nb_samples, c->sample_fmt, 0);
if (ret channels, max_dst_nb_samples,
c->sample_fmt, 0);
}
static void get_audio_frame(int16_t *samples, int frame_size, int nb_channels)
{
int j, i, v;
int16_t *q;
q = samples;
for (j = 0; j codec;
get_audio_frame((int16_t *)src_samples_data[0], src_nb_samples, c->channels);
/* convert samples from native format to destination codec format, using the resampler */
if (swr_ctx) {
/* compute destination number of samples */
dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx, c->sample_rate) + src_nb_samples,
c->sample_rate, c->sample_rate, AV_ROUND_UP);
if (dst_nb_samples > max_dst_nb_samples) {
av_free(dst_samples_data[0]);
ret = av_samples_alloc(dst_samples_data, &dst_samples_linesize, c->channels,
dst_nb_samples, c->sample_fmt, 0);
if (ret channels, dst_nb_samples,
c->sample_fmt, 0);
}
/* convert to destination format */
ret = swr_convert(swr_ctx,
dst_samples_data, dst_nb_samples,
(const uint8_t **)src_samples_data, src_nb_samples);
if (ret nb_samples = dst_nb_samples;
avcodec_fill_audio_frame(frame, c->channels, c->sample_fmt,
dst_samples_data[0], dst_samples_size, 0);
ret = avcodec_encode_audio2(c, &pkt, frame, &got_packet);
if (ret index;
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(oc, &pkt);
if (ret != 0) {
fprintf(stderr, "Error while writing audio frame: %s\n",
av_err2str(ret));
exit(1);
}
avcodec_free_frame(&frame);
}
static void close_audio(AVFormatContext *oc, AVStream *st)
{
avcodec_close(st->codec);
av_free(src_samples_data[0]);
av_free(dst_samples_data[0]);
}
int main(int argc, char *argv[]) {
AVOutputFormat *fmt;
AVFormatContext *oc;
AVStream *audio_st;
AVCodec *audio_codec;
double audio_time, video_time;
int ret = 0;
const char *input = argv[1];
const char *output = argv[2];
av_register_all();
avformat_alloc_output_context2(&oc, NULL, NULL, output);
if(!oc) {
printf("Could not alloc the output context");
return 1;
}
fmt = oc->oformat;
audio_st = NULL;
if(fmt->audio_codec != AV_CODEC_ID_NONE) {
audio_st = add_stream(oc, &audio_codec, fmt->audio_codec);
printf("Started audio stream with codec %s\n", audio_codec->name);
}
if(audio_st) {
open_audio(oc, audio_codec, audio_st);
}
av_dump_format(oc, 0, output, 1);
if (!(fmt->flags & AVFMT_NOFILE)) {
ret = avio_open(&oc->pb, output, AVIO_FLAG_WRITE);
if (ret pts = 0;
for (;;) {
audio_time = audio_st ? audio_st->pts.val * av_q2d(audio_st->time_base) : 0.0;
if ((!audio_st || audio_time >= STREAM_DURATION))
break;
write_audio_frame(oc, audio_st);
}
av_write_trailer(oc);
if(audio_st)
close_audio(oc, audio_st);
if(!(fmt->flags & AVFMT_NOFILE))
avio_close(oc->pb);
avformat_free_context(oc);
return 0;
}
compiled with
clang -o converter -lavcodec -lavformat -lavutil -lswresample -lvorbis converter.c
output
~/v/converter> ./converter test.wav test.webm
Started audio stream with codec libvorbis
Output #0, webm, to 'test.webm':
Stream #0:0: Audio: vorbis (libvorbis), 44100 Hz, 2 channels, fltp, 64 kb/s
[libvorbis @ 0x7fdafb800600] 33 frames left in the queue on closing
So it turns out the answer is to properly setup the pts data for the audio stream.