Getting flite to output audio with PortAudio

2k views Asked by At

I am trying to get the flite speech synthesis library to work on my Mac, but my sound architecture isn't supported within the flite library. To fix that problem, I am using PortAudio to playback the synthesized audio; so I had to do a little bit of hacking within the audio.c file to get flite to use that library. I managed to get everything compiling just fine after mucking around in the GNU AutoTools for a while, but then I run the program and get this output:

$ ./flite -t "test"
frameIndex: 0
maxFrameIndex: 0
numChannels: 1
numSamples: 7225
sampleRate: 8000
=== Now playing back. ===
Waiting for playback to finish.
frameIndex in callback: -2008986336
maxFrameIndex in callback: 32655
numChannels in callback: 152579008
numSamples in callback: 0
sampleRate in callback: 0
Segmentation fault: 11  

$ ./flite -t "test"
frameIndex: 0
maxFrameIndex: 0
numChannels: 1
numSamples: 7225
sampleRate: 8000
=== Now playing back. ===
Waiting for playback to finish.
frameIndex in callback: -71217888
maxFrameIndex in callback: 32712
numChannels in callback: 232979392
numSamples in callback: 0
sampleRate in callback: 0
Segmentation fault: 11

Here is the relevant code from the audio.c file, which is called when I supply the command line argument -t. I marked the area of interest where the segmentation fault occurs in the playCallback() function after a bit of debugging.

static int playCallback( const void *inputBuffer, void *outputBuffer,
                        unsigned long framesPerBuffer,
                        const PaStreamCallbackTimeInfo* timeInfo,
                        PaStreamCallbackFlags statusFlags,
                        void *userData )
{
    cst_wave *data = (cst_wave*)userData;
    short *rptr = &data->samples[data->frameIndex * data->num_channels];
    short *wptr = (short*)outputBuffer;
    unsigned int i;
    int finished;
    unsigned int framesLeft = cst_wave_maxFrameIndex(data) - cst_wave_frameIndex(data);

    (void) inputBuffer; /* Prevent unused variable warnings. */
    (void) timeInfo;
    (void) statusFlags;
    (void) userData;

    printf("frameIndex in callback: %d\n", cst_wave_frameIndex(data));
    printf("maxFrameIndex in callback: %d\n", cst_wave_maxFrameIndex(data));
    printf("numChannels in callback: %d\n", cst_wave_num_channels(data));
    printf("numSamples in callback: %d\n", cst_wave_num_samples(data));
    printf("sampleRate in callback: %d\n\n", cst_wave_sample_rate(data));

    if( framesLeft < framesPerBuffer )
    {
        /* final buffer... */
        for( i=0; i<framesLeft; i++ )
        {
            *wptr++ = *rptr++;  /* left */
            if( cst_wave_num_channels(data) == 2 ) *wptr++ = *rptr++;  /* right */
        }
        for( ; i<framesPerBuffer; i++ )
        {
            *wptr++ = 0;  /* left */
            if( cst_wave_num_channels(data) == 2) *wptr++ = 0;  /* right */
        }
        data->frameIndex += framesLeft;
        finished = paComplete;
    }
    else
    {
        for( i=0; i<framesPerBuffer; i++ )
        {
            *wptr++ = *rptr++;  /* left */
            if( cst_wave_num_channels(data) == 2 ) *wptr++ = *rptr++;  /* right */
        }
        cst_wave_set_frameIndex(data, framesPerBuffer);
        finished = paContinue;
    }
    return finished;
}

int play_wave(cst_wave *w)
{
    PaStream* stream;
    PaStreamParameters outputParameters;
    cst_wave_set_frameIndex(w, 0);
    cst_wave_set_maxFrameIndex(w, (cst_wave_num_samples(w) / cst_wave_sample_rate(w)) * cst_wave_num_channels(w) * sizeof(short));
    int err = 0;
    err = Pa_Initialize();
    outputParameters.device = Pa_GetDefaultOutputDevice();
    if (outputParameters.device == paNoDevice)
    {
        fprintf(stderr,"Error: No default output device.\n");
        return -5;
    }
    printf("frameIndex: %d\n", cst_wave_frameIndex(w));
    printf("maxFrameIndex: %d\n", cst_wave_maxFrameIndex(w));
    printf("numChannels: %d\n", cst_wave_num_channels(w));
    printf("numSamples: %d\n", cst_wave_num_samples(w));
    printf("sampleRate: %d\n", cst_wave_sample_rate(w));

    outputParameters.channelCount = cst_wave_num_channels(w);
    outputParameters.sampleFormat = paInt16;
    outputParameters.suggestedLatency = Pa_GetDeviceInfo( outputParameters.device )->defaultLowOutputLatency;
    outputParameters.hostApiSpecificStreamInfo = NULL;
    puts("=== Now playing back. ===");
    err = Pa_OpenStream(&stream,
                        NULL, /* no input */
                        &outputParameters,
                        cst_wave_sample_rate(w),
                        512,
                        paClipOff,
                        playCallback,
                        &w);
    if( stream )
    {
        err = Pa_StartStream( stream );
        if( err != paNoError ) goto done;

        puts("Waiting for playback to finish.");

        while((err = Pa_IsStreamActive(stream)) == 1) Pa_Sleep(100);
        if( err < 0 ) goto done;

        err = Pa_CloseStream( stream );
        if( err != paNoError ) goto done;

        puts("Done.");
    }
done:
    Pa_Terminate();
    free(cst_wave_samples(w));
}

Because it is relevant, I also slightly modified the cst_wave structure in cst_wave.h so that it contains my data I have to store, as well as adding a few #defines to the ones that were already present:

typedef struct  cst_wave_struct {
    const char *type;
    int frameIndex;
    int maxFrameIndex;
    int sample_rate;
    int num_samples;
    int num_channels;
    short *samples;
} cst_wave;

#define cst_wave_num_samples(w) (w?w->num_samples:0)
#define cst_wave_num_channels(w) (w?w->num_channels:0)
#define cst_wave_sample_rate(w) (w?w->sample_rate:0)
#define cst_wave_samples(w) (w->samples)
#define cst_wave_frameIndex(w) (w->frameIndex)
#define cst_wave_maxFrameIndex(w) (w->maxFrameIndex)

#define cst_wave_set_num_samples(w,s) w->num_samples=s
#define cst_wave_set_num_channels(w,s) w->num_channels=s
#define cst_wave_set_sample_rate(w,s) w->sample_rate=s
#define cst_wave_set_frameIndex(w,s) w->frameIndex=s
#define cst_wave_set_maxFrameIndex(w,s) w->maxFrameIndex=s

Update 1:

Following the advice of @Rohan now gives me this output:

$ ./bin/flite -t "test"
frameIndex: 0
maxFrameIndex: 0
numChannels: 1
numSamples: 7225
sampleRate: 8000
=== Now playing back. ===
Waiting for playback to finish.
frameIndex in callback: 0
maxFrameIndex in callback: 0
numChannels in callback: 1
numSamples in callback: 7225
sampleRate in callback: 8000

Done.
flite(68929,0x7fff71c0d310) malloc: *** error for object 0x7fd6e2809800: pointer being freed was not allocated
*** set a breakpoint in malloc_error_break to debug
Abort trap: 6

To fix that, I removed the free(cst_wave_samples(w));. Now the program executes normally with no visible errors, but there is still no audio output on my Mac. Any suggestions?

3

There are 3 answers

2
Iwillnotexist Idonotexist On BEST ANSWER

You're in luck. I was able to compile both PortAudio and flite on my own Mac, and solve your problem.

You have several issues other than those mentioned before, all of which I have addressed in the code dump below.

  • Minor: You don't use consistently your own API for cst_wave.
  • Minor: I prefer to enclose my while and if blocks with {} always. This has a habit of preventing mysterious bugs.
  • Max Frames was being set to zero. That's because in (cst_wave_num_samples(w) / cst_wave_sample_rate(w)) * cst_wave_num_channels(w) * sizeof(short), you were dividing by the sample rate, which was greater than your number of samples. Given that integer division is left-associative and truncating, yadda yadda yadda zero.
  • Max Frames is still wrong, as a frame includes all channel samples. The number of frames is thus agnostic to both number of channels and the size of the samples themselves. Allowing myself to guess that flite misuses sample to mean frame, your max frame index is just cst_wave_num_samples(w). Else it will be cst_wave_num_samples(w) / cst_wave_num_channels(w).
  • PortAudio's documentation states you should call Pa_StopStream(stream) after the stream becomes inactive, whether or not you were waiting until it became so.
  • I simplified the callback, and corrected it for
    • Minor: Consistent use of your API
    • MAJOR: Ehm... cst_wave_set_frameIndex(data, framesPerBuffer); is definitely wrong. You're pinning yourself at frame index 512 instead of incrementing! That's because you asked for 512 frames per buffer when opening the stream and you're not incrementing the frame index by framesPerBuffer, you're setting the frame index to framesPerBuffer. You hadn't made it that far because your maxFrameIndex was 0 anyways so you were exiting. I fixed it so that the frame index increments - with your API of course.

Here is the code, which I took the freedom of documenting and cleaning until it approached my standards of elegance. Enjoy!

#include <stdio.h>
#include <string.h>

/**
 * Audio play callback.
 * 
 * Follows the PaStreamCallback signature, wherein:
 * 
 * @param input   and
 * @param output  are either arrays of interleaved samples or; if
 *                non-interleaved samples were requested using the
 *                paNonInterleaved sample format flag, an array of buffer
 *                pointers, one non-interleaved buffer for each channel.
 * @param frameCount    The number of sample frames to be processed by the
 *                      stream callback.
 * @param timeInfo      Timestamps indicating the ADC capture time of the first
 *                      sample in the input buffer, the DAC output time of the
 *                      first sample in the output buffer and the time the
 *                      callback was invoked. See PaStreamCallbackTimeInfo and
 *                      Pa_GetStreamTime()
 * @param statusFlags   Flags indicating whether input and/or output buffers
 *                      have been inserted or will be dropped to overcome
 *                      underflow or overflow conditions.
 * @param userData      The value of a user supplied pointer passed to
 *                      Pa_OpenStream() intended for storing synthesis data
 *                      etc.
 */

static int  playCallback(const void*                     inputBuffer,
                         void*                           outputBuffer,
                         unsigned long                   framesPerBuffer,
                         const PaStreamCallbackTimeInfo* timeInfo,
                         PaStreamCallbackFlags           statusFlags,
                         void*                           userData){
    (void) inputBuffer; /* Prevent unused variable warnings. */
    (void) timeInfo;
    (void) statusFlags;
    (void) userData;


    /**
     * Compute current processing state.
     */

    cst_wave*    data;
    short*       rptr;
    short*       wptr;
    unsigned int framesLeft, /* Number of frames of data remaining within the stream ***as a whole*** */
                 frames,     /* Number of frames of data to be written for this buffer. */
                 framesPad,  /* Number of frames of padding required within the final buffer. */
                 samples,    /* Number of samples of data to be written for this buffer. */
                 samplesPad, /* Number of samples of padding required within the final buffer. */
                 numBytes,   /* Number of bytes of data to be written for this buffer. */
                 numBytesPad;/* Number of bytes of padding required within the final buffer. */
    int          finalBuffer;/* Stores whether or not this is the final buffer. */


    data         = (cst_wave*)userData;
    rptr         = &data->samples[cst_wave_frameIndex  (data) *
                                  cst_wave_num_channels(data)];
    wptr         = (short*)outputBuffer;
    framesLeft   = cst_wave_maxFrameIndex(data) - cst_wave_frameIndex(data);
    finalBuffer  = framesLeft      <= framesPerBuffer;
    frames       = finalBuffer     ?  framesLeft     : framesPerBuffer;
    framesPad    = framesPerBuffer -  frames;
    samples      = frames     * cst_wave_num_channels(data);
    samplesPad   = framesPad  * cst_wave_num_channels(data);
    numBytes     = samples    * sizeof(short);
    numBytesPad  = samplesPad * sizeof(short);


    /**
     * Debug code. Comment out in production.
     */

    printf("framesLeft in callback: %u\n", framesLeft);
    printf("framesPerBuffer in callback: %lu\n", framesPerBuffer);
    printf("frames in callback: %u\n", frames);

    printf("frameIndex in callback: %d\n", cst_wave_frameIndex(data));
    printf("maxFrameIndex in callback: %d\n", cst_wave_maxFrameIndex(data));
    printf("numChannels in callback: %d\n", cst_wave_num_channels(data));
    printf("numSamples in callback: %d\n", cst_wave_num_samples(data));
    printf("sampleRate in callback: %d\n\n", cst_wave_sample_rate(data));


    /**
     * Output data. We handle the final buffer specially, padding it with zeros.
     */

    memcpy(wptr, rptr, numBytes);
    wptr += samples;
    rptr += samples;
    cst_wave_set_frameIndex(data, cst_wave_frameIndex(data) + frames);
    memset(wptr, 0, numBytesPad);
    wptr += samplesPad;
    rptr += samplesPad;


    /**
     * Return a completion or continue code depending on whether this was the
     * final buffer or not respectively.
     */

    return finalBuffer ? paComplete : paContinue;
}

/**
 * Play wave function.
 * 
 * Plays the given cst_wave data as audio, blocking until this is done.
 */

int play_wave(cst_wave *w){
    PaStream*          stream;
    PaStreamParameters outputParameters;
    int                err;

    /**
     * Initialize custom fields in cst_wave struct.
     */

    cst_wave_set_frameIndex(w, 0);
    cst_wave_set_maxFrameIndex(w, (cst_wave_num_samples(w)));
    // / cst_wave_sample_rate(w)  * cst_wave_num_channels(w) * sizeof(short)


    /**
     * Initialize Port Audio device and stream parameters.
     */

    err = Pa_Initialize();
    outputParameters.device = Pa_GetDefaultOutputDevice();
    if (outputParameters.device == paNoDevice){
        fprintf(stderr,"Error: No default output device.\n");
        return -5;
    }

    printf("frameIndex: %d\n", cst_wave_frameIndex(w));
    printf("maxFrameIndex: %d\n", cst_wave_maxFrameIndex(w));
    printf("numChannels: %d\n", cst_wave_num_channels(w));
    printf("numSamples: %d\n", cst_wave_num_samples(w));
    printf("sampleRate: %d\n", cst_wave_sample_rate(w));

    outputParameters.channelCount = cst_wave_num_channels(w);
    outputParameters.sampleFormat = paInt16;
    outputParameters.suggestedLatency = Pa_GetDeviceInfo( outputParameters.device )->defaultLowOutputLatency;
    outputParameters.hostApiSpecificStreamInfo = NULL;


    /**
     * Open the stream for playback.
     */

    puts("=== Now playing back. ===");
    err = Pa_OpenStream(&stream,
                        NULL, /* no input */
                        &outputParameters,
                        cst_wave_sample_rate(w),
                        512,
                        paClipOff,
                        playCallback,
                        w);

    if(stream){
        /**
         * Start the stream.
         */

        err = Pa_StartStream(stream);
        if(err != paNoError){
            goto done;
        }

        /**
         * Block while it plays.
         */

        puts("Waiting for playback to finish.");
        while((err = Pa_IsStreamActive(stream)) == 1){
            Pa_Sleep(100);
        }
        if(err < 0){
            goto done;
        }


        /**
         * Stop and close the stream. Both are necessary.
         */

        Pa_StopStream(stream);
        err = Pa_CloseStream(stream);
        if(err != paNoError){
            goto done;
        }
        puts("Done.");
    }

    /**
     * Terminate and leave.
     */
done:
    Pa_Terminate();
    return 0;
}
1
Jerry Coffin On

It looks to me like the problem is probably elsewhere.

The routine where you've added the comment is really pretty trivial when all is said and done. It's basically just copying a buffer full of data from one place to another, and if the data doesn't fill the input buffer, zero-filling the remainder. If I were writing the code, I'd probably do something more along these general lines:

const unsigned frame_size = sizeof(short) * data->num_channels;    

char *source = &data->samples[data->frameIndex * data->num_channels];
char *dest = outputBuffer;

unsigned framesLeft = data->maxFrameIndex - data->frameIndex;
unsigned framesEmpty = framesPerBuffer - framesLeft;

memcpy(source, dest, framesLeft * frame_size);
memset(dest+framesLeft * frame_size, 0, framesEmpty * frame_size);

data->frameIndex += framesPerBuffer;

Although rather clumsily written, the if/else in the question just skips doing the memset part at all if the size that needs to be filled is zero.

So, this copies a buffer full of data from one place to another, and zero-fills any remainder. If you're getting a segfault, whatever's allocating the destination buffer apparently hasn't allocated enough space there. Without doing some looking, it's impossible to guess whether the allocation happens in Pa_Initialize, Pa_OpenStream, Pa_StartStream, or possibly somewhere else--and most likely you care less about the code that actually does the allocation than the code that computes how much space to allocate (which might be in one of the above, or somewhere else completely).

3
Rohan On

In your play_wave function you are calling:

err = Pa_OpenStream(&stream,
                    NULL, /* no input */
                    &outputParameters,
                    cst_wave_sample_rate(w),
                    512,
                    paClipOff,
                    playCallback,
                    &w);

Here as last parameter you are passing &w, so you are passing cst_wave ** as w is defined as cst_wave *w.

But in playCallback() you are using it as

cst_wave *data = (cst_wave*)userData;

So in this function you are incorrectly accessing cst_wave ** as cst_wave *. So at some point you will access the invalid memory while using some member of w.

Also, this is the reason you are getting the incorrect output for other parameters e.g. frameIndex, maxFrameIndex etc. as your output shows.

Solution is just pass w to Pa_OpenStream() function rather than &w.


Your next problem is that you aren't setting your maxFrameIndex correctly. As you stated in the comments, this shouldn't be 0. In order to set it properly, you should have something like this:

cst_wave_set_maxFrameIndex(w, cst_wave_num_samples(w) * cst_wave_num_channels(w));

Lastly, it looks like your callback could be messing things up a bit. Here is a better and more efficient way to write it:

static int playCallback( const void *inputBuffer, void *outputBuffer,
                        unsigned long framesPerBuffer,
                        const PaStreamCallbackTimeInfo* timeInfo,
                        PaStreamCallbackFlags statusFlags,
                        void *userData )
{
    cst_wave *data = (cst_wave*)userData;
    short *rptr = &data->samples[data->frameIndex * data->num_channels];
    short *wptr = (short*)outputBuffer;
    int finished;
    unsigned int framesLeft = data->maxFrameIndex - data->frameIndex;

    (void) inputBuffer; /* Prevent unused variable warnings. */
    (void) timeInfo;
    (void) statusFlags;
    (void) userData;

    if( framesLeft < framesPerBuffer )
    {
        /* final buffer... */
        memcpy(wptr, rptr, sizeof(*wptr) * data->num_channels * framesLeft);
        memset(wptr, sizeof(*wptr) * data->num_channels * framesPerBuffer, 0);
        data->frameIndex += framesLeft;
        finished = paComplete;
    }
    else
    {
        memcpy(wptr, rptr, sizeof(*wptr) * data->num_channels * framesPerBuffer);
        data->frameIndex += framesPerBuffer;
        finished = paContinue;
    }
    return finished;
}