Nodejs Twilio bi-directional streaming: sending an elevenlabs audio stream to a caller

158 views Asked by At

I need a set of new eyes and maybe a few ideas. For its been two days I've been at this.

the idea is simple, I have static text which I send to elevenlabs AI for tts, through their websocket, which returns to me a ulaw_8000 audio stream in batches, which I then want to stream to twilio to be played to a caller, as a response to the caller's words.

I followed: https://www.twilio.com/docs/voice/twiml/stream#attributes-status-callback and https://www.ai-shift.co.jp/techblog/2844 and https://www.twilio.com/docs/voice/twiml/stream#bi-directional-media-streams and many other similar sources.

I feel that I am close to it, but I can't quite place it. Here's my code (excuse my playground code)

import { createServer } from 'http';
import express from 'express';
import { WebSocketServer, WebSocket } from 'ws';
import 'dotenv/config';
import voiced from 'twilio'

const VoiceResponse = voiced.twiml.VoiceResponse;

const voiceId = "somevoiceid";
const model = 'eleven_monolingual_v1';

const ngrokURL = 'https://myngrok-url-here.app'

let streamsID = '';


const app = express();
app.use(express.urlencoded({ extended: true })); // Ensure Express can parse URL-encoded bodies sent by Twilio

const server = createServer(app);


app.get('/', (_, res) => res.type('text').send('Twilio media stream transcriber'));

app.post('/statusCallBack', (req, res) => {
    console.log(req.body)
    streamsID = req.body.StreamSid;
    console.log(streamsID);
});


// Endpoint to handle incoming calls and gather speech input
app.post('/', (req, res) => {
    const response = new VoiceResponse();
    const connect = response.connect();
    connect.stream({
        url: `${ngrokURL}`,
        statusCallback: `${ngrokURL}/statusCallBack`,
        statusCallbackMethod: "POST"
    });

    console.log(response.toString());

    // Use the <Gather> verb to collect speech input and define the action URL to process the input
    const gather = response.gather({
        input: 'speech',
        timeout: 3, // Adjust the timeout as needed
        action: '/process_speech', // Endpoint to process the speech input
        method: 'POST',
    });
    gather.say('Thank you for calling. How may you be assisted?');

    // If the caller doesn't say anything, <Gather> will finish, and you can provide additional instructions
    response.say("I didn't catch that. Please try again.");

    res.type('xml').send(response.toString());
});



// Endpoint to process the speech input and respond during the call
app.post('/process_speech', (req, res) => {
    const speechResult = req.body.SpeechResult;
    const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input?model_id=${model}&output_format=ulaw_8000`;
    const socket = new WebSocket(wsUrl);
    const response = new VoiceResponse();

    if (speechResult) {
        response.say("Here is a poem to soothe you.")
        socket.onopen = function (event) {
            const bosMessage = {
                "text": " ",
                "voice_settings": {
                    "stability": 0.5,
                    "similarity_boost": 0.8
                },
                "xi_api_key": process.env.ELEVENLABS_API_KEY, // replace with your API key
            };

            socket.send(JSON.stringify(bosMessage));

            // Respond based on the speech input
            console.log(`SpeechResult: ${req.body.SpeechResult}`);

            const textMessage = {
                "text": "roses are red,\n" +
                    "violets are blue\n" +
                    "You're an idiot but I still love you\n"
            };

            socket.send(JSON.stringify(textMessage));


            // 4. Send the EOS message with an empty string
            const eosMessage = {
                "text": ""
            };

            socket.send(JSON.stringify(eosMessage));
        };

        socket.onmessage = function (event) {
            const responded = JSON.parse(event.data);

            if (responded.audio) {
                // decode and handle the audio data (e.g., play it)
                // const audioChunk = atob(responded.audio);  // decode base64

                console.log("Received audio chunk");

                const ulaw_data_buffer = Buffer.from(responded.audio);

                // Encode the Buffer containing μ-law data to a base64 string
                const payload = ulaw_data_buffer.toString('base64');


                console.log("Trying to send...")
                let load = {
                    "event": "media",
                    "streamSid": streamsID,
                    "media": {
                        "payload": payload
                    }
                }

                console.log(load)

                socketTwilio.send(JSON.stringify(load));


            } else {
                console.log("No audio data in the response");
            }

            if (responded.isFinal) {
                // the generation is complete
            }

            if (responded.normalizedAlignment) {
                // use the alignment info if needed
            }


        };

        // Handle errors
        socket.onerror = function (error) {
            console.error(`WebSocket Error: ${error}`);
        };

        // Handle socket closing
        socket.onclose = function (event) {
            if (event.wasClean) {
                console.info(`Connection closed cleanly, code=${event.code}, reason=${event.reason}`);
                const gather = response.gather({
                    input: 'speech',
                    timeout: 3, // Adjust the timeout as needed
                    action: '/follow_up', // Endpoint to process the speech input
                    method: 'POST',
                });
                gather.say('Will there be anything else?');
                res.type('xml').send(response.toString());
            } else {
                console.warn('Connection died');
            }
        };

    } else {
        // Handle the case where no speech was detected
        response.say("Sorry, I didn't get that. Please try speaking again.");
    }

    // res.type('xml').send(response.toString());
});
app.post('/follow_up', (req, res) => {
    const speechResult = req.body.SpeechResult.toLowerCase();
    const response = new VoiceResponse();

    if (speechResult.includes('yes')) {
        // If caller says yes, loop back to initial gather
        response.redirect({ method: 'POST' }, '/');
    } else {
        // End the call if they say no or provide an unclear response
        response.say('Thank you for calling. Goodbye!');
    }

    res.type('xml').send(response.toString());
});

socketTwilio.addEventListener('open', function (event) {
    // Send the audio blob as binary data
    console.log("Twilio Websocket opened");
});


// WebSocket server setup remains unchanged
const wss = new WebSocketServer({ server });
wss.on('connection', (ws) => {
    console.log('Twilio media stream WebSocket connected');
    ws.on('message', (message) => {
        // Process WebSocket messages as before
        console.log("Twilio: ", message)

    });
    ws.on('close', () => {
        console.log('Twilio media stream WebSocket disconnected');
    });
});



console.log('Listening on port 8080');
server.listen(8080);

my output has no errors, Call goes through just fine, response says the words commanded for it to say but twilio doesn't play the audio stream. I get a blank for like a second or two, then twilio moves on to the next gather.say('Will there be anything else?');. I know Elevenlabs returns audio stream just fine. I also get "twilio media streams connected" just fine, and even after streaming through the websocket, I see the response from twilio socket's onMessage callback like so: Twilio: <Buffer 7b ... ... 24531 more bytes>

I have hunches on what may be wrong, but I am not sure for I have tried many things

  1. May be audio format is wrong coz twilio says it only accepts base64 mulaw 8000, so that even though it's been sent, it can't be played
  2. Perhaps my websocket "socketTwilio" is messy and not needed, and should somehow send the stream through twiML response? but I haven't seen how to do that anywhere in Nodejs

Any insights on what could be the issue? (Apologies if question is poor or duplicate)

0

There are 0 answers