Custom tag in AWS Polly speech mark file

47 views Asked by At

I want to use a custom tag for images in my text thats processed by Polly.

If this is the text sent to Polly to be processed: This is a {image1} dinosaur

then I'd like to get a file with these speech mark entries in it:

{"time":0,"type":"sentence","start":71,"end":208,"value":"This is a dinosaur."}
{"time":1900,"type":"ssml","start":131,"end":154,"value":"{image1}"}

But what I get with my code is this:

{"time":0,"type":"sentence","start":71,"end":208,"value":"This is a <\/prosody><mark name='{image1}'\/><prosody rate='110%' volume='x-loud'> dinosaur."}
{"time":1900,"type":"ssml","start":131,"end":154,"value":"{image1}"}

This is my code:

import com.amazonaws.services.lambda.runtime.LambdaLogger;
import com.amazonaws.services.polly.AmazonPolly;
import com.amazonaws.services.polly.AmazonPollyClientBuilder;
import com.amazonaws.services.polly.model.OutputFormat;
import com.amazonaws.services.polly.model.SpeechMarkType;
import com.amazonaws.services.polly.model.StartSpeechSynthesisTaskRequest;
import com.amazonaws.services.polly.model.StartSpeechSynthesisTaskResult;
import com.amazonaws.services.polly.model.SynthesisTask;
import com.amazonaws.services.polly.model.TextLengthExceededException;
import com.amazonaws.services.polly.model.TextType;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.util.IOUtils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PollyService {
  private AmazonPolly pollyClient;
  private AmazonS3 s3Client;
  private ObjectMapper objectMapper;
  private SsmlTextService ssmlTextService;

  public PollyService() {
    String pollyRegion = System.getenv("ENV_VAR");
    if (pollyRegion == null) {
      pollyRegion = System.getenv("AWS_REGION");
    }
    this.s3Client = AmazonS3ClientBuilder.standard().build();
    this.objectMapper = new ObjectMapper();
    pollyClient = AmazonPollyClientBuilder.standard().withRegion(pollyRegion).build();
    ssmlTextService = new SsmlTextService();
  }

  public static String getDocumentId(String mp3File) {
    String documentId = mp3File;

    int pos = mp3File.lastIndexOf('.');
    if (pos > 0) {
      pos = mp3File.lastIndexOf('.', pos - 1);
      if (pos > 0) {
        documentId = mp3File.substring(0, pos);
      }
    }
    return documentId;
  }

  public PollyResult synthesizeLongSpeech(
          String fullText, String outbucket, String documentId, PollyParams params, LambdaLogger logger) {

    //Replace image tags with SSML format
//    String processedText = replaceImageTagsWithSSML(fullText);
    String text = ssmlTextService.getSsmlText(params.getDomain(), fullText, params.getSpeekingRate(), logger);
    StartSpeechSynthesisTaskRequest request = new StartSpeechSynthesisTaskRequest()
        .withOutputS3BucketName(outbucket)
        .withOutputS3KeyPrefix(documentId)
        .withOutputFormat(OutputFormat.Mp3)
//            .withSpeechMarkTypes(SpeechMarkType.Ssml)
//            .withTextType(TextType.Text)
        .withVoiceId(params.getVoiceId())
        .withTextType(TextType.Ssml)
        .withSampleRate(params.getSampleRate())
        .withSnsTopicArn(System.getenv("ENV_VAR"))
        .withEngine(params.getEngine())
        .withLanguageCode(params.getLanguageCode())
        .withText(text);
    try {
      StartSpeechSynthesisTaskResult result = pollyClient.startSpeechSynthesisTask(request);
      SynthesisTask task = result.getSynthesisTask();
      return new PollyResult(true, task.getTaskId(), task.getRequestCharacters());
    } catch (TextLengthExceededException e) {
      logger.log(e.getMessage());
      return new PollyResult(false, null, null);
    }
  }

  public PollyResult synthesizeLongSpeechmarks(
          String fullText, String outbucket, String customerId, String documentId, PollyParams params, LambdaLogger logger) {
    //Replace image tags with SSML format
//    String processedText = replaceImageTagsWithSSML(fullText);
    String text = ssmlTextService.getSsmlText(params.getDomain(), fullText, params.getSpeekingRate(), logger);
    String destinationBucket = outbucket;
    String pollyRegion = System.getenv("ENV_VAR");
    if (!System.getenv("AWS_REGION").equals(pollyRegion)) {
      destinationBucket = destinationBucket + "." + pollyRegion;
    }

    StartSpeechSynthesisTaskRequest request = new StartSpeechSynthesisTaskRequest()
        .withOutputS3BucketName(destinationBucket)
        .withOutputS3KeyPrefix("polly_speechmarks/" + customerId + "." + documentId)
        .withOutputFormat(OutputFormat.Json)
            .withSpeechMarkTypes(SpeechMarkType.Ssml, SpeechMarkType.Sentence)
            .withVoiceId(params.getVoiceId())
        .withTextType(TextType.Ssml)
        .withSampleRate(params.getSampleRate())
        .withSnsTopicArn(System.getenv("ENV_VAR"))
        .withEngine(params.getEngine())
        .withLanguageCode(params.getLanguageCode())
        .withText(text);
    try {
      StartSpeechSynthesisTaskResult result = pollyClient.startSpeechSynthesisTask(request);
      SynthesisTask task = result.getSynthesisTask();
      return new PollyResult(true, task.getTaskId(), task.getRequestCharacters());
    } catch (TextLengthExceededException e) {
      return new PollyResult(false, null, null);
    }
  }

}

and also

import com.amazonaws.services.lambda.runtime.LambdaLogger;
import org.apache.commons.text.StringEscapeUtils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SsmlTextService {
  public String getSsmlText(String domain, String text, int speakingRate, LambdaLogger logger) {
    if (speakingRate < 20 || speakingRate > 200) {
      speakingRate = 100;
    }

    StringBuilder builder = new StringBuilder();
    builder.append("<speak>");
    if (domain != null && domain.length() > 0) {
      builder.append("<amazon:domain name='").append(domain).append("'>");
    }

    Pattern pattern = Pattern.compile("\\{image\\d+\\}");
    Matcher matcher = pattern.matcher(text);

    int lastMatchEnd = 0;
    while (matcher.find()) {
      // Open prosody tag
      builder.append("<prosody rate='").append(speakingRate).append("%' volume='x-loud'>");
      // Append text before the {imageN} placeholder
      builder.append(StringEscapeUtils.escapeXml10(text.substring(lastMatchEnd, matcher.start())));
      // Close prosody tag
      builder.append("</prosody>");

      // Append the <mark> tag for {imageN}
      builder.append("<mark name='").append(matcher.group()).append("'/>");

      lastMatchEnd = matcher.end();
    }

    // Append any remaining text after the last {imageN} placeholder
    // Open prosody tag
    builder.append("<prosody rate='").append(speakingRate).append("%' volume='x-loud'>");
    builder.append(StringEscapeUtils.escapeXml10(text.substring(lastMatchEnd)));
    // Close prosody tag
    builder.append("</prosody>");

    if (domain != null && domain.length() > 0) {
      builder.append("</amazon:domain>");
    }
    builder.append("</speak>");

    return builder.toString();
  }


  private String processTextForSsml(String text) {
    StringBuffer processedText = new StringBuffer();
    Pattern pattern = Pattern.compile("\\{image\\d+\\}");
    Matcher matcher = pattern.matcher(text);

    int lastEnd = 0;
    while (matcher.find()) {
      // Append text part
      processedText.append(StringEscapeUtils.escapeXml10(text.substring(lastEnd, matcher.start())));

      // Append mark tag outside prosody
      processedText.append("</prosody>")
              .append("<mark name='")
              .append(matcher.group())
              .append("'/>")
              .append("<prosody rate='100%' volume='x-loud'>");

      lastEnd = matcher.end();
    }

    // Append the remaining part of the text
    processedText.append(StringEscapeUtils.escapeXml10(text.substring(lastEnd)));

    return processedText.toString();
  }
}

Any advice or recommendation, much appreciated. Thank you

Edit:: Something else might worth bringing up is that initially I had difficulty getting the type ssml speechmark entry to generate so I couldnt get: ''' {"time":1900,"type":"ssml","start":131,"end":154,"value":"{image1}"}

but I managed to get that by adding 
'''
            .withSpeechMarkTypes(SpeechMarkType.Ssml, SpeechMarkType.Sentence)
'''

Before I had just
'''
            .withSpeechMarkType(SpeechMarkType.Sentence)
0

There are 0 answers