I want to use a custom tag for images in my text thats processed by Polly.
If this is the text sent to Polly to be processed: This is a {image1} dinosaur
then I'd like to get a file with these speech mark entries in it:
{"time":0,"type":"sentence","start":71,"end":208,"value":"This is a dinosaur."}
{"time":1900,"type":"ssml","start":131,"end":154,"value":"{image1}"}
But what I get with my code is this:
{"time":0,"type":"sentence","start":71,"end":208,"value":"This is a <\/prosody><mark name='{image1}'\/><prosody rate='110%' volume='x-loud'> dinosaur."}
{"time":1900,"type":"ssml","start":131,"end":154,"value":"{image1}"}
This is my code:
import com.amazonaws.services.lambda.runtime.LambdaLogger;
import com.amazonaws.services.polly.AmazonPolly;
import com.amazonaws.services.polly.AmazonPollyClientBuilder;
import com.amazonaws.services.polly.model.OutputFormat;
import com.amazonaws.services.polly.model.SpeechMarkType;
import com.amazonaws.services.polly.model.StartSpeechSynthesisTaskRequest;
import com.amazonaws.services.polly.model.StartSpeechSynthesisTaskResult;
import com.amazonaws.services.polly.model.SynthesisTask;
import com.amazonaws.services.polly.model.TextLengthExceededException;
import com.amazonaws.services.polly.model.TextType;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.util.IOUtils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PollyService {
private AmazonPolly pollyClient;
private AmazonS3 s3Client;
private ObjectMapper objectMapper;
private SsmlTextService ssmlTextService;
public PollyService() {
String pollyRegion = System.getenv("ENV_VAR");
if (pollyRegion == null) {
pollyRegion = System.getenv("AWS_REGION");
}
this.s3Client = AmazonS3ClientBuilder.standard().build();
this.objectMapper = new ObjectMapper();
pollyClient = AmazonPollyClientBuilder.standard().withRegion(pollyRegion).build();
ssmlTextService = new SsmlTextService();
}
public static String getDocumentId(String mp3File) {
String documentId = mp3File;
int pos = mp3File.lastIndexOf('.');
if (pos > 0) {
pos = mp3File.lastIndexOf('.', pos - 1);
if (pos > 0) {
documentId = mp3File.substring(0, pos);
}
}
return documentId;
}
public PollyResult synthesizeLongSpeech(
String fullText, String outbucket, String documentId, PollyParams params, LambdaLogger logger) {
//Replace image tags with SSML format
// String processedText = replaceImageTagsWithSSML(fullText);
String text = ssmlTextService.getSsmlText(params.getDomain(), fullText, params.getSpeekingRate(), logger);
StartSpeechSynthesisTaskRequest request = new StartSpeechSynthesisTaskRequest()
.withOutputS3BucketName(outbucket)
.withOutputS3KeyPrefix(documentId)
.withOutputFormat(OutputFormat.Mp3)
// .withSpeechMarkTypes(SpeechMarkType.Ssml)
// .withTextType(TextType.Text)
.withVoiceId(params.getVoiceId())
.withTextType(TextType.Ssml)
.withSampleRate(params.getSampleRate())
.withSnsTopicArn(System.getenv("ENV_VAR"))
.withEngine(params.getEngine())
.withLanguageCode(params.getLanguageCode())
.withText(text);
try {
StartSpeechSynthesisTaskResult result = pollyClient.startSpeechSynthesisTask(request);
SynthesisTask task = result.getSynthesisTask();
return new PollyResult(true, task.getTaskId(), task.getRequestCharacters());
} catch (TextLengthExceededException e) {
logger.log(e.getMessage());
return new PollyResult(false, null, null);
}
}
public PollyResult synthesizeLongSpeechmarks(
String fullText, String outbucket, String customerId, String documentId, PollyParams params, LambdaLogger logger) {
//Replace image tags with SSML format
// String processedText = replaceImageTagsWithSSML(fullText);
String text = ssmlTextService.getSsmlText(params.getDomain(), fullText, params.getSpeekingRate(), logger);
String destinationBucket = outbucket;
String pollyRegion = System.getenv("ENV_VAR");
if (!System.getenv("AWS_REGION").equals(pollyRegion)) {
destinationBucket = destinationBucket + "." + pollyRegion;
}
StartSpeechSynthesisTaskRequest request = new StartSpeechSynthesisTaskRequest()
.withOutputS3BucketName(destinationBucket)
.withOutputS3KeyPrefix("polly_speechmarks/" + customerId + "." + documentId)
.withOutputFormat(OutputFormat.Json)
.withSpeechMarkTypes(SpeechMarkType.Ssml, SpeechMarkType.Sentence)
.withVoiceId(params.getVoiceId())
.withTextType(TextType.Ssml)
.withSampleRate(params.getSampleRate())
.withSnsTopicArn(System.getenv("ENV_VAR"))
.withEngine(params.getEngine())
.withLanguageCode(params.getLanguageCode())
.withText(text);
try {
StartSpeechSynthesisTaskResult result = pollyClient.startSpeechSynthesisTask(request);
SynthesisTask task = result.getSynthesisTask();
return new PollyResult(true, task.getTaskId(), task.getRequestCharacters());
} catch (TextLengthExceededException e) {
return new PollyResult(false, null, null);
}
}
}
and also
import com.amazonaws.services.lambda.runtime.LambdaLogger;
import org.apache.commons.text.StringEscapeUtils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SsmlTextService {
public String getSsmlText(String domain, String text, int speakingRate, LambdaLogger logger) {
if (speakingRate < 20 || speakingRate > 200) {
speakingRate = 100;
}
StringBuilder builder = new StringBuilder();
builder.append("<speak>");
if (domain != null && domain.length() > 0) {
builder.append("<amazon:domain name='").append(domain).append("'>");
}
Pattern pattern = Pattern.compile("\\{image\\d+\\}");
Matcher matcher = pattern.matcher(text);
int lastMatchEnd = 0;
while (matcher.find()) {
// Open prosody tag
builder.append("<prosody rate='").append(speakingRate).append("%' volume='x-loud'>");
// Append text before the {imageN} placeholder
builder.append(StringEscapeUtils.escapeXml10(text.substring(lastMatchEnd, matcher.start())));
// Close prosody tag
builder.append("</prosody>");
// Append the <mark> tag for {imageN}
builder.append("<mark name='").append(matcher.group()).append("'/>");
lastMatchEnd = matcher.end();
}
// Append any remaining text after the last {imageN} placeholder
// Open prosody tag
builder.append("<prosody rate='").append(speakingRate).append("%' volume='x-loud'>");
builder.append(StringEscapeUtils.escapeXml10(text.substring(lastMatchEnd)));
// Close prosody tag
builder.append("</prosody>");
if (domain != null && domain.length() > 0) {
builder.append("</amazon:domain>");
}
builder.append("</speak>");
return builder.toString();
}
private String processTextForSsml(String text) {
StringBuffer processedText = new StringBuffer();
Pattern pattern = Pattern.compile("\\{image\\d+\\}");
Matcher matcher = pattern.matcher(text);
int lastEnd = 0;
while (matcher.find()) {
// Append text part
processedText.append(StringEscapeUtils.escapeXml10(text.substring(lastEnd, matcher.start())));
// Append mark tag outside prosody
processedText.append("</prosody>")
.append("<mark name='")
.append(matcher.group())
.append("'/>")
.append("<prosody rate='100%' volume='x-loud'>");
lastEnd = matcher.end();
}
// Append the remaining part of the text
processedText.append(StringEscapeUtils.escapeXml10(text.substring(lastEnd)));
return processedText.toString();
}
}
Any advice or recommendation, much appreciated. Thank you
Edit:: Something else might worth bringing up is that initially I had difficulty getting the type ssml speechmark entry to generate so I couldnt get: ''' {"time":1900,"type":"ssml","start":131,"end":154,"value":"{image1}"}
but I managed to get that by adding
'''
.withSpeechMarkTypes(SpeechMarkType.Ssml, SpeechMarkType.Sentence)
'''
Before I had just
'''
.withSpeechMarkType(SpeechMarkType.Sentence)