Skip to content

Commit a49f35b

Browse files
committed
BREAKING CHANGE: Allow specifying voice name for Microsoft TTS
Reference voices by name.
1 parent 78e2cdb commit a49f35b

File tree

4 files changed

+90
-20
lines changed

4 files changed

+90
-20
lines changed

README.md

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,7 @@ Example:
305305
"voicerss": "Your api key for TTS with voicerss",
306306
"microsoft": {
307307
"key": "Your api for Bing speech API",
308-
"gender": "Female",
309-
"name": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
310-
"language": "en-US"
308+
"name": "ZiraRUS"
311309
},
312310
"port": 5005,
313311
"securePort": 5006,
@@ -440,25 +438,41 @@ The following configuration is available (the entered values except key are defa
440438
{
441439
"microsoft": {
442440
"key": "Your api for Bing speech API",
443-
"gender": "Female",
444-
"name": "Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)",
445-
"language": "en-US"
441+
"name": "ZiraRUS"
446442
}
447443
}
448444
```
449445

450-
If you change language, you need to change the name matching the gender for that language, according to this list: https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales. This one doesn't support providing language directly in the request for this reason.
446+
You change language by specifying a voice name correlating to the desired language.
447+
Name should be specified according to this list: https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales
448+
where name is the right most part of the voice font name (without optional Apollo suffix). Example:
449+
450+
`Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)` name should be specified as `Hoda`
451+
452+
`Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)` name should be specified as `Stefan`
453+
454+
`Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)` name should be specified as `BenjaminRUS`
451455

452456
Action is:
453457

454-
/[Room name]/say/[phrase][/[announce volume]]
455-
/sayall/[phrase][/[announce volume]]
458+
/[Room name]/say/[phrase][/[name]][/[announce volume]]
459+
/sayall/[phrase][/[name]][/[announce volume]]
456460

457461
Example:
458462

459463
/Office/say/Hello, dinner is ready
464+
/Office/say/Hello, dinner is ready/BenjaminRUS
465+
/Office/say/Guten morgen/Stefan
460466
/sayall/Hello, dinner is ready
461467
/Office/say/Hello, dinner is ready/90
468+
/Office/say/Guten morgen/Stefan/90
469+
470+
Supported voices are:
471+
472+
Hoda, Hedda, Stefan, Catherine, Linda, Susan, George, Ravi, ZiraRUS, BenjaminRUS, Laura, Pablo, Raul, Caroline, Julie, Paul, Cosimo, Ayumi, Ichiro, Daniel, Irina, Pavel, HuihuiRUS, Yaoyao, Kangkang, Tracy, Danny, Yating, Zhiwei
473+
474+
See https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/API-Reference-REST/BingVoiceOutput#SupLocales to identify
475+
which language and gender it maps against.
462476

463477
#### AWS Polly
464478

@@ -496,10 +510,11 @@ Action is:
496510
Example:
497511

498512
/Office/say/Hello, dinner is ready
499-
/Office/say/Hej, maten är klar/Joanna
513+
/Office/say/Hello, dinner is ready/Nicole
514+
/Office/say/Hej, maten är klar/Astrid
500515
/sayall/Hello, dinner is ready
501516
/Office/say/Hello, dinner is ready/90
502-
/Office/say/Hej, maten är klar/Russell/90
517+
/Office/say/Hej, maten är klar/Astrid/90
503518

504519
This is the current list of voice names and their corresponding language and accent (as of Dec 2016).
505520
To get a current list of voices, you would need to use the AWS CLI and invoke the describe-voices command.

lib/actions/say.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,15 @@ let port;
1111
let system;
1212

1313
function say(player, values) {
14-
const text = decodeURIComponent(values[0]);
14+
let text;
15+
try {
16+
text = decodeURIComponent(values[0]);
17+
} catch (err) {
18+
if (err instanceof URIError) {
19+
err.message = `The encoded phrase ${values[0]} could not be URI decoded. Make sure your url encoded values (%xx) are within valid ranges. xx should be hexadecimal representations`;
20+
}
21+
return Promise.reject(err);
22+
}
1523
let announceVolume;
1624
let language;
1725

lib/actions/sayall.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,15 @@ let port;
77
let system;
88

99
function sayAll(player, values) {
10-
const text = decodeURIComponent(values[0]);
10+
let text;
11+
try {
12+
text = decodeURIComponent(values[0]);
13+
} catch (err) {
14+
if (err instanceof URIError) {
15+
err.message = `The encoded phrase ${values[0]} could not be URI decoded. Make sure your url encoded values (%xx) are within valid ranges. xx should be hexadecimal representations`;
16+
}
17+
return Promise.reject(err);
18+
}
1119
let announceVolume;
1220
let language;
1321

lib/tts-providers/microsoft.js

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@ const APP_ID = '9aa44d9e6ec14da99231a9166fd50b0f';
1010
const INSTANCE_ID = crypto.randomBytes(16).toString('hex');
1111
const TOKEN_EXPIRATION = 590000; // 9:50 minutes in ms
1212
const DEFAULT_SETTINGS = {
13-
language: 'en-US',
14-
gender: 'Female',
15-
name: 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)'
13+
name: 'ZiraRUS'
1614
};
1715

1816
let bearerToken;
@@ -39,15 +37,19 @@ function format(lang, gender, name, text) {
3937
return `<speak version='1.0' xml:lang='en-us'><voice xml:lang='${lang}' xml:gender='${gender}' name='${name}'>${text}</voice></speak>`;
4038
}
4139

42-
function microsoft(phrase, language) {
40+
function microsoft(phrase, voiceName) {
4341
if (!globalSettings.microsoft || !globalSettings.microsoft.key) {
4442
return Promise.resolve();
4543
}
4644

4745
const settings = Object.assign({}, DEFAULT_SETTINGS, globalSettings.microsoft);
4846

47+
if (voiceName) {
48+
settings.name = voiceName;
49+
}
50+
4951
const phraseHash = crypto.createHash('sha1').update(phrase).digest('hex');
50-
const filename = `microsoft-${phraseHash}-${settings.language}-${settings.gender}.wav`;
52+
const filename = `microsoft-${phraseHash}-${settings.name}.wav`;
5153
const filepath = path.resolve(globalSettings.webroot, 'tts', filename);
5254

5355
const expectedUri = `/tts/${filename}`;
@@ -65,7 +67,12 @@ function microsoft(phrase, language) {
6567
}
6668

6769
return promise.then(() => {
68-
const ssml = format(settings.language, settings.gender, settings.name, phrase);
70+
const voice = VOICE[settings.name];
71+
if (!voice) {
72+
throw new Error(`Voice name ${settings.name} could not be located in the list of valid voice names`);
73+
}
74+
75+
const ssml = format(voice.language, voice.gender, voice.font, phrase);
6976
return request({
7077
uri: 'https://speech.platform.bing.com/synthesize',
7178
method: 'POST',
@@ -99,4 +106,36 @@ function microsoft(phrase, language) {
99106
});
100107
}
101108

102-
module.exports = microsoft;
109+
const VOICE = {
110+
Hoda: { language: 'ar-EG', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ar-EG, Hoda)' },
111+
Hedda: { language: 'de-DE', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (de-DE, Hedda)' },
112+
Stefan: { language: 'de-DE', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)' },
113+
Catherine: { language: 'en-AU', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-AU, Catherine)' },
114+
Linda: { language: 'en-CA', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-CA, Linda)' },
115+
Susan: { language: 'en-GB', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-GB, Susan, Apollo)' },
116+
George: { language: 'en-GB', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-GB, George, Apollo)' },
117+
Ravi: { language: 'en-IN', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-IN, Ravi, Apollo)' },
118+
ZiraRUS: { language: 'en-US', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)' },
119+
BenjaminRUS: { language: 'en-US', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (en-US, BenjaminRUS)' },
120+
Laura: { language: 'es-ES', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (es-ES, Laura, Apollo)' },
121+
Pablo: { language: 'es-ES', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (es-ES, Pablo, Apollo)' },
122+
Raul: { language: 'es-MX', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (es-MX, Raul, Apollo)' },
123+
Caroline: { language: 'fr-CA', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (fr-CA, Caroline)' },
124+
Julie: { language: 'fr-FR', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (fr-FR, Julie, Apollo)' },
125+
Paul: { language: 'fr-FR', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (fr-FR, Paul, Apollo)' },
126+
Cosimo: { language: 'it-IT', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (it-IT, Cosimo, Apollo)' },
127+
Ayumi: { language: 'ja-JP', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ja-JP, Ayumi, Apollo)' },
128+
Ichiro: { language: 'ja-JP', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (ja-JP, Ichiro, Apollo)' },
129+
Daniel: { language: 'pt-BR', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (pt-BR, Daniel, Apollo)' },
130+
Irina: { language: 'ru-RU', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (ru-RU, Irina, Apollo)' },
131+
Pavel: { language: 'ru-RU', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (ru-RU, Pavel, Apollo)' },
132+
HuihuiRUS: { language: 'zh-CN', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, HuihuiRUS)' },
133+
Yaoyao: { language: 'zh-CN', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, Yaoyao, Apollo)' },
134+
Kangkang: { language: 'zh-CN', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-CN, Kangkang, Apollo)' },
135+
Tracy: { language: 'zh-HK', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-HK, Tracy, Apollo)' },
136+
Danny: { language: 'zh-HK', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-HK, Danny, Apollo)' },
137+
Yating: { language: 'zh-TW', gender: 'Female', font: 'Microsoft Server Speech Text to Speech Voice (zh-TW, Yating, Apollo)' },
138+
Zhiwei: { language: 'zh-TW', gender: 'Male', font: 'Microsoft Server Speech Text to Speech Voice (zh-TW, Zhiwei, Apollo)' }
139+
};
140+
141+
module.exports = microsoft;

0 commit comments

Comments
 (0)