Skip to content

Commit be680bf

Browse files
committed
Add logic to create the preloaded audio samples for the website
1 parent 182ef63 commit be680bf

2 files changed

Lines changed: 102 additions & 5 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.venv*/
2+
out/
23

34
__pycache__/
45
build/

tests/test_basic.py

Lines changed: 101 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import unittest
2+
import json
3+
from pathlib import Path
24

35
try:
46
import wfloat
@@ -52,6 +54,31 @@ def test_version_matches_expected(self) -> None:
5254

5355
import wfloat
5456

57+
print(wfloat.version)
58+
59+
SPEAKER_IDS = {
60+
"skilled_hero_man": 0,
61+
"skilled_hero_woman": 1,
62+
"fun_hero_man": 2,
63+
"fun_hero_woman": 3,
64+
"strong_hero_man": 4,
65+
"strong_hero_woman": 5,
66+
"mad_scientist_man": 6,
67+
"mad_scientist_woman": 7,
68+
"clever_villain_man": 8,
69+
"clever_villain_woman": 9,
70+
"narrator_man": 10,
71+
"narrator_woman": 11,
72+
"wise_elder_man": 12,
73+
"wise_elder_woman": 13,
74+
"outgoing_anime_man": 14,
75+
"outgoing_anime_woman": 15,
76+
"scary_villain_man": 16,
77+
"scary_villain_woman": 17,
78+
"news_reporter_man": 18,
79+
"news_reporter_woman": 19,
80+
}
81+
5582
model = wfloat.OfflineTtsWfloatModelConfig(
5683
model="../wfloat-web/assets/models/wfloat-model/1.0.0/wfloat-model-1.0.0.onnx",
5784
tokens="../wfloat-web/assets/models/wfloat-model/1.0.0/wfloat-model-1.0.0_tokens.txt",
@@ -71,9 +98,78 @@ def test_version_matches_expected(self) -> None:
7198
)
7299

73100
tts = wfloat.OfflineTts(config)
74-
audio = tts.generate("Hello world.", sid=0, speed=1.0)
75-
ok = wfloat.write_wave("out.wav", audio.samples, audio.sample_rate)
76101

77-
print("sample_rate:", audio.sample_rate)
78-
print("num_samples:", len(audio.samples))
79-
print("write_wave:", ok)
102+
voices_path = "../../web/assets/js/voices.js" # string path to the JSON file
103+
with open(voices_path, "r", encoding="utf-8") as f:
104+
voices_text = f.read()
105+
prefix = "export const VOICES = "
106+
voices_text = voices_text[len(prefix) :]
107+
108+
voices = json.loads(voices_text)
109+
out_dir = Path("out")
110+
out_dir.mkdir(exist_ok=True)
111+
progress_by_voice = {}
112+
113+
for v in voices:
114+
sid = SPEAKER_IDS[v["voiceId"]]
115+
silence_padding_sec = v["padding"]
116+
speed = v["speed"]
117+
final_samples = []
118+
sample_rate = None
119+
raw_text_cursor = 0
120+
current_time_sec = 0.0
121+
progress_events = []
122+
123+
prepared = tts.prepare_wfloat_text(
124+
v["text"],
125+
emotion=v["emotion"],
126+
intensity=v["intensity"],
127+
)
128+
129+
for i in range(len(prepared.text)):
130+
audio = tts.generate(prepared.text_clean[i], sid=sid, speed=speed)
131+
if sample_rate is None:
132+
sample_rate = audio.sample_rate
133+
elif sample_rate != audio.sample_rate:
134+
raise ValueError(
135+
f"Sample rate changed for {v['voiceId']}: "
136+
f"{sample_rate} != {audio.sample_rate}"
137+
)
138+
139+
raw_chunk_text = prepared.text[i] or ""
140+
highlight_start = raw_text_cursor
141+
highlight_end = raw_text_cursor + len(raw_chunk_text)
142+
raw_text_cursor = highlight_end
143+
144+
chunk_duration_sec = len(audio.samples) / sample_rate
145+
padding_sec = silence_padding_sec if i < len(prepared.text) - 1 else 0.0
146+
start_time_sec = current_time_sec
147+
end_time_sec = start_time_sec + chunk_duration_sec + padding_sec
148+
149+
progress_events.append(
150+
{
151+
"text": raw_chunk_text,
152+
"progress": (i + 1) / len(prepared.text),
153+
"textHighlightStart": highlight_start,
154+
"textHighlightEnd": highlight_end,
155+
"startTimeSec": start_time_sec,
156+
"endTimeSec": end_time_sec,
157+
}
158+
)
159+
160+
final_samples.extend(audio.samples)
161+
current_time_sec += chunk_duration_sec
162+
163+
if i < len(prepared.text) - 1:
164+
silence_samples = int(sample_rate * silence_padding_sec)
165+
final_samples.extend([0] * silence_samples)
166+
current_time_sec += silence_padding_sec
167+
168+
output_path = out_dir / f"{v['voiceId']}.wav"
169+
ok = wfloat.write_wave(str(output_path), final_samples, sample_rate)
170+
progress_by_voice[v["voiceId"]] = progress_events
171+
print(f"{output_path}: sample_rate={sample_rate} num_samples={len(final_samples)} write_wave={ok}")
172+
173+
progress_path = out_dir / "progress.json"
174+
with open(progress_path, "w", encoding="utf-8") as f:
175+
json.dump(progress_by_voice, f, ensure_ascii=False, indent=2)

0 commit comments

Comments
 (0)