hexgrad commited on
Commit
b7cfba0
·
verified ·
1 Parent(s): 1badd57

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +169 -9
  2. fr.txt +0 -0
  3. ko.txt +0 -0
  4. zh.txt +0 -0
app.py CHANGED
@@ -34,15 +34,12 @@ PARAM_COUNT = sum(p.numel() for value in models['cpu'].values() for p in value.p
34
  assert PARAM_COUNT < 82_000_000, PARAM_COUNT
35
 
36
  random_texts = {}
37
- for lang in ['en', 'ja']:
38
  with open(f'{lang}.txt', 'r') as r:
39
  random_texts[lang] = [line.strip() for line in r]
40
 
41
  def get_random_text(voice):
42
- if voice[0] == 'j':
43
- lang = 'ja'
44
- else:
45
- lang = 'en'
46
  return random.choice(random_texts[lang])
47
 
48
  sents = set()
@@ -326,6 +323,98 @@ def generate(text, voice='af', ps=None, speed=1, trim=0.5, use_gpu='auto', sk=No
326
  def toggle_autoplay(autoplay):
327
  return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  USE_GPU_CHOICES = [('Auto 🔀', 'auto'), ('CPU 💬', False), ('ZeroGPU 📄', True)]
330
  USE_GPU_INFOS = {
331
  'auto': 'Use CPU or GPU, whichever is faster',
@@ -335,10 +424,48 @@ USE_GPU_INFOS = {
335
  def change_use_gpu(value):
336
  return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  with gr.Blocks() as basic_tts:
339
  with gr.Row():
340
  with gr.Column():
341
- text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
342
  with gr.Row():
343
  voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
344
  use_gpu = gr.Dropdown(
@@ -565,7 +692,7 @@ with gr.Blocks() as lf_tts:
565
 
566
  with gr.Blocks() as about:
567
  gr.Markdown('''
568
- Kokoro is a frontier TTS model for its size. It has [80 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L34) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
569
 
570
  ### FAQ
571
  **Will this be open sourced?**<br/>
@@ -616,6 +743,11 @@ This Space and the underlying Kokoro model are both under development and subjec
616
  '''
617
  with gr.Blocks() as changelog:
618
  gr.Markdown('''
 
 
 
 
 
619
  **30 Nov 2024**<br/>
620
  ✂️ Better trimming with `librosa.effects.trim`<br/>
621
  🏆 https://hf.co/spaces/Pendrokar/TTS-Spaces-Arena
@@ -649,10 +781,38 @@ with gr.Blocks() as changelog:
649
  🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
650
  ''')
651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  with gr.Blocks() as app:
653
  gr.TabbedInterface(
654
- [basic_tts, lf_tts, about, changelog],
655
- ['🔥 Basic TTS', '📖 Long Form', 'ℹ️ About', '📝 Changelog'],
656
  )
657
 
658
  if __name__ == '__main__':
 
34
  assert PARAM_COUNT < 82_000_000, PARAM_COUNT
35
 
36
  random_texts = {}
37
+ for lang in ['en', 'fr', 'ja', 'ko', 'zh']:
38
  with open(f'{lang}.txt', 'r') as r:
39
  random_texts[lang] = [line.strip() for line in r]
40
 
41
  def get_random_text(voice):
42
+ lang = dict(a='en', b='en', f='fr', j='ja', k='ko', z='zh')[voice[0]]
 
 
 
43
  return random.choice(random_texts[lang])
44
 
45
  sents = set()
 
323
  def toggle_autoplay(autoplay):
324
  return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
325
 
326
+ PREVIEW_LANGUAGES = {
327
+ '🇺🇸 en-US': 'a',
328
+ '🇬🇧 en-GB': 'b',
329
+ '🇫🇷 fr-FR': 'f',
330
+ '🇯🇵 ja-JP': 'j',
331
+ '🇰🇷 ko-KR': 'k',
332
+ '🇨🇳 zh-CN': 'z',
333
+ }
334
+
335
+ PREVIEW_CHOICES = dict(
336
+ a={
337
+ '🇺🇸 🚺 American Female ⭐': 'af',
338
+ '🇺🇸 🚺 Bella ⭐': 'af_bella',
339
+ '🇺🇸 🚺 Nicole ⭐': 'af_nicole',
340
+ '🇺🇸 🚺 Sarah ⭐': 'af_sarah',
341
+ '🇺🇸 🚺 Alloy': 'af_alloy',
342
+ '🇺🇸 🚺 Jessica': 'af_jessica',
343
+ '🇺🇸 🚺 Matilda': 'af_matilda',
344
+ '🇺🇸 🚺 Nova': 'af_nova',
345
+ '🇺🇸 🚺 River': 'af_river',
346
+ '🇺🇸 🚺 Sky': 'af_sky',
347
+ '🇺🇸 🚹 Adam ⭐': 'am_adam',
348
+ '🇺🇸 🚹 Michael ⭐': 'am_michael',
349
+ '🇺🇸 🚹 Echo': 'am_echo',
350
+ '🇺🇸 🚹 Eric': 'am_eric',
351
+ '🇺🇸 🚹 Liam': 'am_liam',
352
+ '🇺🇸 🚹 Onyx': 'am_onyx',
353
+ '🇺🇸 🚹 Will 🧪': 'am_will',
354
+ },
355
+ b={
356
+ '🇬🇧 🚺 Alice': 'bf_alice',
357
+ '🇬🇧 🚺 Lily': 'bf_lily',
358
+ '🇬🇧 🚹 Lewis ⭐': 'bm_lewis',
359
+ '🇬🇧 🚹 Daniel': 'bm_daniel',
360
+ '🇬🇧 🚹 Fable': 'bm_fable',
361
+ '🇬🇧 🚹 George': 'bm_george',
362
+ },
363
+ f={'🇫🇷 🚺 French Alpha': 'fr_alpha'},
364
+ j={
365
+ '🇯🇵 🚺 Japanese Alpha': 'jf_alpha',
366
+ '🇯🇵 🚺 Japanese Beta': 'jf_theta',
367
+ '🇯🇵 🚺 Japanese Gamma': 'jf_iota',
368
+ '🇯🇵 🚺 Japanese Delta': 'jf_kappa',
369
+ '🇯🇵 🚺 Japanese Epsilon': 'jf_beta_0',
370
+ '🇯🇵 🚺 Japanese Zeta': 'jf_gamma_0',
371
+ '🇯🇵 🚺 Japanese Eta': 'jf_delta_0',
372
+ '🇯🇵 🚺 Japanese Theta': 'jf_epsilon',
373
+ '🇯🇵 🚺 Japanese Iota': 'jf_zeta',
374
+ '🇯🇵 🚺 Japanese Kappa': 'jf_eta',
375
+ '🇯🇵 🚹 Japanese Omega': 'jm_omega',
376
+ },
377
+ k={
378
+ '🇰🇷 🚺 Korean Alpha': 'kf_alpha',
379
+ '🇰🇷 🚺 Korean Beta': 'kf_beta',
380
+ '🇰🇷 🚺 Korean Gamma': 'kf_gamma',
381
+ '🇰🇷 🚺 Korean Delta': 'kf_delta',
382
+ '🇰🇷 🚺 Korean Epsilon': 'kf_epsilon',
383
+ '🇰🇷 🚺 Korean Zeta': 'kf_zeta',
384
+ '🇰🇷 🚺 Korean Eta': 'kf_eta',
385
+ '🇰🇷 🚺 Korean Theta': 'kf_theta',
386
+ '🇰🇷 🚺 Korean Iota': 'kf_iota',
387
+ '🇰🇷 🚺 Korean Kappa': 'kf_kappa',
388
+ '🇰🇷 🚺 Korean Lambda': 'kf_lambda',
389
+ '🇰🇷 🚺 Korean Mu': 'kf_mu',
390
+ '🇰🇷 🚺 Korean Nu': 'kf_nu',
391
+ '🇰🇷 🚺 Korean Xi': 'kf_xi',
392
+ '🇰🇷 🚺 Korean Omicron': 'kf_omicron',
393
+ '🇰🇷 🚹 Korean Pi': 'km_pi',
394
+ '🇰🇷 🚹 Korean Rho': 'km_rho',
395
+ '🇰🇷 🚹 Korean Sigma': 'km_sigma',
396
+ '🇰🇷 🚹 Korean Tau': 'km_tau',
397
+ '🇰🇷 🚹 Korean Upsilon': 'km_upsilon',
398
+ '🇰🇷 🚹 Korean Phi': 'km_phi',
399
+ '🇰🇷 🚹 Korean Chi': 'km_chi',
400
+ '🇰🇷 🚹 Korean Psi': 'km_psi',
401
+ '🇰🇷 🚹 Korean Omega': 'km_omega',
402
+ },
403
+ z={
404
+ '🇨🇳 🚺 Mandarin Alpha': 'zf_beta',
405
+ '🇨🇳 🚺 Mandarin Beta': 'zf_gamma',
406
+ '🇨🇳 🚺 Mandarin Gamma': 'zf_delta',
407
+ '🇨🇳 🚺 Mandarin Delta': 'zf_epsilon',
408
+ '🇨🇳 🚺 Mandarin Epsilon 🧪': 'zf_alpha',
409
+ '🇨🇳 🚹 Mandarin Phi': 'zm_phi',
410
+ '🇨🇳 🚹 Mandarin Chi': 'zm_chi',
411
+ '🇨🇳 🚹 Mandarin Psi': 'zm_psi',
412
+ '🇨🇳 🚹 Mandarin Omega': 'zm_omega',
413
+ },
414
+ )
415
+ def change_language(value):
416
+ return voice = gr.Dropdown(list(PREVIEW_CHOICES[value].items()), value='af', label='Voice', info='⭐ voices are stable, 🧪 are unstable')
417
+
418
  USE_GPU_CHOICES = [('Auto 🔀', 'auto'), ('CPU 💬', False), ('ZeroGPU 📄', True)]
419
  USE_GPU_INFOS = {
420
  'auto': 'Use CPU or GPU, whichever is faster',
 
424
  def change_use_gpu(value):
425
  return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
426
 
427
+ from gradio_client import Client
428
+ client = Client('hexgrad/kokoro-src', hf_token=os.environ('SRC'))
429
+ def preview(text, voice, speed, trim, use_gpu, sk):
430
+ return client.predict(text=text, voice=voice, speed=speed, trim=trim, use_gpu=use_gpu, sk=sk, api_name='/generate')[0]
431
+
432
+ with gr.Blocks() as preview_tts:
433
+ with gr.Row():
434
+ with gr.Column():
435
+ text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text, up to ~500 characters')
436
+ lang = gr.Radio(choices=PREVIEW_LANGUAGES.items(), value='a')
437
+ with gr.Row():
438
+ voice = gr.Dropdown(list(PREVIEW_CHOICES['a'].items()), value='af', label='Voice', info='⭐ voices are stable, 🧪 are unstable')
439
+ lang.change(fn=change_language, inputs=[lang], outputs=[voice])
440
+ use_gpu = gr.Dropdown(
441
+ USE_GPU_CHOICES,
442
+ value='auto' if CUDA_AVAILABLE else False,
443
+ label='Hardware',
444
+ info=USE_GPU_INFOS['auto' if CUDA_AVAILABLE else False],
445
+ interactive=CUDA_AVAILABLE
446
+ )
447
+ use_gpu.change(fn=change_use_gpu, inputs=[use_gpu], outputs=[use_gpu])
448
+ with gr.Row():
449
+ random_btn = gr.Button('Random Text', variant='secondary')
450
+ generate_btn = gr.Button('Generate', variant='primary')
451
+ random_btn.click(get_random_text, inputs=[lang], outputs=[text])
452
+ with gr.Column():
453
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
454
+ with gr.Accordion('Audio Settings', open=False):
455
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
456
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
457
+ speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Speed', info='Adjust the speaking speed')
458
+ trim = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label='✂️ Trim', info='How much to cut from both ends')
459
+ with gr.Row():
460
+ sk = gr.Textbox(visible=False)
461
+ text.change(lambda: os.environ['SK'], outputs=[sk])
462
+ text.submit(preview, inputs=[text, voice, speed, trim, use_gpu, sk], outputs=[audio])
463
+ generate_btn.click(preview, inputs=[text, voice, speed, trim, use_gpu, sk], outputs=[audio])
464
+
465
  with gr.Blocks() as basic_tts:
466
  with gr.Row():
467
  with gr.Column():
468
+ text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 82 million parameters')
469
  with gr.Row():
470
  voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
471
  use_gpu = gr.Dropdown(
 
692
 
693
  with gr.Blocks() as about:
694
  gr.Markdown('''
695
+ Kokoro is a frontier TTS model for its size. It has [82 million](https://hf.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L34) parameters, uses a lean [StyleTTS 2](https://github.com/yl4579/StyleTTS2) architecture, and was trained on high-quality data. The weights are currently private, but a free public demo is hosted here, at `https://hf.co/spaces/hexgrad/Kokoro-TTS`. The Community tab is open for feature requests, bug reports, etc. For other inquiries, contact `@rzvzn` on Discord.
696
 
697
  ### FAQ
698
  **Will this be open sourced?**<br/>
 
743
  '''
744
  with gr.Blocks() as changelog:
745
  gr.Markdown('''
746
+ **8 Dec 2024**<br/>
747
+ 🚀 Model Preview v0.22<br/>
748
+ 🗣️ 68 total voices spanning 5 languages: English, Chinese, Japanese, Korean, French<br/>
749
+ 📁 Added data card
750
+
751
  **30 Nov 2024**<br/>
752
  ✂️ Better trimming with `librosa.effects.trim`<br/>
753
  🏆 https://hf.co/spaces/Pendrokar/TTS-Spaces-Arena
 
781
  🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
782
  ''')
783
 
784
+ with gr.Blocks() as data_card:
785
+ gr.Markdown('''
786
+ This data card was last updated on **8 Dec 2024**.
787
+
788
+ Kokoro was trained exclusively on **permissive/non-copyrighted audio data** and IPA phoneme labels. Examples of permissive/non-copyrighted audio include:
789
+ - Public domain audio
790
+ - Audio licensed under Apache, MIT, etc
791
+ - Synthetic audio<sup>[1]</sup> generated by closed<sup>[2]</sup> TTS models from large providers
792
+ - CC BY audio (see below for attribution table)
793
+
794
+ [1] https://copyright.gov/ai/ai_policy_guidance.pdf
795
+ [2] No open TTS models used
796
+
797
+ ### Creative Commons Attribution
798
+ The following CC BY audio was part of the dataset used to train Kokoro.
799
+
800
+ | Audio Data | Duration Used | License | Added to Training Set After |
801
+ | ---------- | ------------- | ------- | --------------------------- |
802
+ | [Koniwa](https://github.com/koniwa/koniwa) `tnc` | <1h | [CC BY 3.0](https://creativecommons.org/licenses/by/3.0/deed.ja) | v0.19 / 22 Nov 2024 |
803
+ | [SIWIS](https://datashare.ed.ac.uk/handle/10283/2353) | <11h | [CC BY 4.0](https://datashare.ed.ac.uk/bitstream/handle/10283/2353/license_text) | v0.19 / 22 Nov 2024 |
804
+
805
+ ### Notable Datasets Not Used
806
+ These datasets were **not** used to train Kokoro. They may be of interest to academics:
807
+ - Emilia, `cc-by-nc-4.0`: https://huggingface.co/datasets/amphion/Emilia-Dataset
808
+ - Expresso, `cc-by-nc-4.0`: https://huggingface.co/datasets/ylacombe/expresso
809
+ - JVS corpus, NC clause: https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus
810
+ ''')
811
+
812
  with gr.Blocks() as app:
813
  gr.TabbedInterface(
814
+ [preview_tts, basic_tts, lf_tts, about, changelog, data_card],
815
+ ['🧪 Preview v0.22', '🔥 Basic TTS v0.19', '📖 Long Form v0.19', 'ℹ️ About', '📝 Changelog', '📁 Data'],
816
  )
817
 
818
  if __name__ == '__main__':
fr.txt ADDED
The diff for this file is too large to render. See raw diff
 
ko.txt ADDED
The diff for this file is too large to render. See raw diff
 
zh.txt ADDED
The diff for this file is too large to render. See raw diff