kokoro-tts.py

  1#!/usr/bin/env -S uv run --script
  2# /// script
  3# requires-python = "~=3.11.0" # something is wrong with thinc when this is set to 3.12
  4# dependencies = [
  5#     "click",
  6#     "kokoro",
  7#     "pip",
  8#     "soundfile",
  9#     "tqdm",
 10# ]
 11# ///
 12from pathlib import Path
 13import sys
 14import warnings
 15
 16import click
 17from tqdm import tqdm
 18
 19SAMPLE_RATE = 24000
 20SPEED = 1.0
 21
 22warnings.simplefilter(action="ignore", category=FutureWarning)
 23warnings.simplefilter(action="ignore", category=UserWarning)
 24
 25VOICES = {
 26    "a": [
 27        "af_alloy",
 28        "af_aoede",
 29        "af_bella",
 30        "af_heart",
 31        "af_jessica",
 32        "af_kore",
 33        "af_nicole",
 34        "af_nova",
 35        "af_river",
 36        "af_sarah",
 37        "af_sky",
 38        "am_adam",
 39        "am_echo",
 40        "am_eric",
 41        "am_fenrir",
 42        "am_liam",
 43        "am_michael",
 44        "am_onyx",
 45        "am_puck",
 46        "am_santa",
 47    ],
 48    "b": [
 49        "bf_alice",
 50        "bf_emma",
 51        "bf_isabella",
 52        "bf_lily",
 53        "bm_daniel",
 54        "bm_fable",
 55        "bm_george",
 56        "bm_lewis",
 57    ]
 58}
 59ALL_VOICES = VOICES['a'] + VOICES['b']
 60
 61def text_to_speech(text: str, output_path: Path, voice: str) -> Path:
 62    print("Initializing...")
 63    from kokoro import KPipeline
 64    import numpy
 65    import soundfile
 66    import spacy
 67
 68    if not spacy.util.is_package("xx_ent_wiki_sm"):
 69        spacy.cli.download("xx_ent_wiki_sm")
 70
 71    def gen_audio_segments(pipeline, text: str, voice: str):
 72        nlp = spacy.load("xx_ent_wiki_sm")
 73        nlp.add_pipe("sentencizer")
 74        doc = nlp(text)
 75        sentences = list(doc.sents)
 76        for i, sentence in enumerate(sentences):
 77            audio_segments = []
 78            for gs, ps, audio in pipeline(sentence.text, voice=voice, speed=SPEED, split_pattern=r"\n\n\n"):
 79                audio_segments.append(audio)
 80            yield audio_segments, sentence.text
 81
 82    pipeline = KPipeline(lang_code=voice[0]) # language code is just the first character
 83    print("Initialized. Processing...")
 84    progress_bar = tqdm(total=len(text), unit=" chars")
 85    with output_path.open("ab") as output:
 86        for audio_segments, sentence in gen_audio_segments(pipeline, text, voice):
 87            progress_bar.update(len(sentence))
 88            soundfile.write(output, numpy.concatenate(audio_segments), SAMPLE_RATE)
 89    return output_path
 90
 91@click.command()
 92@click.option('-o', '--output', type=click.Path(exists=False, file_okay=True, dir_okay=False, writable=True, path_type=Path), help='Path to save the generated audio', required=True)
 93@click.option("-v", "--voice", type=click.Choice(ALL_VOICES, case_sensitive=True), default=VOICES["a"][0], help=f'A voice to use from the list')
 94def main(output, voice) -> None:
 95    """
 96    This is a simple wrapper script around the kokoro-82m text-to-speech generation model: https://huggingface.co/hexgrad/Kokoro-82M
 97    It is designed to have text piped in, and audio to be generated to a file path with a selected voice.
 98    """
 99    text_to_speech("\n".join(line.rstrip() for line in sys.stdin.readlines()), output, voice)
100
101if __name__ == "__main__":
102    main()