#!/usr/bin/env -S uv run --script # /// script # requires-python = "~=3.11.0" # something is wrong with thinc when this is set to 3.12 # dependencies = [ # "click", # "kokoro", # "pip", # "soundfile", # "tqdm", # ] # /// from pathlib import Path import sys import warnings import click from tqdm import tqdm SAMPLE_RATE = 24000 SPEED = 1.0 warnings.simplefilter(action="ignore", category=FutureWarning) warnings.simplefilter(action="ignore", category=UserWarning) VOICES = { "a": [ "af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", ], "b": [ "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis", ] } ALL_VOICES = VOICES['a'] + VOICES['b'] def text_to_speech(text: str, output_path: Path, voice: str) -> Path: print("Initializing...") from kokoro import KPipeline import numpy import soundfile import spacy if not spacy.util.is_package("xx_ent_wiki_sm"): spacy.cli.download("xx_ent_wiki_sm") def gen_audio_segments(pipeline, text: str, voice: str): nlp = spacy.load("xx_ent_wiki_sm") nlp.add_pipe("sentencizer") doc = nlp(text) sentences = list(doc.sents) for i, sentence in enumerate(sentences): audio_segments = [] for gs, ps, audio in pipeline(sentence.text, voice=voice, speed=SPEED, split_pattern=r"\n\n\n"): audio_segments.append(audio) yield audio_segments, sentence.text pipeline = KPipeline(lang_code=voice[0]) # language code is just the first character print("Initialized. Processing...") progress_bar = tqdm(total=len(text), unit=" chars") with output_path.open("ab") as output: for audio_segments, sentence in gen_audio_segments(pipeline, text, voice): progress_bar.update(len(sentence)) soundfile.write(output, numpy.concatenate(audio_segments), SAMPLE_RATE) return output_path @click.command() @click.option('-o', '--output', type=click.Path(exists=False, file_okay=True, dir_okay=False, writable=True, path_type=Path), help='Path to save the generated audio', required=True) @click.option("-v", "--voice", type=click.Choice(ALL_VOICES, case_sensitive=True), default=VOICES["a"][0], help=f'A voice to use from the list') def main(output, voice) -> None: """ This is a simple wrapper script around the kokoro-82m text-to-speech generation model: https://huggingface.co/hexgrad/Kokoro-82M It is designed to have text piped in, and audio to be generated to a file path with a selected voice. """ text_to_speech("\n".join(line.rstrip() for line in sys.stdin.readlines()), output, voice) if __name__ == "__main__": main()