1#!/usr/bin/env -S uv run --script
2# /// script
3# requires-python = "~=3.11.0" # something is wrong with thinc when this is set to 3.12
4# dependencies = [
5# "click",
6# "kokoro",
7# "pip",
8# "soundfile",
9# "tqdm",
10# ]
11# ///
12from pathlib import Path
13import sys
14import warnings
15
16import click
17from tqdm import tqdm
18
19SAMPLE_RATE = 24000
20SPEED = 1.0
21
22warnings.simplefilter(action="ignore", category=FutureWarning)
23warnings.simplefilter(action="ignore", category=UserWarning)
24
25VOICES = {
26 "a": [
27 "af_alloy",
28 "af_aoede",
29 "af_bella",
30 "af_heart",
31 "af_jessica",
32 "af_kore",
33 "af_nicole",
34 "af_nova",
35 "af_river",
36 "af_sarah",
37 "af_sky",
38 "am_adam",
39 "am_echo",
40 "am_eric",
41 "am_fenrir",
42 "am_liam",
43 "am_michael",
44 "am_onyx",
45 "am_puck",
46 "am_santa",
47 ],
48 "b": [
49 "bf_alice",
50 "bf_emma",
51 "bf_isabella",
52 "bf_lily",
53 "bm_daniel",
54 "bm_fable",
55 "bm_george",
56 "bm_lewis",
57 ]
58}
59ALL_VOICES = VOICES['a'] + VOICES['b']
60
61def text_to_speech(text: str, output_path: Path, voice: str) -> Path:
62 print("Initializing...")
63 from kokoro import KPipeline
64 import numpy
65 import soundfile
66 import spacy
67
68 if not spacy.util.is_package("xx_ent_wiki_sm"):
69 spacy.cli.download("xx_ent_wiki_sm")
70
71 def gen_audio_segments(pipeline, text: str, voice: str):
72 nlp = spacy.load("xx_ent_wiki_sm")
73 nlp.add_pipe("sentencizer")
74 doc = nlp(text)
75 sentences = list(doc.sents)
76 for i, sentence in enumerate(sentences):
77 audio_segments = []
78 for gs, ps, audio in pipeline(sentence.text, voice=voice, speed=SPEED, split_pattern=r"\n\n\n"):
79 audio_segments.append(audio)
80 yield audio_segments, sentence.text
81
82 pipeline = KPipeline(lang_code=voice[0]) # language code is just the first character
83 print("Initialized. Processing...")
84 progress_bar = tqdm(total=len(text), unit=" chars")
85 with output_path.open("ab") as output:
86 for audio_segments, sentence in gen_audio_segments(pipeline, text, voice):
87 progress_bar.update(len(sentence))
88 soundfile.write(output, numpy.concatenate(audio_segments), SAMPLE_RATE)
89 return output_path
90
91@click.command()
92@click.option('-o', '--output', type=click.Path(exists=False, file_okay=True, dir_okay=False, writable=True, path_type=Path), help='Path to save the generated audio', required=True)
93@click.option("-v", "--voice", type=click.Choice(ALL_VOICES, case_sensitive=True), default=VOICES["a"][0], help=f'A voice to use from the list')
94def main(output, voice) -> None:
95 """
96 This is a simple wrapper script around the kokoro-82m text-to-speech generation model: https://huggingface.co/hexgrad/Kokoro-82M
97 It is designed to have text piped in, and audio to be generated to a file path with a selected voice.
98 """
99 text_to_speech("\n".join(line.rstrip() for line in sys.stdin.readlines()), output, voice)
100
101if __name__ == "__main__":
102 main()