#!/usr/bin/env -S uv run --script
# /// script
# requires-python = "~=3.11.0" # something is wrong with thinc when this is set to 3.12
# dependencies = [
#     "click",
#     "kokoro",
#     "pip",
#     "soundfile",
#     "tqdm",
# ]
# ///
from pathlib import Path
import sys
import warnings

import click
from tqdm import tqdm

SAMPLE_RATE = 24000
SPEED = 1.0

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

VOICES = {
    "a": [
        "af_alloy",
        "af_aoede",
        "af_bella",
        "af_heart",
        "af_jessica",
        "af_kore",
        "af_nicole",
        "af_nova",
        "af_river",
        "af_sarah",
        "af_sky",
        "am_adam",
        "am_echo",
        "am_eric",
        "am_fenrir",
        "am_liam",
        "am_michael",
        "am_onyx",
        "am_puck",
        "am_santa",
    ],
    "b": [
        "bf_alice",
        "bf_emma",
        "bf_isabella",
        "bf_lily",
        "bm_daniel",
        "bm_fable",
        "bm_george",
        "bm_lewis",
    ]
}
ALL_VOICES = VOICES['a'] + VOICES['b']

def text_to_speech(text: str, output_path: Path, voice: str) -> Path:
    print("Initializing...")
    from kokoro import KPipeline
    import numpy
    import soundfile
    import spacy

    if not spacy.util.is_package("xx_ent_wiki_sm"):
        spacy.cli.download("xx_ent_wiki_sm")

    def gen_audio_segments(pipeline, text: str, voice: str):
        nlp = spacy.load("xx_ent_wiki_sm")
        nlp.add_pipe("sentencizer")
        doc = nlp(text)
        sentences = list(doc.sents)
        for i, sentence in enumerate(sentences):
            audio_segments = []
            for gs, ps, audio in pipeline(sentence.text, voice=voice, speed=SPEED, split_pattern=r"\n\n\n"):
                audio_segments.append(audio)
            yield audio_segments, sentence.text

    pipeline = KPipeline(lang_code=voice[0]) # language code is just the first character
    print("Initialized. Processing...")
    progress_bar = tqdm(total=len(text), unit=" chars")
    with output_path.open("ab") as output:
        for audio_segments, sentence in gen_audio_segments(pipeline, text, voice):
            progress_bar.update(len(sentence))
            soundfile.write(output, numpy.concatenate(audio_segments), SAMPLE_RATE)
    return output_path

@click.command()
@click.option('-o', '--output', type=click.Path(exists=False, file_okay=True, dir_okay=False, writable=True, path_type=Path), help='Path to save the generated audio', required=True)
@click.option("-v", "--voice", type=click.Choice(ALL_VOICES, case_sensitive=True), default=VOICES["a"][0], help=f'A voice to use from the list')
def main(output, voice) -> None:
    """
    This is a simple wrapper script around the kokoro-82m text-to-speech generation model: https://huggingface.co/hexgrad/Kokoro-82M
    It is designed to have text piped in, and audio to be generated to a file path with a selected voice.
    """
    text_to_speech("\n".join(line.rstrip() for line in sys.stdin.readlines()), output, voice)

if __name__ == "__main__":
    main()