From be759f3fea57891b0f8f82586adbccb55187f4fd Mon Sep 17 00:00:00 2001 From: matatonic Date: Thu, 27 Jun 2024 01:43:43 -0400 Subject: [PATCH] 0.15.0 --- Dockerfile | 4 +++- Dockerfile.min | 2 +- README.md | 7 ++++++- requirements-min.txt | 3 +-- requirements-rocm.txt | 19 +++++-------------- requirements.txt | 20 ++++++-------------- speech.py | 3 ++- 7 files changed, 24 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index 09c453a..9526de9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,9 @@ FROM python:3.11-slim +RUN --mount=type=cache,target=/root/.cache/pip pip install -U pip + ARG TARGETPLATFORM -RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg +RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg libaio-dev RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi ENV PATH="/root/.cargo/bin:${PATH}" diff --git a/Dockerfile.min b/Dockerfile.min index cc1db1f..b11ae06 100644 --- a/Dockerfile.min +++ b/Dockerfile.min @@ -12,7 +12,7 @@ RUN mkdir -p voices config COPY requirements*.txt /app/ RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt -COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/ +COPY *.py *.sh *.default.yaml README.md LICENSE /app/ ENV TTS_HOME=voices ENV HF_HOME=voices diff --git a/README.md b/README.md index 84d59b0..0720740 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s ## Recent Changes +Version 0.15.0, 2024-06-26 + +* Switch to [coqui-tts](https://github.com/idiap/coqui-ai-TTS) (updated fork), updated simpler dependencies, torch 2.3, etc. +* Resolve cuda threading issues + Version 0.14.1, 2024-06-26 * Make deepspeed possible (`--use-deepspeed`), but not enabled in pre-built docker images (too large). Requires the cuda-toolkit installed, see the Dockerfile comment for details @@ -127,7 +132,7 @@ source .venv/bin/activate # Install the Python requirements # - use requirements-rocm.txt for AMD GPU (ROCm support) # - use requirements-min.txt for piper only (CPU only) -pip install -r requirements.txt +pip install -U -r requirements.txt # run the server bash startup.sh ``` diff --git a/requirements-min.txt b/requirements-min.txt index 744da39..e79f80c 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,6 +1,5 @@ -pyyaml fastapi uvicorn loguru numpy<2 -piper-tts==1.2.0 +piper-tts \ No newline at end of file diff --git a/requirements-rocm.txt b/requirements-rocm.txt index bc7ce39..c6a4248 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -1,17 +1,8 @@ fastapi uvicorn loguru -# piper-tts -piper-tts==1.2.0 -# xtts -TTS==0.22.0 -# https://github.com/huggingface/transformers/issues/31040 -transformers<4.41.0 -deepspeed<0.14.0 -# XXX, 3.8+ has some issue for now -spacy==3.7.4 - -# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9 -# Re: https://github.com/pytorch/pytorch/issues/121834 -torch==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux" -torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux" +piper-tts +coqui-tts +deepspeed +torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux" +torchaudio; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9c7c8b6..ec719f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,14 @@ fastapi uvicorn loguru -# piper-tts -piper-tts==1.2.0 -# xtts -TTS==0.22.0 -# https://github.com/huggingface/transformers/issues/31040 -transformers<4.41.0 -deepspeed<0.14.0 -# XXX, 3.8+ has some issue for now -spacy==3.7.4 +piper-tts +coqui-tts[languages] +deepspeed -# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9 -# Re: https://github.com/pytorch/pytorch/issues/121834 -torch==2.2.2; sys_platform != "darwin" +torch; sys_platform != "darwin" torchaudio; sys_platform != "darwin" # for MPS accelerated torch on Mac - doesn't work yet, incomplete support in torch and torchaudio -torch==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin" -torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin" +torch; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin" +torchaudio; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin" # ROCM (Linux only) - use requirements.amd.txt \ No newline at end of file diff --git a/speech.py b/speech.py index 97b8fb2..3ecd3c4 100755 --- a/speech.py +++ b/speech.py @@ -92,7 +92,8 @@ def tts(self, text, language, speaker_wav, **hf_generate_kwargs): self.not_idle() try: with torch.no_grad(): - gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # XXX TODO: allow multiple wav + with self.lock: # this doesn't seem threadsafe, but it's quick enough + gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # XXX TODO: allow multiple wav for wav in self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs): yield wav.cpu().numpy().tobytes() # assumes wav data is f32le