Merge pull request #8 from rmcpantoja/piper

Piper implementation
metalalchemist · Jun 26, 2023 · 1c6ecde · 1c6ecde
2 parents 49f6632 + 1c9144f
commit 1c6ecde
Show file tree

Hide file tree

Showing 13 changed files with 532 additions and 36 deletions.
diff --git a/.github/workflows/betube32.yml b/.github/workflows/betube32.yml
@@ -0,0 +1,69 @@
+name: VeTube-x86
+
+on:
+  push:
+    tags: ["*"]
+    branches: [ master , piper ]
+  pull_request:
+    branches: [ master , piper ]
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: windows-latest
+
+    steps:
+      - name: Source checkout
+        uses: actions/checkout@v3
+
+      - name: Configure Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.10.11
+          architecture: x86
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip wheel setuptools
+          pip install -r requirements.txt
+          pip install pyinstaller gdown
+          pip install --upgrade pyzmq httpx httpcore future
+          git clone https://github.com/mush42/espeak-phonemizer-windows
+
+      - name: Compiling
+        run: |
+          pyinstaller VeTube.py
+          gdown 1ZtF6zus0A7kC9Lwr_kTUbw0MiOoZq29H -O dist/VeTube/bootstrap.exe
+          cp -R doc dist/VeTube/
+          cp -R locales dist/VeTube/
+          cp -R readme dist/VeTube/
+          cp -R sounds dist/VeTube/
+          cp -R espeak-phonemizer-windows/espeak_phonemizer dist/VeTube/
+
+      - name: Create zip
+        run: |
+          cd dist
+          7z a ../VeTube-x86.zip VeTube/
+          cd ..
+
+      - name: Upload zip
+        uses: actions/upload-artifact@v3
+        with:
+          name: VeTube-x86
+          path: dist
+          if-no-files-found: error
+
+  vetube_release:
+    runs-on: windows-latest
+    if: ${{ startsWith(github.ref, 'refs/tags/') }}
+    needs: ["build"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: download
+      uses: actions/download-artifact@v3
+    - name: Release
+      uses: softprops/action-gh-release@v1
+      with:
+        files: VeTube-x86.zip
+        fail_on_unmatched_files: true
+        prerelease: ${{ contains(github.ref, '-') }}
diff --git a/.github/workflows/betube64.yml b/.github/workflows/betube64.yml
@@ -0,0 +1,69 @@
+name: VeTube-x64
+
+on:
+  push:
+    tags: ["*"]
+    branches: [ master , piper ]
+  pull_request:
+    branches: [ master , piper ]
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: windows-latest
+
+    steps:
+      - name: Source checkout
+        uses: actions/checkout@v3
+
+      - name: Configure Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.10.11
+          architecture: x64
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip wheel setuptools
+          pip install -r requirements.txt
+          pip install pyinstaller gdown
+          pip install --upgrade pyzmq httpx httpcore future
+          git clone https://github.com/mush42/espeak-phonemizer-windows
+
+      - name: Compiling
+        run: |
+          pyinstaller VeTube.py
+          gdown 1ZtF6zus0A7kC9Lwr_kTUbw0MiOoZq29H -O dist/VeTube/bootstrap.exe
+          cp -R doc dist/VeTube/
+          cp -R locales dist/VeTube/
+          cp -R readme dist/VeTube/
+          cp -R sounds dist/VeTube/
+          cp -R espeak-phonemizer-windows/espeak_phonemizer dist/VeTube/
+
+      - name: Create zip
+        run: |
+          cd dist
+          7z a ../VeTube-x64.zip VeTube/
+          cd ..
+
+      - name: Upload zip
+        uses: actions/upload-artifact@v3
+        with:
+          name: VeTube-x64
+          path: dist
+          if-no-files-found: error
+
+  vetube_release:
+    runs-on: windows-latest
+    if: ${{ startsWith(github.ref, 'refs/tags/') }}
+    needs: ["build"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: download
+      uses: actions/download-artifact@v3
+    - name: Release
+      uses: softprops/action-gh-release@v1
+      with:
+        files: VeTube-x64.zip
+        fail_on_unmatched_files: true
+        prerelease: ${{ contains(github.ref, '-') }}
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,9 @@
 __pycache__/
-*.pyc
+*.pyc
+*.onnx
+*.onnx.json
+piper/voices/*
+piper/voices/*/*.onnx
+piper/voices/*/*.onnx.json
+data.json
+keys.txt
diff --git a/.gitmodules b/.gitmodules
diff --git a/TTS/Piper/__init__.py b/TTS/Piper/__init__.py
@@ -0,0 +1,137 @@
+import io
+import json
+import logging
+import wave
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Mapping, Optional, Sequence, Union
+
+import numpy as np
+import onnxruntime
+from espeak_phonemizer import Phonemizer
+
+_LOGGER = logging.getLogger(__name__)
+
+_BOS = "^"
+_EOS = "$"
+_PAD = "_"
+
+
+@dataclass
+class PiperConfig:
+    num_symbols: int
+    num_speakers: int
+    sample_rate: int
+    espeak_voice: str
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+    phoneme_id_map: Mapping[str, Sequence[int]]
+
+
+class Piper:
+    def __init__(
+        self,
+        model_path: Union[str, Path],
+        config_path: Optional[Union[str, Path]] = None,
+        use_cuda: bool = False,
+    ):
+        if config_path is None:
+            config_path = f"{model_path}.json"
+
+        self.config = load_config(config_path)
+        self.phonemizer = Phonemizer(self.config.espeak_voice)
+        self.model = onnxruntime.InferenceSession(
+            str(model_path),
+            sess_options=onnxruntime.SessionOptions(),
+            providers=["CPUExecutionProvider"]
+            if not use_cuda
+            else ["CUDAExecutionProvider"],
+        )
+
+    def synthesize(
+        self,
+        text: str,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+    ) -> bytes:
+        """Synthesize WAV audio from text."""
+        if length_scale is None:
+            length_scale = self.config.length_scale
+
+        if noise_scale is None:
+            noise_scale = self.config.noise_scale
+
+        if noise_w is None:
+            noise_w = self.config.noise_w
+
+        phonemes_str = self.phonemizer.phonemize(text)
+        phonemes = [_BOS] + list(phonemes_str)
+        phoneme_ids: List[int] = []
+
+        for phoneme in phonemes:
+            if phoneme in self.config.phoneme_id_map:
+                phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
+                phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
+            else:
+                _LOGGER.warning("No id for phoneme: %s", phoneme)
+
+        phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
+
+        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [noise_scale, length_scale, noise_w],
+            dtype=np.float32,
+        )
+
+        if (self.config.num_speakers > 1) and (speaker_id is not None):
+            # Default speaker
+            speaker_id = 0
+
+        sid = None
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+
+        # Synthesize through Onnx
+        audio = self.model.run(
+            None,
+            {
+                "input": phoneme_ids_array,
+                "input_lengths": phoneme_ids_lengths,
+                "scales": scales,
+                "sid": sid,
+            },
+        )[0].squeeze((0, 1))
+        audio = audio_float_to_int16(audio.squeeze())
+        return audio, self.config.sample_rate
+
+
+def load_config(config_path: Union[str, Path]) -> PiperConfig:
+    with open(config_path, "r", encoding="utf-8") as config_file:
+        config_dict = json.load(config_file)
+        inference = config_dict.get("inference", {})
+
+        return PiperConfig(
+            num_symbols=config_dict["num_symbols"],
+            num_speakers=config_dict["num_speakers"],
+            sample_rate=config_dict["audio"]["sample_rate"],
+            espeak_voice=config_dict["espeak"]["voice"],
+            noise_scale=inference.get("noise_scale", 0.667),
+            length_scale=inference.get("length_scale", 1.0),
+            noise_w=inference.get("noise_w", 0.8),
+            phoneme_id_map=config_dict["phoneme_id_map"],
+        )
+
+
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm
diff --git a/TTS/Piper/speaker.py b/TTS/Piper/speaker.py
@@ -0,0 +1,48 @@
+import logging
+from functools import partial
+from pathlib import Path
+import sounddevice as sd
+from . import Piper
+
+class piperSpeak:
+	def __init__(self, model_path):
+		self.model_path = model_path
+		self.speaker_id = None
+		self.length_scale = 1
+		self.noise_scale = 0.667
+		self.noise_w = 0.8
+		self.synthesize = None
+		self.voice = None
+
+	def load_model(self):
+		if self.voice:
+			return self.voice
+		self.voice = Piper(self.model_path)
+
+	def set_rate(self, new_scale):
+		self.length_scale = new_scale
+
+	def set_speaker(self, sid):
+		self.speaker_id = sid
+
+	def is_multispeaker(self):
+		return self.voice.config.num_speakers > 1
+
+	def list_speakers(self):
+		if self.is_multispeaker():
+			return self.voice.config.speaker_id_map
+		else:
+			raise Exception("This is not a multispeaker model!")
+
+	def speak(self, text):
+		self.synthesize = self.load_model()
+		if self.speaker_id is None and self.is_multispeaker():
+			self.set_speaker(0)
+		audio_norm, sample_rate = self.voice.synthesize(
+			text,
+			self.speaker_id,
+			self.length_scale,
+			self.noise_scale,
+			self.noise_w
+		)
+		sd.play(audio_norm, sample_rate)
diff --git a/TTS/lector.py b/TTS/lector.py
@@ -0,0 +1,27 @@
+# lector:
+from accessible_output2.outputs import auto, sapi5
+from .Piper import Piper, speaker
+import glob
+"""
+Esto es un gestionador de TTS. Permite manejar el uso de diferentes motores de texto a voz como:
+1. accessible output2
+2. Piper
+"""
+def configurar_tts(lector):
+	if lector == "auto":
+		return auto.Auto()
+	elif lector == "sapi5":
+		return sapi5.SAPI5()
+	elif lector == "piper":
+		return speaker
+	else:
+		raise Exception("Lector no soportado.")
+
+def detect_onnx_models(path):
+    onnx_models = glob.glob(path + '/*/*.onnx')
+    if len(onnx_models) > 1:
+        return onnx_models
+    elif len(onnx_models) == 1:
+        return onnx_models[0]
+    else:
+        return None