Skip to content

Commit

Permalink
Merge pull request #8 from rmcpantoja/piper
Browse files Browse the repository at this point in the history
Piper implementation
  • Loading branch information
metalalchemist authored Jun 26, 2023
2 parents 49f6632 + 1c9144f commit 1c6ecde
Show file tree
Hide file tree
Showing 13 changed files with 532 additions and 36 deletions.
69 changes: 69 additions & 0 deletions .github/workflows/betube32.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: VeTube-x86

on:
push:
tags: ["*"]
branches: [ master , piper ]
pull_request:
branches: [ master , piper ]
workflow_dispatch:

jobs:
build:
runs-on: windows-latest

steps:
- name: Source checkout
uses: actions/checkout@v3

- name: Configure Python
uses: actions/setup-python@v4
with:
python-version: 3.10.11
architecture: x86

- name: Install dependencies
run: |
pip install --upgrade pip wheel setuptools
pip install -r requirements.txt
pip install pyinstaller gdown
pip install --upgrade pyzmq httpx httpcore future
git clone https://github.com/mush42/espeak-phonemizer-windows
- name: Compiling
run: |
pyinstaller VeTube.py
gdown 1ZtF6zus0A7kC9Lwr_kTUbw0MiOoZq29H -O dist/VeTube/bootstrap.exe
cp -R doc dist/VeTube/
cp -R locales dist/VeTube/
cp -R readme dist/VeTube/
cp -R sounds dist/VeTube/
cp -R espeak-phonemizer-windows/espeak_phonemizer dist/VeTube/
- name: Create zip
run: |
cd dist
7z a ../VeTube-x86.zip VeTube/
cd ..
- name: Upload zip
uses: actions/upload-artifact@v3
with:
name: VeTube-x86
path: dist
if-no-files-found: error

vetube_release:
runs-on: windows-latest
if: ${{ startsWith(github.ref, 'refs/tags/') }}
needs: ["build"]
steps:
- uses: actions/checkout@v3
- name: download
uses: actions/download-artifact@v3
- name: Release
uses: softprops/action-gh-release@v1
with:
files: VeTube-x86.zip
fail_on_unmatched_files: true
prerelease: ${{ contains(github.ref, '-') }}
69 changes: 69 additions & 0 deletions .github/workflows/betube64.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: VeTube-x64

on:
push:
tags: ["*"]
branches: [ master , piper ]
pull_request:
branches: [ master , piper ]
workflow_dispatch:

jobs:
build:
runs-on: windows-latest

steps:
- name: Source checkout
uses: actions/checkout@v3

- name: Configure Python
uses: actions/setup-python@v4
with:
python-version: 3.10.11
architecture: x64

- name: Install dependencies
run: |
pip install --upgrade pip wheel setuptools
pip install -r requirements.txt
pip install pyinstaller gdown
pip install --upgrade pyzmq httpx httpcore future
git clone https://github.com/mush42/espeak-phonemizer-windows
- name: Compiling
run: |
pyinstaller VeTube.py
gdown 1ZtF6zus0A7kC9Lwr_kTUbw0MiOoZq29H -O dist/VeTube/bootstrap.exe
cp -R doc dist/VeTube/
cp -R locales dist/VeTube/
cp -R readme dist/VeTube/
cp -R sounds dist/VeTube/
cp -R espeak-phonemizer-windows/espeak_phonemizer dist/VeTube/
- name: Create zip
run: |
cd dist
7z a ../VeTube-x64.zip VeTube/
cd ..
- name: Upload zip
uses: actions/upload-artifact@v3
with:
name: VeTube-x64
path: dist
if-no-files-found: error

vetube_release:
runs-on: windows-latest
if: ${{ startsWith(github.ref, 'refs/tags/') }}
needs: ["build"]
steps:
- uses: actions/checkout@v3
- name: download
uses: actions/download-artifact@v3
- name: Release
uses: softprops/action-gh-release@v1
with:
files: VeTube-x64.zip
fail_on_unmatched_files: true
prerelease: ${{ contains(github.ref, '-') }}
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
__pycache__/
*.pyc
*.pyc
*.onnx
*.onnx.json
piper/voices/*
piper/voices/*/*.onnx
piper/voices/*/*.onnx.json
data.json
keys.txt
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

137 changes: 137 additions & 0 deletions TTS/Piper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import io
import json
import logging
import wave
from dataclasses import dataclass
from pathlib import Path
from typing import List, Mapping, Optional, Sequence, Union

import numpy as np
import onnxruntime
from espeak_phonemizer import Phonemizer

_LOGGER = logging.getLogger(__name__)

_BOS = "^"
_EOS = "$"
_PAD = "_"


@dataclass
class PiperConfig:
num_symbols: int
num_speakers: int
sample_rate: int
espeak_voice: str
length_scale: float
noise_scale: float
noise_w: float
phoneme_id_map: Mapping[str, Sequence[int]]


class Piper:
def __init__(
self,
model_path: Union[str, Path],
config_path: Optional[Union[str, Path]] = None,
use_cuda: bool = False,
):
if config_path is None:
config_path = f"{model_path}.json"

self.config = load_config(config_path)
self.phonemizer = Phonemizer(self.config.espeak_voice)
self.model = onnxruntime.InferenceSession(
str(model_path),
sess_options=onnxruntime.SessionOptions(),
providers=["CPUExecutionProvider"]
if not use_cuda
else ["CUDAExecutionProvider"],
)

def synthesize(
self,
text: str,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
) -> bytes:
"""Synthesize WAV audio from text."""
if length_scale is None:
length_scale = self.config.length_scale

if noise_scale is None:
noise_scale = self.config.noise_scale

if noise_w is None:
noise_w = self.config.noise_w

phonemes_str = self.phonemizer.phonemize(text)
phonemes = [_BOS] + list(phonemes_str)
phoneme_ids: List[int] = []

for phoneme in phonemes:
if phoneme in self.config.phoneme_id_map:
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
else:
_LOGGER.warning("No id for phoneme: %s", phoneme)

phoneme_ids.extend(self.config.phoneme_id_map[_EOS])

phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[noise_scale, length_scale, noise_w],
dtype=np.float32,
)

if (self.config.num_speakers > 1) and (speaker_id is not None):
# Default speaker
speaker_id = 0

sid = None

if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)

# Synthesize through Onnx
audio = self.model.run(
None,
{
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze((0, 1))
audio = audio_float_to_int16(audio.squeeze())
return audio, self.config.sample_rate


def load_config(config_path: Union[str, Path]) -> PiperConfig:
with open(config_path, "r", encoding="utf-8") as config_file:
config_dict = json.load(config_file)
inference = config_dict.get("inference", {})

return PiperConfig(
num_symbols=config_dict["num_symbols"],
num_speakers=config_dict["num_speakers"],
sample_rate=config_dict["audio"]["sample_rate"],
espeak_voice=config_dict["espeak"]["voice"],
noise_scale=inference.get("noise_scale", 0.667),
length_scale=inference.get("length_scale", 1.0),
noise_w=inference.get("noise_w", 0.8),
phoneme_id_map=config_dict["phoneme_id_map"],
)


def audio_float_to_int16(
audio: np.ndarray, max_wav_value: float = 32767.0
) -> np.ndarray:
"""Normalize audio and convert to int16 range"""
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
audio_norm = audio_norm.astype("int16")
return audio_norm
48 changes: 48 additions & 0 deletions TTS/Piper/speaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import logging
from functools import partial
from pathlib import Path
import sounddevice as sd
from . import Piper

class piperSpeak:
def __init__(self, model_path):
self.model_path = model_path
self.speaker_id = None
self.length_scale = 1
self.noise_scale = 0.667
self.noise_w = 0.8
self.synthesize = None
self.voice = None

def load_model(self):
if self.voice:
return self.voice
self.voice = Piper(self.model_path)

def set_rate(self, new_scale):
self.length_scale = new_scale

def set_speaker(self, sid):
self.speaker_id = sid

def is_multispeaker(self):
return self.voice.config.num_speakers > 1

def list_speakers(self):
if self.is_multispeaker():
return self.voice.config.speaker_id_map
else:
raise Exception("This is not a multispeaker model!")

def speak(self, text):
self.synthesize = self.load_model()
if self.speaker_id is None and self.is_multispeaker():
self.set_speaker(0)
audio_norm, sample_rate = self.voice.synthesize(
text,
self.speaker_id,
self.length_scale,
self.noise_scale,
self.noise_w
)
sd.play(audio_norm, sample_rate)
27 changes: 27 additions & 0 deletions TTS/lector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# lector:
from accessible_output2.outputs import auto, sapi5
from .Piper import Piper, speaker
import glob
"""
Esto es un gestionador de TTS. Permite manejar el uso de diferentes motores de texto a voz como:
1. accessible output2
2. Piper
"""
def configurar_tts(lector):
if lector == "auto":
return auto.Auto()
elif lector == "sapi5":
return sapi5.SAPI5()
elif lector == "piper":
return speaker
else:
raise Exception("Lector no soportado.")

def detect_onnx_models(path):
onnx_models = glob.glob(path + '/*/*.onnx')
if len(onnx_models) > 1:
return onnx_models
elif len(onnx_models) == 1:
return onnx_models[0]
else:
return None
Loading

0 comments on commit 1c6ecde

Please sign in to comment.