From efe34eacaeb0a69b829e21edb98f130f143f6592 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Thu, 29 Feb 2024 19:59:26 +0100 Subject: [PATCH 01/25] new requirements, api and intial setup --- .gitignore | 164 ++++++++++++++++++ README.md | 31 +++- Utils/PLBERT/__init__.py | 0 __init__.py | 0 api.py | 360 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 9 +- setup.py | 15 ++ 7 files changed, 574 insertions(+), 5 deletions(-) create mode 100644 .gitignore create mode 100644 Utils/PLBERT/__init__.py create mode 100644 __init__.py create mode 100644 api.py create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..5a2345f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +test_tmp/ +StyleTTS2-LibriTTS/ +models_weight/ \ No newline at end of file diff --git a/README.md b/README.md index 1033fd06..7e0dbc27 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,36 @@ Online demo: [Hugging Face](https://huggingface.co/spaces/styletts2/styletts2) ( - [x] Test training code for multi-speaker models (VCTK and LibriTTS) - [x] Finish demo code for multispeaker model and upload pre-trained models - [x] Add a finetuning script for new speakers with base pre-trained multispeaker models +- [x] Installable styletts2 and easier interface +- [ ] Add multilingual support +- [ ] Add simple interface to train new languages and voice tuning - [ ] Fix DDP (accelerator) for `train_second.py` **(I have tried everything I could to fix this but had no success, so if you are willing to help, please see [#7](https://github.com/yl4579/StyleTTS2/issues/7))** ## Pre-requisites + +### Install as package + +1. Conda new env and lfs (recommended) +```bash +conda create -n styletts python=3.9 +conda activate styletts +git lfs install +``` +2. Install espeak +*Linux* +```bash +sudo apt update +sudo apt install espeak-ng +``` +*MacOS* build [espeak-ng](https://github.com/espeak-ng/espeak-ng/blob/master/docs/building.md) +*windows* install [espeak-ng](https://github.com/espeak-ng/espeak-ng/blob/master/docs/guide.md#windows) + +3. install requirements +```bash +pip install -r requirements.txt +``` + +### If you want to edit the code while development 1. Python >= 3.7 2. Clone this repository: ```bash @@ -30,10 +57,6 @@ cd StyleTTS2 ```bash pip install -r requirements.txt ``` -On Windows add: -```bash -pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -U -``` Also install phonemizer and espeak if you want to run the demo: ```bash pip install phonemizer diff --git a/Utils/PLBERT/__init__.py b/Utils/PLBERT/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api.py b/api.py new file mode 100644 index 00000000..10f4a03f --- /dev/null +++ b/api.py @@ -0,0 +1,360 @@ +import torch +torch.manual_seed(0) +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = True + +import random +random.seed(0) + +import numpy as np +np.random.seed(0) + +# load packages +import time +import random +import yaml +from munch import Munch +import numpy as np +import torch +from torch import nn +import torch.nn.functional as F +import torchaudio +import librosa +from nltk.tokenize import word_tokenize + +from models import * +from utils import * +from text_utils import TextCleaner +import phonemizer +from Utils.PLBERT.util import load_plbert +from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule +from typing import Tuple, Type +from numpy.typing import NDArray +import os + + +def load_phonemizer_configs_asr_f0_bert(language:str="en-us")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: + global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True) + + config = yaml.safe_load(open("/content/StyleTTS2/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml")) + + + # load pretrained ASR model + ASR_config = config.get('ASR_config', False) + ASR_path = config.get('ASR_path', False) + text_aligner = load_ASR_models(ASR_path, ASR_config) + + # load pretrained F0 model + F0_path = config.get('F0_path', False) + pitch_extractor = load_F0_models(F0_path) + + # load BERT model + BERT_path = config.get('PLBERT_dir', False) + plbert = load_plbert(BERT_path) + + return global_phonemizer, config, text_aligner, pitch_extractor, plbert + +def load_model(weight_path:str, config:dict, + text_aligner:torch.nn.Module, pitch_extractor:torch.nn.Module, + plbert:torch.nn.Module, device:str='cpu')->Tuple[torch.nn.Module, any]: + model_params = recursive_munch(config['model_params']) + model = build_model(model_params, text_aligner, pitch_extractor, plbert) + _ = [model[key].eval() for key in model] + _ = [model[key].to(device) for key in model] + + params_whole = torch.load(weight_path, map_location='cpu') + params = params_whole['net'] + + + for key in model: + if key in params: + print('%s loaded' % key) + try: + model[key].load_state_dict(params[key]) + except: + from collections import OrderedDict + state_dict = params[key] + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + name = k[7:] # remove `module.` + new_state_dict[name] = v + # load params + model[key].load_state_dict(new_state_dict, strict=False) + # except: + # _load(params[key], model[key]) + _ = [model[key].eval() for key in model] + + + return model, model_params + +def load_sampler(model:torch.nn.Module)->torch.nn.Module: + sampler = DiffusionSampler(model.diffusion.diffusion, + sampler=ADPM2Sampler(), + sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters + clamp=False) + + return sampler + + +class StyleTTS: + def __init__(self, + config_path:str="./Configs/config.yml", + model_path:str="./models_weight", + language:str="en-us", + device:str='cpu', + load_from_HF:bool=True, + model_remote_path:str="https://huggingface.co/yl4579/StyleTTS2-LibriTTS"): + + if load_from_HF is True: + if model_path is None: + cwd = os.getcwd() + model_path = os.path.join(cwd,"models_weight") + os.makedirs(model_path, exist_ok=True) + os.system(f"git clone {model_remote_path} {model_path}") + + self.model_remote_path = model_remote_path + self.config_path = config_path + self.model_path = model_path + self.language = language + self._device = device + + (self.global_phonemizer, + self.config, + self.text_aligner, + self.pitch_extractor, + self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language) + + + self.model, self.model_params = load_model(weight_path=model_path, + config=self.config, + text_aligner=self.text_aligner, + pitch_extractor=self.pitch_extractor, + plbert=self.plbert, + device=device) + + self.sampler = load_sampler(model=self.model) + + self.textclenaer = TextCleaner() + + self.to_mel = torchaudio.transforms.MelSpectrogram( + n_mels=80, n_fft=2048, win_length=1200, hop_length=300) + self.mean, self.std = -4, 4 + + # def __call__(self, text:str, ref_s:NDArray, alpha:float=0.3, + # beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + # return self.predict(text=text, + # ref_s=ref_s, + # alpha=alpha, + # beta=beta, + # diffusion_steps=diffusion_steps, + # embedding_scale=embedding_scale) + + def predict(self, text:str, ref_s:NDArray, alpha:float=0.3, + beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + text = text.strip() + ps = self.global_phonemizer.phonemize([text]) + ps = word_tokenize(ps[0]) + ps = ' '.join(ps) + tokens = self.textclenaer(ps) + tokens.insert(0, 0) + tokens = torch.LongTensor(tokens).to(self._device).unsqueeze(0) + + with torch.no_grad(): + input_lengths = torch.LongTensor([tokens.shape[-1]]).to(self._device) + text_mask = self.length_to_mask(input_lengths).to(self._device) + + t_en = self.model.text_encoder(tokens, input_lengths, text_mask) + bert_dur = self.model.bert(tokens, attention_mask=(~text_mask).int()) + d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2) + + s_pred = self.sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(self._device), + embedding=bert_dur, + embedding_scale=embedding_scale, + features=ref_s, # reference from the same speaker as the embedding + num_steps=diffusion_steps).squeeze(1) + + + s = s_pred[:, 128:] + ref = s_pred[:, :128] + + ref = alpha * ref + (1 - alpha) * ref_s[:, :128] + s = beta * s + (1 - beta) * ref_s[:, 128:] + + d = self.model.predictor.text_encoder(d_en, s, input_lengths, text_mask) + + x, _ = self.model.predictor.lstm(d) + duration = self.model.predictor.duration_proj(x) + + duration = torch.sigmoid(duration).sum(axis=-1) + pred_dur = torch.round(duration.squeeze()).clamp(min=1) + + + pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data)) + c_frame = 0 + for i in range(pred_aln_trg.size(0)): + pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1 + c_frame += int(pred_dur[i].data) + + # encode prosody + en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(self._device)) + if self.model_params.decoder.type == "hifigan": + asr_new = torch.zeros_like(en) + asr_new[:, :, 0] = en[:, :, 0] + asr_new[:, :, 1:] = en[:, :, 0:-1] + en = asr_new + + F0_pred, N_pred = self.model.predictor.F0Ntrain(en, s) + + asr = (t_en @ pred_aln_trg.unsqueeze(0).to(self._device)) + if self.model_params.decoder.type == "hifigan": + asr_new = torch.zeros_like(asr) + asr_new[:, :, 0] = asr[:, :, 0] + asr_new[:, :, 1:] = asr[:, :, 0:-1] + asr = asr_new + + out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0)) + + + return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later + + def compute_style(self, wave=None, sr=None, path=None, device='cpu')->torch.Tensor: + if path is not None: + wave, sr = librosa.load(path, sr=24000) + audio, index = librosa.effects.trim(wave, top_db=30) + if sr != 24000: + audio = librosa.resample(audio, sr, 24000) + mel_tensor = self.preprocess(audio).to(device) + + with torch.no_grad(): + ref_s = self.model.style_encoder(mel_tensor.unsqueeze(1)) + ref_p = self.model.predictor_encoder(mel_tensor.unsqueeze(1)) + + return torch.cat([ref_s, ref_p], dim=1) + + def length_to_mask(self, lengths:NDArray)->torch.Tensor: + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + + def preprocess(self, wave:NDArray)->torch.Tensor: + wave_tensor = torch.from_numpy(wave).float() + mel_tensor = self.to_mel(wave_tensor) + mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std + return mel_tensor + + def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray, + alpha:float=0.3, beta:float=0.7, t:float=0.7, + diffusion_steps:int=5, embedding_scale:int=1)->NDArray: + text = text.strip() + ps = self.global_phonemizer.phonemize([text]) + ps = word_tokenize(ps[0]) + ps = ' '.join(ps) + ps = ps.replace('``', '"') + ps = ps.replace("''", '"') + + tokens = self.textclenaer(ps) + tokens.insert(0, 0) + tokens = torch.LongTensor(tokens).to(self._device).unsqueeze(0) + + with torch.no_grad(): + input_lengths = torch.LongTensor([tokens.shape[-1]]).to(self._device) + text_mask = self.length_to_mask(input_lengths).to(self._device) + + t_en = self.model.text_encoder(tokens, input_lengths, text_mask) + bert_dur = self.model.bert(tokens, attention_mask=(~text_mask).int()) + d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2) + + s_pred = self.sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(self._device), + embedding=bert_dur, + embedding_scale=embedding_scale, + features=ref_s, # reference from the same speaker as the embedding + num_steps=diffusion_steps).squeeze(1) + + if s_prev is not None: + # convex combination of previous and current style + s_pred = t * s_prev + (1 - t) * s_pred + + s = s_pred[:, 128:] + ref = s_pred[:, :128] + + ref = alpha * ref + (1 - alpha) * ref_s[:, :128] + s = beta * s + (1 - beta) * ref_s[:, 128:] + + s_pred = torch.cat([ref, s], dim=-1) + + d = self.model.predictor.text_encoder(d_en, + s, input_lengths, text_mask) + + x, _ = self.model.predictor.lstm(d) + duration = self.model.predictor.duration_proj(x) + + duration = torch.sigmoid(duration).sum(axis=-1) + pred_dur = torch.round(duration.squeeze()).clamp(min=1) + + + pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data)) + c_frame = 0 + for i in range(pred_aln_trg.size(0)): + pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1 + c_frame += int(pred_dur[i].data) + + # encode prosody + en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(self._device)) + if self.model_params.decoder.type == "hifigan": + asr_new = torch.zeros_like(en) + asr_new[:, :, 0] = en[:, :, 0] + asr_new[:, :, 1:] = en[:, :, 0:-1] + en = asr_new + + F0_pred, N_pred = self.model.predictor.F0Ntrain(en, s) + + asr = (t_en @ pred_aln_trg.unsqueeze(0).to(self._device)) + if self.model_params.decoder.type == "hifigan": + asr_new = torch.zeros_like(asr) + asr_new[:, :, 0] = asr[:, :, 0] + asr_new[:, :, 1:] = asr[:, :, 0:-1] + asr = asr_new + + out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0)) + + + return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later + + def predict_long(self, text:str, ref_s:NDArray, alpha:float=0.3, + beta:float=0.7, diffusion_steps:float=5, + embedding_scale:float=1, t:float=.7) -> NDArray: + sentences = text.split('.') # simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later) + wavs = [] + s_prev = None + for text in sentences: + if text.strip() == "": continue + text += '.' # add it back + + wav, s_prev = self._predict_long_step(text, + s_prev, + ref_s, + alpha=alpha, + beta=beta, # make it more suitable for the text + t=t, + diffusion_steps=diffusion_steps, + embedding_scale=embedding_scale) + wavs.append(wav) + + @property + def device(self): + return self._device + + @device.setter + def device(self, device:str): + self._device = device + + def to(self, device:str): + self.device = device + + +if __name__ == "__main__": + print(StyleTTS) + stts = StyleTTS() + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8b8d1122..3da0762d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ SoundFile torchaudio +torchvision munch torch pydub @@ -14,4 +15,10 @@ einops-exts tqdm typing typing-extensions -git+https://github.com/resemble-ai/monotonic_align.git \ No newline at end of file +git+https://github.com/resemble-ai/monotonic_align.git +phonemizer + + +torch --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' +torchvision --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' +torchaudio --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..783be4d8 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +import os +from setuptools import setup, find_packages + + +setup( + name='StyleTTS2', + version='2.0', + description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.', + license='MIT', + package_dir={'styletts2':'./'}, + long_description=open('README.md').read(), + packages=find_packages(), + install_requires=find_packages(), + url="https://github.com/yl4579/StyleTTS2.git", +) \ No newline at end of file From a41adc8d6b53d17e8a8315ade2d0fbb69ab067ac Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 11:16:52 +0000 Subject: [PATCH 02/25] adding better api --- api.py | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/api.py b/api.py index 10f4a03f..82210daf 100644 --- a/api.py +++ b/api.py @@ -28,15 +28,15 @@ import phonemizer from Utils.PLBERT.util import load_plbert from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule -from typing import Tuple, Type +from typing import Tuple, Type, Union from numpy.typing import NDArray import os -def load_phonemizer_configs_asr_f0_bert(language:str="en-us")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: +def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True) - config = yaml.safe_load(open("/content/StyleTTS2/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml")) + config = yaml.safe_load(open(config_path)) # load pretrained ASR model @@ -111,6 +111,8 @@ def __init__(self, model_path = os.path.join(cwd,"models_weight") os.makedirs(model_path, exist_ok=True) os.system(f"git clone {model_remote_path} {model_path}") + config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml") + model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth") self.model_remote_path = model_remote_path self.config_path = config_path @@ -122,7 +124,7 @@ def __init__(self, self.config, self.text_aligner, self.pitch_extractor, - self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language) + self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, config_path=self.config_path) self.model, self.model_params = load_model(weight_path=model_path, @@ -140,17 +142,20 @@ def __init__(self, n_mels=80, n_fft=2048, win_length=1200, hop_length=300) self.mean, self.std = -4, 4 - # def __call__(self, text:str, ref_s:NDArray, alpha:float=0.3, - # beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: - # return self.predict(text=text, - # ref_s=ref_s, - # alpha=alpha, - # beta=beta, - # diffusion_steps=diffusion_steps, - # embedding_scale=embedding_scale) + def __call__(self, text:str, ref_s:NDArray=None, alpha:float=0.3, + beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + return self.predict(text=text, + ref_s=ref_s, + alpha=alpha, + beta=beta, + diffusion_steps=diffusion_steps, + embedding_scale=embedding_scale) - def predict(self, text:str, ref_s:NDArray, alpha:float=0.3, + def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + + if ref_s is None: ref_s = self.load_random_ref_s() + text = text.strip() ps = self.global_phonemizer.phonemize([text]) ps = word_tokenize(ps[0]) @@ -242,9 +247,10 @@ def preprocess(self, wave:NDArray)->torch.Tensor: mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std return mel_tensor - def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray, + def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, alpha:float=0.3, beta:float=0.7, t:float=0.7, diffusion_steps:int=5, embedding_scale:int=1)->NDArray: + if ref_s is None: ref_s = self.load_random_ref_s() text = text.strip() ps = self.global_phonemizer.phonemize([text]) ps = word_tokenize(ps[0]) @@ -320,9 +326,10 @@ def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray, return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later - def predict_long(self, text:str, ref_s:NDArray, alpha:float=0.3, + def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1, t:float=.7) -> NDArray: + if ref_s is None: ref_s = self.load_random_ref_s() sentences = text.split('.') # simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later) wavs = [] s_prev = None @@ -340,6 +347,9 @@ def predict_long(self, text:str, ref_s:NDArray, alpha:float=0.3, embedding_scale=embedding_scale) wavs.append(wav) + def load_random_ref_s(self): + return torch.randn(1, 256).to(self._device) + @property def device(self): return self._device @@ -355,6 +365,11 @@ def to(self, device:str): if __name__ == "__main__": print(StyleTTS) stts = StyleTTS() - + sr = 24000 + wave = np.random.randn(sr*10) + # print(wave.shape) + # print(stts.compute_style(wave=wave, sr=sr, path=None, device='cpu').shape) + # print(stts.load_random_ref_s().shape) + assert stts("read this in a random voice") is not None \ No newline at end of file From fb358f94d48fdbd64b7b60c0017d1d9151448198 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 11:18:09 +0000 Subject: [PATCH 03/25] NLTK auto install and cli scripts --- requirements.txt | 11 ++++++----- scripts/cli | 14 ++++++++++++++ scripts/install | 4 ++++ scripts/nltk_download | 4 ++++ 4 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 scripts/cli create mode 100644 scripts/install create mode 100644 scripts/nltk_download diff --git a/requirements.txt b/requirements.txt index 3da0762d..cdaad136 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -SoundFile +soundfile torchaudio torchvision munch @@ -15,10 +15,11 @@ einops-exts tqdm typing typing-extensions -git+https://github.com/resemble-ai/monotonic_align.git +monotonic_align @ git+https://github.com/resemble-ai/monotonic_align.git phonemizer -torch --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' -torchvision --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' -torchaudio --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' \ No newline at end of file + +torch @ https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' +torchvision @ https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' +torchaudio @ https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32' \ No newline at end of file diff --git a/scripts/cli b/scripts/cli new file mode 100644 index 00000000..ed4a4aa6 --- /dev/null +++ b/scripts/cli @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +git lfs install + +if [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then + sudo apt update + sudo apt install espeak-ng +# elif [ "$(uname)" == "Darwin" ]; then +# # Do something under Mac OS X platform +# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then +# # Do something under 32 bits Windows NT platform +# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW64_NT" ]; then +# # Do something under 64 bits Windows NT platform +fi \ No newline at end of file diff --git a/scripts/install b/scripts/install new file mode 100644 index 00000000..ceb69c11 --- /dev/null +++ b/scripts/install @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +pip install -r ../requirements.txt + +mkdir del \ No newline at end of file diff --git a/scripts/nltk_download b/scripts/nltk_download new file mode 100644 index 00000000..b27d67a0 --- /dev/null +++ b/scripts/nltk_download @@ -0,0 +1,4 @@ +#!/usr/bin/env python + +import nltk +nltk.download('punkt') \ No newline at end of file From c3b6e58a0980dee5b778e5b4ab279d6b1e246ea3 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 11:27:25 +0000 Subject: [PATCH 04/25] installable --- setup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 783be4d8..f990694d 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,21 @@ import os from setuptools import setup, find_packages +with open('requirements.txt') as f: + required = f.read().splitlines() + setup( name='StyleTTS2', version='2.0', description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.', license='MIT', - package_dir={'styletts2':'./'}, + package_dir={'styletts2':'src'}, long_description=open('README.md').read(), - packages=find_packages(), - install_requires=find_packages(), + install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", + scripts=[ + './scripts/nltk_download', + './scripts/cli', + ], ) \ No newline at end of file From de3a4c780194a97127414068b9fc6984ae9a085c Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 13:37:52 +0000 Subject: [PATCH 05/25] working api --- api.py | 27 ++++++++++++++++----------- scripts/cli | 14 -------------- scripts/install | 4 ---- scripts/nltk_download | 4 ---- setup.py | 9 +++------ 5 files changed, 19 insertions(+), 39 deletions(-) delete mode 100644 scripts/cli delete mode 100644 scripts/install delete mode 100644 scripts/nltk_download diff --git a/api.py b/api.py index 82210daf..492a8268 100644 --- a/api.py +++ b/api.py @@ -31,6 +31,8 @@ from typing import Tuple, Type, Union from numpy.typing import NDArray import os +import nltk +nltk.download('punkt') def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: @@ -68,7 +70,7 @@ def load_model(weight_path:str, config:dict, for key in model: if key in params: - print('%s loaded' % key) + # print('%s loaded' % key) try: model[key].load_state_dict(params[key]) except: @@ -99,7 +101,7 @@ def load_sampler(model:torch.nn.Module)->torch.nn.Module: class StyleTTS: def __init__(self, config_path:str="./Configs/config.yml", - model_path:str="./models_weight", + model_path:str=None, language:str="en-us", device:str='cpu', load_from_HF:bool=True, @@ -109,10 +111,11 @@ def __init__(self, if model_path is None: cwd = os.getcwd() model_path = os.path.join(cwd,"models_weight") - os.makedirs(model_path, exist_ok=True) - os.system(f"git clone {model_remote_path} {model_path}") - config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml") - model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth") + if not os.path.exists(model_path): + os.makedirs(model_path, exist_ok=True) + os.system(f"git clone {model_remote_path} {model_path}") + config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml") + model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth") self.model_remote_path = model_remote_path self.config_path = config_path @@ -347,6 +350,8 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, embedding_scale=embedding_scale) wavs.append(wav) + return np.concatenate(wavs, axis=0) + def load_random_ref_s(self): return torch.randn(1, 256).to(self._device) @@ -363,13 +368,13 @@ def to(self, device:str): if __name__ == "__main__": - print(StyleTTS) stts = StyleTTS() sr = 24000 wave = np.random.randn(sr*10) - # print(wave.shape) - # print(stts.compute_style(wave=wave, sr=sr, path=None, device='cpu').shape) - # print(stts.load_random_ref_s().shape) - assert stts("read this in a random voice") is not None + + print(stts("read this in a random voice").shape) + print(stts.predict("read this in a random voice").shape) + print(stts.predict_long("simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later)").shape) + \ No newline at end of file diff --git a/scripts/cli b/scripts/cli deleted file mode 100644 index ed4a4aa6..00000000 --- a/scripts/cli +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -git lfs install - -if [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then - sudo apt update - sudo apt install espeak-ng -# elif [ "$(uname)" == "Darwin" ]; then -# # Do something under Mac OS X platform -# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then -# # Do something under 32 bits Windows NT platform -# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW64_NT" ]; then -# # Do something under 64 bits Windows NT platform -fi \ No newline at end of file diff --git a/scripts/install b/scripts/install deleted file mode 100644 index ceb69c11..00000000 --- a/scripts/install +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash -pip install -r ../requirements.txt - -mkdir del \ No newline at end of file diff --git a/scripts/nltk_download b/scripts/nltk_download deleted file mode 100644 index b27d67a0..00000000 --- a/scripts/nltk_download +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python - -import nltk -nltk.download('punkt') \ No newline at end of file diff --git a/setup.py b/setup.py index f990694d..57766c04 100644 --- a/setup.py +++ b/setup.py @@ -1,21 +1,18 @@ import os +from sys import platform from setuptools import setup, find_packages with open('requirements.txt') as f: required = f.read().splitlines() - setup( name='StyleTTS2', version='2.0', description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.', license='MIT', - package_dir={'styletts2':'src'}, + package_dir={'styletts2':'./'}, long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", - scripts=[ - './scripts/nltk_download', - './scripts/cli', - ], + ) \ No newline at end of file From de90d59ba2c65b6a43bba962e7a28a248a80f230 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 13:39:54 +0000 Subject: [PATCH 06/25] fix install --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7e0dbc27..f7263739 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ sudo apt install espeak-ng 3. install requirements ```bash -pip install -r requirements.txt +pip install git+https://github.com/yl4579/StyleTTS2.git ``` ### If you want to edit the code while development From d2baf0a440496a13ea1be7b448a17a347c995808 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 19:44:16 +0100 Subject: [PATCH 07/25] fix imports --- api.py | 10 +++++----- models.py | 22 ++++++++++++++++------ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/api.py b/api.py index 492a8268..c1f3cd9a 100644 --- a/api.py +++ b/api.py @@ -22,12 +22,12 @@ import librosa from nltk.tokenize import word_tokenize -from models import * -from utils import * -from text_utils import TextCleaner +from .models import * +from .utils import * +from .text_utils import TextCleaner import phonemizer -from Utils.PLBERT.util import load_plbert -from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule +from .Utils.PLBERT.util import load_plbert +from .Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule from typing import Tuple, Type, Union from numpy.typing import NDArray import os diff --git a/models.py b/models.py index 84bbb03d..5b5fa487 100644 --- a/models.py +++ b/models.py @@ -12,14 +12,24 @@ import torch.nn.functional as F from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from Utils.ASR.models import ASRCNN -from Utils.JDC.model import JDCNet +try: + from .Utils.ASR.models import ASRCNN + from .Utils.JDC.model import JDCNet -from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution -from Modules.diffusion.modules import Transformer1d, StyleTransformer1d -from Modules.diffusion.diffusion import AudioDiffusionConditional + from .Modules.diffusion.sampler import KDiffusion, LogNormalDistribution + from .Modules.diffusion.modules import Transformer1d, StyleTransformer1d + from .Modules.diffusion.diffusion import AudioDiffusionConditional -from Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator + from .Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator +except: + from Utils.ASR.models import ASRCNN + from Utils.JDC.model import JDCNet + + from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution + from Modules.diffusion.modules import Transformer1d, StyleTransformer1d + from Modules.diffusion.diffusion import AudioDiffusionConditional + + from Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator from munch import Munch import yaml From 5cbfe800d0304cecc4d547df5be4be52ed393315 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 19:58:53 +0100 Subject: [PATCH 08/25] add_cwd --- api.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/api.py b/api.py index c1f3cd9a..463e136b 100644 --- a/api.py +++ b/api.py @@ -35,23 +35,33 @@ nltk.download('punkt') -def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: +def load_phonemizer_configs_asr_f0_bert(language:str="en-us", + config_path:str="./Configs/config.yml", + add_cwd:bool=True)->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True) - + + if add_cwd is True: + config_path = os.path.join(os.getcwd(), config_path) config = yaml.safe_load(open(config_path)) # load pretrained ASR model ASR_config = config.get('ASR_config', False) ASR_path = config.get('ASR_path', False) + if add_cwd is True: + ASR_path = os.path.join(os.getcwd(), ASR_path) text_aligner = load_ASR_models(ASR_path, ASR_config) # load pretrained F0 model F0_path = config.get('F0_path', False) + if add_cwd is True: + F0_path = os.path.join(os.getcwd(), F0_path) pitch_extractor = load_F0_models(F0_path) # load BERT model BERT_path = config.get('PLBERT_dir', False) + if add_cwd is True: + BERT_path = os.path.join(os.getcwd(), BERT_path) plbert = load_plbert(BERT_path) return global_phonemizer, config, text_aligner, pitch_extractor, plbert @@ -100,13 +110,16 @@ def load_sampler(model:torch.nn.Module)->torch.nn.Module: class StyleTTS: def __init__(self, - config_path:str="./Configs/config.yml", + config_path:str=None, model_path:str=None, language:str="en-us", device:str='cpu', load_from_HF:bool=True, model_remote_path:str="https://huggingface.co/yl4579/StyleTTS2-LibriTTS"): + add_cwd = False + if config_path is None: add_cwd = True + if load_from_HF is True: if model_path is None: cwd = os.getcwd() @@ -114,7 +127,7 @@ def __init__(self, if not os.path.exists(model_path): os.makedirs(model_path, exist_ok=True) os.system(f"git clone {model_remote_path} {model_path}") - config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml") + config_path = os.path.join("Models", "LibriTTS", "config.yml") model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth") self.model_remote_path = model_remote_path @@ -127,7 +140,9 @@ def __init__(self, self.config, self.text_aligner, self.pitch_extractor, - self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, config_path=self.config_path) + self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, + config_path=self.config_path, + add_cwd=add_cwd) self.model, self.model_params = load_model(weight_path=model_path, From 3fa25c011eb23d7a5174d85410de2acf456c13fe Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 20:06:09 +0100 Subject: [PATCH 09/25] fix path --- api.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/api.py b/api.py index 463e136b..bfc4a86d 100644 --- a/api.py +++ b/api.py @@ -31,17 +31,23 @@ from typing import Tuple, Type, Union from numpy.typing import NDArray import os + +# runs first time after installation only import nltk nltk.download('punkt') +import pathlib +ROOT = pathlib.Path(__file__).parent.resolve() + + def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml", add_cwd:bool=True)->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True) if add_cwd is True: - config_path = os.path.join(os.getcwd(), config_path) + config_path = os.path.join(ROOT, config_path) config = yaml.safe_load(open(config_path)) @@ -49,19 +55,19 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us", ASR_config = config.get('ASR_config', False) ASR_path = config.get('ASR_path', False) if add_cwd is True: - ASR_path = os.path.join(os.getcwd(), ASR_path) + ASR_path = os.path.join(ROOT, ASR_path) text_aligner = load_ASR_models(ASR_path, ASR_config) # load pretrained F0 model F0_path = config.get('F0_path', False) if add_cwd is True: - F0_path = os.path.join(os.getcwd(), F0_path) + F0_path = os.path.join(ROOT, F0_path) pitch_extractor = load_F0_models(F0_path) # load BERT model BERT_path = config.get('PLBERT_dir', False) if add_cwd is True: - BERT_path = os.path.join(os.getcwd(), BERT_path) + BERT_path = os.path.join(ROOT, BERT_path) plbert = load_plbert(BERT_path) return global_phonemizer, config, text_aligner, pitch_extractor, plbert @@ -122,8 +128,8 @@ def __init__(self, if load_from_HF is True: if model_path is None: - cwd = os.getcwd() - model_path = os.path.join(cwd,"models_weight") + + model_path = os.path.join(ROOT,"models_weight") if not os.path.exists(model_path): os.makedirs(model_path, exist_ok=True) os.system(f"git clone {model_remote_path} {model_path}") From e229f416258ab5f12e62e7932e6f1a1215c6feff Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 20:12:52 +0100 Subject: [PATCH 10/25] fix config_path --- api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api.py b/api.py index bfc4a86d..98f0bc30 100644 --- a/api.py +++ b/api.py @@ -133,7 +133,7 @@ def __init__(self, if not os.path.exists(model_path): os.makedirs(model_path, exist_ok=True) os.system(f"git clone {model_remote_path} {model_path}") - config_path = os.path.join("Models", "LibriTTS", "config.yml") + config_path = os.path.join("models_weight", "Models", "LibriTTS", "config.yml") model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth") self.model_remote_path = model_remote_path From d804321ca6923e510b45674e72bbbdbf4e8fb962 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 20:33:51 +0100 Subject: [PATCH 11/25] fix ASR_config --- api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api.py b/api.py index 98f0bc30..fb9605c3 100644 --- a/api.py +++ b/api.py @@ -56,6 +56,7 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us", ASR_path = config.get('ASR_path', False) if add_cwd is True: ASR_path = os.path.join(ROOT, ASR_path) + ASR_config = os.path.join(ROOT, ASR_config) text_aligner = load_ASR_models(ASR_path, ASR_config) # load pretrained F0 model From fcd39e7bc832751aa0786ad0caeb013cb7a33822 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 20:48:27 +0100 Subject: [PATCH 12/25] added include_package_data --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 57766c04..634fb3c6 100644 --- a/setup.py +++ b/setup.py @@ -14,5 +14,6 @@ long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", + include_package_data=True, ) \ No newline at end of file From b157d83cdc7176a99ce198d5389d82e3b5ca52e4 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:01:32 +0100 Subject: [PATCH 13/25] try to include configs --- MANIFEST.in | 6 ++++++ setup.py | 5 +++++ 2 files changed, 11 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..ffa0621b --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include **/*.yml +include **/*.yaml +include **/*.json +include **/*.pth +include **/*.t7 +include **/*.txt \ No newline at end of file diff --git a/setup.py b/setup.py index 634fb3c6..52a4c10f 100644 --- a/setup.py +++ b/setup.py @@ -10,10 +10,15 @@ version='2.0', description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.', license='MIT', + packages=find_packages(), package_dir={'styletts2':'./'}, long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", include_package_data=True, + # package_data = { + # 'static': ['*'], + # 'Potato': ['*.txt'] + # } ) \ No newline at end of file From 758d8ba6da04b4d8664ceaea71aac8b5ec33c091 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:16:54 +0100 Subject: [PATCH 14/25] try to fix configs --- MANIFEST.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index ffa0621b..4266a761 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ -include **/*.yml -include **/*.yaml -include **/*.json -include **/*.pth -include **/*.t7 -include **/*.txt \ No newline at end of file +global-include *.yml +global-include *.yaml +global-include *.json +global-include *.pth +global-include *.t7 +global-include *.txt \ No newline at end of file From 9cb050905f09630535eb8efce09a26dac3d9187e Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:21:41 +0100 Subject: [PATCH 15/25] test --- MANIFEST.in => MANIFEST.del_temp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename MANIFEST.in => MANIFEST.del_temp (100%) diff --git a/MANIFEST.in b/MANIFEST.del_temp similarity index 100% rename from MANIFEST.in rename to MANIFEST.del_temp From c5822a32214c6f79860d4ccc1b2770abccee3573 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:27:16 +0100 Subject: [PATCH 16/25] test --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 52a4c10f..48f86628 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", - include_package_data=True, +# include_package_data=True, # package_data = { # 'static': ['*'], # 'Potato': ['*.txt'] From addc30032c37251511c3dde071247c2f9680c743 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:30:22 +0100 Subject: [PATCH 17/25] test --- setup.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 48f86628..34686430 100644 --- a/setup.py +++ b/setup.py @@ -15,10 +15,6 @@ long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", -# include_package_data=True, - # package_data = { - # 'static': ['*'], - # 'Potato': ['*.txt'] - # } + ) \ No newline at end of file From a778061a210783895d552c40d6b197ce6f263d9d Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:37:57 +0100 Subject: [PATCH 18/25] test --- MANIFEST.del_temp | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 MANIFEST.del_temp diff --git a/MANIFEST.del_temp b/MANIFEST.del_temp deleted file mode 100644 index 4266a761..00000000 --- a/MANIFEST.del_temp +++ /dev/null @@ -1,6 +0,0 @@ -global-include *.yml -global-include *.yaml -global-include *.json -global-include *.pth -global-include *.t7 -global-include *.txt \ No newline at end of file From ddf0a2d009117dfa413e113ca4ee0c55486eda92 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:42:13 +0100 Subject: [PATCH 19/25] test --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 34686430..45928d4b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,6 @@ version='2.0', description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.', license='MIT', - packages=find_packages(), package_dir={'styletts2':'./'}, long_description=open('README.md').read(), install_requires=required, From 989a3c8235e7c3f8362173535215531b66846cc8 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:49:45 +0100 Subject: [PATCH 20/25] test --- MANIFEST.in | 6 ++++++ setup.py | 1 + 2 files changed, 7 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..4266a761 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +global-include *.yml +global-include *.yaml +global-include *.json +global-include *.pth +global-include *.t7 +global-include *.txt \ No newline at end of file diff --git a/setup.py b/setup.py index 45928d4b..f237984d 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", + include_package_data=True, ) \ No newline at end of file From 8c224c26c1347f15bf08f0ed0008356a9ba557c5 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 21:55:47 +0100 Subject: [PATCH 21/25] test --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f237984d..7b36cae7 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,10 @@ long_description=open('README.md').read(), install_requires=required, url="https://github.com/yl4579/StyleTTS2.git", - include_package_data=True, +# include_package_data=True, + package_data={ + 'styletts2': ['**/*.txt', '**/*.t7', '**/*.pth', '**/*.json', '**/*.yaml', '**/*.yml'] + } ) \ No newline at end of file From 2ecae58f1756be6d5f15856692e43a58511483a4 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 22:00:36 +0100 Subject: [PATCH 22/25] fix imports --- models.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/models.py b/models.py index 5b5fa487..c7c49ece 100644 --- a/models.py +++ b/models.py @@ -625,7 +625,11 @@ def build_model(args, text_aligner, pitch_extractor, bert): assert args.decoder.type in ['istftnet', 'hifigan'], 'Decoder type unknown' if args.decoder.type == "istftnet": - from Modules.istftnet import Decoder + try: + from .Modules.istftnet import Decoder + except: + from Modules.istftnet import Decoder + decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels, resblock_kernel_sizes = args.decoder.resblock_kernel_sizes, upsample_rates = args.decoder.upsample_rates, @@ -634,7 +638,10 @@ def build_model(args, text_aligner, pitch_extractor, bert): upsample_kernel_sizes=args.decoder.upsample_kernel_sizes, gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size) else: - from Modules.hifigan import Decoder + try: + from .Modules.hifigan import Decoder + except: + from Modules.hifigan import Decoder decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels, resblock_kernel_sizes = args.decoder.resblock_kernel_sizes, upsample_rates = args.decoder.upsample_rates, From 0453a023bcc5b6216f16f6af540c9e8410816a67 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 23:32:59 +0100 Subject: [PATCH 23/25] adding docs --- api.py | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 157 insertions(+), 19 deletions(-) diff --git a/api.py b/api.py index fb9605c3..e2e13b06 100644 --- a/api.py +++ b/api.py @@ -1,3 +1,6 @@ +""" +StyleTTS2 API module. +""" import torch torch.manual_seed(0) torch.backends.cudnn.benchmark = False @@ -44,6 +47,18 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml", add_cwd:bool=True)->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: + """ + Load the necessary configurations and models for phonemizer, ASR, F0, and BERT. + + Args: + language (str, optional): The language for the phonemizer backend. Defaults to "en-us". + config_path (str, optional): The path to the configuration file. Defaults to "./Configs/config.yml". + add_cwd (bool, optional): Whether to add the current working directory to the paths. This is used to load default models only. Defaults to True. + + Returns: + Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: A tuple containing the global phonemizer, + the configuration dictionary, the text aligner model, the pitch extractor model, and the BERT model. + """ global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True, with_stress=True) if add_cwd is True: @@ -75,7 +90,21 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us", def load_model(weight_path:str, config:dict, text_aligner:torch.nn.Module, pitch_extractor:torch.nn.Module, - plbert:torch.nn.Module, device:str='cpu')->Tuple[torch.nn.Module, any]: + plbert:torch.nn.Module, device:str='cpu')->Tuple[torch.nn.Module, any]: + """ + Loads a pre-trained model with the specified weight path and configuration. + + Args: + weight_path (str): The path to the pre-trained model weights. + config (dict): The configuration dictionary for building the model. + text_aligner (torch.nn.Module): The text aligner module. Returned by load_phonemizer_configs_asr_f0_bert. + pitch_extractor (torch.nn.Module): The pitch extractor module. Returned by load_phonemizer_configs_asr_f0_bert. + plbert (torch.nn.Module): The plbert module. Returned by load_phonemizer_configs_asr_f0_bert. + device (str, optional): The device to load the model on. Defaults to 'cpu'. + + Returns: + Tuple[torch.nn.Module, any]: A tuple containing the loaded model and its parameters. + """ model_params = recursive_munch(config['model_params']) model = build_model(model_params, text_aligner, pitch_extractor, plbert) _ = [model[key].eval() for key in model] @@ -106,7 +135,16 @@ def load_model(weight_path:str, config:dict, return model, model_params -def load_sampler(model:torch.nn.Module)->torch.nn.Module: +def load_sampler(model: torch.nn.Module) -> torch.nn.Module: + """ + Loads a diffusion sampler for the given model. + + Args: + model (torch.nn.Module): The model to load the sampler for. Returned by load_model. + + Returns: + torch.nn.Module: The loaded diffusion sampler. + """ sampler = DiffusionSampler(model.diffusion.diffusion, sampler=ADPM2Sampler(), sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters @@ -123,7 +161,17 @@ def __init__(self, device:str='cpu', load_from_HF:bool=True, model_remote_path:str="https://huggingface.co/yl4579/StyleTTS2-LibriTTS"): - + """ + Initializes the API object for StyleTTS2. + + Args: + config_path (str, optional): Path to the configuration file. Defaults to None. + model_path (str, optional): Path to the model file. Defaults to None. If None, will use LJ Speech model. + language (str, optional): Language code. Defaults to "en-us". More languages will be added in the future with multi language plbert. + device (str, optional): Device to run the model on. Defaults to 'cpu'. + load_from_HF (bool, optional): Whether to load the model from Hugging Face. Defaults to True. + model_remote_path (str, optional): Remote path to the model. Defaults to "https://huggingface.co/yl4579/StyleTTS2-LibriTTS". + """ add_cwd = False if config_path is None: add_cwd = True @@ -150,7 +198,7 @@ def __init__(self, self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, config_path=self.config_path, add_cwd=add_cwd) - + self.model, self.model_params = load_model(weight_path=model_path, config=self.config, @@ -168,16 +216,44 @@ def __init__(self, self.mean, self.std = -4, 4 def __call__(self, text:str, ref_s:NDArray=None, alpha:float=0.3, - beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: - return self.predict(text=text, - ref_s=ref_s, - alpha=alpha, - beta=beta, - diffusion_steps=diffusion_steps, - embedding_scale=embedding_scale) + beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + """ + Call the model to generate speech with the given input text and optional reference style. wrapper for predict. + + Args: + text (str): The input text for speech generation. + ref_s (NDArray, optional): The reference style for speech generation. Defaults to None. + alpha (float, optional): The weight of the reference style in the generated speech. Defaults to 0.3. + beta (float, optional): The weight of the input text in the generated speech. Defaults to 0.7. + diffusion_steps (float, optional): The number of diffusion steps for speech generation. Defaults to 5. + embedding_scale (float, optional): The scale factor for the input text embedding. Defaults to 1. + + Returns: + NDArray: The generated speech waveform. + """ + return self.predict(text=text, + ref_s=ref_s, + alpha=alpha, + beta=beta, + diffusion_steps=diffusion_steps, + embedding_scale=embedding_scale) def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, - beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray: + """ + Generates speech waveform for the given input text. + + Args: + text (str): The input text to be synthesized. + ref_s (NDArray, optional): Reference speaker embedding. Returned by compute_style. Defaults to None. + alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre). + beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody). + diffusion_steps (float, optional): Number of diffusion steps for sampling the speech. Defaults to 5. + embedding_scale (float, optional): Scaling factor for the speaker embedding. Defaults to 1. + + Returns: + NDArray: The generated speech waveform. + """ if ref_s is None: ref_s = self.load_random_ref_s() @@ -198,10 +274,10 @@ def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2) s_pred = self.sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(self._device), - embedding=bert_dur, - embedding_scale=embedding_scale, - features=ref_s, # reference from the same speaker as the embedding - num_steps=diffusion_steps).squeeze(1) + embedding=bert_dur, + embedding_scale=embedding_scale, + features=ref_s, # reference from the same speaker as the embedding + num_steps=diffusion_steps).squeeze(1) s = s_pred[:, 128:] @@ -245,9 +321,22 @@ def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0)) - return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later + return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later def compute_style(self, wave=None, sr=None, path=None, device='cpu')->torch.Tensor: + """ + Compute the style representation for the given audio. If path is provided, it will load the audio from the path. + Otherwise, it will use the wave and sr arguments. + + Args: + wave (np.ndarray, optional): Audio waveform. Defaults to None. + sr (int, optional): Sample rate of the audio. Defaults to None. + path (str, optional): Path to the audio file. Defaults to None. + device (str, optional): Device to use for computation. Defaults to 'cpu'. + + Returns: + torch.Tensor: Style representation tensor. + """ if path is not None: wave, sr = librosa.load(path, sr=24000) audio, index = librosa.effects.trim(wave, top_db=30) @@ -267,15 +356,43 @@ def length_to_mask(self, lengths:NDArray)->torch.Tensor: return mask def preprocess(self, wave:NDArray)->torch.Tensor: + """ + Preprocesses the input waveform by converting it to a mel spectrogram tensor. + + Args: + wave (numpy.ndarray): The input waveform. + + Returns: + torch.Tensor: The preprocessed mel spectrogram tensor. + """ wave_tensor = torch.from_numpy(wave).float() mel_tensor = self.to_mel(wave_tensor) mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std return mel_tensor - def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, + def predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, alpha:float=0.3, beta:float=0.7, t:float=0.7, diffusion_steps:int=5, embedding_scale:int=1)->NDArray: + """ + Predicts the output audio waveform for a given input text and style. + + Args: + text (str): The input text to be synthesized. + s_prev (NDArray): The previous style embedding. + ref_s (NDArray, optional): The reference style embedding. If not provided, a random reference style is loaded. Defaults to None. + alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre). + beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody). + t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7. + diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5. + embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1. + + Returns: + NDArray: The output audio waveform. + NDArray: The predicted style embedding. + """ + if ref_s is None: ref_s = self.load_random_ref_s() + text = text.strip() ps = self.global_phonemizer.phonemize([text]) ps = word_tokenize(ps[0]) @@ -354,6 +471,21 @@ def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1, t:float=.7) -> NDArray: + """ + Generates a long audio prediction based on the given text. + + Args: + text (str): The input text to be synthesized. + ref_s (NDArray, optional): The reference style embedding. If not provided, a random reference style is loaded. Defaults to None. + alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre). + beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody). + t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7. + diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5. + embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1. + + Returns: + NDArray: The generated audio waveform as a numpy array. + """ if ref_s is None: ref_s = self.load_random_ref_s() sentences = text.split('.') # simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later) wavs = [] @@ -362,7 +494,7 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, if text.strip() == "": continue text += '.' # add it back - wav, s_prev = self._predict_long_step(text, + wav, s_prev = self.predict_long_step(text, s_prev, ref_s, alpha=alpha, @@ -375,6 +507,12 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, return np.concatenate(wavs, axis=0) def load_random_ref_s(self): + """ + returns a random style embedding. This ruins the result. Use it only for testing. + + Returns: + torch.Tensor: A random style embedding tensor. + """ return torch.randn(1, 256).to(self._device) @property From e07dba583c851f84dde04b810275e65138f508ed Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Fri, 1 Mar 2024 23:39:37 +0100 Subject: [PATCH 24/25] add embedding_scale docs --- api.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/api.py b/api.py index e2e13b06..c48c3660 100644 --- a/api.py +++ b/api.py @@ -226,7 +226,9 @@ def __call__(self, text:str, ref_s:NDArray=None, alpha:float=0.3, alpha (float, optional): The weight of the reference style in the generated speech. Defaults to 0.3. beta (float, optional): The weight of the input text in the generated speech. Defaults to 0.7. diffusion_steps (float, optional): The number of diffusion steps for speech generation. Defaults to 5. - embedding_scale (float, optional): The scale factor for the input text embedding. Defaults to 1. + embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. + The higher the scale, the more conditional the style is to the input text and hence more emotional. + Defaults to 1. Returns: NDArray: The generated speech waveform. @@ -249,7 +251,9 @@ def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre). beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody). diffusion_steps (float, optional): Number of diffusion steps for sampling the speech. Defaults to 5. - embedding_scale (float, optional): Scaling factor for the speaker embedding. Defaults to 1. + embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. + The higher the scale, the more conditional the style is to the input text and hence more emotional. + Defaults to 1. Returns: NDArray: The generated speech waveform. @@ -372,7 +376,7 @@ def preprocess(self, wave:NDArray)->torch.Tensor: def predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, alpha:float=0.3, beta:float=0.7, t:float=0.7, - diffusion_steps:int=5, embedding_scale:int=1)->NDArray: + diffusion_steps:int=5, embedding_scale:float=1)->NDArray: """ Predicts the output audio waveform for a given input text and style. @@ -384,7 +388,9 @@ def predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody). t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7. diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5. - embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1. + embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. + The higher the scale, the more conditional the style is to the input text and hence more emotional. + Defaults to 1. Returns: NDArray: The output audio waveform. @@ -481,7 +487,9 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody). t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7. diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5. - embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1. + embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. + The higher the scale, the more conditional the style is to the input text and hence more emotional. + Defaults to 1. Returns: NDArray: The generated audio waveform as a numpy array. From cc09ac49fbcf54ec8fb355c4b87a6453e48b2b91 Mon Sep 17 00:00:00 2001 From: Mohannad Ehab Barakat Date: Sat, 2 Mar 2024 11:26:50 +0100 Subject: [PATCH 25/25] fix bug in compute_style with wave input --- api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api.py b/api.py index c48c3660..d527de69 100644 --- a/api.py +++ b/api.py @@ -345,7 +345,7 @@ def compute_style(self, wave=None, sr=None, path=None, device='cpu')->torch.Tens wave, sr = librosa.load(path, sr=24000) audio, index = librosa.effects.trim(wave, top_db=30) if sr != 24000: - audio = librosa.resample(audio, sr, 24000) + audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) mel_tensor = self.preprocess(audio).to(device) with torch.no_grad():