From efe34eacaeb0a69b829e21edb98f130f143f6592 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Thu, 29 Feb 2024 19:59:26 +0100
Subject: [PATCH 01/25] new requirements, api and intial setup

---
 .gitignore               | 164 ++++++++++++++++++
 README.md                |  31 +++-
 Utils/PLBERT/__init__.py |   0
 __init__.py              |   0
 api.py                   | 360 +++++++++++++++++++++++++++++++++++++++
 requirements.txt         |   9 +-
 setup.py                 |  15 ++
 7 files changed, 574 insertions(+), 5 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 Utils/PLBERT/__init__.py
 create mode 100644 __init__.py
 create mode 100644 api.py
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..5a2345f2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+test_tmp/
+StyleTTS2-LibriTTS/
+models_weight/
\ No newline at end of file
diff --git a/README.md b/README.md
index 1033fd06..7e0dbc27 100644
--- a/README.md
+++ b/README.md
@@ -17,9 +17,36 @@ Online demo: [Hugging Face](https://huggingface.co/spaces/styletts2/styletts2) (
 - [x] Test training code for multi-speaker models (VCTK and LibriTTS)
 - [x] Finish demo code for multispeaker model and upload pre-trained models
 - [x] Add a finetuning script for new speakers with base pre-trained multispeaker models
+- [x] Installable styletts2 and easier interface
+- [ ] Add multilingual support
+- [ ] Add simple interface to train new languages and voice tuning
 - [ ] Fix DDP (accelerator) for `train_second.py` **(I have tried everything I could to fix this but had no success, so if you are willing to help, please see [#7](https://github.com/yl4579/StyleTTS2/issues/7))**
 
 ## Pre-requisites
+
+### Install as package
+
+1. Conda new env and lfs (recommended)
+```bash
+conda create -n styletts python=3.9
+conda activate styletts
+git lfs install
+```
+2. Install espeak 
+*Linux*
+```bash
+sudo apt update
+sudo apt install espeak-ng
+```
+*MacOS* build [espeak-ng](https://github.com/espeak-ng/espeak-ng/blob/master/docs/building.md)
+*windows* install [espeak-ng](https://github.com/espeak-ng/espeak-ng/blob/master/docs/guide.md#windows)
+
+3. install requirements
+```bash
+pip install -r requirements.txt
+```
+
+### If you want to edit the code while development
 1. Python >= 3.7
 2. Clone this repository:
 ```bash
@@ -30,10 +57,6 @@ cd StyleTTS2
 ```bash
 pip install -r requirements.txt
 ```
-On Windows add:
-```bash
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -U
-```
 Also install phonemizer and espeak if you want to run the demo:
 ```bash
 pip install phonemizer
diff --git a/Utils/PLBERT/__init__.py b/Utils/PLBERT/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/api.py b/api.py
new file mode 100644
index 00000000..10f4a03f
--- /dev/null
+++ b/api.py
@@ -0,0 +1,360 @@
+import torch
+torch.manual_seed(0)
+torch.backends.cudnn.benchmark = False
+torch.backends.cudnn.deterministic = True
+
+import random
+random.seed(0)
+
+import numpy as np
+np.random.seed(0)
+
+# load packages
+import time
+import random
+import yaml
+from munch import Munch
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+import librosa
+from nltk.tokenize import word_tokenize
+
+from models import *
+from utils import *
+from text_utils import TextCleaner
+import phonemizer
+from Utils.PLBERT.util import load_plbert
+from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
+from typing import Tuple, Type
+from numpy.typing import NDArray
+import os
+
+
+def load_phonemizer_configs_asr_f0_bert(language:str="en-us")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
+    global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True,  with_stress=True)
+
+    config = yaml.safe_load(open("/content/StyleTTS2/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml"))
+
+
+    # load pretrained ASR model
+    ASR_config = config.get('ASR_config', False)
+    ASR_path = config.get('ASR_path', False)
+    text_aligner = load_ASR_models(ASR_path, ASR_config)
+
+    # load pretrained F0 model
+    F0_path = config.get('F0_path', False)
+    pitch_extractor = load_F0_models(F0_path)
+
+    # load BERT model
+    BERT_path = config.get('PLBERT_dir', False)
+    plbert = load_plbert(BERT_path)
+
+    return global_phonemizer, config, text_aligner, pitch_extractor, plbert
+
+def load_model(weight_path:str, config:dict, 
+               text_aligner:torch.nn.Module, pitch_extractor:torch.nn.Module,
+                 plbert:torch.nn.Module, device:str='cpu')->Tuple[torch.nn.Module, any]:
+    model_params = recursive_munch(config['model_params'])
+    model = build_model(model_params, text_aligner, pitch_extractor, plbert)
+    _ = [model[key].eval() for key in model]
+    _ = [model[key].to(device) for key in model]
+
+    params_whole = torch.load(weight_path, map_location='cpu')
+    params = params_whole['net']
+    
+
+    for key in model:
+        if key in params:
+            print('%s loaded' % key)
+            try:
+                model[key].load_state_dict(params[key])
+            except:
+                from collections import OrderedDict
+                state_dict = params[key]
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove `module.`
+                    new_state_dict[name] = v
+                # load params
+                model[key].load_state_dict(new_state_dict, strict=False)
+    #             except:
+    #                 _load(params[key], model[key])
+    _ = [model[key].eval() for key in model]
+
+
+    return model, model_params
+
+def load_sampler(model:torch.nn.Module)->torch.nn.Module:
+    sampler = DiffusionSampler(model.diffusion.diffusion,
+                               sampler=ADPM2Sampler(),
+                               sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
+                               clamp=False)
+    
+    return sampler
+
+
+class StyleTTS:
+    def __init__(self, 
+                 config_path:str="./Configs/config.yml", 
+                 model_path:str="./models_weight", 
+                 language:str="en-us", 
+                 device:str='cpu',
+                 load_from_HF:bool=True, 
+                 model_remote_path:str="https://huggingface.co/yl4579/StyleTTS2-LibriTTS"):
+        
+        if load_from_HF is True:
+            if model_path is None: 
+                cwd = os.getcwd()
+                model_path = os.path.join(cwd,"models_weight")
+                os.makedirs(model_path, exist_ok=True)
+            os.system(f"git clone {model_remote_path} {model_path}")
+
+        self.model_remote_path = model_remote_path
+        self.config_path = config_path
+        self.model_path = model_path
+        self.language = language
+        self._device = device
+
+        (self.global_phonemizer, 
+         self.config, 
+         self.text_aligner, 
+         self.pitch_extractor, 
+         self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language)
+        
+
+        self.model, self.model_params = load_model(weight_path=model_path, 
+                                                   config=self.config, 
+                                                   text_aligner=self.text_aligner, 
+                                                   pitch_extractor=self.pitch_extractor,
+                                                   plbert=self.plbert,
+                                                   device=device)
+        
+        self.sampler = load_sampler(model=self.model)
+
+        self.textclenaer = TextCleaner()
+
+        self.to_mel = torchaudio.transforms.MelSpectrogram(
+            n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
+        self.mean, self.std = -4, 4
+
+    # def __call__(self, text:str, ref_s:NDArray, alpha:float=0.3, 
+    #              beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+    #     return self.predict(text=text,
+    #                         ref_s=ref_s, 
+    #                         alpha=alpha, 
+    #                         beta=beta, 
+    #                         diffusion_steps=diffusion_steps, 
+    #                         embedding_scale=embedding_scale)
+
+    def predict(self, text:str, ref_s:NDArray, alpha:float=0.3, 
+                beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+        text = text.strip()
+        ps = self.global_phonemizer.phonemize([text])
+        ps = word_tokenize(ps[0])
+        ps = ' '.join(ps)
+        tokens = self.textclenaer(ps)
+        tokens.insert(0, 0)
+        tokens = torch.LongTensor(tokens).to(self._device).unsqueeze(0)
+
+        with torch.no_grad():
+            input_lengths = torch.LongTensor([tokens.shape[-1]]).to(self._device)
+            text_mask = self.length_to_mask(input_lengths).to(self._device)
+
+            t_en = self.model.text_encoder(tokens, input_lengths, text_mask)
+            bert_dur = self.model.bert(tokens, attention_mask=(~text_mask).int())
+            d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
+
+            s_pred = self.sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(self._device),
+                                  embedding=bert_dur,
+                                  embedding_scale=embedding_scale,
+                                  features=ref_s, # reference from the same speaker as the embedding
+                                  num_steps=diffusion_steps).squeeze(1)
+
+
+            s = s_pred[:, 128:]
+            ref = s_pred[:, :128]
+
+            ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
+            s = beta * s + (1 - beta)  * ref_s[:, 128:]
+
+            d = self.model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+
+            x, _ = self.model.predictor.lstm(d)
+            duration = self.model.predictor.duration_proj(x)
+
+            duration = torch.sigmoid(duration).sum(axis=-1)
+            pred_dur = torch.round(duration.squeeze()).clamp(min=1)
+
+
+            pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
+            c_frame = 0
+            for i in range(pred_aln_trg.size(0)):
+                pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
+                c_frame += int(pred_dur[i].data)
+
+            # encode prosody
+            en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(self._device))
+            if self.model_params.decoder.type == "hifigan":
+                asr_new = torch.zeros_like(en)
+                asr_new[:, :, 0] = en[:, :, 0]
+                asr_new[:, :, 1:] = en[:, :, 0:-1]
+                en = asr_new
+
+            F0_pred, N_pred = self.model.predictor.F0Ntrain(en, s)
+
+            asr = (t_en @ pred_aln_trg.unsqueeze(0).to(self._device))
+            if self.model_params.decoder.type == "hifigan":
+                asr_new = torch.zeros_like(asr)
+                asr_new[:, :, 0] = asr[:, :, 0]
+                asr_new[:, :, 1:] = asr[:, :, 0:-1]
+                asr = asr_new
+
+            out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+
+
+        return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later 
+
+    def compute_style(self, wave=None, sr=None, path=None, device='cpu')->torch.Tensor:
+        if path is not None:
+            wave, sr = librosa.load(path, sr=24000)
+        audio, index = librosa.effects.trim(wave, top_db=30)
+        if sr != 24000:
+            audio = librosa.resample(audio, sr, 24000)
+        mel_tensor = self.preprocess(audio).to(device)
+
+        with torch.no_grad():
+            ref_s = self.model.style_encoder(mel_tensor.unsqueeze(1))
+            ref_p = self.model.predictor_encoder(mel_tensor.unsqueeze(1))
+
+        return torch.cat([ref_s, ref_p], dim=1)
+    
+    def length_to_mask(self, lengths:NDArray)->torch.Tensor:
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+
+    def preprocess(self, wave:NDArray)->torch.Tensor:
+        wave_tensor = torch.from_numpy(wave).float()
+        mel_tensor = self.to_mel(wave_tensor)
+        mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std
+        return mel_tensor
+
+    def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray, 
+                           alpha:float=0.3, beta:float=0.7, t:float=0.7, 
+                           diffusion_steps:int=5, embedding_scale:int=1)->NDArray:
+        text = text.strip()
+        ps = self.global_phonemizer.phonemize([text])
+        ps = word_tokenize(ps[0])
+        ps = ' '.join(ps)
+        ps = ps.replace('``', '"')
+        ps = ps.replace("''", '"')
+
+        tokens = self.textclenaer(ps)
+        tokens.insert(0, 0)
+        tokens = torch.LongTensor(tokens).to(self._device).unsqueeze(0)
+
+        with torch.no_grad():
+            input_lengths = torch.LongTensor([tokens.shape[-1]]).to(self._device)
+            text_mask = self.length_to_mask(input_lengths).to(self._device)
+
+            t_en = self.model.text_encoder(tokens, input_lengths, text_mask)
+            bert_dur = self.model.bert(tokens, attention_mask=(~text_mask).int())
+            d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
+
+            s_pred = self.sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(self._device),
+                                            embedding=bert_dur,
+                                            embedding_scale=embedding_scale,
+                                                features=ref_s, # reference from the same speaker as the embedding
+                                                num_steps=diffusion_steps).squeeze(1)
+
+            if s_prev is not None:
+                # convex combination of previous and current style
+                s_pred = t * s_prev + (1 - t) * s_pred
+
+            s = s_pred[:, 128:]
+            ref = s_pred[:, :128]
+
+            ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
+            s = beta * s + (1 - beta)  * ref_s[:, 128:]
+
+            s_pred = torch.cat([ref, s], dim=-1)
+
+            d = self.model.predictor.text_encoder(d_en,
+                                            s, input_lengths, text_mask)
+
+            x, _ = self.model.predictor.lstm(d)
+            duration = self.model.predictor.duration_proj(x)
+
+            duration = torch.sigmoid(duration).sum(axis=-1)
+            pred_dur = torch.round(duration.squeeze()).clamp(min=1)
+
+
+            pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
+            c_frame = 0
+            for i in range(pred_aln_trg.size(0)):
+                pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
+                c_frame += int(pred_dur[i].data)
+
+            # encode prosody
+            en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(self._device))
+            if self.model_params.decoder.type == "hifigan":
+                asr_new = torch.zeros_like(en)
+                asr_new[:, :, 0] = en[:, :, 0]
+                asr_new[:, :, 1:] = en[:, :, 0:-1]
+                en = asr_new
+
+            F0_pred, N_pred = self.model.predictor.F0Ntrain(en, s)
+
+            asr = (t_en @ pred_aln_trg.unsqueeze(0).to(self._device))
+            if self.model_params.decoder.type == "hifigan":
+                asr_new = torch.zeros_like(asr)
+                asr_new[:, :, 0] = asr[:, :, 0]
+                asr_new[:, :, 1:] = asr[:, :, 0:-1]
+                asr = asr_new
+
+            out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+
+
+        return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later
+    
+    def predict_long(self, text:str, ref_s:NDArray, alpha:float=0.3, 
+                     beta:float=0.7, diffusion_steps:float=5, 
+                     embedding_scale:float=1, t:float=.7) -> NDArray:
+        sentences = text.split('.') # simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later)
+        wavs = []
+        s_prev = None
+        for text in sentences:
+            if text.strip() == "": continue
+            text += '.' # add it back
+
+            wav, s_prev = self._predict_long_step(text,
+                                                  s_prev,
+                                                  ref_s,
+                                                  alpha=alpha,
+                                                  beta=beta,  # make it more suitable for the text
+                                                  t=t,
+                                                  diffusion_steps=diffusion_steps, 
+                                                  embedding_scale=embedding_scale)
+            wavs.append(wav)
+
+    @property
+    def device(self):
+        return self._device
+    
+    @device.setter
+    def device(self, device:str):
+        self._device = device
+
+    def to(self, device:str):
+        self.device = device
+
+
+if __name__ == "__main__":
+    print(StyleTTS)
+    stts = StyleTTS()
+
+
+    
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 8b8d1122..3da0762d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 SoundFile
 torchaudio
+torchvision
 munch
 torch
 pydub
@@ -14,4 +15,10 @@ einops-exts
 tqdm
 typing
 typing-extensions
-git+https://github.com/resemble-ai/monotonic_align.git
\ No newline at end of file
+git+https://github.com/resemble-ai/monotonic_align.git
+phonemizer
+
+
+torch --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
+torchvision --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
+torchaudio --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..783be4d8
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,15 @@
+import os
+from setuptools import setup, find_packages
+
+
+setup(
+   name='StyleTTS2',
+   version='2.0',
+   description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.',
+   license='MIT',
+   package_dir={'styletts2':'./'},
+   long_description=open('README.md').read(),
+   packages=find_packages(),
+   install_requires=find_packages(),
+   url="https://github.com/yl4579/StyleTTS2.git",
+)
\ No newline at end of file

From a41adc8d6b53d17e8a8315ade2d0fbb69ab067ac Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 11:16:52 +0000
Subject: [PATCH 02/25] adding better api

---
 api.py | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/api.py b/api.py
index 10f4a03f..82210daf 100644
--- a/api.py
+++ b/api.py
@@ -28,15 +28,15 @@
 import phonemizer
 from Utils.PLBERT.util import load_plbert
 from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
-from typing import Tuple, Type
+from typing import Tuple, Type, Union
 from numpy.typing import NDArray
 import os
 
 
-def load_phonemizer_configs_asr_f0_bert(language:str="en-us")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
+def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
     global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True,  with_stress=True)
 
-    config = yaml.safe_load(open("/content/StyleTTS2/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml"))
+    config = yaml.safe_load(open(config_path))
 
 
     # load pretrained ASR model
@@ -111,6 +111,8 @@ def __init__(self,
                 model_path = os.path.join(cwd,"models_weight")
                 os.makedirs(model_path, exist_ok=True)
             os.system(f"git clone {model_remote_path} {model_path}")
+            config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml")
+            model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth")
 
         self.model_remote_path = model_remote_path
         self.config_path = config_path
@@ -122,7 +124,7 @@ def __init__(self,
          self.config, 
          self.text_aligner, 
          self.pitch_extractor, 
-         self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language)
+         self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, config_path=self.config_path)
         
 
         self.model, self.model_params = load_model(weight_path=model_path, 
@@ -140,17 +142,20 @@ def __init__(self,
             n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
         self.mean, self.std = -4, 4
 
-    # def __call__(self, text:str, ref_s:NDArray, alpha:float=0.3, 
-    #              beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
-    #     return self.predict(text=text,
-    #                         ref_s=ref_s, 
-    #                         alpha=alpha, 
-    #                         beta=beta, 
-    #                         diffusion_steps=diffusion_steps, 
-    #                         embedding_scale=embedding_scale)
+    def __call__(self, text:str, ref_s:NDArray=None, alpha:float=0.3, 
+                 beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+        return self.predict(text=text, 
+                            ref_s=ref_s, 
+                            alpha=alpha, 
+                            beta=beta, 
+                            diffusion_steps=diffusion_steps, 
+                            embedding_scale=embedding_scale)
 
-    def predict(self, text:str, ref_s:NDArray, alpha:float=0.3, 
+    def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, 
                 beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+        
+        if ref_s is None: ref_s = self.load_random_ref_s()
+
         text = text.strip()
         ps = self.global_phonemizer.phonemize([text])
         ps = word_tokenize(ps[0])
@@ -242,9 +247,10 @@ def preprocess(self, wave:NDArray)->torch.Tensor:
         mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std
         return mel_tensor
 
-    def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray, 
+    def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, 
                            alpha:float=0.3, beta:float=0.7, t:float=0.7, 
                            diffusion_steps:int=5, embedding_scale:int=1)->NDArray:
+        if ref_s is None: ref_s = self.load_random_ref_s()
         text = text.strip()
         ps = self.global_phonemizer.phonemize([text])
         ps = word_tokenize(ps[0])
@@ -320,9 +326,10 @@ def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray,
 
         return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later
     
-    def predict_long(self, text:str, ref_s:NDArray, alpha:float=0.3, 
+    def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, 
                      beta:float=0.7, diffusion_steps:float=5, 
                      embedding_scale:float=1, t:float=.7) -> NDArray:
+        if ref_s is None: ref_s = self.load_random_ref_s()
         sentences = text.split('.') # simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later)
         wavs = []
         s_prev = None
@@ -340,6 +347,9 @@ def predict_long(self, text:str, ref_s:NDArray, alpha:float=0.3,
                                                   embedding_scale=embedding_scale)
             wavs.append(wav)
 
+    def load_random_ref_s(self):
+        return torch.randn(1, 256).to(self._device)
+    
     @property
     def device(self):
         return self._device
@@ -355,6 +365,11 @@ def to(self, device:str):
 if __name__ == "__main__":
     print(StyleTTS)
     stts = StyleTTS()
-
+    sr = 24000
+    wave = np.random.randn(sr*10)
+    # print(wave.shape)
+    # print(stts.compute_style(wave=wave, sr=sr, path=None, device='cpu').shape)
+    # print(stts.load_random_ref_s().shape)
+    assert stts("read this in a random voice") is not None
 
     
\ No newline at end of file

From fb358f94d48fdbd64b7b60c0017d1d9151448198 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 11:18:09 +0000
Subject: [PATCH 03/25] NLTK auto install and cli scripts

---
 requirements.txt      | 11 ++++++-----
 scripts/cli           | 14 ++++++++++++++
 scripts/install       |  4 ++++
 scripts/nltk_download |  4 ++++
 4 files changed, 28 insertions(+), 5 deletions(-)
 create mode 100644 scripts/cli
 create mode 100644 scripts/install
 create mode 100644 scripts/nltk_download

diff --git a/requirements.txt b/requirements.txt
index 3da0762d..cdaad136 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-SoundFile
+soundfile
 torchaudio
 torchvision
 munch
@@ -15,10 +15,11 @@ einops-exts
 tqdm
 typing
 typing-extensions
-git+https://github.com/resemble-ai/monotonic_align.git
+monotonic_align @ git+https://github.com/resemble-ai/monotonic_align.git
 phonemizer
 
 
-torch --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
-torchvision --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
-torchaudio --index-url https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
\ No newline at end of file
+
+torch @ https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
+torchvision @ https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
+torchaudio @ https://download.pytorch.org/whl/cu118 ; sys_platform == 'win32'
\ No newline at end of file
diff --git a/scripts/cli b/scripts/cli
new file mode 100644
index 00000000..ed4a4aa6
--- /dev/null
+++ b/scripts/cli
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+git lfs install
+
+if [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
+    sudo apt update
+    sudo apt install espeak-ng
+# elif [ "$(uname)" == "Darwin" ]; then
+#     # Do something under Mac OS X platform        
+# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then
+#     # Do something under 32 bits Windows NT platform
+# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW64_NT" ]; then
+#     # Do something under 64 bits Windows NT platform
+fi
\ No newline at end of file
diff --git a/scripts/install b/scripts/install
new file mode 100644
index 00000000..ceb69c11
--- /dev/null
+++ b/scripts/install
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+pip install -r ../requirements.txt
+
+mkdir del
\ No newline at end of file
diff --git a/scripts/nltk_download b/scripts/nltk_download
new file mode 100644
index 00000000..b27d67a0
--- /dev/null
+++ b/scripts/nltk_download
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+
+import nltk
+nltk.download('punkt')
\ No newline at end of file

From c3b6e58a0980dee5b778e5b4ab279d6b1e246ea3 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 11:27:25 +0000
Subject: [PATCH 04/25] installable

---
 setup.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 783be4d8..f990694d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,21 @@
 import os
 from setuptools import setup, find_packages
 
+with open('requirements.txt') as f:
+    required = f.read().splitlines()
+
 
 setup(
    name='StyleTTS2',
    version='2.0',
    description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.',
    license='MIT',
-   package_dir={'styletts2':'./'},
+   package_dir={'styletts2':'src'},
    long_description=open('README.md').read(),
-   packages=find_packages(),
-   install_requires=find_packages(),
+   install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
+   scripts=[
+            './scripts/nltk_download',
+            './scripts/cli',
+            ],
 )
\ No newline at end of file

From de3a4c780194a97127414068b9fc6984ae9a085c Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 13:37:52 +0000
Subject: [PATCH 05/25] working api

---
 api.py                | 27 ++++++++++++++++-----------
 scripts/cli           | 14 --------------
 scripts/install       |  4 ----
 scripts/nltk_download |  4 ----
 setup.py              |  9 +++------
 5 files changed, 19 insertions(+), 39 deletions(-)
 delete mode 100644 scripts/cli
 delete mode 100644 scripts/install
 delete mode 100644 scripts/nltk_download

diff --git a/api.py b/api.py
index 82210daf..492a8268 100644
--- a/api.py
+++ b/api.py
@@ -31,6 +31,8 @@
 from typing import Tuple, Type, Union
 from numpy.typing import NDArray
 import os
+import nltk
+nltk.download('punkt')
 
 
 def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
@@ -68,7 +70,7 @@ def load_model(weight_path:str, config:dict,
 
     for key in model:
         if key in params:
-            print('%s loaded' % key)
+            # print('%s loaded' % key)
             try:
                 model[key].load_state_dict(params[key])
             except:
@@ -99,7 +101,7 @@ def load_sampler(model:torch.nn.Module)->torch.nn.Module:
 class StyleTTS:
     def __init__(self, 
                  config_path:str="./Configs/config.yml", 
-                 model_path:str="./models_weight", 
+                 model_path:str=None, 
                  language:str="en-us", 
                  device:str='cpu',
                  load_from_HF:bool=True, 
@@ -109,10 +111,11 @@ def __init__(self,
             if model_path is None: 
                 cwd = os.getcwd()
                 model_path = os.path.join(cwd,"models_weight")
-                os.makedirs(model_path, exist_ok=True)
-            os.system(f"git clone {model_remote_path} {model_path}")
-            config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml")
-            model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth")
+                if not os.path.exists(model_path):
+                    os.makedirs(model_path, exist_ok=True)
+                    os.system(f"git clone {model_remote_path} {model_path}")
+                config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml")
+                model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth")
 
         self.model_remote_path = model_remote_path
         self.config_path = config_path
@@ -347,6 +350,8 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
                                                   embedding_scale=embedding_scale)
             wavs.append(wav)
 
+        return np.concatenate(wavs, axis=0)
+
     def load_random_ref_s(self):
         return torch.randn(1, 256).to(self._device)
     
@@ -363,13 +368,13 @@ def to(self, device:str):
 
 
 if __name__ == "__main__":
-    print(StyleTTS)
     stts = StyleTTS()
     sr = 24000
     wave = np.random.randn(sr*10)
-    # print(wave.shape)
-    # print(stts.compute_style(wave=wave, sr=sr, path=None, device='cpu').shape)
-    # print(stts.load_random_ref_s().shape)
-    assert stts("read this in a random voice") is not None
+    
+    print(stts("read this in a random voice").shape)
+    print(stts.predict("read this in a random voice").shape)
+    print(stts.predict_long("simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later)").shape)
+
 
     
\ No newline at end of file
diff --git a/scripts/cli b/scripts/cli
deleted file mode 100644
index ed4a4aa6..00000000
--- a/scripts/cli
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-git lfs install
-
-if [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
-    sudo apt update
-    sudo apt install espeak-ng
-# elif [ "$(uname)" == "Darwin" ]; then
-#     # Do something under Mac OS X platform        
-# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then
-#     # Do something under 32 bits Windows NT platform
-# elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW64_NT" ]; then
-#     # Do something under 64 bits Windows NT platform
-fi
\ No newline at end of file
diff --git a/scripts/install b/scripts/install
deleted file mode 100644
index ceb69c11..00000000
--- a/scripts/install
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-pip install -r ../requirements.txt
-
-mkdir del
\ No newline at end of file
diff --git a/scripts/nltk_download b/scripts/nltk_download
deleted file mode 100644
index b27d67a0..00000000
--- a/scripts/nltk_download
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env python
-
-import nltk
-nltk.download('punkt')
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f990694d..57766c04 100644
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,18 @@
 import os
+from sys import platform
 from setuptools import setup, find_packages
 
 with open('requirements.txt') as f:
     required = f.read().splitlines()
 
-
 setup(
    name='StyleTTS2',
    version='2.0',
    description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.',
    license='MIT',
-   package_dir={'styletts2':'src'},
+   package_dir={'styletts2':'./'},
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
-   scripts=[
-            './scripts/nltk_download',
-            './scripts/cli',
-            ],
+   
 )
\ No newline at end of file

From de90d59ba2c65b6a43bba962e7a28a248a80f230 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 13:39:54 +0000
Subject: [PATCH 06/25] fix install

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7e0dbc27..f7263739 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ sudo apt install espeak-ng
 
 3. install requirements
 ```bash
-pip install -r requirements.txt
+pip install git+https://github.com/yl4579/StyleTTS2.git
 ```
 
 ### If you want to edit the code while development

From d2baf0a440496a13ea1be7b448a17a347c995808 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 19:44:16 +0100
Subject: [PATCH 07/25] fix imports

---
 api.py    | 10 +++++-----
 models.py | 22 ++++++++++++++++------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/api.py b/api.py
index 492a8268..c1f3cd9a 100644
--- a/api.py
+++ b/api.py
@@ -22,12 +22,12 @@
 import librosa
 from nltk.tokenize import word_tokenize
 
-from models import *
-from utils import *
-from text_utils import TextCleaner
+from .models import *
+from .utils import *
+from .text_utils import TextCleaner
 import phonemizer
-from Utils.PLBERT.util import load_plbert
-from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
+from .Utils.PLBERT.util import load_plbert
+from .Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
 from typing import Tuple, Type, Union
 from numpy.typing import NDArray
 import os
diff --git a/models.py b/models.py
index 84bbb03d..5b5fa487 100644
--- a/models.py
+++ b/models.py
@@ -12,14 +12,24 @@
 import torch.nn.functional as F
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
-from Utils.ASR.models import ASRCNN
-from Utils.JDC.model import JDCNet
+try:
+    from .Utils.ASR.models import ASRCNN
+    from .Utils.JDC.model import JDCNet
 
-from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
-from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
-from Modules.diffusion.diffusion import AudioDiffusionConditional
+    from .Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
+    from .Modules.diffusion.modules import Transformer1d, StyleTransformer1d
+    from .Modules.diffusion.diffusion import AudioDiffusionConditional
 
-from Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator
+    from .Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator
+except:
+    from Utils.ASR.models import ASRCNN
+    from Utils.JDC.model import JDCNet
+
+    from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
+    from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
+    from Modules.diffusion.diffusion import AudioDiffusionConditional
+
+    from Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator
 
 from munch import Munch
 import yaml

From 5cbfe800d0304cecc4d547df5be4be52ed393315 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 19:58:53 +0100
Subject: [PATCH 08/25] add_cwd

---
 api.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/api.py b/api.py
index c1f3cd9a..463e136b 100644
--- a/api.py
+++ b/api.py
@@ -35,23 +35,33 @@
 nltk.download('punkt')
 
 
-def load_phonemizer_configs_asr_f0_bert(language:str="en-us", config_path:str="./Configs/config.yml")->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
+def load_phonemizer_configs_asr_f0_bert(language:str="en-us", 
+                                        config_path:str="./Configs/config.yml",
+                                        add_cwd:bool=True)->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
     global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True,  with_stress=True)
-
+    
+    if add_cwd is True:
+        config_path = os.path.join(os.getcwd(), config_path)
     config = yaml.safe_load(open(config_path))
 
 
     # load pretrained ASR model
     ASR_config = config.get('ASR_config', False)
     ASR_path = config.get('ASR_path', False)
+    if add_cwd is True:
+        ASR_path = os.path.join(os.getcwd(), ASR_path)
     text_aligner = load_ASR_models(ASR_path, ASR_config)
 
     # load pretrained F0 model
     F0_path = config.get('F0_path', False)
+    if add_cwd is True:
+        F0_path = os.path.join(os.getcwd(), F0_path)
     pitch_extractor = load_F0_models(F0_path)
 
     # load BERT model
     BERT_path = config.get('PLBERT_dir', False)
+    if add_cwd is True:
+        BERT_path = os.path.join(os.getcwd(), BERT_path)
     plbert = load_plbert(BERT_path)
 
     return global_phonemizer, config, text_aligner, pitch_extractor, plbert
@@ -100,13 +110,16 @@ def load_sampler(model:torch.nn.Module)->torch.nn.Module:
 
 class StyleTTS:
     def __init__(self, 
-                 config_path:str="./Configs/config.yml", 
+                 config_path:str=None, 
                  model_path:str=None, 
                  language:str="en-us", 
                  device:str='cpu',
                  load_from_HF:bool=True, 
                  model_remote_path:str="https://huggingface.co/yl4579/StyleTTS2-LibriTTS"):
         
+        add_cwd = False
+        if config_path is None: add_cwd = True
+
         if load_from_HF is True:
             if model_path is None: 
                 cwd = os.getcwd()
@@ -114,7 +127,7 @@ def __init__(self,
                 if not os.path.exists(model_path):
                     os.makedirs(model_path, exist_ok=True)
                     os.system(f"git clone {model_remote_path} {model_path}")
-                config_path = os.path.join(model_path, "Models", "LibriTTS", "config.yml")
+                config_path = os.path.join("Models", "LibriTTS", "config.yml")
                 model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth")
 
         self.model_remote_path = model_remote_path
@@ -127,7 +140,9 @@ def __init__(self,
          self.config, 
          self.text_aligner, 
          self.pitch_extractor, 
-         self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, config_path=self.config_path)
+         self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, 
+                                                            config_path=self.config_path, 
+                                                            add_cwd=add_cwd)
         
 
         self.model, self.model_params = load_model(weight_path=model_path, 

From 3fa25c011eb23d7a5174d85410de2acf456c13fe Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 20:06:09 +0100
Subject: [PATCH 09/25] fix path

---
 api.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/api.py b/api.py
index 463e136b..bfc4a86d 100644
--- a/api.py
+++ b/api.py
@@ -31,17 +31,23 @@
 from typing import Tuple, Type, Union
 from numpy.typing import NDArray
 import os
+
+# runs first time after installation only
 import nltk
 nltk.download('punkt')
 
 
+import pathlib
+ROOT = pathlib.Path(__file__).parent.resolve()
+
+
 def load_phonemizer_configs_asr_f0_bert(language:str="en-us", 
                                         config_path:str="./Configs/config.yml",
                                         add_cwd:bool=True)->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
     global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True,  with_stress=True)
     
     if add_cwd is True:
-        config_path = os.path.join(os.getcwd(), config_path)
+        config_path = os.path.join(ROOT, config_path)
     config = yaml.safe_load(open(config_path))
 
 
@@ -49,19 +55,19 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us",
     ASR_config = config.get('ASR_config', False)
     ASR_path = config.get('ASR_path', False)
     if add_cwd is True:
-        ASR_path = os.path.join(os.getcwd(), ASR_path)
+        ASR_path = os.path.join(ROOT, ASR_path)
     text_aligner = load_ASR_models(ASR_path, ASR_config)
 
     # load pretrained F0 model
     F0_path = config.get('F0_path', False)
     if add_cwd is True:
-        F0_path = os.path.join(os.getcwd(), F0_path)
+        F0_path = os.path.join(ROOT, F0_path)
     pitch_extractor = load_F0_models(F0_path)
 
     # load BERT model
     BERT_path = config.get('PLBERT_dir', False)
     if add_cwd is True:
-        BERT_path = os.path.join(os.getcwd(), BERT_path)
+        BERT_path = os.path.join(ROOT, BERT_path)
     plbert = load_plbert(BERT_path)
 
     return global_phonemizer, config, text_aligner, pitch_extractor, plbert
@@ -122,8 +128,8 @@ def __init__(self,
 
         if load_from_HF is True:
             if model_path is None: 
-                cwd = os.getcwd()
-                model_path = os.path.join(cwd,"models_weight")
+                
+                model_path = os.path.join(ROOT,"models_weight")
                 if not os.path.exists(model_path):
                     os.makedirs(model_path, exist_ok=True)
                     os.system(f"git clone {model_remote_path} {model_path}")

From e229f416258ab5f12e62e7932e6f1a1215c6feff Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 20:12:52 +0100
Subject: [PATCH 10/25] fix config_path

---
 api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api.py b/api.py
index bfc4a86d..98f0bc30 100644
--- a/api.py
+++ b/api.py
@@ -133,7 +133,7 @@ def __init__(self,
                 if not os.path.exists(model_path):
                     os.makedirs(model_path, exist_ok=True)
                     os.system(f"git clone {model_remote_path} {model_path}")
-                config_path = os.path.join("Models", "LibriTTS", "config.yml")
+                config_path = os.path.join("models_weight", "Models", "LibriTTS", "config.yml")
                 model_path = os.path.join(model_path, "Models", "LibriTTS", "epochs_2nd_00020.pth")
 
         self.model_remote_path = model_remote_path

From d804321ca6923e510b45674e72bbbdbf4e8fb962 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 20:33:51 +0100
Subject: [PATCH 11/25] fix ASR_config

---
 api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/api.py b/api.py
index 98f0bc30..fb9605c3 100644
--- a/api.py
+++ b/api.py
@@ -56,6 +56,7 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us",
     ASR_path = config.get('ASR_path', False)
     if add_cwd is True:
         ASR_path = os.path.join(ROOT, ASR_path)
+        ASR_config = os.path.join(ROOT, ASR_config)
     text_aligner = load_ASR_models(ASR_path, ASR_config)
 
     # load pretrained F0 model

From fcd39e7bc832751aa0786ad0caeb013cb7a33822 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 20:48:27 +0100
Subject: [PATCH 12/25] added include_package_data

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 57766c04..634fb3c6 100644
--- a/setup.py
+++ b/setup.py
@@ -14,5 +14,6 @@
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
+   include_package_data=True,
    
 )
\ No newline at end of file

From b157d83cdc7176a99ce198d5389d82e3b5ca52e4 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:01:32 +0100
Subject: [PATCH 13/25] try to include configs

---
 MANIFEST.in | 6 ++++++
 setup.py    | 5 +++++
 2 files changed, 11 insertions(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..ffa0621b
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,6 @@
+include **/*.yml
+include **/*.yaml
+include **/*.json
+include **/*.pth
+include **/*.t7
+include **/*.txt
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 634fb3c6..52a4c10f 100644
--- a/setup.py
+++ b/setup.py
@@ -10,10 +10,15 @@
    version='2.0',
    description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.',
    license='MIT',
+   packages=find_packages(),
    package_dir={'styletts2':'./'},
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
    include_package_data=True,
+    # package_data = {
+    #     'static': ['*'],
+    #     'Potato': ['*.txt']
+    # }
    
 )
\ No newline at end of file

From 758d8ba6da04b4d8664ceaea71aac8b5ec33c091 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:16:54 +0100
Subject: [PATCH 14/25] try to fix configs

---
 MANIFEST.in | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index ffa0621b..4266a761 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,6 @@
-include **/*.yml
-include **/*.yaml
-include **/*.json
-include **/*.pth
-include **/*.t7
-include **/*.txt
\ No newline at end of file
+global-include *.yml
+global-include *.yaml
+global-include *.json
+global-include *.pth
+global-include *.t7
+global-include *.txt
\ No newline at end of file

From 9cb050905f09630535eb8efce09a26dac3d9187e Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:21:41 +0100
Subject: [PATCH 15/25] test

---
 MANIFEST.in => MANIFEST.del_temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename MANIFEST.in => MANIFEST.del_temp (100%)

diff --git a/MANIFEST.in b/MANIFEST.del_temp
similarity index 100%
rename from MANIFEST.in
rename to MANIFEST.del_temp

From c5822a32214c6f79860d4ccc1b2770abccee3573 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:27:16 +0100
Subject: [PATCH 16/25] test

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 52a4c10f..48f86628 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
-   include_package_data=True,
+#    include_package_data=True,
     # package_data = {
     #     'static': ['*'],
     #     'Potato': ['*.txt']

From addc30032c37251511c3dde071247c2f9680c743 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:30:22 +0100
Subject: [PATCH 17/25] test

---
 setup.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 48f86628..34686430 100644
--- a/setup.py
+++ b/setup.py
@@ -15,10 +15,6 @@
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
-#    include_package_data=True,
-    # package_data = {
-    #     'static': ['*'],
-    #     'Potato': ['*.txt']
-    # }
+
    
 )
\ No newline at end of file

From a778061a210783895d552c40d6b197ce6f263d9d Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:37:57 +0100
Subject: [PATCH 18/25] test

---
 MANIFEST.del_temp | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 MANIFEST.del_temp

diff --git a/MANIFEST.del_temp b/MANIFEST.del_temp
deleted file mode 100644
index 4266a761..00000000
--- a/MANIFEST.del_temp
+++ /dev/null
@@ -1,6 +0,0 @@
-global-include *.yml
-global-include *.yaml
-global-include *.json
-global-include *.pth
-global-include *.t7
-global-include *.txt
\ No newline at end of file

From ddf0a2d009117dfa413e113ca4ee0c55486eda92 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:42:13 +0100
Subject: [PATCH 19/25] test

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 34686430..45928d4b 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,6 @@
    version='2.0',
    description='A text-to-speech (TTS) model that leverages style diffusion and adversarial training with large speech language models (SLMs) to achieve human-level TTS synthesis.',
    license='MIT',
-   packages=find_packages(),
    package_dir={'styletts2':'./'},
    long_description=open('README.md').read(),
    install_requires=required,

From 989a3c8235e7c3f8362173535215531b66846cc8 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:49:45 +0100
Subject: [PATCH 20/25] test

---
 MANIFEST.in | 6 ++++++
 setup.py    | 1 +
 2 files changed, 7 insertions(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..4266a761
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,6 @@
+global-include *.yml
+global-include *.yaml
+global-include *.json
+global-include *.pth
+global-include *.t7
+global-include *.txt
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 45928d4b..f237984d 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,7 @@
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
+   include_package_data=True,
 
    
 )
\ No newline at end of file

From 8c224c26c1347f15bf08f0ed0008356a9ba557c5 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 21:55:47 +0100
Subject: [PATCH 21/25] test

---
 setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f237984d..7b36cae7 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,10 @@
    long_description=open('README.md').read(),
    install_requires=required,
    url="https://github.com/yl4579/StyleTTS2.git",
-   include_package_data=True,
+#    include_package_data=True,
+    package_data={
+        'styletts2': ['**/*.txt', '**/*.t7', '**/*.pth', '**/*.json', '**/*.yaml', '**/*.yml']
+    }
 
    
 )
\ No newline at end of file

From 2ecae58f1756be6d5f15856692e43a58511483a4 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 22:00:36 +0100
Subject: [PATCH 22/25] fix imports

---
 models.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/models.py b/models.py
index 5b5fa487..c7c49ece 100644
--- a/models.py
+++ b/models.py
@@ -625,7 +625,11 @@ def build_model(args, text_aligner, pitch_extractor, bert):
     assert args.decoder.type in ['istftnet', 'hifigan'], 'Decoder type unknown'
     
     if args.decoder.type == "istftnet":
-        from Modules.istftnet import Decoder
+        try:
+            from .Modules.istftnet import Decoder
+        except:
+            from Modules.istftnet import Decoder
+
         decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
                 resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
                 upsample_rates = args.decoder.upsample_rates,
@@ -634,7 +638,10 @@ def build_model(args, text_aligner, pitch_extractor, bert):
                 upsample_kernel_sizes=args.decoder.upsample_kernel_sizes, 
                 gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size) 
     else:
-        from Modules.hifigan import Decoder
+        try:
+            from .Modules.hifigan import Decoder
+        except:
+            from Modules.hifigan import Decoder
         decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
                 resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
                 upsample_rates = args.decoder.upsample_rates,

From 0453a023bcc5b6216f16f6af540c9e8410816a67 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 23:32:59 +0100
Subject: [PATCH 23/25] adding docs

---
 api.py | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 157 insertions(+), 19 deletions(-)

diff --git a/api.py b/api.py
index fb9605c3..e2e13b06 100644
--- a/api.py
+++ b/api.py
@@ -1,3 +1,6 @@
+"""
+StyleTTS2 API module.
+"""
 import torch
 torch.manual_seed(0)
 torch.backends.cudnn.benchmark = False
@@ -44,6 +47,18 @@
 def load_phonemizer_configs_asr_f0_bert(language:str="en-us", 
                                         config_path:str="./Configs/config.yml",
                                         add_cwd:bool=True)->Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]:
+    """
+    Load the necessary configurations and models for phonemizer, ASR, F0, and BERT.
+
+    Args:
+        language (str, optional): The language for the phonemizer backend. Defaults to "en-us".
+        config_path (str, optional): The path to the configuration file. Defaults to "./Configs/config.yml".
+        add_cwd (bool, optional): Whether to add the current working directory to the paths. This is used to load default models only.  Defaults to True.
+
+    Returns:
+        Tuple[any, dict, torch.nn.Module, torch.nn.Module, torch.nn.Module]: A tuple containing the global phonemizer,
+        the configuration dictionary, the text aligner model, the pitch extractor model, and the BERT model.
+    """
     global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True,  with_stress=True)
     
     if add_cwd is True:
@@ -75,7 +90,21 @@ def load_phonemizer_configs_asr_f0_bert(language:str="en-us",
 
 def load_model(weight_path:str, config:dict, 
                text_aligner:torch.nn.Module, pitch_extractor:torch.nn.Module,
-                 plbert:torch.nn.Module, device:str='cpu')->Tuple[torch.nn.Module, any]:
+               plbert:torch.nn.Module, device:str='cpu')->Tuple[torch.nn.Module, any]:
+    """
+    Loads a pre-trained model with the specified weight path and configuration.
+
+    Args:
+        weight_path (str): The path to the pre-trained model weights.
+        config (dict): The configuration dictionary for building the model.
+        text_aligner (torch.nn.Module): The text aligner module. Returned by load_phonemizer_configs_asr_f0_bert.
+        pitch_extractor (torch.nn.Module): The pitch extractor module. Returned by load_phonemizer_configs_asr_f0_bert.
+        plbert (torch.nn.Module): The plbert module. Returned by load_phonemizer_configs_asr_f0_bert.
+        device (str, optional): The device to load the model on. Defaults to 'cpu'.
+
+    Returns:
+        Tuple[torch.nn.Module, any]: A tuple containing the loaded model and its parameters.
+    """
     model_params = recursive_munch(config['model_params'])
     model = build_model(model_params, text_aligner, pitch_extractor, plbert)
     _ = [model[key].eval() for key in model]
@@ -106,7 +135,16 @@ def load_model(weight_path:str, config:dict,
 
     return model, model_params
 
-def load_sampler(model:torch.nn.Module)->torch.nn.Module:
+def load_sampler(model: torch.nn.Module) -> torch.nn.Module:
+    """
+    Loads a diffusion sampler for the given model.
+
+    Args:
+        model (torch.nn.Module): The model to load the sampler for. Returned by load_model.
+
+    Returns:
+        torch.nn.Module: The loaded diffusion sampler.
+    """
     sampler = DiffusionSampler(model.diffusion.diffusion,
                                sampler=ADPM2Sampler(),
                                sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
@@ -123,7 +161,17 @@ def __init__(self,
                  device:str='cpu',
                  load_from_HF:bool=True, 
                  model_remote_path:str="https://huggingface.co/yl4579/StyleTTS2-LibriTTS"):
-        
+        """
+        Initializes the API object for StyleTTS2.
+
+        Args:
+            config_path (str, optional): Path to the configuration file. Defaults to None.
+            model_path (str, optional): Path to the model file. Defaults to None. If None, will use LJ Speech model.
+            language (str, optional): Language code. Defaults to "en-us". More languages will be added in the future with multi language plbert.
+            device (str, optional): Device to run the model on. Defaults to 'cpu'.
+            load_from_HF (bool, optional): Whether to load the model from Hugging Face. Defaults to True.
+            model_remote_path (str, optional): Remote path to the model. Defaults to "https://huggingface.co/yl4579/StyleTTS2-LibriTTS".
+        """
         add_cwd = False
         if config_path is None: add_cwd = True
 
@@ -150,7 +198,7 @@ def __init__(self,
          self.plbert) = load_phonemizer_configs_asr_f0_bert(language=language, 
                                                             config_path=self.config_path, 
                                                             add_cwd=add_cwd)
-        
+
 
         self.model, self.model_params = load_model(weight_path=model_path, 
                                                    config=self.config, 
@@ -168,16 +216,44 @@ def __init__(self,
         self.mean, self.std = -4, 4
 
     def __call__(self, text:str, ref_s:NDArray=None, alpha:float=0.3, 
-                 beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
-        return self.predict(text=text, 
-                            ref_s=ref_s, 
-                            alpha=alpha, 
-                            beta=beta, 
-                            diffusion_steps=diffusion_steps, 
-                            embedding_scale=embedding_scale)
+                     beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+            """
+            Call the model to generate speech with the given input text and optional reference style. wrapper for predict.
+
+            Args:
+                text (str): The input text for speech generation.
+                ref_s (NDArray, optional): The reference style for speech generation. Defaults to None.
+                alpha (float, optional): The weight of the reference style in the generated speech. Defaults to 0.3.
+                beta (float, optional): The weight of the input text in the generated speech. Defaults to 0.7.
+                diffusion_steps (float, optional): The number of diffusion steps for speech generation. Defaults to 5.
+                embedding_scale (float, optional): The scale factor for the input text embedding. Defaults to 1.
+
+            Returns:
+                NDArray: The generated speech waveform.
+            """
+            return self.predict(text=text, 
+                                ref_s=ref_s, 
+                                alpha=alpha, 
+                                beta=beta, 
+                                diffusion_steps=diffusion_steps, 
+                                embedding_scale=embedding_scale)
 
     def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3, 
-                beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+                    beta:float=0.7, diffusion_steps:float=5, embedding_scale:float=1) -> NDArray:
+        """
+        Generates speech waveform for the given input text.
+
+        Args:
+            text (str): The input text to be synthesized.
+            ref_s (NDArray, optional): Reference speaker embedding. Returned by compute_style. Defaults to None.
+            alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre).
+            beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody).
+            diffusion_steps (float, optional): Number of diffusion steps for sampling the speech. Defaults to 5.
+            embedding_scale (float, optional): Scaling factor for the speaker embedding. Defaults to 1.
+
+        Returns:
+            NDArray: The generated speech waveform.
+        """
         
         if ref_s is None: ref_s = self.load_random_ref_s()
 
@@ -198,10 +274,10 @@ def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
             d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
 
             s_pred = self.sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(self._device),
-                                  embedding=bert_dur,
-                                  embedding_scale=embedding_scale,
-                                  features=ref_s, # reference from the same speaker as the embedding
-                                  num_steps=diffusion_steps).squeeze(1)
+                                    embedding=bert_dur,
+                                    embedding_scale=embedding_scale,
+                                    features=ref_s, # reference from the same speaker as the embedding
+                                    num_steps=diffusion_steps).squeeze(1)
 
 
             s = s_pred[:, 128:]
@@ -245,9 +321,22 @@ def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
             out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
 
 
-        return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later 
+        return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
 
     def compute_style(self, wave=None, sr=None, path=None, device='cpu')->torch.Tensor:
+        """
+        Compute the style representation for the given audio. If path is provided, it will load the audio from the path.
+        Otherwise, it will use the wave and sr arguments.
+
+        Args:
+            wave (np.ndarray, optional): Audio waveform. Defaults to None.
+            sr (int, optional): Sample rate of the audio. Defaults to None.
+            path (str, optional): Path to the audio file. Defaults to None.
+            device (str, optional): Device to use for computation. Defaults to 'cpu'.
+
+        Returns:
+            torch.Tensor: Style representation tensor.
+        """
         if path is not None:
             wave, sr = librosa.load(path, sr=24000)
         audio, index = librosa.effects.trim(wave, top_db=30)
@@ -267,15 +356,43 @@ def length_to_mask(self, lengths:NDArray)->torch.Tensor:
         return mask
 
     def preprocess(self, wave:NDArray)->torch.Tensor:
+        """
+        Preprocesses the input waveform by converting it to a mel spectrogram tensor.
+
+        Args:
+            wave (numpy.ndarray): The input waveform.
+
+        Returns:
+            torch.Tensor: The preprocessed mel spectrogram tensor.
+        """
         wave_tensor = torch.from_numpy(wave).float()
         mel_tensor = self.to_mel(wave_tensor)
         mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std
         return mel_tensor
 
-    def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, 
+    def predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, 
                            alpha:float=0.3, beta:float=0.7, t:float=0.7, 
                            diffusion_steps:int=5, embedding_scale:int=1)->NDArray:
+        """
+            Predicts the output audio waveform for a given input text and style.
+
+            Args:
+                text (str): The input text to be synthesized.
+                s_prev (NDArray): The previous style embedding.
+                ref_s (NDArray, optional): The reference style embedding. If not provided, a random reference style is loaded. Defaults to None.
+                alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre).
+                beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody).
+                t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7.
+                diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5.
+                embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1.
+
+            Returns:
+                NDArray: The output audio waveform.
+                NDArray: The predicted style embedding.
+        """
+        
         if ref_s is None: ref_s = self.load_random_ref_s()
+        
         text = text.strip()
         ps = self.global_phonemizer.phonemize([text])
         ps = word_tokenize(ps[0])
@@ -354,6 +471,21 @@ def _predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None,
     def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3, 
                      beta:float=0.7, diffusion_steps:float=5, 
                      embedding_scale:float=1, t:float=.7) -> NDArray:
+        """
+        Generates a long audio prediction based on the given text.
+
+        Args:
+            text (str): The input text to be synthesized.
+            ref_s (NDArray, optional): The reference style embedding. If not provided, a random reference style is loaded. Defaults to None.
+            alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre).
+            beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody).
+            t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7.
+            diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5.
+            embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1.
+
+        Returns:
+            NDArray: The generated audio waveform as a numpy array.
+        """
         if ref_s is None: ref_s = self.load_random_ref_s()
         sentences = text.split('.') # simple split by dot (what about split_and_recombine_text tortoise. I'll check it out later)
         wavs = []
@@ -362,7 +494,7 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
             if text.strip() == "": continue
             text += '.' # add it back
 
-            wav, s_prev = self._predict_long_step(text,
+            wav, s_prev = self.predict_long_step(text,
                                                   s_prev,
                                                   ref_s,
                                                   alpha=alpha,
@@ -375,6 +507,12 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
         return np.concatenate(wavs, axis=0)
 
     def load_random_ref_s(self):
+        """
+        returns a random style embedding. This ruins the result. Use it only for testing.
+
+        Returns:
+            torch.Tensor: A random style embedding tensor.
+        """
         return torch.randn(1, 256).to(self._device)
     
     @property

From e07dba583c851f84dde04b810275e65138f508ed Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Fri, 1 Mar 2024 23:39:37 +0100
Subject: [PATCH 24/25] add embedding_scale docs

---
 api.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/api.py b/api.py
index e2e13b06..c48c3660 100644
--- a/api.py
+++ b/api.py
@@ -226,7 +226,9 @@ def __call__(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
                 alpha (float, optional): The weight of the reference style in the generated speech. Defaults to 0.3.
                 beta (float, optional): The weight of the input text in the generated speech. Defaults to 0.7.
                 diffusion_steps (float, optional): The number of diffusion steps for speech generation. Defaults to 5.
-                embedding_scale (float, optional): The scale factor for the input text embedding. Defaults to 1.
+                embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. 
+                                                    The higher the scale, the more conditional the style is to the input text and hence more emotional. 
+                                                    Defaults to 1.
 
             Returns:
                 NDArray: The generated speech waveform.
@@ -249,7 +251,9 @@ def predict(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
             alpha (float, optional): Alpha value for controlling timbr. Defaults to 0.3 (70% of the reference timbre).
             beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody).
             diffusion_steps (float, optional): Number of diffusion steps for sampling the speech. Defaults to 5.
-            embedding_scale (float, optional): Scaling factor for the speaker embedding. Defaults to 1.
+            embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. 
+                                                    The higher the scale, the more conditional the style is to the input text and hence more emotional. 
+                                                    Defaults to 1.
 
         Returns:
             NDArray: The generated speech waveform.
@@ -372,7 +376,7 @@ def preprocess(self, wave:NDArray)->torch.Tensor:
 
     def predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None, 
                            alpha:float=0.3, beta:float=0.7, t:float=0.7, 
-                           diffusion_steps:int=5, embedding_scale:int=1)->NDArray:
+                           diffusion_steps:int=5, embedding_scale:float=1)->NDArray:
         """
             Predicts the output audio waveform for a given input text and style.
 
@@ -384,7 +388,9 @@ def predict_long_step(self, text:str, s_prev:NDArray, ref_s:NDArray=None,
                 beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody).
                 t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7.
                 diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5.
-                embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1.
+                embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. 
+                                                    The higher the scale, the more conditional the style is to the input text and hence more emotional. 
+                                                    Defaults to 1.
 
             Returns:
                 NDArray: The output audio waveform.
@@ -481,7 +487,9 @@ def predict_long(self, text:str, ref_s:NDArray=None, alpha:float=0.3,
             beta (float, optional): Beta value for controlling the prosody. Defaults to 0.7 (30% of the reference prosody).
             t (float, optional): The convex combination factor between the previous and current style. Defaults to 0.7.
             diffusion_steps (int, optional): The number of diffusion steps. Defaults to 5.
-            embedding_scale (int, optional): The scale factor for the style embedding. Defaults to 1.
+            embedding_scale (float, optional): The scale factor for the input text embedding. This is the classifier-free guidance scale. 
+                                                    The higher the scale, the more conditional the style is to the input text and hence more emotional. 
+                                                    Defaults to 1.
 
         Returns:
             NDArray: The generated audio waveform as a numpy array.

From cc09ac49fbcf54ec8fb355c4b87a6453e48b2b91 Mon Sep 17 00:00:00 2001
From: Mohannad Ehab Barakat <hannod98@yahoo.com>
Date: Sat, 2 Mar 2024 11:26:50 +0100
Subject: [PATCH 25/25] fix bug in compute_style with wave input

---
 api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api.py b/api.py
index c48c3660..d527de69 100644
--- a/api.py
+++ b/api.py
@@ -345,7 +345,7 @@ def compute_style(self, wave=None, sr=None, path=None, device='cpu')->torch.Tens
             wave, sr = librosa.load(path, sr=24000)
         audio, index = librosa.effects.trim(wave, top_db=30)
         if sr != 24000:
-            audio = librosa.resample(audio, sr, 24000)
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
         mel_tensor = self.preprocess(audio).to(device)
 
         with torch.no_grad():