diff --git a/README.md b/README.md index 1fe7bce8..711a6ac3 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ pip install so-vits-svc-fork ## Features not available in the original repo +- **Realtime voice conversion** - Unified command-line interface (no need to run Python scripts) - Ready to use just by installing with `pip`. - Automatically download pretrained base model and HuBERT model @@ -50,6 +51,12 @@ pip install so-vits-svc-fork ## Usage +### Realtime Voice conversion + +```shell +svc vc --model-path +``` + ### Training Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb) @@ -69,6 +76,34 @@ svc train svc --model-path source.wav ``` +For more details, run `svc -h` or `svc -h`. + +```shell +svc -h +Usage: svc [OPTIONS] COMMAND [ARGS]... + + so-vits-svc allows any folder structure for training data. However, it is + recommended to place the training data in the following structure: + + dataset_raw/{speaker_name}/{wav_name}.wav + + To train a model, run pre-resample, pre-config, pre-hubert, train. To infer + a model, run infer. + +Options: + -h, --help Show this message and exit. + +Commands: + clean Clean up files, only useful if you are using the default... + infer Inference + onnx Export model to onnx + pre-config Preprocessing part 2: config + pre-hubert Preprocessing part 3: hubert + pre-resample Preprocessing part 1: resample + train Train model + vc Realtime inference from microphone +``` + ## Contributors ✨ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): diff --git a/poetry.lock b/poetry.lock index 2952b2ca..c2ace889 100644 --- a/poetry.lock +++ b/poetry.lock @@ -582,6 +582,21 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "cm-time" +version = "0.1.2" +description = "A simple context manager that measures time using perf_counter()" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "cm_time-0.1.2-py3-none-any.whl", hash = "sha256:8dfd172916a7b23f508fbca046157067ab9374fc38791d408278ce7288077cfa"}, + {file = "cm_time-0.1.2.tar.gz", hash = "sha256:e2848efc5868884d0a7795408ec9b2c21d2d3e2cf399241e8e4531a29128b638"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.4.0,<5.0.0", markers = "python_version < \"3.10\""} + [[package]] name = "colorama" version = "0.4.6" @@ -2556,7 +2571,6 @@ files = [ numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" @@ -4655,5 +4669,5 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" -python-versions = "^3.8" -content-hash = "cfa6c5f171ac51aac49572a3ce0e664a51b47571349189816e66c9a17a63894a" +python-versions = ">=3.8,<3.11" +content-hash = "bd0a5148f6634dc9b2df2d30a8752d0de8dc72d509827ea6b4245e12bfb34060" diff --git a/pyproject.toml b/pyproject.toml index 65f198b8..7cb597e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ rich = "*" tqdm-joblib = "*" tensorboardx = "*" pyinputplus = "*" +cm-time = "^0.1.2" [tool.poetry.group.dev.dependencies] pre-commit = ">=3" diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index fd26b155..4f4b57ed 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -36,10 +36,15 @@ @click.group() def cli(): """so-vits-svc allows any folder structure for training data. - However, it is recommended to place the training data in the following structure: + However, the following folder structure is recommended. - dataset_raw/{speaker_name}/{wav_name}.wav + When training: dataset_raw/{speaker_name}/{wav_name}.wav + + When inference: configs/44k/config.json, logs/44k/G_XXXX.pth + + If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc. + (The latest model will be automatically loaded.) To train a model, run pre-resample, pre-config, pre-hubert, train. To infer a model, run infer. """ @@ -62,7 +67,8 @@ def cli(): default=Path("./logs/44k"), ) def train(config_path: Path, model_path: Path): - """Train model""" + """Train model + If D_0.pth or G_0.pth not found, automatically download from hub.""" from .train import main config_path = Path(config_path) @@ -87,7 +93,7 @@ def train(config_path: Path, model_path: Path): "-m", "--model_path", type=click.Path(exists=True), - default=Path("./logs/44k/G_800.pth"), + default=Path("./logs/44k/"), help="path to model", ) @click.option( @@ -107,7 +113,7 @@ def train(config_path: Path, model_path: Path): @click.option("-t", "--transpose", type=int, default=0, help="transpose") @click.option("-d", "--db_thresh", type=int, default=-40, help="db thresh") @click.option( - "-a", "--auto_predict_f0", type=bool, default=False, help="auto predict f0" + "-a", "--auto_predict_f0", type=bool, default=True, help="auto predict f0" ) @click.option( "-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio" @@ -139,11 +145,20 @@ def infer( """Inference""" from .inference_main import infer + if not auto_predict_f0: + LOG.warning( + f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please set transpose." + "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different." + ) + input_path = Path(input_path) if output_path is None: output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}" output_path = Path(output_path) model_path = Path(model_path) + if model_path.is_dir(): + model_path = list(sorted(model_path.glob("*.pth")))[-1] + LOG.info(f"Since model_path is a directory, use {model_path}") config_path = Path(config_path) if cluster_model_path is not None: cluster_model_path = Path(cluster_model_path) @@ -164,6 +179,114 @@ def infer( ) +@cli.command() +@click.option( + "-m", + "--model_path", + type=click.Path(exists=True), + default=Path("./logs/44k/"), + help="path to model", +) +@click.option( + "-c", + "--config_path", + type=click.Path(exists=True), + default=Path("./configs/44k/config.json"), + help="path to config", +) +@click.option( + "-k", + "--cluster_model_path", + type=click.Path(exists=True), + default=None, + help="path to cluster model", +) +@click.option("-t", "--transpose", type=int, default=12, help="transpose") +@click.option( + "-a", + "--auto_predict_f0", + type=bool, + default=False, + help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)", +) +@click.option( + "-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio" +) +@click.option("-n", "--noise_scale", type=float, default=0.4, help="noise scale") +@click.option("-d", "--db_thresh", type=int, default=-20, help="db thresh") +@click.option("-p", "--pad_seconds", type=float, default=0.02, help="pad seconds") +@click.option( + "-c", + "--crossfade_seconds", + type=float, + default=0.01, + help="crossfade seconds", +) +@click.option("-b", "--block_seconds", type=float, default=1, help="block seconds") +@click.option( + "-d", + "--device", + type=str, + default="cuda" if torch.cuda.is_available() else "cpu", + help="device", +) +@click.option("-s", "--speaker", type=str, default=None, help="speaker name") +def vc( + # paths + model_path: Path, + config_path: Path, + # svc config + speaker: str, + cluster_model_path: Path | None, + transpose: int, + auto_predict_f0: bool, + cluster_infer_ratio: float, + noise_scale: float, + # slice config + db_thresh: int, + pad_seconds: float, + # realtime config + crossfade_seconds: float, + block_seconds: float, + device: Literal["cpu", "cuda"], +) -> None: + """Realtime inference from microphone""" + from .inference_main import realtime + + if auto_predict_f0: + LOG.warning( + "auto_predict_f0 = True in realtime inference will cause unstable voice pitch, use with caution" + ) + else: + LOG.warning( + f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please change the transpose value." + "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different." + ) + model_path = Path(model_path) + config_path = Path(config_path) + if cluster_model_path is not None: + cluster_model_path = Path(cluster_model_path) + if model_path.is_dir(): + model_path = list(sorted(model_path.glob("*.pth")))[-1] + LOG.info(f"Since model_path is a directory, use {model_path}") + + realtime( + model_path=model_path, + config_path=config_path, + speaker=speaker, + cluster_model_path=cluster_model_path, + transpose=transpose, + auto_predict_f0=auto_predict_f0, + cluster_infer_ratio=cluster_infer_ratio, + noise_scale=noise_scale, + crossfade_seconds=crossfade_seconds, + block_seconds=block_seconds, + db_thresh=db_thresh, + pad_seconds=pad_seconds, + device=device, + ) + + @click.help_option("--help", "-h") @cli.command() @click.option( @@ -250,7 +373,8 @@ def pre_config( default=Path("./configs/44k/config.json"), ) def pre_hubert(input_dir: Path, config_path: Path) -> None: - """Preprocessing part 3: hubert""" + """Preprocessing part 3: hubert + If the HuBERT model is not found, it will be downloaded automatically.""" from .preprocess_hubert_f0 import preprocess_hubert_f0 input_dir = Path(input_dir) diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py index e8f9c263..0458af2e 100644 --- a/src/so_vits_svc_fork/inference/infer_tool.py +++ b/src/so_vits_svc_fork/inference/infer_tool.py @@ -1,18 +1,13 @@ -import hashlib -import io -import json +from __future__ import annotations + import os -import time from logging import getLogger -from pathlib import Path +from typing import Any import librosa import numpy as np - -# import onnxruntime -import soundfile import torch -import torchaudio +from cm_time import timer from so_vits_svc_fork import cluster, utils from so_vits_svc_fork.inference import slicer @@ -23,85 +18,16 @@ LOG = getLogger(__name__) -def read_temp(file_name): - if not os.path.exists(file_name): - with open(file_name, "w") as f: - f.write(json.dumps({"info": "temp_dict"})) - return {} - else: - try: - with open(file_name) as f: - data = f.read() - data_dict = json.loads(data) - if os.path.getsize(file_name) > 50 * 1024 * 1024: - f_name = file_name.replace("\\", "/").split("/")[-1] - LOG.info(f"clean {f_name}") - for wav_hash in list(data_dict.keys()): - if ( - int(time.time()) - int(data_dict[wav_hash]["time"]) - > 14 * 24 * 3600 - ): - del data_dict[wav_hash] - except Exception as e: - LOG.exception(e) - LOG.info(f"{file_name} error,auto rebuild file") - data_dict = {"info": "temp_dict"} - return data_dict - - -def write_temp(file_name, data): - with open(file_name, "w") as f: - f.write(json.dumps(data)) - - -def timeit(func): - def run(*args, **kwargs): - t = time.time() - res = func(*args, **kwargs) - LOG.info(f"executing '{func.__name__}' costed {time.time() - t:.3f}s") - return res - - return run - - -def format_wav(audio_path): - if Path(audio_path).suffix == ".wav": - return - raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None) - soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate) - - -def get_end_file(dir_path, end): - file_lists = [] - for root, dirs, files in os.walk(dir_path): - files = [f for f in files if f[0] != "."] - dirs[:] = [d for d in dirs if d[0] != "."] - for f_file in files: - if f_file.endswith(end): - file_lists.append(os.path.join(root, f_file).replace("\\", "/")) - return file_lists - - -def get_md5(content): - return hashlib.new("md5", content).hexdigest() - - -def fill_a_to_b(a, b): - if len(a) < len(b): - for _ in range(0, len(b) - len(a)): - a.append(a[0]) - - -def mkdir(paths: list): - for path in paths: - if not os.path.exists(path): - os.mkdir(path) - - def pad_array(arr, target_length): current_length = arr.shape[0] if current_length >= target_length: - return arr + return arr[ + (current_length - target_length) + // 2 : (current_length - target_length) + // 2 + + target_length, + ..., + ] else: pad_width = target_length - current_length pad_left = pad_width // 2 @@ -115,10 +41,11 @@ def pad_array(arr, target_length): class Svc: def __init__( self, - net_g_path, - config_path, - device=None, - cluster_model_path="logs/44k/kmeans_10000.pt", + *, + net_g_path: str, + config_path: str, + device: torch.device | str | None = None, + cluster_model_path: str | None = None, ): self.net_g_path = net_g_path if device is None: @@ -130,14 +57,12 @@ def __init__( self.target_sample = self.hps_ms.data.sampling_rate self.hop_size = self.hps_ms.data.hop_length self.spk2id = self.hps_ms.spk - # 加载hubert self.hubert_model = utils.get_hubert_model().to(self.dev) self.load_model() - if os.path.exists(cluster_model_path): + if cluster_model_path is not None and os.path.exists(cluster_model_path): self.cluster_model = cluster.get_cluster_model(cluster_model_path) def load_model(self): - # 获取模型配置 self.net_g_ms = SynthesizerTrn( self.hps_ms.data.filter_length // 2 + 1, self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, @@ -149,11 +74,15 @@ def load_model(self): else: _ = self.net_g_ms.eval().to(self.dev) - def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker): - wav, sr = librosa.load(in_path, sr=self.target_sample) - + def get_unit_f0( + self, + audio: np.ndarray[Any, np.dtype[np.float64]], + tran: int, + cluster_infer_ratio: float, + speaker: int | str, + ): f0 = utils.compute_f0_parselmouth( - wav, sampling_rate=self.target_sample, hop_length=self.hop_size + audio, sampling_rate=self.target_sample, hop_length=self.hop_size ) f0, uv = utils.interpolate_f0(f0) f0 = torch.FloatTensor(f0) @@ -163,7 +92,7 @@ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker): uv = uv.unsqueeze(0).to(self.dev) wav16k = librosa.resample( - wav, orig_sr=self.target_sample, target_sr=HUBERT_SAMPLING_RATE + audio, orig_sr=self.target_sample, target_sr=HUBERT_SAMPLING_RATE ) wav16k = torch.from_numpy(wav16k).to(self.dev) c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k) @@ -181,113 +110,191 @@ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker): def infer( self, - speaker, - tran, - raw_path, - cluster_infer_ratio=0, - auto_predict_f0=False, - noice_scale=0.4, + speaker: int | str, + transpose: int, + audio: np.ndarray[Any, np.dtype[np.float32]], + cluster_infer_ratio: float = 0, + auto_predict_f0: bool = False, + noise_scale: float = 0.4, ): + audio = audio.astype(np.float32) + # get speaker id speaker_id = self.spk2id.__dict__.get(speaker) - if not speaker_id and type(speaker) is int: + if not speaker_id and isinstance(speaker, int): if len(self.spk2id.__dict__) >= speaker: speaker_id = speaker + else: + LOG.warning(f"Speaker {speaker} is not found. Use speaker 0 instead.") + speaker_id = 0 sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) - c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker) + + # get unit f0 + c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker) if "half" in self.net_g_path and torch.cuda.is_available(): c = c.half() + + # inference with torch.no_grad(): - start = time.time() - audio = self.net_g_ms.infer( - c, - f0=f0, - g=sid, - uv=uv, - predict_f0=auto_predict_f0, - noice_scale=noice_scale, - )[0, 0].data.float() - use_time = time.time() - start - LOG.info(f"vits use time:{use_time}") + with timer() as t: + audio = self.net_g_ms.infer( + c, + f0=f0, + g=sid, + uv=uv, + predict_f0=auto_predict_f0, + noice_scale=noise_scale, + )[0, 0].data.float() + realtime_coef = len(audio) / (t.elapsed * self.target_sample) + LOG.info( + f"Inferece time: {t.elapsed:.2f}s, Realtime coef: {realtime_coef:.2f} " + f"Input shape: {audio.shape}, Output shape: {audio.shape}" + ) return audio, audio.shape[-1] def clear_empty(self): - # 清理显存 torch.cuda.empty_cache() - def slice_inference( + def infer_silence( self, - raw_audio_path, - spk, - tran, - slice_db, - cluster_infer_ratio, - auto_predict_f0, - noice_scale, - pad_seconds=0.5, - ): - wav_path = raw_audio_path - chunks = slicer.cut(wav_path, db_thresh=slice_db) - audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks) - - audio = [] - for slice_tag, data in audio_data: - LOG.info(f"#=====segment start, {round(len(data) / audio_sr, 3)}s======") - # pad - pad_len = int(audio_sr * pad_seconds) - data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])]) - length = int(np.ceil(len(data) / audio_sr * self.target_sample)) - raw_path = io.BytesIO() - soundfile.write(raw_path, data, audio_sr, format="wav") - raw_path.seek(0) + audio: np.ndarray[Any, np.dtype[np.float32]], + *, + # svc config + speaker: int | str, + transpose: int = 0, + auto_predict_f0: bool = False, + cluster_infer_ratio: float = 0, + noise_scale: float = 0.4, + # slice config + db_thresh: int = -40, + pad_seconds: float = 0.5, + fade_seconds: float = 0.04, + ) -> np.ndarray[Any, np.dtype[np.float32]]: + chunks = slicer.cut(audio, self.target_sample, db_thresh=db_thresh) + LOG.info(f"Cut audio into chunks {chunks}") + sr = self.target_sample + + result_audio = np.array([]) + for slice_tag, data in slicer.chunks2audio(audio, chunks): + # segment length + length = int(np.ceil(len(data) / sr * self.target_sample)) if slice_tag: - LOG.info("jump empty segment") + LOG.info("Skip silence") _audio = np.zeros(length) else: + # pad + pad_len = int(sr * pad_seconds) + data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])]) out_audio, out_sr = self.infer( - spk, - tran, - raw_path, + speaker, + transpose, + audio, cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, - noice_scale=noice_scale, + noise_scale=noise_scale, ) _audio = out_audio.cpu().numpy() + pad_len = int(self.target_sample * pad_seconds) + _audio = _audio[pad_len:-pad_len] - pad_len = int(self.target_sample * pad_seconds) - _audio = _audio[pad_len:-pad_len] - audio.extend(list(_audio)) - return np.array(audio) + # add fade + fade_len = int(self.target_sample * fade_seconds) + _audio[:fade_len] = _audio[:fade_len] * np.linspace(0, 1, fade_len) + _audio[-fade_len:] = _audio[-fade_len:] * np.linspace(1, 0, fade_len) + result_audio = np.concatenate([result_audio, pad_array(_audio, length)]) + result_audio = result_audio[: audio.shape[0]] + return result_audio -class RealTimeVC: - def __init__(self): - self.last_chunk = None - self.last_o = None - self.chunk_len = HUBERT_SAMPLING_RATE # 区块长度 - self.pre_len = 3840 # 交叉淡化长度,640的倍数 +import maad - """输入输出都是1维numpy 音频波形数组""" - def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path): - import maad +class RealTimeVCBase: + def __init__( + self, + *, + svc_model: Svc, + crossfade_len: int = 3840, + use_slicer: bool = True, + ): + self.svc_model = svc_model + self.crossfade_len = crossfade_len + self.last_input = np.zeros(crossfade_len * 2, dtype=np.float32) + self.last_infered = np.zeros(crossfade_len * 2, dtype=np.float32) + self.use_slicer = use_slicer + + """The input and output are 1-dimensional numpy audio waveform arrays""" - audio, sr = torchaudio.load(input_wav_path) - audio = audio.cpu().numpy()[0] - temp_wav = io.BytesIO() - if self.last_chunk is None: - input_wav_path.seek(0) - audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path) - audio = audio.cpu().numpy() - self.last_chunk = audio[-self.pre_len :] - self.last_o = audio - return audio[-self.chunk_len :] + def process( + self, + input_audio: np.ndarray[Any, np.dtype[np.float32]], + *, + # svc config + speaker: int | str, + transpose: int, + cluster_infer_ratio: float = 0, + auto_predict_f0: bool = False, + noise_scale: float = 0.4, + # slice config + db_thresh: int = -40, + pad_seconds: float = 0.5, + ): + if input_audio.ndim != 1: + raise ValueError("Input audio must be 1-dimensional.") + if input_audio.shape[0] < self.crossfade_len: + raise ValueError( + f"Input audio length ({len(input_audio)}) should be at least crossfade length ({self.crossfade_len})." + ) + input_audio = input_audio.astype(np.float32) + input_audio = np.nan_to_num(input_audio) + + # create input audio + input_audio_c = np.concatenate([self.last_input, input_audio])[ + -(input_audio.shape[0] + self.crossfade_len) : + ] + self.last_input = input_audio.copy() + LOG.info( + f"Input shape: {input_audio.shape}, Concatenated shape: {input_audio_c.shape}, Crossfade length: {self.crossfade_len}" + ) + assert input_audio_c.shape[0] == input_audio.shape[0] + self.crossfade_len + + # infer + if self.use_slicer: + infered_audio_c = self.svc_model.infer_silence( + audio=input_audio_c, + speaker=speaker, + transpose=transpose, + cluster_infer_ratio=cluster_infer_ratio, + auto_predict_f0=auto_predict_f0, + noise_scale=noise_scale, + db_thresh=db_thresh, + pad_seconds=pad_seconds, + ) else: - audio = np.concatenate([self.last_chunk, audio]) - soundfile.write(temp_wav, audio, sr, format="wav") - temp_wav.seek(0) - audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav) - audio = audio.cpu().numpy() - ret = maad.util.crossfade(self.last_o, audio, self.pre_len) - self.last_chunk = audio[-self.pre_len :] - self.last_o = audio - return ret[self.chunk_len : 2 * self.chunk_len] + rms = np.sqrt(np.mean(input_audio**2)) + min_rms = 10 ** (db_thresh / 20) + if rms < min_rms: + LOG.info(f"Skip silence: RMS={rms:.2f} < {min_rms:.2f}") + infered_audio_c = input_audio_c.copy() + else: + LOG.info(f"Start inference: RMS={rms:.2f} >= {min_rms:.2f}") + infered_audio_c, _ = self.svc_model.infer( + speaker=speaker, + transpose=transpose, + audio=input_audio_c, + cluster_infer_ratio=cluster_infer_ratio, + auto_predict_f0=auto_predict_f0, + noise_scale=noise_scale, + ) + infered_audio_c = infered_audio_c.cpu().numpy() + infered_audio_c = infered_audio_c + LOG.info(f"Concentrated Inferred shape: {infered_audio_c.shape}") + assert infered_audio_c.shape[0] == input_audio_c.shape[0] + + # crossfade + result = maad.util.crossfade( + self.last_infered, infered_audio_c, 1, self.crossfade_len + )[: input_audio.shape[0]] + LOG.info(f"Result shape: {result.shape}") + assert result.shape[0] == input_audio.shape[0] + self.last_infered = infered_audio_c + return result diff --git a/src/so_vits_svc_fork/inference/slicer.py b/src/so_vits_svc_fork/inference/slicer.py index 21309ad5..05e728da 100644 --- a/src/so_vits_svc_fork/inference/slicer.py +++ b/src/so_vits_svc_fork/inference/slicer.py @@ -1,6 +1,4 @@ import librosa -import torch -import torchaudio class Slicer: @@ -172,22 +170,31 @@ def slice(self, waveform): return chunk_dict -def cut(audio_path, db_thresh=-30, min_len=5000): - audio, sr = librosa.load(audio_path, sr=None) +from typing import Any, Iterable + +from numpy import dtype, float32, ndarray + + +def cut( + audio: "ndarray[Any, dtype[float32]]", + sr: int, + db_thresh: int = -30, + min_len: int = 5000, +) -> "dict[str, dict[str, bool | str]]": slicer = Slicer(sr=sr, threshold=db_thresh, min_length=min_len) chunks = slicer.slice(audio) return chunks -def chunks2audio(audio_path, chunks): +def chunks2audio( + audio: "ndarray[Any, dtype[float32]]", chunks: "dict[str, dict[str, bool | str]]" +) -> Iterable[tuple[bool, "ndarray[Any, dtype[float32]]"]]: chunks = dict(chunks) - audio, sr = torchaudio.load(audio_path) - if len(audio.shape) == 2 and audio.shape[1] >= 2: - audio = torch.mean(audio, dim=0).unsqueeze(0) - audio = audio.cpu().numpy()[0] + if audio.ndim == 2: + audio = audio.mean(axis=1) result = [] for k, v in chunks.items(): tag = v["split_time"].split(",") if tag[0] != tag[1]: result.append((v["slice"], audio[int(tag[0]) : int(tag[1])])) - return result, sr + return result diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py index f6fc1afb..e1e4b85d 100644 --- a/src/so_vits_svc_fork/inference_main.py +++ b/src/so_vits_svc_fork/inference_main.py @@ -1,71 +1,123 @@ from __future__ import annotations -import io from logging import getLogger from pathlib import Path from typing import Literal +import librosa import numpy as np import soundfile import torch -from tqdm import tqdm -from .inference import infer_tool, slicer -from .inference.infer_tool import Svc +from .inference.infer_tool import RealTimeVCBase, Svc LOG = getLogger(__name__) def infer( + *, + # paths input_path: Path, output_path: Path, - speaker: str, model_path: Path, config_path: Path, + # svc config + speaker: str, cluster_model_path: Path | None = None, transpose: int = 0, - db_thresh: int = -40, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, + # slice config + db_thresh: int = -40, pad_seconds: float = 0.5, device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu", ): svc_model = Svc( - model_path.as_posix(), config_path.as_posix(), cluster_model_path, device + net_g_path=model_path.as_posix(), + config_path=config_path.as_posix(), + cluster_model_path=cluster_model_path.as_posix() + if cluster_model_path + else None, + device=device, ) - infer_tool.format_wav(input_path) - input_path = Path(input_path).with_suffix(".wav") - chunks = slicer.cut(input_path, db_thresh=db_thresh) - audio_data, audio_sr = slicer.chunks2audio(input_path, chunks) + wav, sr = librosa.load(input_path, sr=svc_model.target_sample) + audio = svc_model.infer_silence( + wav, + speaker=speaker, + db_thresh=db_thresh, + pad_seconds=pad_seconds, + transpose=transpose, + auto_predict_f0=auto_predict_f0, + cluster_infer_ratio=cluster_infer_ratio, + noise_scale=noise_scale, + ) - audio = [] - for slice_tag, data in tqdm(audio_data): - # segment length - length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample)) - if slice_tag: - LOG.info("skip non-speaking segment") - _audio = np.zeros(length) - else: - # pad - pad_len = int(audio_sr * pad_seconds) - data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])]) - raw_path = io.BytesIO() - soundfile.write(raw_path, data, audio_sr, format="wav") - raw_path.seek(0) - out_audio, out_sr = svc_model.infer( - speaker, - transpose, - raw_path, - cluster_infer_ratio=cluster_infer_ratio, - auto_predict_f0=auto_predict_f0, - noice_scale=noise_scale, - ) - _audio = out_audio.cpu().numpy() - pad_len = int(svc_model.target_sample * pad_seconds) - _audio = _audio[pad_len:-pad_len] + soundfile.write(output_path, audio, svc_model.target_sample) - audio.extend(list(infer_tool.pad_array(_audio, length))) - soundfile.write(output_path, audio, svc_model.target_sample) +import sounddevice as sd + + +def realtime( + *, + # paths + model_path: Path, + config_path: Path, + # svc config + speaker: str, + cluster_model_path: Path | None = None, + transpose: int = 0, + auto_predict_f0: bool = False, + cluster_infer_ratio: float = 0, + noise_scale: float = 0.4, + # slice config + db_thresh: int = -40, + pad_seconds: float = 0.5, + # realtime config + crossfade_seconds: float = 0.05, + block_seconds: float = 0.5, + device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu", +): + svc_model = Svc( + net_g_path=model_path.as_posix(), + config_path=config_path.as_posix(), + cluster_model_path=cluster_model_path.as_posix() + if cluster_model_path + else None, + device=device, + ) + model = RealTimeVCBase( + svc_model=svc_model, + crossfade_len=int(crossfade_seconds * svc_model.target_sample), + ) + + def callback( + indata: np.ndarray, + outdata: np.ndarray, + frames: int, + time: int, + status: sd.CallbackFlags, + ) -> None: + LOG.info(f"Frames: {frames}, Status: {status}, Shape: {indata.shape}") + + outdata[:] = model.process( + input_audio=indata.mean(axis=1), + speaker=speaker, + transpose=transpose, + auto_predict_f0=auto_predict_f0, + noise_scale=noise_scale, + cluster_infer_ratio=cluster_infer_ratio, + db_thresh=db_thresh, + pad_seconds=pad_seconds, + ).reshape(-1, 1) + + with sd.Stream( + channels=1, + callback=callback, + samplerate=svc_model.target_sample, + blocksize=int(block_seconds * svc_model.target_sample), + ): + while True: + sd.sleep(1)