diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index 96f4db10..5b701b24 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -202,6 +202,13 @@ def train( default=False, help="absolute thresh", ) +@click.option( + "-mc", + "--max-chunk-seconds", + type=float, + default=40, + help="maximum allowed single chunk length, set lower if you get out of memory (0 to disable)", +) def infer( # paths input_path: Path, @@ -221,6 +228,7 @@ def infer( pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, + max_chunk_seconds: float = 40, device: str | torch.device = get_optimal_device(), ): """Inference""" @@ -264,6 +272,7 @@ def infer( pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, + max_chunk_seconds=max_chunk_seconds, device=device, ) diff --git a/src/so_vits_svc_fork/default_gui_presets.json b/src/so_vits_svc_fork/default_gui_presets.json index bb6df99f..b651042a 100644 --- a/src/so_vits_svc_fork/default_gui_presets.json +++ b/src/so_vits_svc_fork/default_gui_presets.json @@ -9,6 +9,7 @@ "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, + "max_chunk_seconds": 40, "crossfade_seconds": 0.05, "block_seconds": 0.35, "additional_infer_before_seconds": 0.15, @@ -27,6 +28,7 @@ "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, + "max_chunk_seconds": 40, "crossfade_seconds": 0.05, "block_seconds": 1.5, "additional_infer_before_seconds": 0.01, @@ -45,6 +47,7 @@ "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, + "max_chunk_seconds": 40, "crossfade_seconds": 0.05, "block_seconds": 2.5, "additional_infer_before_seconds": 0.01, @@ -63,6 +66,7 @@ "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, + "max_chunk_seconds": 40, "crossfade_seconds": 0.04, "block_seconds": 0.15, "additional_infer_before_seconds": 0.05, @@ -81,6 +85,7 @@ "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, + "max_chunk_seconds": 40, "auto_play": true, "passthrough_original": false } diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py index 113840ee..9195bd05 100644 --- a/src/so_vits_svc_fork/gui.py +++ b/src/so_vits_svc_fork/gui.py @@ -290,6 +290,16 @@ def main(): resolution=0.01, ), ], + [ + sg.Text("Max chunk seconds (set lower if Out Of Memory, 0 to disable)"), + sg.Push(), + sg.Slider( + range=(0.0, 240.0), + orientation="h", + key="max_chunk_seconds", + resolution=1.0, + ), + ], [ sg.Checkbox( key="absolute_thresh", @@ -644,6 +654,7 @@ def apply_preset(name: str) -> None: pad_seconds=values["pad_seconds"], chunk_seconds=values["chunk_seconds"], absolute_thresh=values["absolute_thresh"], + max_chunk_seconds=values["max_chunk_seconds"], device="cpu" if not values["use_gpu"] else get_optimal_device(), diff --git a/src/so_vits_svc_fork/inference/core.py b/src/so_vits_svc_fork/inference/core.py index 7c9ba4dc..8a6d110a 100644 --- a/src/so_vits_svc_fork/inference/core.py +++ b/src/so_vits_svc_fork/inference/core.py @@ -64,6 +64,7 @@ def split_silence( frame_length: int = 2048, hop_length: int = 512, aggregate: Callable[[ndarray[Any, dtype[float32]]], float] = np.mean, + max_chunk_length: int = 0, ) -> Iterable[Chunk]: non_silence_indices = librosa.effects.split( audio, @@ -79,7 +80,16 @@ def split_silence( yield Chunk( is_speech=False, audio=audio[last_end:start], start=last_end, end=start ) - yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end) + while max_chunk_length > 0 and end - start > max_chunk_length: + yield Chunk( + is_speech=True, + audio=audio[start : start + max_chunk_length], + start=start, + end=start + max_chunk_length, + ) + start += max_chunk_length + if end - start > 0: + yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end) last_end = end if last_end != len(audio): yield Chunk( @@ -248,6 +258,7 @@ def infer_silence( pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, + max_chunk_seconds: float = 40, # fade_seconds: float = 0.0, ) -> np.ndarray[Any, np.dtype[np.float32]]: sr = self.target_sample @@ -267,6 +278,7 @@ def infer_silence( frame_length=chunk_length_min * 2, hop_length=chunk_length_min, ref=1 if absolute_thresh else np.max, + max_chunk_length=int(max_chunk_seconds * sr), ): LOG.info(f"Chunk: {chunk}") if not chunk.is_speech: diff --git a/src/so_vits_svc_fork/inference/main.py b/src/so_vits_svc_fork/inference/main.py index ff726a94..b58ff0bb 100644 --- a/src/so_vits_svc_fork/inference/main.py +++ b/src/so_vits_svc_fork/inference/main.py @@ -36,6 +36,7 @@ def infer( pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, + max_chunk_seconds: float = 40, device: str | torch.device = get_optimal_device(), ): model_path = Path(model_path) @@ -65,6 +66,7 @@ def infer( pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, + max_chunk_seconds=max_chunk_seconds, ) soundfile.write(output_path, audio, svc_model.target_sample)