Skip to content

Commit

Permalink
feat: enhance RealtimeVC
Browse files Browse the repository at this point in the history
  • Loading branch information
34j committed Mar 21, 2023
1 parent 1c8305a commit 8303e57
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 36 deletions.
20 changes: 19 additions & 1 deletion src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,21 @@ def infer(
default=0.01,
help="crossfade seconds",
)
@click.option("-b", "--block-seconds", type=float, default=1, help="block seconds")
@click.option(
"-ab",
"--additional-infer-before-seconds",
type=float,
default=0.2,
help="additional infer before seconds",
)
@click.option(
"-aa",
"--additional-infer-after-seconds",
type=float,
default=0.1,
help="additional infer after seconds",
)
@click.option("-b", "--block-seconds", type=float, default=0.5, help="block seconds")
@click.option(
"-d",
"--device",
Expand Down Expand Up @@ -314,6 +328,8 @@ def vc(
chunk_seconds: float,
# realtime config
crossfade_seconds: float,
additional_infer_before_seconds: float,
additional_infer_after_seconds: float,
block_seconds: float,
version: int,
input_device: int | str | None,
Expand Down Expand Up @@ -358,6 +374,8 @@ def vc(
chunk_seconds=chunk_seconds,
# realtime config
crossfade_seconds=crossfade_seconds,
additional_infer_before_seconds=additional_infer_before_seconds,
additional_infer_after_seconds=additional_infer_after_seconds,
block_seconds=block_seconds,
version=version,
input_device=input_device,
Expand Down
56 changes: 48 additions & 8 deletions src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def main():
default_text=model_candidates[-1].absolute().as_posix()
if model_candidates
else "",
enable_events=True,
),
sg.FileBrowse(
initial_folder=Path("./logs/44k/").absolute
Expand Down Expand Up @@ -77,7 +78,7 @@ def main():
[
sg.Text("Cluster model path"),
sg.Push(),
sg.InputText(key="cluster_model_path"),
sg.InputText(key="cluster_model_path", enable_events=True),
sg.FileBrowse(
initial_folder="./logs/44k/"
if Path("./logs/44k/").exists()
Expand Down Expand Up @@ -213,26 +214,48 @@ def main():
range=(0, 0.6),
orientation="h",
key="crossfade_seconds",
default_value=0.1,
default_value=0.08,
resolution=0.001,
),
],
[
sg.Text("Block seconds"),
sg.Push(),
sg.Slider(
range=(0, 3.0),
range=(0, 1.0),
orientation="h",
key="block_seconds",
default_value=1,
resolution=0.01,
default_value=0.35,
resolution=0.001,
),
],
[
sg.Text("Additional Infer seconds (before)"),
sg.Push(),
sg.Slider(
range=(0, 1.0),
orientation="h",
key="additional_infer_before_seconds",
default_value=0.2,
resolution=0.001,
),
],
[
sg.Text("Additional Infer seconds (after)"),
sg.Push(),
sg.Slider(
range=(0, 1.0),
orientation="h",
key="additional_infer_after_seconds",
default_value=0.08,
resolution=0.001,
),
],
[
sg.Text("Realtime algorithm"),
sg.Combo(
["2 (Divide by speech)", "1 (Divide constantly)"],
default_value="2 (Divide by speech)",
default_value="1 (Divide constantly)",
key="realtime_algorithm",
),
],
Expand Down Expand Up @@ -294,15 +317,26 @@ def update_combo() -> None:
if values["speaker"] == "":
update_combo()
if event.endswith("_path"):
browser = window[f"{event}_browse"]
for name in window.AllKeysDict:
if name.endswith("_browse"):
browser = window[name]
if isinstance(browser, sg.Button):
LOG.info(
f"Updating browser {browser} to {Path(values[event]).parent}"
)
browser.InitialFolder = Path(values[event]).parent
browser.update()
else:
LOG.warning(f"Browser {browser} is not a FileBrowse")
"""browser = window[f"{event}_browse"]
if isinstance(browser, sg.Button):
LOG.info(
f"Updating browser {browser} to {Path(values[event]).parent}"
)
browser.InitialFolder = Path(values[event]).parent
browser.update()
else:
LOG.warning(f"Browser {browser} is not a FileBrowse")
LOG.warning(f"Browser {browser} is not a FileBrowse")"""
if event == "config_path":
update_combo()
elif event == "infer":
Expand Down Expand Up @@ -360,6 +394,12 @@ def update_combo() -> None:
noise_scale=values["noise_scale"],
f0_method=values["f0_method"],
crossfade_seconds=values["crossfade_seconds"],
additional_infer_before_seconds=values[
"additional_infer_before_seconds"
],
additional_infer_after_seconds=values[
"additional_infer_after_seconds"
],
db_thresh=values["silence_threshold"],
pad_seconds=values["pad_seconds"],
chunk_seconds=values["chunk_seconds"],
Expand Down
118 changes: 96 additions & 22 deletions src/so_vits_svc_fork/inference/infer_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def duration(self) -> float32:
# return self.end - self.start
return float32(self.audio.shape[0])

def __repr__(self) -> str:
return f"Chunk(Speech: {self.is_speech}, {self.duration})"


def split_silence(
audio: ndarray[Any, dtype[float32]],
Expand Down Expand Up @@ -286,10 +289,44 @@ def infer_silence(
return result_audio


def linear_crossfade(
first: ndarray[Any, dtype[float32]],
second: ndarray[Any, dtype[float32]],
length: int,
) -> ndarray[Any, dtype[float32]]:
return np.concatenate(
[
first[:-length],
first[-length:] * np.linspace(1, 0, length)
+ second[:length] * np.linspace(0, 1, length),
second[length:],
]
)


class Crossfader:
def __init__(self, *, crossfade_len: int) -> None:
def __init__(
self,
*,
additional_infer_before_len: int,
additional_infer_after_len: int,
crossfade_len: int,
) -> None:
if additional_infer_before_len < 0:
raise ValueError("additional_infer_len must be >= 0")
if crossfade_len < 0:
raise ValueError("crossfade_len must be >= 0")
if additional_infer_after_len < 0:
raise ValueError("additional_infer_len must be >= 0")
if additional_infer_before_len < 0:
raise ValueError("additional_infer_len must be >= 0")
self.additional_infer_before_len = additional_infer_before_len
self.additional_infer_after_len = additional_infer_after_len
self.crossfade_len = crossfade_len
self.last_input_left = np.zeros(crossfade_len, dtype=np.float32)
self.last_input_left = np.zeros(
crossfade_len + additional_infer_before_len + additional_infer_after_len,
dtype=np.float32,
)
self.last_infered_left = np.zeros(crossfade_len, dtype=np.float32)

def process(
Expand All @@ -304,33 +341,64 @@ def process(
crossfade :▲■■■■■
▲□□□□□
"""
# check input
if input_audio.ndim != 1:
raise ValueError("Input audio must be 1-dimensional.")
if input_audio.shape[0] < self.crossfade_len:
if (
input_audio.shape[0] + self.additional_infer_before_len
<= self.crossfade_len
):
raise ValueError(
f"Input audio length ({len(input_audio)}) should be at least crossfade length ({self.crossfade_len})."
f"Input audio length ({input_audio.shape[0]}) + additional_infer_len ({self.additional_infer_before_len}) must be greater than crossfade_len ({self.crossfade_len})."
)
input_audio = input_audio.astype(np.float32)
input_audio_ = np.concatenate([self.last_input_left, input_audio])
infer_audio_ = self.infer(input_audio_, *args, **kwargs)
if len(infer_audio_) != len(input_audio_):
input_audio_len = len(input_audio)

# concat last input and infer
input_audio_concat = np.concatenate([self.last_input_left, input_audio])
del input_audio
pad_len = 0
if pad_len:
infer_audio_concat = self.infer(
np.pad(input_audio_concat, (pad_len, pad_len), mode="reflect"),
*args,
**kwargs,
)[pad_len:-pad_len]
else:
infer_audio_concat = self.infer(input_audio_concat, *args, **kwargs)
if len(infer_audio_concat) != len(input_audio_concat):
raise ValueError(
f"Inferred audio length ({len(infer_audio_)}) should be equal to input audio length ({len(input_audio_)})."
f"Inferred audio length ({len(infer_audio_concat)}) should be equal to input audio length ({len(input_audio_concat)})."
)
result_audio = np.concatenate(
[
(
self.last_infered_left * np.linspace(1, 0, self.crossfade_len)
+ infer_audio_[: self.crossfade_len]
* np.linspace(0, 1, self.crossfade_len)
)
/ 2,
infer_audio_[self.crossfade_len : -self.crossfade_len],
]

infer_audio_to_use = infer_audio_concat[
-(
self.crossfade_len + input_audio_len + self.additional_infer_after_len
) : -(self.crossfade_len + self.additional_infer_after_len)
]
assert (
len(infer_audio_to_use) == input_audio_len
), f"{len(infer_audio_to_use)} != {input_audio_len}"
result_audio = linear_crossfade(
self.last_infered_left, infer_audio_to_use, self.crossfade_len
)
self.last_input_left = input_audio[-self.crossfade_len :]
self.last_infered_left = infer_audio_[-self.crossfade_len :]
assert len(result_audio) == len(input_audio)
assert (
len(result_audio) == input_audio_len
), f"{len(result_audio)} != {input_audio_len}"

# update last input and inferred
self.last_input_left = input_audio_concat[
-(
self.crossfade_len
+ self.additional_infer_before_len
+ self.additional_infer_after_len
) :
]
self.last_infered_left = infer_audio_concat[
-(
self.crossfade_len + self.additional_infer_after_len
) : -self.additional_infer_after_len
]
return result_audio

def infer(
Expand All @@ -345,11 +413,17 @@ def __init__(
*,
svc_model: Svc,
crossfade_len: int = 3840,
additional_infer_before_len: int = 7680,
additional_infer_after_len: int = 7680,
split: bool = True,
) -> None:
self.svc_model = svc_model
self.split = split
super().__init__(crossfade_len=crossfade_len)
super().__init__(
crossfade_len=crossfade_len,
additional_infer_before_len=additional_infer_before_len,
additional_infer_after_len=additional_infer_after_len,
)

def process(
self,
Expand Down
24 changes: 19 additions & 5 deletions src/so_vits_svc_fork/inference_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import soundfile
import torch
from cm_time import timer

from .inference.infer_tool import RealtimeVC, RealtimeVC2, Svc

Expand Down Expand Up @@ -87,6 +88,8 @@ def realtime(
chunk_seconds: float = 0.5,
# realtime config
crossfade_seconds: float = 0.05,
additional_infer_before_seconds: float = 0.2,
additional_infer_after_seconds: float = 0.1,
block_seconds: float = 0.5,
version: int = 2,
input_device: int | str | None = None,
Expand All @@ -106,10 +109,18 @@ def realtime(
else None,
device=device,
)

LOG.info("Creating realtime model...")
if version == 1:
model = RealtimeVC(
svc_model=svc_model,
crossfade_len=int(crossfade_seconds * svc_model.target_sample),
additional_infer_before_len=int(
additional_infer_before_seconds * svc_model.target_sample
),
additional_infer_after_len=int(
additional_infer_after_seconds * svc_model.target_sample
),
)
else:
model = RealtimeVC2(
Expand Down Expand Up @@ -187,16 +198,19 @@ def callback(
)
if version == 1:
kwargs["pad_seconds"] = pad_seconds
outdata[:] = model.process(
**kwargs,
).reshape(-1, 1)
with timer() as t:
outdata[:] = model.process(
**kwargs,
).reshape(-1, 1)
LOG.info(f"True Realtime coef: {block_seconds / t.elapsed:.2f}")

with sd.Stream(
device=(input_device, output_device),
channels=1,
callback=callback,
samplerate=svc_model.target_sample,
blocksize=int(block_seconds * svc_model.target_sample),
):
) as stream:
while True:
sd.sleep(1)
LOG.info(f"Latency: {stream.latency}")
sd.sleep(1000)

0 comments on commit 8303e57

Please sign in to comment.