Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: change default f0 method from crepe to dio #100

Merged
merged 8 commits into from
Mar 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,14 @@ Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (s
```shell
svc pre-resample
svc pre-config
svc pre-hubert
svc pre-hubert -fm dio
svc train
```

#### Notes

- Dataset audio duration per file should be <~ 10s or VRAM will run out.
- To change the f0 inference method to CREPE, replace `svc pre-hubert -fm dio` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues.
- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.

### Further help
Expand Down
3 changes: 2 additions & 1 deletion notebooks/so-vits-svc-fork-4.0.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@
"metadata": {},
"outputs": [],
"source": [
"!svc pre-hubert"
"F0_METHOD = \"dio\" #@param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n",
"!svc pre-hubert -fm {F0_METHOD}"
]
},
{
Expand Down
5 changes: 3 additions & 2 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def infer(
"-fm",
"--f0-method",
type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
default="crepe",
default="dio",
help="f0 prediction method",
)
@click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds")
Expand Down Expand Up @@ -490,6 +490,7 @@ def pre_config(
@click.option(
"-n",
"--n_jobs",
"--n-jobs",
type=int,
default=4,
help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)",
Expand All @@ -505,7 +506,7 @@ def pre_config(
"-fm",
"--f0-method",
type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
default="crepe",
default="dio",
)
def pre_hubert(
input_dir: Path,
Expand Down
2 changes: 1 addition & 1 deletion src/so_vits_svc_fork/default_gui_presets.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"silence_threshold": -35.0,
"transpose": 0.0,
"auto_predict_f0": true,
"f0_method": "crepe",
"f0_method": "dio",
"cluster_infer_ratio": 0.0,
"noise_scale": 0.4,
"pad_seconds": 0.1,
Expand Down
3 changes: 2 additions & 1 deletion src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def load_presets() -> dict:
json.loads(GUI_PRESETS_PATH.read_text()) if GUI_PRESETS_PATH.exists() else {}
)
# prioriy: defaults > users
return {**defaults, **users}
# order: defaults -> users
return {**defaults, **users, **defaults}


def add_preset(name: str, preset: dict) -> dict:
Expand Down
10 changes: 5 additions & 5 deletions src/so_vits_svc_fork/inference/infer_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def get_unit_f0(
speaker: int | str,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
] = "dio",
):
f0 = utils.compute_f0(
audio,
Expand Down Expand Up @@ -172,7 +172,7 @@ def infer(
noise_scale: float = 0.4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
] = "dio",
) -> tuple[torch.Tensor, int]:
audio = audio.astype(np.float32)
# get speaker id
Expand Down Expand Up @@ -240,7 +240,7 @@ def infer_silence(
noise_scale: float = 0.4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
] = "dio",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand Down Expand Up @@ -459,7 +459,7 @@ def infer(
noise_scale: float = 0.4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
] = "dio",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand Down Expand Up @@ -519,7 +519,7 @@ def process(
noise_scale: float = 0.4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
] = "dio",
# slice config
db_thresh: int = -40,
chunk_seconds: float = 0.5,
Expand Down
8 changes: 2 additions & 6 deletions src/so_vits_svc_fork/inference_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ def infer(
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand Down Expand Up @@ -83,9 +81,7 @@ def realtime(
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand Down
12 changes: 3 additions & 9 deletions src/so_vits_svc_fork/preprocess_hubert_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ def _process_one(
sampling_rate: int,
hop_length: int,
device: Literal["cuda", "cpu"] = "cuda",
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
force_rebuild: bool = False,
):
wav, sr = librosa.load(filepath, sr=sampling_rate)
Expand Down Expand Up @@ -59,9 +57,7 @@ def _process_batch(
sampling_rate: int,
hop_length: int,
pbar_position: int,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
force_rebuild: bool = False,
):
device = "cuda" if torch.cuda.is_available() else "cpu"
Expand All @@ -83,9 +79,7 @@ def preprocess_hubert_f0(
input_dir: Path | str,
config_path: Path | str,
n_jobs: int = 4,
f0_method: Literal[
"crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
] = "crepe",
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
force_rebuild: bool = False,
):
input_dir = Path(input_dir)
Expand Down
6 changes: 4 additions & 2 deletions src/so_vits_svc_fork/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def compute_f0(
p_len: None | int = None,
sampling_rate: int = 44100,
hop_length: int = 512,
method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "crepe",
method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio",
**kwargs,
):
with timer() as t:
Expand All @@ -260,7 +260,9 @@ def compute_f0(
elif method == "parselmouth":
f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
else:
raise ValueError("type must be dio, crepe, harvest or parselmouth")
raise ValueError(
"type must be dio, crepe, crepe-tiny, harvest or parselmouth"
)
rtf = t.elapsed / (len(wav_numpy) / sampling_rate)
LOG.info(f"F0 inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}")
return f0
Expand Down