diff --git a/README.md b/README.md
index 1fe7bce8..711a6ac3 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,7 @@ pip install so-vits-svc-fork
 
 ## Features not available in the original repo
 
+- **Realtime voice conversion**
 - Unified command-line interface (no need to run Python scripts)
 - Ready to use just by installing with `pip`.
 - Automatically download pretrained base model and HuBERT model
@@ -50,6 +51,12 @@ pip install so-vits-svc-fork
 
 ## Usage
 
+### Realtime Voice conversion
+
+```shell
+svc vc --model-path <model-path>
+```
+
 ### Training
 
 Colab notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb)
@@ -69,6 +76,34 @@ svc train
 svc --model-path <model-path> source.wav
 ```
 
+For more details, run `svc -h` or `svc <subcommand> -h`.
+
+```shell
+svc -h
+Usage: svc [OPTIONS] COMMAND [ARGS]...
+
+  so-vits-svc allows any folder structure for training data. However, it is
+  recommended to place the training data in the following structure:
+
+      dataset_raw/{speaker_name}/{wav_name}.wav
+
+  To train a model, run pre-resample, pre-config, pre-hubert, train. To infer
+  a model, run infer.
+
+Options:
+  -h, --help  Show this message and exit.
+
+Commands:
+  clean         Clean up files, only useful if you are using the default...
+  infer         Inference
+  onnx          Export model to onnx
+  pre-config    Preprocessing part 2: config
+  pre-hubert    Preprocessing part 3: hubert
+  pre-resample  Preprocessing part 1: resample
+  train         Train model
+  vc            Realtime inference from microphone
+```
+
 ## Contributors ✨
 
 Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
diff --git a/poetry.lock b/poetry.lock
index 2952b2ca..c2ace889 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -582,6 +582,21 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "cm-time"
+version = "0.1.2"
+description = "A simple context manager that measures time using perf_counter()"
+category = "main"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "cm_time-0.1.2-py3-none-any.whl", hash = "sha256:8dfd172916a7b23f508fbca046157067ab9374fc38791d408278ce7288077cfa"},
+    {file = "cm_time-0.1.2.tar.gz", hash = "sha256:e2848efc5868884d0a7795408ec9b2c21d2d3e2cf399241e8e4531a29128b638"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.4.0,<5.0.0", markers = "python_version < \"3.10\""}
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -2556,7 +2571,6 @@ files = [
 numpy = [
     {version = ">=1.20.3", markers = "python_version < \"3.10\""},
     {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
 ]
 python-dateutil = ">=2.8.1"
 pytz = ">=2020.1"
@@ -4655,5 +4669,5 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 
 [metadata]
 lock-version = "2.0"
-python-versions = "^3.8"
-content-hash = "cfa6c5f171ac51aac49572a3ce0e664a51b47571349189816e66c9a17a63894a"
+python-versions = ">=3.8,<3.11"
+content-hash = "bd0a5148f6634dc9b2df2d30a8752d0de8dc72d509827ea6b4245e12bfb34060"
diff --git a/pyproject.toml b/pyproject.toml
index 65f198b8..7cb597e6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ rich = "*"
 tqdm-joblib = "*"
 tensorboardx = "*"
 pyinputplus = "*"
+cm-time = "^0.1.2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index fd26b155..4f4b57ed 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -36,10 +36,15 @@
 @click.group()
 def cli():
     """so-vits-svc allows any folder structure for training data.
-    However, it is recommended to place the training data in the following structure:
+    However, the following folder structure is recommended.
 
-        dataset_raw/{speaker_name}/{wav_name}.wav
+        When training: dataset_raw/{speaker_name}/{wav_name}.wav
 
+
+        When inference: configs/44k/config.json, logs/44k/G_XXXX.pth
+
+    If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc.
+    (The latest model will be automatically loaded.)
     To train a model, run pre-resample, pre-config, pre-hubert, train.
     To infer a model, run infer.
     """
@@ -62,7 +67,8 @@ def cli():
     default=Path("./logs/44k"),
 )
 def train(config_path: Path, model_path: Path):
-    """Train model"""
+    """Train model
+    If D_0.pth or G_0.pth not found, automatically download from hub."""
     from .train import main
 
     config_path = Path(config_path)
@@ -87,7 +93,7 @@ def train(config_path: Path, model_path: Path):
     "-m",
     "--model_path",
     type=click.Path(exists=True),
-    default=Path("./logs/44k/G_800.pth"),
+    default=Path("./logs/44k/"),
     help="path to model",
 )
 @click.option(
@@ -107,7 +113,7 @@ def train(config_path: Path, model_path: Path):
 @click.option("-t", "--transpose", type=int, default=0, help="transpose")
 @click.option("-d", "--db_thresh", type=int, default=-40, help="db thresh")
 @click.option(
-    "-a", "--auto_predict_f0", type=bool, default=False, help="auto predict f0"
+    "-a", "--auto_predict_f0", type=bool, default=True, help="auto predict f0"
 )
 @click.option(
     "-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
@@ -139,11 +145,20 @@ def infer(
     """Inference"""
     from .inference_main import infer
 
+    if not auto_predict_f0:
+        LOG.warning(
+            f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please set transpose."
+            "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
+        )
+
     input_path = Path(input_path)
     if output_path is None:
         output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}"
     output_path = Path(output_path)
     model_path = Path(model_path)
+    if model_path.is_dir():
+        model_path = list(sorted(model_path.glob("*.pth")))[-1]
+        LOG.info(f"Since model_path is a directory, use {model_path}")
     config_path = Path(config_path)
     if cluster_model_path is not None:
         cluster_model_path = Path(cluster_model_path)
@@ -164,6 +179,114 @@ def infer(
     )
 
 
+@cli.command()
+@click.option(
+    "-m",
+    "--model_path",
+    type=click.Path(exists=True),
+    default=Path("./logs/44k/"),
+    help="path to model",
+)
+@click.option(
+    "-c",
+    "--config_path",
+    type=click.Path(exists=True),
+    default=Path("./configs/44k/config.json"),
+    help="path to config",
+)
+@click.option(
+    "-k",
+    "--cluster_model_path",
+    type=click.Path(exists=True),
+    default=None,
+    help="path to cluster model",
+)
+@click.option("-t", "--transpose", type=int, default=12, help="transpose")
+@click.option(
+    "-a",
+    "--auto_predict_f0",
+    type=bool,
+    default=False,
+    help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)",
+)
+@click.option(
+    "-r", "--cluster_infer_ratio", type=float, default=0, help="cluster infer ratio"
+)
+@click.option("-n", "--noise_scale", type=float, default=0.4, help="noise scale")
+@click.option("-d", "--db_thresh", type=int, default=-20, help="db thresh")
+@click.option("-p", "--pad_seconds", type=float, default=0.02, help="pad seconds")
+@click.option(
+    "-c",
+    "--crossfade_seconds",
+    type=float,
+    default=0.01,
+    help="crossfade seconds",
+)
+@click.option("-b", "--block_seconds", type=float, default=1, help="block seconds")
+@click.option(
+    "-d",
+    "--device",
+    type=str,
+    default="cuda" if torch.cuda.is_available() else "cpu",
+    help="device",
+)
+@click.option("-s", "--speaker", type=str, default=None, help="speaker name")
+def vc(
+    # paths
+    model_path: Path,
+    config_path: Path,
+    # svc config
+    speaker: str,
+    cluster_model_path: Path | None,
+    transpose: int,
+    auto_predict_f0: bool,
+    cluster_infer_ratio: float,
+    noise_scale: float,
+    # slice config
+    db_thresh: int,
+    pad_seconds: float,
+    # realtime config
+    crossfade_seconds: float,
+    block_seconds: float,
+    device: Literal["cpu", "cuda"],
+) -> None:
+    """Realtime inference from microphone"""
+    from .inference_main import realtime
+
+    if auto_predict_f0:
+        LOG.warning(
+            "auto_predict_f0 = True in realtime inference will cause unstable voice pitch, use with caution"
+        )
+    else:
+        LOG.warning(
+            f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please change the transpose value."
+            "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different."
+        )
+    model_path = Path(model_path)
+    config_path = Path(config_path)
+    if cluster_model_path is not None:
+        cluster_model_path = Path(cluster_model_path)
+    if model_path.is_dir():
+        model_path = list(sorted(model_path.glob("*.pth")))[-1]
+        LOG.info(f"Since model_path is a directory, use {model_path}")
+
+    realtime(
+        model_path=model_path,
+        config_path=config_path,
+        speaker=speaker,
+        cluster_model_path=cluster_model_path,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        crossfade_seconds=crossfade_seconds,
+        block_seconds=block_seconds,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        device=device,
+    )
+
+
 @click.help_option("--help", "-h")
 @cli.command()
 @click.option(
@@ -250,7 +373,8 @@ def pre_config(
     default=Path("./configs/44k/config.json"),
 )
 def pre_hubert(input_dir: Path, config_path: Path) -> None:
-    """Preprocessing part 3: hubert"""
+    """Preprocessing part 3: hubert
+    If the HuBERT model is not found, it will be downloaded automatically."""
     from .preprocess_hubert_f0 import preprocess_hubert_f0
 
     input_dir = Path(input_dir)
diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
index e8f9c263..0458af2e 100644
--- a/src/so_vits_svc_fork/inference/infer_tool.py
+++ b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -1,18 +1,13 @@
-import hashlib
-import io
-import json
+from __future__ import annotations
+
 import os
-import time
 from logging import getLogger
-from pathlib import Path
+from typing import Any
 
 import librosa
 import numpy as np
-
-# import onnxruntime
-import soundfile
 import torch
-import torchaudio
+from cm_time import timer
 
 from so_vits_svc_fork import cluster, utils
 from so_vits_svc_fork.inference import slicer
@@ -23,85 +18,16 @@
 LOG = getLogger(__name__)
 
 
-def read_temp(file_name):
-    if not os.path.exists(file_name):
-        with open(file_name, "w") as f:
-            f.write(json.dumps({"info": "temp_dict"}))
-        return {}
-    else:
-        try:
-            with open(file_name) as f:
-                data = f.read()
-            data_dict = json.loads(data)
-            if os.path.getsize(file_name) > 50 * 1024 * 1024:
-                f_name = file_name.replace("\\", "/").split("/")[-1]
-                LOG.info(f"clean {f_name}")
-                for wav_hash in list(data_dict.keys()):
-                    if (
-                        int(time.time()) - int(data_dict[wav_hash]["time"])
-                        > 14 * 24 * 3600
-                    ):
-                        del data_dict[wav_hash]
-        except Exception as e:
-            LOG.exception(e)
-            LOG.info(f"{file_name} error,auto rebuild file")
-            data_dict = {"info": "temp_dict"}
-        return data_dict
-
-
-def write_temp(file_name, data):
-    with open(file_name, "w") as f:
-        f.write(json.dumps(data))
-
-
-def timeit(func):
-    def run(*args, **kwargs):
-        t = time.time()
-        res = func(*args, **kwargs)
-        LOG.info(f"executing '{func.__name__}' costed {time.time() - t:.3f}s")
-        return res
-
-    return run
-
-
-def format_wav(audio_path):
-    if Path(audio_path).suffix == ".wav":
-        return
-    raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
-    soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
-
-
-def get_end_file(dir_path, end):
-    file_lists = []
-    for root, dirs, files in os.walk(dir_path):
-        files = [f for f in files if f[0] != "."]
-        dirs[:] = [d for d in dirs if d[0] != "."]
-        for f_file in files:
-            if f_file.endswith(end):
-                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
-    return file_lists
-
-
-def get_md5(content):
-    return hashlib.new("md5", content).hexdigest()
-
-
-def fill_a_to_b(a, b):
-    if len(a) < len(b):
-        for _ in range(0, len(b) - len(a)):
-            a.append(a[0])
-
-
-def mkdir(paths: list):
-    for path in paths:
-        if not os.path.exists(path):
-            os.mkdir(path)
-
-
 def pad_array(arr, target_length):
     current_length = arr.shape[0]
     if current_length >= target_length:
-        return arr
+        return arr[
+            (current_length - target_length)
+            // 2 : (current_length - target_length)
+            // 2
+            + target_length,
+            ...,
+        ]
     else:
         pad_width = target_length - current_length
         pad_left = pad_width // 2
@@ -115,10 +41,11 @@ def pad_array(arr, target_length):
 class Svc:
     def __init__(
         self,
-        net_g_path,
-        config_path,
-        device=None,
-        cluster_model_path="logs/44k/kmeans_10000.pt",
+        *,
+        net_g_path: str,
+        config_path: str,
+        device: torch.device | str | None = None,
+        cluster_model_path: str | None = None,
     ):
         self.net_g_path = net_g_path
         if device is None:
@@ -130,14 +57,12 @@ def __init__(
         self.target_sample = self.hps_ms.data.sampling_rate
         self.hop_size = self.hps_ms.data.hop_length
         self.spk2id = self.hps_ms.spk
-        # 加载hubert
         self.hubert_model = utils.get_hubert_model().to(self.dev)
         self.load_model()
-        if os.path.exists(cluster_model_path):
+        if cluster_model_path is not None and os.path.exists(cluster_model_path):
             self.cluster_model = cluster.get_cluster_model(cluster_model_path)
 
     def load_model(self):
-        # 获取模型配置
         self.net_g_ms = SynthesizerTrn(
             self.hps_ms.data.filter_length // 2 + 1,
             self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
@@ -149,11 +74,15 @@ def load_model(self):
         else:
             _ = self.net_g_ms.eval().to(self.dev)
 
-    def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
-        wav, sr = librosa.load(in_path, sr=self.target_sample)
-
+    def get_unit_f0(
+        self,
+        audio: np.ndarray[Any, np.dtype[np.float64]],
+        tran: int,
+        cluster_infer_ratio: float,
+        speaker: int | str,
+    ):
         f0 = utils.compute_f0_parselmouth(
-            wav, sampling_rate=self.target_sample, hop_length=self.hop_size
+            audio, sampling_rate=self.target_sample, hop_length=self.hop_size
         )
         f0, uv = utils.interpolate_f0(f0)
         f0 = torch.FloatTensor(f0)
@@ -163,7 +92,7 @@ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
         uv = uv.unsqueeze(0).to(self.dev)
 
         wav16k = librosa.resample(
-            wav, orig_sr=self.target_sample, target_sr=HUBERT_SAMPLING_RATE
+            audio, orig_sr=self.target_sample, target_sr=HUBERT_SAMPLING_RATE
         )
         wav16k = torch.from_numpy(wav16k).to(self.dev)
         c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
@@ -181,113 +110,191 @@ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
 
     def infer(
         self,
-        speaker,
-        tran,
-        raw_path,
-        cluster_infer_ratio=0,
-        auto_predict_f0=False,
-        noice_scale=0.4,
+        speaker: int | str,
+        transpose: int,
+        audio: np.ndarray[Any, np.dtype[np.float32]],
+        cluster_infer_ratio: float = 0,
+        auto_predict_f0: bool = False,
+        noise_scale: float = 0.4,
     ):
+        audio = audio.astype(np.float32)
+        # get speaker id
         speaker_id = self.spk2id.__dict__.get(speaker)
-        if not speaker_id and type(speaker) is int:
+        if not speaker_id and isinstance(speaker, int):
             if len(self.spk2id.__dict__) >= speaker:
                 speaker_id = speaker
+        else:
+            LOG.warning(f"Speaker {speaker} is not found. Use speaker 0 instead.")
+            speaker_id = 0
         sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
-        c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker)
+
+        # get unit f0
+        c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker)
         if "half" in self.net_g_path and torch.cuda.is_available():
             c = c.half()
+
+        # inference
         with torch.no_grad():
-            start = time.time()
-            audio = self.net_g_ms.infer(
-                c,
-                f0=f0,
-                g=sid,
-                uv=uv,
-                predict_f0=auto_predict_f0,
-                noice_scale=noice_scale,
-            )[0, 0].data.float()
-            use_time = time.time() - start
-            LOG.info(f"vits use time:{use_time}")
+            with timer() as t:
+                audio = self.net_g_ms.infer(
+                    c,
+                    f0=f0,
+                    g=sid,
+                    uv=uv,
+                    predict_f0=auto_predict_f0,
+                    noice_scale=noise_scale,
+                )[0, 0].data.float()
+            realtime_coef = len(audio) / (t.elapsed * self.target_sample)
+            LOG.info(
+                f"Inferece time: {t.elapsed:.2f}s, Realtime coef: {realtime_coef:.2f} "
+                f"Input shape: {audio.shape}, Output shape: {audio.shape}"
+            )
         return audio, audio.shape[-1]
 
     def clear_empty(self):
-        # 清理显存
         torch.cuda.empty_cache()
 
-    def slice_inference(
+    def infer_silence(
         self,
-        raw_audio_path,
-        spk,
-        tran,
-        slice_db,
-        cluster_infer_ratio,
-        auto_predict_f0,
-        noice_scale,
-        pad_seconds=0.5,
-    ):
-        wav_path = raw_audio_path
-        chunks = slicer.cut(wav_path, db_thresh=slice_db)
-        audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
-
-        audio = []
-        for slice_tag, data in audio_data:
-            LOG.info(f"#=====segment start, {round(len(data) / audio_sr, 3)}s======")
-            # pad
-            pad_len = int(audio_sr * pad_seconds)
-            data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
-            length = int(np.ceil(len(data) / audio_sr * self.target_sample))
-            raw_path = io.BytesIO()
-            soundfile.write(raw_path, data, audio_sr, format="wav")
-            raw_path.seek(0)
+        audio: np.ndarray[Any, np.dtype[np.float32]],
+        *,
+        # svc config
+        speaker: int | str,
+        transpose: int = 0,
+        auto_predict_f0: bool = False,
+        cluster_infer_ratio: float = 0,
+        noise_scale: float = 0.4,
+        # slice config
+        db_thresh: int = -40,
+        pad_seconds: float = 0.5,
+        fade_seconds: float = 0.04,
+    ) -> np.ndarray[Any, np.dtype[np.float32]]:
+        chunks = slicer.cut(audio, self.target_sample, db_thresh=db_thresh)
+        LOG.info(f"Cut audio into chunks {chunks}")
+        sr = self.target_sample
+
+        result_audio = np.array([])
+        for slice_tag, data in slicer.chunks2audio(audio, chunks):
+            # segment length
+            length = int(np.ceil(len(data) / sr * self.target_sample))
             if slice_tag:
-                LOG.info("jump empty segment")
+                LOG.info("Skip silence")
                 _audio = np.zeros(length)
             else:
+                # pad
+                pad_len = int(sr * pad_seconds)
+                data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
                 out_audio, out_sr = self.infer(
-                    spk,
-                    tran,
-                    raw_path,
+                    speaker,
+                    transpose,
+                    audio,
                     cluster_infer_ratio=cluster_infer_ratio,
                     auto_predict_f0=auto_predict_f0,
-                    noice_scale=noice_scale,
+                    noise_scale=noise_scale,
                 )
                 _audio = out_audio.cpu().numpy()
+                pad_len = int(self.target_sample * pad_seconds)
+                _audio = _audio[pad_len:-pad_len]
 
-            pad_len = int(self.target_sample * pad_seconds)
-            _audio = _audio[pad_len:-pad_len]
-            audio.extend(list(_audio))
-        return np.array(audio)
+                # add fade
+                fade_len = int(self.target_sample * fade_seconds)
+                _audio[:fade_len] = _audio[:fade_len] * np.linspace(0, 1, fade_len)
+                _audio[-fade_len:] = _audio[-fade_len:] * np.linspace(1, 0, fade_len)
+            result_audio = np.concatenate([result_audio, pad_array(_audio, length)])
+        result_audio = result_audio[: audio.shape[0]]
+        return result_audio
 
 
-class RealTimeVC:
-    def __init__(self):
-        self.last_chunk = None
-        self.last_o = None
-        self.chunk_len = HUBERT_SAMPLING_RATE  # 区块长度
-        self.pre_len = 3840  # 交叉淡化长度，640的倍数
+import maad
 
-    """输入输出都是1维numpy 音频波形数组"""
 
-    def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
-        import maad
+class RealTimeVCBase:
+    def __init__(
+        self,
+        *,
+        svc_model: Svc,
+        crossfade_len: int = 3840,
+        use_slicer: bool = True,
+    ):
+        self.svc_model = svc_model
+        self.crossfade_len = crossfade_len
+        self.last_input = np.zeros(crossfade_len * 2, dtype=np.float32)
+        self.last_infered = np.zeros(crossfade_len * 2, dtype=np.float32)
+        self.use_slicer = use_slicer
+
+    """The input and output are 1-dimensional numpy audio waveform arrays"""
 
-        audio, sr = torchaudio.load(input_wav_path)
-        audio = audio.cpu().numpy()[0]
-        temp_wav = io.BytesIO()
-        if self.last_chunk is None:
-            input_wav_path.seek(0)
-            audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
-            audio = audio.cpu().numpy()
-            self.last_chunk = audio[-self.pre_len :]
-            self.last_o = audio
-            return audio[-self.chunk_len :]
+    def process(
+        self,
+        input_audio: np.ndarray[Any, np.dtype[np.float32]],
+        *,
+        # svc config
+        speaker: int | str,
+        transpose: int,
+        cluster_infer_ratio: float = 0,
+        auto_predict_f0: bool = False,
+        noise_scale: float = 0.4,
+        # slice config
+        db_thresh: int = -40,
+        pad_seconds: float = 0.5,
+    ):
+        if input_audio.ndim != 1:
+            raise ValueError("Input audio must be 1-dimensional.")
+        if input_audio.shape[0] < self.crossfade_len:
+            raise ValueError(
+                f"Input audio length ({len(input_audio)}) should be at least crossfade length ({self.crossfade_len})."
+            )
+        input_audio = input_audio.astype(np.float32)
+        input_audio = np.nan_to_num(input_audio)
+
+        # create input audio
+        input_audio_c = np.concatenate([self.last_input, input_audio])[
+            -(input_audio.shape[0] + self.crossfade_len) :
+        ]
+        self.last_input = input_audio.copy()
+        LOG.info(
+            f"Input shape: {input_audio.shape}, Concatenated shape: {input_audio_c.shape}, Crossfade length: {self.crossfade_len}"
+        )
+        assert input_audio_c.shape[0] == input_audio.shape[0] + self.crossfade_len
+
+        # infer
+        if self.use_slicer:
+            infered_audio_c = self.svc_model.infer_silence(
+                audio=input_audio_c,
+                speaker=speaker,
+                transpose=transpose,
+                cluster_infer_ratio=cluster_infer_ratio,
+                auto_predict_f0=auto_predict_f0,
+                noise_scale=noise_scale,
+                db_thresh=db_thresh,
+                pad_seconds=pad_seconds,
+            )
         else:
-            audio = np.concatenate([self.last_chunk, audio])
-            soundfile.write(temp_wav, audio, sr, format="wav")
-            temp_wav.seek(0)
-            audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
-            audio = audio.cpu().numpy()
-            ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
-            self.last_chunk = audio[-self.pre_len :]
-            self.last_o = audio
-            return ret[self.chunk_len : 2 * self.chunk_len]
+            rms = np.sqrt(np.mean(input_audio**2))
+            min_rms = 10 ** (db_thresh / 20)
+            if rms < min_rms:
+                LOG.info(f"Skip silence: RMS={rms:.2f} < {min_rms:.2f}")
+                infered_audio_c = input_audio_c.copy()
+            else:
+                LOG.info(f"Start inference: RMS={rms:.2f} >= {min_rms:.2f}")
+                infered_audio_c, _ = self.svc_model.infer(
+                    speaker=speaker,
+                    transpose=transpose,
+                    audio=input_audio_c,
+                    cluster_infer_ratio=cluster_infer_ratio,
+                    auto_predict_f0=auto_predict_f0,
+                    noise_scale=noise_scale,
+                )
+                infered_audio_c = infered_audio_c.cpu().numpy()
+        infered_audio_c = infered_audio_c
+        LOG.info(f"Concentrated Inferred shape: {infered_audio_c.shape}")
+        assert infered_audio_c.shape[0] == input_audio_c.shape[0]
+
+        # crossfade
+        result = maad.util.crossfade(
+            self.last_infered, infered_audio_c, 1, self.crossfade_len
+        )[: input_audio.shape[0]]
+        LOG.info(f"Result shape: {result.shape}")
+        assert result.shape[0] == input_audio.shape[0]
+        self.last_infered = infered_audio_c
+        return result
diff --git a/src/so_vits_svc_fork/inference/slicer.py b/src/so_vits_svc_fork/inference/slicer.py
index 21309ad5..05e728da 100644
--- a/src/so_vits_svc_fork/inference/slicer.py
+++ b/src/so_vits_svc_fork/inference/slicer.py
@@ -1,6 +1,4 @@
 import librosa
-import torch
-import torchaudio
 
 
 class Slicer:
@@ -172,22 +170,31 @@ def slice(self, waveform):
             return chunk_dict
 
 
-def cut(audio_path, db_thresh=-30, min_len=5000):
-    audio, sr = librosa.load(audio_path, sr=None)
+from typing import Any, Iterable
+
+from numpy import dtype, float32, ndarray
+
+
+def cut(
+    audio: "ndarray[Any, dtype[float32]]",
+    sr: int,
+    db_thresh: int = -30,
+    min_len: int = 5000,
+) -> "dict[str, dict[str, bool | str]]":
     slicer = Slicer(sr=sr, threshold=db_thresh, min_length=min_len)
     chunks = slicer.slice(audio)
     return chunks
 
 
-def chunks2audio(audio_path, chunks):
+def chunks2audio(
+    audio: "ndarray[Any, dtype[float32]]", chunks: "dict[str, dict[str, bool | str]]"
+) -> Iterable[tuple[bool, "ndarray[Any, dtype[float32]]"]]:
     chunks = dict(chunks)
-    audio, sr = torchaudio.load(audio_path)
-    if len(audio.shape) == 2 and audio.shape[1] >= 2:
-        audio = torch.mean(audio, dim=0).unsqueeze(0)
-    audio = audio.cpu().numpy()[0]
+    if audio.ndim == 2:
+        audio = audio.mean(axis=1)
     result = []
     for k, v in chunks.items():
         tag = v["split_time"].split(",")
         if tag[0] != tag[1]:
             result.append((v["slice"], audio[int(tag[0]) : int(tag[1])]))
-    return result, sr
+    return result
diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py
index f6fc1afb..e1e4b85d 100644
--- a/src/so_vits_svc_fork/inference_main.py
+++ b/src/so_vits_svc_fork/inference_main.py
@@ -1,71 +1,123 @@
 from __future__ import annotations
 
-import io
 from logging import getLogger
 from pathlib import Path
 from typing import Literal
 
+import librosa
 import numpy as np
 import soundfile
 import torch
-from tqdm import tqdm
 
-from .inference import infer_tool, slicer
-from .inference.infer_tool import Svc
+from .inference.infer_tool import RealTimeVCBase, Svc
 
 LOG = getLogger(__name__)
 
 
 def infer(
+    *,
+    # paths
     input_path: Path,
     output_path: Path,
-    speaker: str,
     model_path: Path,
     config_path: Path,
+    # svc config
+    speaker: str,
     cluster_model_path: Path | None = None,
     transpose: int = 0,
-    db_thresh: int = -40,
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
+    # slice config
+    db_thresh: int = -40,
     pad_seconds: float = 0.5,
     device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
 ):
     svc_model = Svc(
-        model_path.as_posix(), config_path.as_posix(), cluster_model_path, device
+        net_g_path=model_path.as_posix(),
+        config_path=config_path.as_posix(),
+        cluster_model_path=cluster_model_path.as_posix()
+        if cluster_model_path
+        else None,
+        device=device,
     )
 
-    infer_tool.format_wav(input_path)
-    input_path = Path(input_path).with_suffix(".wav")
-    chunks = slicer.cut(input_path, db_thresh=db_thresh)
-    audio_data, audio_sr = slicer.chunks2audio(input_path, chunks)
+    wav, sr = librosa.load(input_path, sr=svc_model.target_sample)
+    audio = svc_model.infer_silence(
+        wav,
+        speaker=speaker,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+    )
 
-    audio = []
-    for slice_tag, data in tqdm(audio_data):
-        # segment length
-        length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
-        if slice_tag:
-            LOG.info("skip non-speaking segment")
-            _audio = np.zeros(length)
-        else:
-            # pad
-            pad_len = int(audio_sr * pad_seconds)
-            data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
-            raw_path = io.BytesIO()
-            soundfile.write(raw_path, data, audio_sr, format="wav")
-            raw_path.seek(0)
-            out_audio, out_sr = svc_model.infer(
-                speaker,
-                transpose,
-                raw_path,
-                cluster_infer_ratio=cluster_infer_ratio,
-                auto_predict_f0=auto_predict_f0,
-                noice_scale=noise_scale,
-            )
-            _audio = out_audio.cpu().numpy()
-            pad_len = int(svc_model.target_sample * pad_seconds)
-            _audio = _audio[pad_len:-pad_len]
+    soundfile.write(output_path, audio, svc_model.target_sample)
 
-        audio.extend(list(infer_tool.pad_array(_audio, length)))
 
-    soundfile.write(output_path, audio, svc_model.target_sample)
+import sounddevice as sd
+
+
+def realtime(
+    *,
+    # paths
+    model_path: Path,
+    config_path: Path,
+    # svc config
+    speaker: str,
+    cluster_model_path: Path | None = None,
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = 0,
+    noise_scale: float = 0.4,
+    # slice config
+    db_thresh: int = -40,
+    pad_seconds: float = 0.5,
+    # realtime config
+    crossfade_seconds: float = 0.05,
+    block_seconds: float = 0.5,
+    device: Literal["cpu", "cuda"] = "cuda" if torch.cuda.is_available() else "cpu",
+):
+    svc_model = Svc(
+        net_g_path=model_path.as_posix(),
+        config_path=config_path.as_posix(),
+        cluster_model_path=cluster_model_path.as_posix()
+        if cluster_model_path
+        else None,
+        device=device,
+    )
+    model = RealTimeVCBase(
+        svc_model=svc_model,
+        crossfade_len=int(crossfade_seconds * svc_model.target_sample),
+    )
+
+    def callback(
+        indata: np.ndarray,
+        outdata: np.ndarray,
+        frames: int,
+        time: int,
+        status: sd.CallbackFlags,
+    ) -> None:
+        LOG.info(f"Frames: {frames}, Status: {status}, Shape: {indata.shape}")
+
+        outdata[:] = model.process(
+            input_audio=indata.mean(axis=1),
+            speaker=speaker,
+            transpose=transpose,
+            auto_predict_f0=auto_predict_f0,
+            noise_scale=noise_scale,
+            cluster_infer_ratio=cluster_infer_ratio,
+            db_thresh=db_thresh,
+            pad_seconds=pad_seconds,
+        ).reshape(-1, 1)
+
+    with sd.Stream(
+        channels=1,
+        callback=callback,
+        samplerate=svc_model.target_sample,
+        blocksize=int(block_seconds * svc_model.target_sample),
+    ):
+        while True:
+            sd.sleep(1)