deepvk · maks00170 · Jan 16, 2024 · Oct 9, 2023 · Oct 9, 2023 · Oct 9, 2023
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -0,0 +1,18 @@
+name: Main
+
+on: [push, pull_request]
+
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: 3.10.12
+          cache: "pip"
+      - name: "installation"
+        run: |
+          pip install -r requirements-dev.txt
+      - name: "black"
+        run: black . --check --diff --color
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
+
+ENV NV_CUDNN_VERSION 8.6.0.163
+ENV NV_CUDNN_PACKAGE_NAME "libcudnn8"
+
+ENV NV_CUDNN_PACKAGE "$NV_CUDNN_PACKAGE_NAME=$NV_CUDNN_VERSION-1+cuda11.8"
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update && apt-get -y upgrade && apt-get install -y --no-install-recommends ffmpeg
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ${NV_CUDNN_PACKAGE} \
+    unzip \
+    && apt-mark hold ${NV_CUDNN_PACKAGE_NAME} \
+    && rm -rf /var/lib/apt/lists/*
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+RUN echo 'alias python=python3' >> ~/.bashrc
+RUN echo 'NCCL_SOCKET_IFNAME=lo' >> ~/.bashrc
+
+
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+ENTRYPOINT [ "bash" ]
diff --git a/README.md b/README.md
@@ -0,0 +1,73 @@
+# PM-Unet: phase and magnitude aware model for music source separation
+ [![githubio](https://img.shields.io/badge/GitHub.io-Audio_Samples-blue?logo=Github&style=flat-square)](https://d-a-yakovlev.github.io/test/)
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OXlCZgd5KidMDZDUItOIT9ZA4IUJHXsZ?usp=sharing)
+
+## Navigation
+1. [Structure](#structure)
+2. [Docker](#docker)
+3. [Training](#training)
+4. [Inference](#inference)
+
+## Structure
+- [`separator`](./separator) ‒ main source code with model and dataset implementations and code to train model.
+- [`streaming`](./streaming/demo) ‒ source code inference tf-lite version model.
+
+## Docker
+#### To set up environment with Docker
+
+If you don't have Docker installed, please follow the links to find installation instructions for [Ubuntu](https://docs.docker.com/desktop/install/linux-install/), [Mac](https://docs.docker.com/desktop/install/mac-install/) or [Windows](https://docs.docker.com/desktop/install/windows-install/).
+
+Build docker image:
+
+    docker build -t pmunet .
+
+Run docker image:
+
+    bash run_docker.sh
+
+## Data
+Used dataset [MUSDB18-HQ](https://sigsep.github.io/datasets/musdb.html#musdb18-hq-uncompressed-wav). 
+
+[![Download dataset](https://img.shields.io/badge/Download%20dataset-65c73b)](https://zenodo.org/record/3338373/files/musdb18hq.zip?download=1)
+
+The dataset consists of
+150 full-length stereo tracks sampled at 44.1 kHz. providing a
+complete audio mix and four main elements: ”vocal”, ”bass”,
+”drums” and ”other” for each sample, which can be considered as a target in the context of source separation. The kit
+structure offers 100 training compositions and 50 validation
+compositions
+
+## Training
+1. Configure arguments in `separator/config/config.py`.
+2. `cd separator`.
+3. Run `python3 separator/pl_model.py`.
+
+## Inference
+
+### Auto local
+1. Configure arguments in `separator/config/config.py`.
+2. `cd separator`.
+3. `python3 inference.py [-IO]`
+    - `-I` specify path to mixture, 
+    - `-O` output dir, both of them optional. 
+
+By default script loads `.pt` file with weights and `sample.wav` from google drive.
+
+#### For example
+``` 
+python3 inference.py -I path/to/mix -O out_dir
+```
+With successful script run four audio files (`vocals.wav` and `drums.wav`, `bass.wav`, `other.wav`) will be in `out_dir`. By default in `separator/inference/output`.
+
+**You can download weights manually**
+
+Download one the .pt file below:
+ * [LSTM-bottleneck version](https://drive.google.com/file/d/18jT2TYffdRD1fL7wecAiM5nJPM_OKpNB/view?usp=drive_link)
+ * [WIthout LSTM-bottleneck version](https://drive.google.com/file/d/1VO07OYbsnCuEJYRSuA8HhjlQnx6dbWX7/view?usp=drive_link)
+
+ ### Streaming
+ In streaming section located scripts for: convert model to `tflite` format and run `tflite` model in `"stream mode"`.
+
+1. Configure arguments in `streaming/config/config.py`.
+2. `cd streaming`.
+3. `python3 runner.py`
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,3 @@
+black
+mypy
+pytest
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,139 @@
+aiohttp==3.8.4
+aiosignal==1.3.1
+antlr4-python3-runtime==4.9.3
+appdirs==1.4.4
+asttokens
+async-timeout==4.0.2
+attrs==23.1.0
+audioread==3.0.0
+backcall
+certifi==2023.5.7
+cffi==1.15.1
+charset-normalizer==3.1.0
+cmake==3.26.4
+comm
+contourpy
+cycler
+Cython==0.29.35
+debugpy
+decorator
+diffq==0.2.4
+einops==0.6.1
+executing
+fast-bss-eval==0.1.4
+ffmpeg-python==0.2.0
+filelock==3.12.0
+fonttools==4.25.0
+frozenlist==1.3.3
+fsspec==2023.6.0
+future==0.18.3
+gdown
+idna==3.4
+ipykernel
+ipython
+jedi
+Jinja2==3.1.2
+joblib==1.3.1
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+julius==0.2.7
+jupyter_client
+jupyter_core
+kiwisolver
+lameenc==1.4.2
+lazy_loader==0.3
+librosa==0.10.0.post2
+lightning-utilities==0.8.0
+lit==16.0.5.post0
+llvmlite==0.40.1
+lpips==0.1.4
+MarkupSafe==2.1.3
+matplotlib
+matplotlib-inline
+mir-eval==0.7
+mkl-fft==1.3.6
+mkl-random
+mkl-service==2.4.0
+mpmath==1.3.0
+msgpack==1.0.5
+multidict==6.0.4
+munkres==1.1.4
+musdb==0.4.0
+museval==0.4.1
+nest-asyncio
+networkx==3.1
+numba==0.57.1
+numpy #==1.24.4
+nobuco
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+omegaconf==2.3.0
+openunmix==1.2.1
+packaging
+pandas==2.1.0
+parso
+pexpect
+pickleshare
+Pillow==9.5.0
+platformdirs
+ply==3.11
+pooch==1.6.0
+primePy==1.3
+prompt-toolkit
+psutil
+ptyprocess
+pure-eval
+pyaml==23.5.9
+pycparser==2.21
+pyee==10.0.1
+Pygments
+pyparsing
+PyQt5-sip==12.11.0
+PySoundFile==0.9.0.post1
+python-dateutil
+python-ffmpeg==2.0.4
+pytorch-lightning==2.0.3
+pytz==2023.3
+PyYAML==6.0
+pyzmq
+referencing==0.30.2
+requests==2.31.0
+rpds-py==0.10.0
+scikit-learn==1.3.0
+scipy==1.10.1
+simplejson==3.19.1
+sip
+six
+soundfile==0.12.1
+sox==1.4.1
+soxr==0.3.5
+stack-data
+stempeg==0.2.3
+sympy==1.12
+tensorflow>=2.13.0 #.*
+threadpoolctl==3.1.0
+toml
+torch==2.0.1
+torch-audiomentations==0.11.0
+torch-pitch-shift==1.2.4
+torchaudio==2.0.2
+torchmetrics==0.11.4
+torchvision==0.15.2
+tornado
+tqdm==4.65.0
+traitlets
+triton==2.0.0
+typing_extensions>=4.6.1
+tzdata==2023.3
+urllib3==2.0.3
+wcwidth
+yarl==1.9.2
diff --git a/run_docker.sh b/run_docker.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+app=$PWD
+
+docker run --name pmunet -it --rm \
+    --net=host --ipc=host \
+    --gpus "all" \
+    -v ${app}:/app \
+    pmunet
diff --git a/separator/config/config.py b/separator/config/config.py
@@ -0,0 +1,97 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+
+
+@dataclass
+class TrainConfig:
+    device: str = "cuda"
+
+    # datasets
+    musdb_path: str = "musdb18hq"
+    metadata_train_path: str = "metadata"
+    metadata_test_path: str = "metadata1"
+    segment: int = 5
+
+    # dataloaders
+    batch_size: int = 6
+    shuffle_train: bool = True
+    shuffle_valid: bool = False
+    drop_last: bool = True
+    num_workers: int = 2
+
+    # checkpoint_callback
+    metric_monitor_mode: str = "min"
+    save_top_k_model_weights: int = 1
+
+    # PM_Unet model
+    model_source: tuple = ("drums", "bass", "other", "vocals")
+    model_depth: int = 4
+    model_channel: int = 28
+    is_mono: bool = False
+    mask_mode: bool = False
+    skip_mode: str = "concat"
+    nfft: int = 4096
+    bottlneck_lstm: bool = True
+    layers: int = 2
+    stft_flag: bool = True
+    # augments
+    shift: int = 8192
+    pitchshift_proba: float = 0.2
+    vocals_min_semitones: int = -5
+    vocals_max_semitones: int = 5
+    other_min_semitones: int = -2
+    other_max_semitones: int = 2
+    pitchshift_flag_other: bool = False
+    time_change_proba: float = 0.2
+    time_change_factors: tuple = (0.8, 0.85, 0.9, 0.95, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3)
+    remix_proba: float = 1
+    remix_group_size: int = batch_size
+    scale_proba: float = 1
+    scale_min: float = 0.25
+    scale_max: float = 1.25
+    fade_mask_proba: float = 0.1
+    double_proba: float = 0.1
+    reverse_proba: float = 0.2
+    mushap_proba: float = 0.0
+    mushap_depth: int = 2
+
+    # loss if there are artifacts while listening, then increase this params
+    factor: int = 1
+    c_factor: int = 1
+    loss_nfft: tuple = (4096,)
+    gamma: float = 0.3
+    # lr
+    lr: float = 0.5 * 3e-3
+    T_0: int = 40
+
+    # lightning
+    max_epochs: int = 100
+    precision: str = 16  # "bf16-mixed"
+    grad_clip: float = 0.5
+
+
+@dataclass
+class InferenceConfig:
+    GDRIVE_PREFIX = "https://drive.google.com/uc?id="
+
+    device: str = "cpu"
+
+    # weights
+    weights_dir: Path = Path("/app/separator/inference/weights")
+    gdrive_weights_LSTM: str = f"{GDRIVE_PREFIX}18jT2TYffdRD1fL7wecAiM5nJPM_OKpNB"
+    gdrive_weights_conv: str = f"{GDRIVE_PREFIX}1VO07OYbsnCuEJYRSuA8HhjlQnx6dbWX7"
+
+    # inference instance
+    segment: int = 7
+    overlap: float = 0.2
+    offset: Union[int, None] = None
+    duration: Union[int, None] = None
+
+    # inference
+    sample_rate: int = 44100
+    num_channels: int = 2
+    default_result_dir: str = "/app/separator/inference/output"
+    default_input_dir: str = "/app/separator/inference/input"
+    # adele
+    gdrive_mix: str = f"{GDRIVE_PREFIX}1zJpyW1fYxHKXDcDH9s5DiBCYiRpraDB3"