PaddlePaddle · LeoMax-Xiong · Mar 24, 2022 · Feb 24, 2022 · Feb 25, 2022 · Feb 25, 2022
diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
@@ -189,4 +189,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
@@ -6,3 +6,23 @@ sv0 - speaker verfication with softmax backend etc, all python code
 
 sv1 - dependence on kaldi, speaker verfication with plda/sc backend, 
       more info refer to the sv1/readme.txt
+
+
+## VoxCeleb2 preparation
+
+VoxCeleb2 audio files are released in m4a format. All the VoxCeleb2 m4a audio files must be converted in wav files before feeding them in PaddleSpeech. 
+Please, follow these steps to prepare the dataset correctly:
+
+1. Download Voxceleb2.
+You can find download instructions here: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
+
+2. Convert .m4a to wav
+VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech,  you have to convert all the m4a audio files into wav files.
+
+``` shell
+ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
+```
+
+You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
+
+3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -0,0 +1,52 @@
+###########################################
+#                Data                 #
+###########################################
+# we should explicitly specify the wav path of vox2 audio data converted from m4a
+vox2_base_path: 
+augment: True
+batch_size: 16
+num_workers: 2
+num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+shuffle: True
+random_chunk: True
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# currently, we only support fbank
+sample_rate: 16000
+n_mels: 80
+window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
+# if we want use another model, please choose another configuration yaml file
+model:
+  input_size: 80
+  # "channels": [512, 512, 512, 512, 1536],
+  channels: [1024, 1024, 1024, 1024, 3072]
+  kernel_sizes: [5, 3, 3, 3, 1]
+  dilations: [1, 2, 3, 4, 1]
+  attention_channels: 128
+  lin_neurons: 192
+
+###########################################
+#                Training                 #
+###########################################
+seed: 1986 # according from speechbrain configuration
+epochs: 10
+save_interval: 1
+log_interval: 1
+learning_rate: 1e-8
+
+
+###########################################
+#                Testing                  #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+
diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=100
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+dir=$1
+conf_path=$2
+mkdir -p ${dir}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+    # we should use the local/convert.sh convert m4a to wav
+    python3 local/data_prepare.py \
+                        --data-dir ${dir} \
+                        --config ${conf_path}
+fi 
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.datasets.voxceleb import VoxCeleb
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+
+    # stage0: set the cpu device, all data prepare process will be done in cpu mode
+    paddle.set_device("cpu")
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage 1: generate the voxceleb csv file
+    # Note: this may occurs c++ execption, but the program will execute fine
+    # so we ignore the execption 
+    # we explicitly pass the vox2 base path to data prepare and generate the audio info
+    logger.info("start to generate the voxceleb dataset info")
+    train_dataset = VoxCeleb(
+        'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
+
+    # stage 2: generate the augment noise csv file
+    if config.augment:
+        logger.info("start to generate the augment dataset info")
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    args = parser.parse_args()
+    # yapf: enable
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)
diff --git a/examples/voxceleb/sv0/local/emb.sh b/examples/voxceleb/sv0/local/emb.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+. ./path.sh
+
+exp_dir=exp/ecapa-tdnn-vox12-big//epoch_10/            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml
+audio_path="demo/voxceleb/00001.wav"
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# extract the audio embedding
+python3 ${BIN_DIR}/extract_emb.py --device "gpu" \
+          --config ${conf_path} \
+          --audio-path ${audio_path} --load-checkpoint ${exp_dir}
diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh
@@ -0,0 +1,8 @@
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+python3 ${BIN_DIR}/test.py \
+        --config ${conf_path} \
+        --data-dir ${dir} \
+        --load-checkpoint ${exp_dir}
diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+# train the speaker identification task with voxceleb data
+# Note: we will store the log file in exp/log directory
+python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+    ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
+    --data-dir ${dir} --config ${conf_path}
+
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=ecapa_tdnn
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh
+set -e
+
+#######################################################################
+# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
+#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
+# stage 1: train the speaker identification model
+# stage 2: test speaker identification 
+# stage 3: extract the training embeding to train the LDA and PLDA
+######################################################################
+
+# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset 
+# default the dataset will be stored in the ~/.paddleaudio/
+# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
+# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
+# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
+# export PPAUDIO_HOME=
+stage=0
+stop_stage=50
+
+# data directory
+# if we set the variable ${dir}, we will store the wav info to this directory
+# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+# vox2 wav path, we must convert the m4a format to wav format 
+# dir=data-demo/                          # data info directory    
+dir=demo/                          # data info directory   
+
+exp_dir=exp/ecapa-tdnn-vox12-big//            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml          
+gpus=0,1,2,3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+mkdir -p ${exp_dir}
+
+if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+     # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+     # and we should specifiy the vox2 data in the data.sh
+     bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
+fi 
+
+if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # stage 1: train the speaker identification model
+     CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} 
+fi
+
+if [ $stage -le 2 ]; then
+     # stage 2: get the speaker verification scores with cosine function
+     #          now we only support use cosine to get the scores
+     CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
+fi
+
+# if [ $stage -le 3 ]; then
+#      # stage 2: extract the training embeding to train the LDA and PLDA
+#      # todo: extract the training embedding
+# fi 
diff --git a/examples/voxceleb/sv0/utils b/examples/voxceleb/sv0/utils
@@ -0,0 +1 @@
+../../../utils/
diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -15,3 +15,5 @@
 from .gtzan import GTZAN
 from .tess import TESS
 from .urban_sound import UrbanSound8K
+from .voxceleb import VoxCeleb
+from .rirs_noises import OpenRIRNoise