first commit

yuyq96 · Aug 8, 2020 · 0944f75 · 0944f75
commit 0944f75
Show file tree

Hide file tree

Showing 9 changed files with 488 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,53 @@
+# Densely Connected Time Delay Neural Network
+
+PyTorch implementation of Densely Connected Time Delay Neural Network (D-TDNN) in our paper ["Densely Connected Time Delay Neural Network for Speaker Verification"](https://www.isca-speech.org/archive/Interspeech_2020/abstracts/1275.html) (INTERSPEECH 2020).
+
+We provide the [pretrained models](https://github.com/yuyq96/D-TDNN/releases) which can be used in many tasks such as:
+
+- Speaker Verification
+- Speaker Adaption for Speech Recognition
+- Speaker-Dependent Speech Separation
+- Multi-Speaker Text-to-Speech
+
+![D-TDNN & D-TDNN-SS](figure/D_TDNN.png)
+
+## Usage
+
+Data preparation
+* Install [Kaldi](https://github.com/kaldi-asr/kaldi) toolkit.
+* Download [VoxCeleb1 test set](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) and unzip it.
+* Place `prepare_voxceleb1_test.sh` under `$kaldi_root/egs/voxceleb/v2` and change the `$datadir` and `$voxceleb1_root` in it.
+* Run `chmod +x prepare_voxceleb1_test.sh && ./prepare_voxceleb1_test.sh` to generate acoustic features ([30-Dim MFCCs](https://github.com/kaldi-asr/kaldi/blob/master/egs/voxceleb/v2/conf/mfcc.conf)).
+* Replace the `trials` under `$datadir/test_no_sil` with the [clean version](https://github.com/yuyq96/D-TDNN/releases).
+
+Test
+```
+python main.py --root $datadir/test_no_sil --model D-TDNN --checkpoint model_zoo/dtdnn.pth --device cuda
+```
+
+## Evaluation
+
+VoxCeleb1-O
+
+| Model | Emb. | Params (M) | Loss | Backend | EER (%) | DCF_0.01 | DCF_0.001 |
+| :---- | :--: | :--------: | :--: | :-----: | :-----: | :------: | :-------: |
+| [TDNN](https://github.com/yuyq96/D-TDNN/releases) | 512 | 4.2 | Softmax | PLDA | 2.34 | 0.28 | 0.38 |
+| E-TDNN | 512 | 6.1 | Softmax | PLDA | 2.08 | 0.26 | 0.41 |
+| F-TDNN | 512 | 12.4 | Softmax | PLDA | 1.89 | 0.21 | 0.29 |
+| [D-TDNN](https://github.com/yuyq96/D-TDNN/releases) | 512 | 2.8 | Softmax | Cosine | 1.81 | 0.20 | 0.28 |
+| D-TDNN-SS (0) | 512 | 3.0 | Softmax | Cosine | 1.55 | 0.20 | 0.30 |
+| D-TDNN-SS | 512 | 3.5 | Softmax | Cosine | 1.41 | 0.19 | 0.24 |
+| D-TDNN-SS | 128 | 3.1 | AAM-Softmax | Cosine | 1.22 | 0.13 | 0.20 |
+
+## Citation
+
+If you find D-TDNN helps your research, please cite
+```
+@inproceedings{DBLP:conf/interspeech/YuL20,
+  author    = {Ya-Qi Yu and
+               Wu-Jun Li},
+  title     = {Densely Connected Time Delay Neural Network for Speaker Verification},
+  booktitle = {Annual Conference of the International Speech Communication Association (INTERSPEECH)},
+  year      = {2020}
+}
+```
diff --git a/data.py b/data.py
@@ -0,0 +1,32 @@
+import os
+
+import kaldiio
+from torch.utils.data import Dataset
+
+
+class KaldiFeatDataset(Dataset):
+
+    def __init__(self, root, transform=None):
+        super(KaldiFeatDataset, self).__init__()
+        self.transform = transform
+        self.feats = []
+        with open(os.path.join(root, 'feats.scp'), 'r') as f:
+            for line in f:
+                utt, feats = line.split(' ')
+                self.feats.append((feats, utt))
+
+    def __len__(self):
+        return len(self.feats)
+
+    def __getitem__(self, index):
+        feats, utt = self.feats[index]
+        feats = kaldiio.load_mat(feats)
+        if self.transform is not None:
+            feats = self.transform(feats)
+        return feats, utt
+
+
+class Transpose2D(object):
+
+    def __call__(self, a):
+        return a.transpose((1, 0))
diff --git a/figure/D_TDNN.png b/figure/D_TDNN.png
diff --git a/main.py b/main.py
@@ -0,0 +1,72 @@
+import argparse
+import os
+
+import numpy as np
+import torch
+from numpy import linalg
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from data import KaldiFeatDataset, Transpose2D
+from metric import compute_fnr_fpr, compute_eer, compute_c_norm
+from model.tdnn import TDNN
+from model.dtdnn import DTDNN
+
+parser = argparse.ArgumentParser(description='Speaker Verification')
+parser.add_argument('--root', default='data', type=str)
+parser.add_argument('--model', default='D-TDNN', choices=['TDNN', 'D-TDNN'])
+parser.add_argument('--checkpoint', default=None, type=str)
+parser.add_argument('--device', default="cpu", choices=['cpu', 'cuda'])
+parser.add_argument('--pin-memory', default=True, type=bool)
+
+
+def load_model():
+    assert os.path.isfile(args.checkpoint), "No checkpoint found at '{}'".format(args.checkpoint)
+    print('Loading checkpoint {}'.format(args.checkpoint))
+    state_dict = torch.load(args.checkpoint)['state_dict']
+    if args.model == 'TDNN':
+        model = TDNN()
+    else:
+        model = DTDNN()
+    model.to(device)
+    model.load_state_dict(state_dict)
+    return model
+
+
+def test():
+    model = load_model()
+    model.eval()
+
+    transform = Transpose2D()
+    dataset = KaldiFeatDataset(root=args.root, transform=transform)
+    loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=args.pin_memory)
+
+    utt2emb = {}
+    for data, utt in tqdm(loader):
+        with torch.no_grad():
+            data = data.to(device)
+            emb = model(data)
+            utt2emb[utt[0]] = emb[0].cpu().numpy()
+
+    with open(os.path.join(args.root, 'trials'), 'r') as f:
+        scores = []
+        labels = []
+        for line in f:
+            utt1, utt2, label = line.split(' ')
+            emb1, emb2 = utt2emb[utt1], utt2emb[utt2]
+            score = emb1.dot(emb2) / (linalg.norm(emb1) * linalg.norm(emb2))
+            scores.append(score)
+            labels.append(1 if label.strip() == 'target' else 0)
+        scores = np.array(scores)
+        labels = np.array(labels)
+        fnr, fpr = compute_fnr_fpr(scores, labels)
+        eer, th = compute_eer(fnr, fpr, True, scores)
+        print('Equal error rate is {:6f}%, at threshold {:6f}'.format(eer * 100, th))
+        print('Minimum detection cost (0.01) is {:6f}'.format(compute_c_norm(fnr, fpr, 0.01)))
+        print('Minimum detection cost (0.001) is {:6f}'.format(compute_c_norm(fnr, fpr, 0.001)))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    test()
diff --git a/metric.py b/metric.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+
+def compute_fnr_fpr(scores, labels):
+    """ computes false negative rate (FNR) and false positive rate (FPR)
+    given trial scores and their labels.
+    """
+
+    indices = np.argsort(scores)
+    labels = labels[indices]
+
+    target = (labels == 1).astype('f8')
+    nontar = (labels == 0).astype('f8')
+
+    fnr = np.cumsum(target) / np.sum(target)
+    fpr = 1 - np.cumsum(nontar) / np.sum(nontar)
+    return fnr, fpr
+
+
+def compute_eer(fnr, fpr, requires_threshold=False, scores=None):
+    """ computes the equal error rate (EER) given FNR and FPR values calculated
+        for a range of operating points on the DET curve
+        *kaldi style*
+    """
+
+    diff_miss_fa = fnr - fpr
+    x = np.flatnonzero(diff_miss_fa >= 0)[0]
+    eer = fnr[x - 1]
+    if requires_threshold:
+        assert scores is not None
+        scores = np.sort(scores)
+        th = scores[x]
+        return eer, th
+    return eer
+
+
+def compute_c_norm(fnr, fpr, p_target, c_miss=1, c_fa=1):
+    """ computes normalized minimum detection cost function (DCF) given
+        the costs for false accepts and false rejects as well as a priori
+        probability for target speakers
+    """
+
+    dcf = c_miss * fnr * p_target + c_fa * fpr * (1 - p_target)
+    c_det = np.min(dcf)
+    c_def = min(c_miss * p_target, c_fa * (1 - p_target))
+    return c_det/c_def
diff --git a/model/dtdnn.py b/model/dtdnn.py
@@ -0,0 +1,40 @@
+from collections import OrderedDict
+
+from torch import nn
+
+from .layers import TDNNLayer, DenseTDNNBlock, TransitLayer, DenseLayer, StatsPool
+
+
+class DTDNN(nn.Module):
+
+    def __init__(self, feat_dim=30, embedding_size=512,
+                 growth_rate=64, bn_size=2, init_channels=128,
+                 config_str='batchnorm-relu'):
+        super(DTDNN, self).__init__()
+
+        self.xvector = nn.Sequential(OrderedDict([
+            ('tdnn', TDNNLayer(feat_dim, init_channels, 5, dilation=1, padding=-1,
+                               config_str=config_str)),
+        ]))
+        channels = init_channels
+        for i, (num_layers, kernel_size, dilation) in enumerate(zip((6, 12), (3, 3), (1, 3))):
+            block = DenseTDNNBlock(
+                num_layers=num_layers,
+                in_channels=channels,
+                out_channels=growth_rate,
+                bn_channels=bn_size * growth_rate,
+                kernel_size=kernel_size,
+                dilation=dilation,
+                config_str=config_str
+            )
+            self.xvector.add_module('block%d' % (i + 1), block)
+            channels = channels + num_layers * growth_rate
+            self.xvector.add_module(
+                'transit%d' % (i + 1), TransitLayer(channels, channels // 2, bias=False,
+                                                    config_str=config_str))
+            channels //= 2
+        self.xvector.add_module('stats', StatsPool())
+        self.xvector.add_module('dense', DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
+
+    def forward(self, x):
+        return self.xvector(x)