Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyq96 committed Aug 8, 2020
0 parents commit 0944f75
Show file tree
Hide file tree
Showing 9 changed files with 488 additions and 0 deletions.
53 changes: 53 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Densely Connected Time Delay Neural Network

PyTorch implementation of Densely Connected Time Delay Neural Network (D-TDNN) in our paper ["Densely Connected Time Delay Neural Network for Speaker Verification"](https://www.isca-speech.org/archive/Interspeech_2020/abstracts/1275.html) (INTERSPEECH 2020).

We provide the [pretrained models](https://github.com/yuyq96/D-TDNN/releases) which can be used in many tasks such as:

- Speaker Verification
- Speaker Adaption for Speech Recognition
- Speaker-Dependent Speech Separation
- Multi-Speaker Text-to-Speech

![D-TDNN & D-TDNN-SS](figure/D_TDNN.png)

## Usage

Data preparation
* Install [Kaldi](https://github.com/kaldi-asr/kaldi) toolkit.
* Download [VoxCeleb1 test set](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) and unzip it.
* Place `prepare_voxceleb1_test.sh` under `$kaldi_root/egs/voxceleb/v2` and change the `$datadir` and `$voxceleb1_root` in it.
* Run `chmod +x prepare_voxceleb1_test.sh && ./prepare_voxceleb1_test.sh` to generate acoustic features ([30-Dim MFCCs](https://github.com/kaldi-asr/kaldi/blob/master/egs/voxceleb/v2/conf/mfcc.conf)).
* Replace the `trials` under `$datadir/test_no_sil` with the [clean version](https://github.com/yuyq96/D-TDNN/releases).

Test
```
python main.py --root $datadir/test_no_sil --model D-TDNN --checkpoint model_zoo/dtdnn.pth --device cuda
```

## Evaluation

VoxCeleb1-O

| Model | Emb. | Params (M) | Loss | Backend | EER (%) | DCF_0.01 | DCF_0.001 |
| :---- | :--: | :--------: | :--: | :-----: | :-----: | :------: | :-------: |
| [TDNN](https://github.com/yuyq96/D-TDNN/releases) | 512 | 4.2 | Softmax | PLDA | 2.34 | 0.28 | 0.38 |
| E-TDNN | 512 | 6.1 | Softmax | PLDA | 2.08 | 0.26 | 0.41 |
| F-TDNN | 512 | 12.4 | Softmax | PLDA | 1.89 | 0.21 | 0.29 |
| [D-TDNN](https://github.com/yuyq96/D-TDNN/releases) | 512 | 2.8 | Softmax | Cosine | 1.81 | 0.20 | 0.28 |
| D-TDNN-SS (0) | 512 | 3.0 | Softmax | Cosine | 1.55 | 0.20 | 0.30 |
| D-TDNN-SS | 512 | 3.5 | Softmax | Cosine | 1.41 | 0.19 | 0.24 |
| D-TDNN-SS | 128 | 3.1 | AAM-Softmax | Cosine | 1.22 | 0.13 | 0.20 |

## Citation

If you find D-TDNN helps your research, please cite
```
@inproceedings{DBLP:conf/interspeech/YuL20,
author = {Ya-Qi Yu and
Wu-Jun Li},
title = {Densely Connected Time Delay Neural Network for Speaker Verification},
booktitle = {Annual Conference of the International Speech Communication Association (INTERSPEECH)},
year = {2020}
}
```
32 changes: 32 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os

import kaldiio
from torch.utils.data import Dataset


class KaldiFeatDataset(Dataset):

def __init__(self, root, transform=None):
super(KaldiFeatDataset, self).__init__()
self.transform = transform
self.feats = []
with open(os.path.join(root, 'feats.scp'), 'r') as f:
for line in f:
utt, feats = line.split(' ')
self.feats.append((feats, utt))

def __len__(self):
return len(self.feats)

def __getitem__(self, index):
feats, utt = self.feats[index]
feats = kaldiio.load_mat(feats)
if self.transform is not None:
feats = self.transform(feats)
return feats, utt


class Transpose2D(object):

def __call__(self, a):
return a.transpose((1, 0))
Binary file added figure/D_TDNN.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
72 changes: 72 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import argparse
import os

import numpy as np
import torch
from numpy import linalg
from torch.utils.data import DataLoader
from tqdm import tqdm

from data import KaldiFeatDataset, Transpose2D
from metric import compute_fnr_fpr, compute_eer, compute_c_norm
from model.tdnn import TDNN
from model.dtdnn import DTDNN

parser = argparse.ArgumentParser(description='Speaker Verification')
parser.add_argument('--root', default='data', type=str)
parser.add_argument('--model', default='D-TDNN', choices=['TDNN', 'D-TDNN'])
parser.add_argument('--checkpoint', default=None, type=str)
parser.add_argument('--device', default="cpu", choices=['cpu', 'cuda'])
parser.add_argument('--pin-memory', default=True, type=bool)


def load_model():
assert os.path.isfile(args.checkpoint), "No checkpoint found at '{}'".format(args.checkpoint)
print('Loading checkpoint {}'.format(args.checkpoint))
state_dict = torch.load(args.checkpoint)['state_dict']
if args.model == 'TDNN':
model = TDNN()
else:
model = DTDNN()
model.to(device)
model.load_state_dict(state_dict)
return model


def test():
model = load_model()
model.eval()

transform = Transpose2D()
dataset = KaldiFeatDataset(root=args.root, transform=transform)
loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=args.pin_memory)

utt2emb = {}
for data, utt in tqdm(loader):
with torch.no_grad():
data = data.to(device)
emb = model(data)
utt2emb[utt[0]] = emb[0].cpu().numpy()

with open(os.path.join(args.root, 'trials'), 'r') as f:
scores = []
labels = []
for line in f:
utt1, utt2, label = line.split(' ')
emb1, emb2 = utt2emb[utt1], utt2emb[utt2]
score = emb1.dot(emb2) / (linalg.norm(emb1) * linalg.norm(emb2))
scores.append(score)
labels.append(1 if label.strip() == 'target' else 0)
scores = np.array(scores)
labels = np.array(labels)
fnr, fpr = compute_fnr_fpr(scores, labels)
eer, th = compute_eer(fnr, fpr, True, scores)
print('Equal error rate is {:6f}%, at threshold {:6f}'.format(eer * 100, th))
print('Minimum detection cost (0.01) is {:6f}'.format(compute_c_norm(fnr, fpr, 0.01)))
print('Minimum detection cost (0.001) is {:6f}'.format(compute_c_norm(fnr, fpr, 0.001)))


if __name__ == '__main__':
args = parser.parse_args()
device = torch.device(args.device)
test()
46 changes: 46 additions & 0 deletions metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import numpy as np


def compute_fnr_fpr(scores, labels):
""" computes false negative rate (FNR) and false positive rate (FPR)
given trial scores and their labels.
"""

indices = np.argsort(scores)
labels = labels[indices]

target = (labels == 1).astype('f8')
nontar = (labels == 0).astype('f8')

fnr = np.cumsum(target) / np.sum(target)
fpr = 1 - np.cumsum(nontar) / np.sum(nontar)
return fnr, fpr


def compute_eer(fnr, fpr, requires_threshold=False, scores=None):
""" computes the equal error rate (EER) given FNR and FPR values calculated
for a range of operating points on the DET curve
*kaldi style*
"""

diff_miss_fa = fnr - fpr
x = np.flatnonzero(diff_miss_fa >= 0)[0]
eer = fnr[x - 1]
if requires_threshold:
assert scores is not None
scores = np.sort(scores)
th = scores[x]
return eer, th
return eer


def compute_c_norm(fnr, fpr, p_target, c_miss=1, c_fa=1):
""" computes normalized minimum detection cost function (DCF) given
the costs for false accepts and false rejects as well as a priori
probability for target speakers
"""

dcf = c_miss * fnr * p_target + c_fa * fpr * (1 - p_target)
c_det = np.min(dcf)
c_def = min(c_miss * p_target, c_fa * (1 - p_target))
return c_det/c_def
40 changes: 40 additions & 0 deletions model/dtdnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from collections import OrderedDict

from torch import nn

from .layers import TDNNLayer, DenseTDNNBlock, TransitLayer, DenseLayer, StatsPool


class DTDNN(nn.Module):

def __init__(self, feat_dim=30, embedding_size=512,
growth_rate=64, bn_size=2, init_channels=128,
config_str='batchnorm-relu'):
super(DTDNN, self).__init__()

self.xvector = nn.Sequential(OrderedDict([
('tdnn', TDNNLayer(feat_dim, init_channels, 5, dilation=1, padding=-1,
config_str=config_str)),
]))
channels = init_channels
for i, (num_layers, kernel_size, dilation) in enumerate(zip((6, 12), (3, 3), (1, 3))):
block = DenseTDNNBlock(
num_layers=num_layers,
in_channels=channels,
out_channels=growth_rate,
bn_channels=bn_size * growth_rate,
kernel_size=kernel_size,
dilation=dilation,
config_str=config_str
)
self.xvector.add_module('block%d' % (i + 1), block)
channels = channels + num_layers * growth_rate
self.xvector.add_module(
'transit%d' % (i + 1), TransitLayer(channels, channels // 2, bias=False,
config_str=config_str))
channels //= 2
self.xvector.add_module('stats', StatsPool())
self.xvector.add_module('dense', DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))

def forward(self, x):
return self.xvector(x)
Loading

0 comments on commit 0944f75

Please sign in to comment.