-
Notifications
You must be signed in to change notification settings - Fork 14
/
mix.py
101 lines (67 loc) · 3.04 KB
/
mix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import pandas as pd
import random as rnd
import torch
class MIX:
# This dataset uses a room impulse response and a speech datasets, to create augmented
# data based on different mixing parameters.
#
# rir Room impulse response dataset (e.g. BIRD).
# speech Speech dataset (e.g. LibriSpeech).
# snr SNR (dB): [SNR_min, SNR_max].
# volume Volume (linear): [volume_min, volume_max].
# count Number of active sources: [count_min, count_max] (inclusive).
# duration Segment duration (in samples).
# samples_count Number of samples.
def __init__(self, rir, speech, snr=[-5,+5], volume=[0.01,0.99], count=[1,4], duration=16000, samples_count=10000):
self.rir = rir
self.speech = speech
np.random.seed(0)
self.rirs = np.random.randint(0, len(rir), samples_count)
self.speeches = np.random.randint(0, len(speech), (samples_count,4))
self.counts = np.random.randint(count[0], count[1]+1, samples_count)
self.snrs = np.random.uniform(snr[0], snr[1], (samples_count,4))
self.shifts = np.random.uniform(0.0, 1.0, (samples_count,4))
self.volumes = np.random.uniform(volume[0], volume[1], samples_count)
self.duration = duration
# Return the number of samples.
def __len__(self):
return len(self.rirs)
# Return the mixed segments. The segments are of dimensions (duration x 2).
# It also returns the meta data for the rir, the number of active sources
# and the segments for each source and microphone (number of sources x duration x 2).
def __getitem__(self, idx):
hs, meta = self.rir[self.rirs[idx]]
count = self.counts[idx]
ys = np.zeros((self.duration,2), dtype=np.float32)
xs = np.zeros((count,self.duration,2), dtype=np.float32)
# For each active source
for i in range(0, count):
h1 = hs[i*2+0,:].numpy()
h2 = hs[i*2+1,:].numpy()
x = np.squeeze(self.speech[self.speeches[idx,i]][0].numpy());
# Perform convolution between RIR and source
y1 = np.convolve(x, h1)
y2 = np.convolve(x, h2)
# Shift randomly in time
y1 = np.roll(y1, int(self.shifts[idx,i] * y1.shape[0]))
y1 = y1[0:self.duration]
y2 = np.roll(y2, int(self.shifts[idx,i] * y2.shape[0]))
y2 = y2[0:self.duration]
# Normalize power
E1 = np.sum(y1 ** 2)
E2 = np.sum(y2 ** 2)
E = 0.5 * (E1 + E2)
y1 /= (E ** 0.5 + 1E-10)
y2 /= (E ** 0.5 + 1E-10)
# Compute gain and apply
g = 10 ** (self.snrs[idx,i] / 10.0)
xs[i,:,0] = g * y1
xs[i,:,1] = g * y2
ys[:,0] += g * y1
ys[:,1] += g * y2
# Apply volume
v = self.volumes[idx] / (np.max(np.abs(ys)) + 1e-10)
xs *= v
ys *= v
return ys, meta, count, xs