Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get DeepFormants working again #9

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ Download the code. The code is based on signal processing package in Python call
Dependencies:
Run these lines in a terminal to install everything necessary for feature extraction.
```
sudo apt-get install python-numpy python-scipy python-nose python-pip

sudo pip install scikits.talkbox
sudo apt-get install python3-numpy python3-scipy python3-nose
```
Next for the installation of Torch for loading the models run this.
```
Expand All @@ -37,30 +35,31 @@ cd ~/torch; bash install-deps;
./install.sh
```
```
luarocks install rnn
git clone https://github.com/Element-Research/rnn.git old-rnn
cd old-rnn; luarocks make rocks/rnn-scm-1.rockspec
```
The Estimation model can be downloaded here and because of size constraints the Tracking model can be abtained by download from this link
[tracking_model.mat] (https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc)
The Estimation model can be downloaded here and because of size constraints the Tracking model can be obtained by download from this link:
[tracking_model.mat](https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc)

## How to use:

For vowel formant estimation, call the main script in a terminal with the following inputs: wav file, formant output filename, and the vowel begin and end times:

```
python formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3
python3 formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3
```

or the vowel begin and end times can be taken from a TextGrid file (here the name of the TextGrid is Example.TextGrid and the vowel is taken from a tier called "VOWEL"):

```
python formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \
python3 formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \
--textgrid_tier VOWEL
```

For formant tracking, just call the script with the wav file and output filename:

```
python formants.py data/Example.wav data/ExamplePredictions.csv
python3 formants.py data/Example.wav data/ExamplePredictions.csv
```


Expand Down
32 changes: 17 additions & 15 deletions extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import math
from scipy.fftpack.realtransforms import dct
from scipy.signal import lfilter, hamming
from copy import deepcopy
from scipy.fftpack import fft, ifft
from scikits.talkbox.linpred import lpc
#from scikits.talkbox.linpred import lpc # obsolete
from helpers.conch_lpc import lpc
import shutil
from helpers.utilities import *

Expand Down Expand Up @@ -88,9 +88,9 @@ def periodogram(x, nfft=None, fs=1):

pxx = np.abs(fft(x, nfft)) ** 2
if nfft % 2 == 0:
pn = nfft / 2 + 1
pn = nfft // 2 + 1
else:
pn = (nfft + 1 )/ 2
pn = (nfft + 1) // 2

fgrid = np.linspace(0, fs * 0.5, pn)
return pxx[:pn] / (n * fs), fgrid
Expand Down Expand Up @@ -137,9 +137,9 @@ def arspec(x, order, nfft=None, fs=1):

# This is not enough to deal correctly with even/odd size
if nfft % 2 == 0:
pn = nfft / 2 + 1
pn = nfft // 2 + 1
else:
pn = (nfft + 1 )/ 2
pn = (nfft + 1) // 2

px = 1 / np.fft.fft(a, nfft)[:pn]
pxx = np.real(np.conj(px) * px)
Expand Down Expand Up @@ -200,7 +200,6 @@ def preemp(input, p):


def arspecs(input_wav,order,Atal=False):
epsilon = 0.0000000001
data = input_wav
if Atal:
ar = atal(data, order, 30)
Expand All @@ -211,8 +210,10 @@ def arspecs(input_wav,order,Atal=False):
for k, l in zip(ars[0], ars[1]):
ar.append(math.log(math.sqrt((k**2)+(l**2))))
for val in range(0,len(ar)):
if ar[val] == 0.0:
ar[val] = deepcopy(epsilon)
if ar[val] < 0.0:
ar[val] = np.nan
elif ar[val] == 0.0:
ar[val] = epsilon
mspec1 = np.log10(ar)
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
ar = dct(mspec1, type=2, norm='ortho', axis=-1)
Expand All @@ -221,10 +222,10 @@ def arspecs(input_wav,order,Atal=False):

def specPS(input_wav,pitch):
N = len(input_wav)
samps = N/pitch
samps = N // pitch
if samps == 0:
samps = 1
frames = N/samps
frames = N // samps
data = input_wav[0:frames]
specs = periodogram(data,nfft=4096)
for i in range(1,int(samps)):
Expand All @@ -236,10 +237,11 @@ def specPS(input_wav,pitch):
specs[0][s] /= float(samps)
peri = []
for k, l in zip(specs[0], specs[1]):
if k == 0 and l == 0:
peri.append(epsilon)
else:
peri.append(math.log(math.sqrt((k ** 2) + (l ** 2))))
m = math.sqrt((k ** 2) + (l ** 2))
if m > 0: m = math.log(m)
if m == 0: m = epsilon
elif m < 0: m = np.nan
peri.append(m)
# Filter the spectrum through the triangle filterbank
mspec = np.log10(peri)
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
Expand Down
8 changes: 4 additions & 4 deletions formants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@

def predict_from_times(wav_filename, preds_filename, begin, end):
tmp_features_filename = tempfile._get_default_tempdir() + "/" + next(tempfile._get_candidate_names()) + ".txt"
print tmp_features_filename
print(tmp_features_filename)

if begin > 0.0 or end > 0.0:
features.create_features(wav_filename, tmp_features_filename, begin, end)
easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename)
easy_call("luajit load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename)
else:
features.create_features(wav_filename, tmp_features_filename)
easy_call("th load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename)
easy_call("luajit load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename)


def predict_from_textgrid(wav_filename, preds_filename, textgrid_filename, textgrid_tier):

print wav_filename
print(wav_filename)

if os.path.exists(preds_filename):
os.remove(preds_filename)
Expand Down
4 changes: 2 additions & 2 deletions formants.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ if [ $# -eq 2 ]
then
tempfile=`mktemp -t txt`
python extract_features.py $1 $tempfile
th load_estimation_model.lua $tempfile $2
luajit load_estimation_model.lua $tempfile $2
elif [ $# -eq 4 ]
then
tempfile=`mktemp -t txt`
python extract_features.py $1 $tempfile --begin $3 --end $4
th load_estimation_model.lua $tempfile $2
luajit load_estimation_model.lua $tempfile $2
else
echo "$0 wav_filename pred_csv_filename [begin_time end_time]"
fi
Loading