diff --git a/pliers/converters/video.py b/pliers/converters/video.py index cf3fa878..e4ab8cdf 100644 --- a/pliers/converters/video.py +++ b/pliers/converters/video.py @@ -14,4 +14,7 @@ class VideoToAudioConverter(Converter): VERSION = '1.0' def _convert(self, video): - return AudioStim(clip=video.clip.audio, onset=video.onset) + fps = AudioStim.get_sampling_rate(video.filename) + return AudioStim(sampling_rate=fps, + clip=video.clip.audio, + onset=video.onset) diff --git a/pliers/stimuli/audio.py b/pliers/stimuli/audio.py index 9aa8ee6b..89f7d650 100644 --- a/pliers/stimuli/audio.py +++ b/pliers/stimuli/audio.py @@ -3,6 +3,10 @@ from .base import Stim from moviepy.audio.io.AudioFileClip import AudioFileClip +import os +import re +import subprocess + class AudioStim(Stim): @@ -18,14 +22,17 @@ class AudioStim(Stim): _default_file_extension = '.wav' - def __init__(self, filename=None, onset=None, sampling_rate=44100, url=None, clip=None): + def __init__(self, filename=None, onset=None, sampling_rate=None, url=None, clip=None): if url is not None: filename = url self.filename = filename + self.sampling_rate = sampling_rate - self.clip = clip + if not self.sampling_rate: + self.sampling_rate = self.get_sampling_rate(self.filename) - if self.clip is None: + self.clip = clip + if not self.clip: self._load_clip() # Small default buffer isn't ideal, but moviepy has persistent issues @@ -41,6 +48,42 @@ def __init__(self, filename=None, onset=None, sampling_rate=44100, url=None, cli super(AudioStim, self).__init__( filename, onset=onset, duration=duration) + @staticmethod + def get_sampling_rate(filename): + ''' Use FFMPEG to get the sampling rate, most of this code was + adapted from the moviepy codebase ''' + cmd = ['ffmpeg', '-i', filename] + + with open(os.devnull, 'rb') as devnull: + creationflags = 0x08000000 if os.name == 'nt' else 0 + p = subprocess.Popen(cmd, + stdin=devnull, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=creationflags) + + _, p_err = p.communicate() + del p + + lines = p_err.decode('utf8').splitlines() + if 'No such file or directory' in lines[-1]: + raise IOError(('Error: the file %s could not be found.\n' + 'Please check that you entered the correct ' + 'path.') % filename) + + lines_audio = [l for l in lines if ' Audio: ' in l] + + if lines_audio: + line = lines_audio[0] + try: + match = re.search(' [0-9]* Hz', line) + return int(line[match.start()+1:match.end()-3]) + except: + pass + + # Return a sensible default + return 44100 + def _load_clip(self): self.clip = AudioFileClip(self.filename, fps=self.sampling_rate) @@ -54,4 +97,4 @@ def __setstate__(self, d): self._load_clip() def save(self, path): - self.clip.write_audiofile(path) + self.clip.write_audiofile(path, fps=self.sampling_rate) diff --git a/pliers/stimuli/video.py b/pliers/stimuli/video.py index dbeee39e..591dde97 100644 --- a/pliers/stimuli/video.py +++ b/pliers/stimuli/video.py @@ -4,6 +4,7 @@ from math import ceil from moviepy.video.io.VideoFileClip import VideoFileClip from .base import Stim +from .audio import AudioStim from .image import ImageStim @@ -68,7 +69,8 @@ def __init__(self, filename=None, frame_index=None, onset=None, url=None): duration=duration) def _load_clip(self): - self.clip = VideoFileClip(self.filename) + audio_fps = AudioStim.get_sampling_rate(self.filename) + self.clip = VideoFileClip(self.filename, audio_fps=audio_fps) def __iter__(self): """ Frame iteration. """ diff --git a/pliers/tests/converters/test_video_converters.py b/pliers/tests/converters/test_video_converters.py index 16df32e4..5745f6a4 100644 --- a/pliers/tests/converters/test_video_converters.py +++ b/pliers/tests/converters/test_video_converters.py @@ -15,4 +15,14 @@ def test_video_to_audio_converter(): assert audio.history.source_class == 'VideoStim' assert audio.history.source_file == filename assert audio.onset == 4.2 + assert audio.sampling_rate == 48000 + assert np.isclose(video.duration, audio.duration, 1e-2) + + filename = join(VIDEO_DIR, 'obama_speech.mp4') + video = VideoStim(filename, onset=1.0) + audio = conv.transform(video) + assert audio.history.source_class == 'VideoStim' + assert audio.history.source_file == filename + assert audio.onset == 1.0 + assert audio.sampling_rate == 24000 assert np.isclose(video.duration, audio.duration, 1e-2) diff --git a/pliers/tests/data/video/obama_speech.wav b/pliers/tests/data/video/obama_speech.wav deleted file mode 100644 index 64d30454..00000000 Binary files a/pliers/tests/data/video/obama_speech.wav and /dev/null differ diff --git a/pliers/tests/data/video/small.wav b/pliers/tests/data/video/small.wav deleted file mode 100644 index f64b7293..00000000 Binary files a/pliers/tests/data/video/small.wav and /dev/null differ diff --git a/pliers/tests/extractors/test_audio_extractors.py b/pliers/tests/extractors/test_audio_extractors.py index d4ac271d..f1d80623 100644 --- a/pliers/tests/extractors/test_audio_extractors.py +++ b/pliers/tests/extractors/test_audio_extractors.py @@ -47,47 +47,47 @@ def test_spectral_extractors(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = SpectralCentroidExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 3) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['spectral_centroid'][0], 817.53095) + assert df.shape == (1221, 3) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['spectral_centroid'][0], 1144.98145) ext2 = SpectralCentroidExtractor(n_fft=1024, hop_length=256) df = ext2.transform(audio).to_df() - assert df.shape == (9763, 3) - assert np.isclose(df['onset'][1], 0.005805) - assert np.isclose(df['duration'][0], 0.005805) - assert np.isclose(df['spectral_centroid'][0], 1492.00515) + assert df.shape == (2441, 3) + assert np.isclose(df['onset'][1], 0.02322) + assert np.isclose(df['duration'][0], 0.02322) + assert np.isclose(df['spectral_centroid'][0], 866.20176) ext = SpectralBandwidthExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 3) - assert np.isclose(df['spectral_bandwidth'][0], 1056.66227) + assert df.shape == (1221, 3) + assert np.isclose(df['spectral_bandwidth'][0], 1172.96090) - ext = SpectralContrastExtractor() + ext = SpectralContrastExtractor(fmin=100.0) df = ext.transform(audio).to_df() - assert df.shape == (4882, 9) - assert np.isclose(df['spectral_contrast_band_4'][0], 25.09001) + assert df.shape == (1221, 9) + assert np.isclose(df['spectral_contrast_band_4'][0], 25.637166) ext = SpectralRolloffExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 3) - assert np.isclose(df['spectral_rolloff'][0], 1550.39063) + assert df.shape == (1221, 3) + assert np.isclose(df['spectral_rolloff'][0], 2492.46826) def test_polyfeatures_extractor(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = PolyFeaturesExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 4) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['coefficient_0'][0], -7.795e-5) + assert df.shape == (1221, 4) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['coefficient_0'][0], -0.00172077) ext2 = PolyFeaturesExtractor(order=3) df = ext2.transform(audio).to_df() - assert df.shape == (4882, 6) - assert np.isclose(df['coefficient_3'][2], 20.77778) + assert df.shape == (1221, 6) + assert np.isclose(df['coefficient_3'][2], 12.32108) def test_rmse_extractor(): @@ -95,17 +95,17 @@ def test_rmse_extractor(): onset=1.0) ext = RMSEExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 3) - assert np.isclose(df['onset'][1], 1.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['rmse'][0], 0.226572) + assert df.shape == (1221, 3) + assert np.isclose(df['onset'][1], 1.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['rmse'][0], 0.25663) ext2 = RMSEExtractor(frame_length=1024, hop_length=256, center=False) df = ext2.transform(audio).to_df() - assert df.shape == (9759, 3) - assert np.isclose(df['onset'][1], 1.005805) - assert np.isclose(df['duration'][0], 0.005805) - assert np.isclose(df['rmse'][0], 0.22648) + assert df.shape == (2437, 3) + assert np.isclose(df['onset'][1], 1.02322) + assert np.isclose(df['duration'][0], 0.02322) + assert np.isclose(df['rmse'][0], 0.25649) def test_zcr_extractor(): @@ -113,97 +113,97 @@ def test_zcr_extractor(): onset=2.0) ext = ZeroCrossingRateExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 3) - assert np.isclose(df['onset'][1], 2.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['zero_crossing_rate'][0], 0.0234375) + assert df.shape == (1221, 3) + assert np.isclose(df['onset'][1], 2.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['zero_crossing_rate'][0], 0.069824) ext2 = ZeroCrossingRateExtractor(frame_length=1024, hop_length=256, center=False, pad=True) df = ext2.transform(audio).to_df() - assert df.shape == (9759, 3) - assert np.isclose(df['onset'][1], 2.005805) - assert np.isclose(df['duration'][0], 0.005805) - assert np.isclose(df['zero_crossing_rate'][0], 0.047852) + assert df.shape == (2437, 3) + assert np.isclose(df['onset'][1], 2.02322) + assert np.isclose(df['duration'][0], 0.02322) + assert np.isclose(df['zero_crossing_rate'][0], 0.140625) def test_chroma_extractors(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = ChromaSTFTExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 14) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['chroma_2'][0], 0.417595) + assert df.shape == (1221, 14) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['chroma_2'][0], 0.53129) ext2 = ChromaSTFTExtractor(n_chroma=6, n_fft=1024, hop_length=256) df = ext2.transform(audio).to_df() - assert df.shape == (9763, 8) - assert np.isclose(df['onset'][1], 0.005805) - assert np.isclose(df['duration'][0], 0.005805) - assert np.isclose(df['chroma_5'][0], 0.732480) + assert df.shape == (2441, 8) + assert np.isclose(df['onset'][1], 0.02322) + assert np.isclose(df['duration'][0], 0.02322) + assert np.isclose(df['chroma_5'][0], 0.86870) ext = ChromaCQTExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 14) - assert np.isclose(df['chroma_cqt_2'][0], 0.286443) + assert df.shape == (1221, 14) + assert np.isclose(df['chroma_cqt_2'][0], 0.355324) ext = ChromaCENSExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 14) - assert np.isclose(df['chroma_cens_2'][0], 0.217814) + assert df.shape == (1221, 14) + assert np.isclose(df['chroma_cens_2'][0], 0.137765) def test_melspectrogram_extractor(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = MelspectrogramExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 130) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['mel_3'][0], 0.553125) + assert df.shape == (1221, 130) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['mel_3'][0], 0.82194) ext2 = MelspectrogramExtractor(n_mels=15) df = ext2.transform(audio).to_df() - assert df.shape == (4882, 17) - assert np.isclose(df['mel_4'][2], 3.24429) + assert df.shape == (1221, 17) + assert np.isclose(df['mel_4'][2], 7.40387) def test_mfcc_extractor(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = MFCCExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 22) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['mfcc_3'][0], 5.98247) + assert df.shape == (1221, 22) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['mfcc_3'][0], 20.84870) ext2 = MFCCExtractor(n_mfcc=15) df = ext2.transform(audio).to_df() - assert df.shape == (4882, 17) - assert np.isclose(df['mfcc_14'][2], -7.41533) + assert df.shape == (1221, 17) + assert np.isclose(df['mfcc_14'][2], -22.39406) def test_tonnetz_extractor(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = TonnetzExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 8) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['tonal_centroid_0'][0], -0.0264436) + assert df.shape == (1221, 8) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['tonal_centroid_0'][0], -0.031784) def test_tempogram_extractor(): audio = AudioStim(join(AUDIO_DIR, "barber.wav")) ext = TempogramExtractor() df = ext.transform(audio).to_df() - assert df.shape == (4882, 386) - assert np.isclose(df['onset'][1], 0.01161) - assert np.isclose(df['duration'][0], 0.01161) - assert np.isclose(df['tempo_1'][0], 0.773760) + assert df.shape == (1221, 386) + assert np.isclose(df['onset'][1], 0.04644) + assert np.isclose(df['duration'][0], 0.04644) + assert np.isclose(df['tempo_1'][0], 0.75708) ext2 = TempogramExtractor(win_length=300) df = ext2.transform(audio).to_df() - assert df.shape == (4882, 302) - assert np.isclose(df['tempo_1'][2], 0.756967) + assert df.shape == (1221, 302) + assert np.isclose(df['tempo_1'][2], 0.74917) diff --git a/pliers/tests/test_stims.py b/pliers/tests/test_stims.py index 86d9cb48..027cdaa8 100644 --- a/pliers/tests/test_stims.py +++ b/pliers/tests/test_stims.py @@ -97,12 +97,16 @@ def test_video_stim(): assert f3.data.shape == (240, 320, 3) -def test_audio_stim(dummy_iter_extractor): +def test_audio_stim(): audio_dir = join(get_test_data_path(), 'audio') - stim = AudioStim(join(audio_dir, 'barber.wav'), sampling_rate=11025) + stim = AudioStim(join(audio_dir, 'barber.wav')) assert round(stim.duration) == 57 assert stim.sampling_rate == 11025 + stim = AudioStim(join(audio_dir, 'homer.wav')) + assert round(stim.duration) == 3 + assert stim.sampling_rate == 11025 + def test_audio_formats(): audio_dir = join(get_test_data_path(), 'audio')