From 9b4a911c93aef82508278fb1eeea6d53efc6d629 Mon Sep 17 00:00:00 2001 From: john culnan Date: Wed, 6 Jan 2021 11:29:37 -0700 Subject: [PATCH 1/3] started code to batch-transcribe current datasets with pocketsphinx --- .../transcribe_datasets/__init__.py | 0 .../transcribe_datasets_sphinx.py | 41 +++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 train_and_test_models/transcribe_datasets/__init__.py create mode 100644 train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py diff --git a/train_and_test_models/transcribe_datasets/__init__.py b/train_and_test_models/transcribe_datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py new file mode 100644 index 00000000..a65b6802 --- /dev/null +++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py @@ -0,0 +1,41 @@ +# transcribe all datasets using sphinx with speech_recognition/sphinx_sr.py +# assumes data files are WAV formatted and organized in directories + +from speech_recognition import sphinx_sr +import os + + +class DatasetTranscriber: + """ + Transcribes wav files for datasets using pocketsphinx + currently written for MELD, MUStARD, and ChaLearn + param dataset should be one of these three + """ + def __init__(self, dataset, location, extensions=None): + self.dataset = dataset.lower() # options: 'meld', 'mustard', 'chalearn' + self.location = location + # get list of extensions + if type(self.extensions) is not str: + self.extensions = extensions + else: + self.extensions = [] + self.extensions.append(extensions) + + def read_in_current_files(self, current_file_location): + pass + + def transcribe(self, save_location): + """ + transcribe all available files in the specified location + """ + if self.extensions is not None: + for ext in self.extensions: + # get the location of each dir with files + location = f"{self.location}/{ext}" + # find wav files + for wavfile in os.listdir(location): + if wavfile.endswith(".wav"): + # transcribe wav files + full_path = os.path.join(location, wavfile) + transcription = sphinx_sr.transcribe_file(full_path) + # save transcribed wav files to new files \ No newline at end of file From 3be6b917926d1c73a558b9a2353d3133a33d42d1 Mon Sep 17 00:00:00 2001 From: john culnan Date: Fri, 8 Jan 2021 09:15:49 -0700 Subject: [PATCH 2/3] updated code to transcribe MELD, MUStARD, ChaLearn with PocketSphinx --- .../README.md | 0 speech_recognizers/__init__.py | 0 .../google_sr.py | 0 .../sphinx_sr.py | 8 +- .../transcribe_datasets_sphinx.py | 240 +++++++++++++++++- 5 files changed, 237 insertions(+), 11 deletions(-) rename {speech_recognition => speech_recognizers}/README.md (100%) create mode 100644 speech_recognizers/__init__.py rename {speech_recognition => speech_recognizers}/google_sr.py (100%) rename {speech_recognition => speech_recognizers}/sphinx_sr.py (70%) diff --git a/speech_recognition/README.md b/speech_recognizers/README.md similarity index 100% rename from speech_recognition/README.md rename to speech_recognizers/README.md diff --git a/speech_recognizers/__init__.py b/speech_recognizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/speech_recognition/google_sr.py b/speech_recognizers/google_sr.py similarity index 100% rename from speech_recognition/google_sr.py rename to speech_recognizers/google_sr.py diff --git a/speech_recognition/sphinx_sr.py b/speech_recognizers/sphinx_sr.py similarity index 70% rename from speech_recognition/sphinx_sr.py rename to speech_recognizers/sphinx_sr.py index 26746dda..b696c88b 100644 --- a/speech_recognition/sphinx_sr.py +++ b/speech_recognizers/sphinx_sr.py @@ -1,8 +1,6 @@ import speech_recognition as sr import sys -input_file = sys.argv[1] - def transcribe_file(input_file): @@ -11,11 +9,15 @@ def transcribe_file(input_file): with stim as source: audio = r.record(source) - transcription = r.recognize_sphinx(audio) + try: + transcription = r.recognize_sphinx(audio) + except: + transcription = None return transcription if __name__ == "__main__": + input_file = sys.argv[1] result = transcribe_file(input_file) print(result) \ No newline at end of file diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py index a65b6802..5ff2727e 100644 --- a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py +++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py @@ -1,41 +1,265 @@ -# transcribe all datasets using sphinx with speech_recognition/sphinx_sr.py +# transcribe all datasets using sphinx with speech_recognizers/sphinx_sr.py # assumes data files are WAV formatted and organized in directories -from speech_recognition import sphinx_sr +from speech_recognizers import sphinx_sr + import os +import pandas as pd +import warnings +import sys class DatasetTranscriber: """ Transcribes wav files for datasets using pocketsphinx currently written for MELD, MUStARD, and ChaLearn - param dataset should be one of these three + dataset : a string of name of dataset + location : full path to dataset directory + extensions : directory extensions needed to access wav files """ def __init__(self, dataset, location, extensions=None): self.dataset = dataset.lower() # options: 'meld', 'mustard', 'chalearn' self.location = location # get list of extensions - if type(self.extensions) is not str: + if type(extensions) is not str: self.extensions = extensions else: self.extensions = [] self.extensions.append(extensions) + self.save_location = f"{location}/{self.dataset}_transcribed" + + def convert_and_save_transcriptions(self, data_dict, transcript_dict): + """ + takes a dictionary of current gold files with text + and a dictionary of new transcriptions + and replaces gold with new transcriptions + saves updated files in new location + """ + # find dataset type + if self.dataset == "meld": + pass + elif self.dataset == "mustard": + + pass + elif self.dataset == "chalearn": + pass def read_in_current_files(self, current_file_location): - pass + """ + read in files containing gold transcriptions + files are in different formats depending upon the dataset + """ + # create dict for label : utt/other-info pairs + data_dict = {} + if self.dataset == "meld": + # all utterances in a single csv file + all_utts = pd.read_csv(current_file_location) + elif self.dataset == "mustard" or self.dataset == "chalearn": + # all utterances are in a single tsv file + all_utts = pd.read_csv(current_file_location, sep='\t') + + return all_utts + # + # all_utts.to_dict('records', into=data_dict) + # + # return data_dict - def transcribe(self, save_location): + # def save_transcriptions(self, transcriptions, save_name): + # """ + # Saves the transcriptions as they come out + # Saved as a tsv file with name save_name + # To location self.location + # Does not consider dataset structure + # """ + # sname = "" + # if save_name.endswith(".tsv"): + # sname = save_name + # elif save_name.endswith(".csv"): + # sname = save_name.split(".csv")[0] + # else: + # sname = f"{save_name}.tsv" + # # convert transcriptions if in dict + # if type(transcriptions) == dict: + # transcriptions = [(k, v) for k, v in transcriptions.items()] + # # save transcriptions + # if type(transcriptions) == pd.DataFrame: + # transcriptions.to_csv(f"{self.location}/{sname}", index=False, + # sep="\t") + # elif type(transcriptions) == list: + # with open(f"{self.location}/{sname}", 'w') as wfile: + # wfile.write("id\tutterance\n") + # for item in transcriptions: + # wfile.write(f"{item[0]}\t{item[1]}\n") + + def save_transcriptions(self, transcriptions_dict, current_files, save_name): + """ + Save transcriptions alongside other info currently + required for each dataset + Saves to location self.location + """ + sname = "" + if save_name.endswith(".tsv"): + sname = save_name + elif save_name.endswith(".csv"): + sname = save_name.split(".csv")[0] + else: + sname = f"{save_name}.tsv" + + # convert transcriptions dict to pandas df + transcriptions_df = pd.DataFrame.from_dict(transcriptions_dict) + # delete utterance from current_files + current_files = current_files.loc[:, ~(current_files.columns.str.lower() == 'utterance')] + + # merge dfs on id + if self.dataset == "meld": + transcriptions_df.rename(columns={'id': 'DiaID_UttID'}, inplace=True) + transcriptions_df = transcriptions_df.merge(current_files, on='DiaID_UttID') + elif self.dataset == "mustard": + transcriptions_df.rename(columns={'id': 'clip_id'}, inplace=True) + print(transcriptions_df.columns.values.tolist()) + print(current_files.columns.values.tolist()) + transcriptions_df = transcriptions_df.merge(current_files, on='clip_id') + elif self.dataset == "chalearn": + transcriptions_df.rename(columns={'id': 'file'}, inplace=True) + transcriptions_df = transcriptions_df.merge(current_files, on='file') + + transcriptions_df.to_csv(f"{self.location}/{sname}", index=False, + sep="\t") + + def transcribe(self): """ transcribe all available files in the specified location """ + # save dict of [name -> [list]] + transcript_dict = {'id': [], 'utterance': []} + if self.extensions is not None: for ext in self.extensions: # get the location of each dir with files location = f"{self.location}/{ext}" # find wav files for wavfile in os.listdir(location): - if wavfile.endswith(".wav"): + ending = ".wav" + if self.dataset == "meld": + ending = "_2.wav" + if wavfile.endswith(ending): + wavname = wavfile.split(ending)[0] + if self.dataset == "chalearn": + wavname = wavname + ".mp4" + print(f"Now transcribing {wavname}") # transcribe wav files full_path = os.path.join(location, wavfile) transcription = sphinx_sr.transcribe_file(full_path) - # save transcribed wav files to new files \ No newline at end of file + print(transcription) + # add wavname, transcription pairs to transcript_dict + transcript_dict['id'].append(wavname) + transcript_dict['utterance'].append(transcription) + # return completed dict + return transcript_dict + + # def transcribe(self): + # """ + # transcribe all available files in the specified location + # """ + # # save wavname -> transcription dict + # transcript_dict = {} + # + # if self.extensions is not None: + # for ext in self.extensions: + # # get the location of each dir with files + # location = f"{self.location}/{ext}" + # # find wav files + # for wavfile in os.listdir(location): + # if wavfile.endswith(".wav"): + # wavname = wavfile.split('.wav')[0] + # print(f"Now transcribing {wavname}") + # # transcribe wav files + # full_path = os.path.join(location, wavfile) + # transcription = sphinx_sr.transcribe_file(full_path) + # print(transcription) + # # add wavname, transcription pairs to transcript_dict + # if wavname not in transcript_dict: + # transcript_dict[wavname] = transcription + # else: + # warnings.warn(f"{wavname} already in transcript_dict. Not replacing transcription") + # + # # return completed dict + # return transcript_dict + + +if __name__ == "__main__": + if sys.argv[1] == "mustard": + # assumes that datasets are in the untracked 'data' directory + mustard_location = "../../data/multimodal_datasets/MUStARD" + mustard_extensions = "utterances_final" + current_file_path = f"{mustard_location}/mustard_utts.tsv" + + mustard_transcriber = DatasetTranscriber("MUStARD", mustard_location, mustard_extensions) + + # get current label file + current_file = mustard_transcriber.read_in_current_files(current_file_path) + print("Current file read") + print(current_file.head(5)) + + # transcribe data + transcripts = mustard_transcriber.transcribe() + + # save transcriptions + mustard_transcriber.save_transcriptions(transcripts, current_file, "mustard_sphinx.tsv") + + elif sys.argv[1] == "meld": + # assumes that datasets are in the untracked 'data' directory + meld_location = "../../data/multimodal_datasets/MELD_formatted" + + meld_train_extensions = "train/train_audio" + meld_dev_extensions = "dev/dev_audio" + meld_test_extensions = "test/test_audio" + + current_train_path = f"{meld_location}/train/train_sent_emo.csv" + current_dev_path = f"{meld_location}/dev/dev_sent_emo.csv" + current_test_path = f"{meld_location}/test/test_sent_emo.csv" + + meld_train_transcriber = DatasetTranscriber("MELD", meld_location, meld_train_extensions) + meld_dev_transcriber = DatasetTranscriber("MELD", meld_location, meld_dev_extensions) + meld_test_transcriber = DatasetTranscriber("MELD", meld_location, meld_test_extensions) + + # get paths + current_train_file = meld_train_transcriber.read_in_current_files(current_train_path) + current_dev_file = meld_dev_transcriber.read_in_current_files(current_dev_path) + current_test_file = meld_test_transcriber.read_in_current_files(current_test_path) + + # transcribe data + train_transcripts = meld_train_transcriber.transcribe() + dev_transcripts = meld_dev_transcriber.transcribe() + test_transcripts = meld_test_transcriber.transcribe() + + # save transcriptions + meld_train_transcriber.save_transcriptions(train_transcripts, current_train_file, "train/meld_sphinx.tsv") + meld_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "dev/meld_sphinx.tsv") + meld_test_transcriber.save_transcriptions(test_transcripts, current_test_file, "test/meld_sphinx.tsv") + + elif sys.argv[1] == "chalearn": + # assumes that datasets are in the untracked 'data' directory + chalearn_location = "../../data/multimodal_datasets/Chalearn" + + chalearn_train_extension = "train/mp4" + chalearn_dev_extension = "val/mp4" + # chalearn_test_extension = "test/mp4 + + current_train_path = f"{chalearn_location}/train/gold_and_utts.tsv" + current_dev_path = f"{chalearn_location}/val/gold_and_utts.tsv" + # current_test_path = f"{chalearn_location}/test/gold_and_utts.tsv" + + chalearn_train_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_train_extension) + current_train_file = chalearn_train_transcriber.read_in_current_files(current_train_path) + train_transcripts = chalearn_train_transcriber.transcribe() + + # save transcriptions + chalearn_train_transcriber.save_transcriptions(train_transcripts, current_train_file, "train/chalearn_sphinx.tsv") + + chalearn_dev_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_dev_extension) + current_dev_file = chalearn_dev_transcriber.read_in_current_files(current_dev_path) + dev_transcripts = chalearn_dev_transcriber.transcribe() + + # save transcriptions + chalearn_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "val/chalearn_sphinx.tsv") \ No newline at end of file From 1f549f73edc10e7e78ab948fc2cc689539311734 Mon Sep 17 00:00:00 2001 From: john culnan Date: Fri, 15 Jan 2021 10:37:00 -0700 Subject: [PATCH 3/3] updates to run datasets through sphinx and google speech recognizers --- speech_recognizers/google_sr.py | 29 +++- .../transcribe_datasets_google.py | 131 ++++++++++++++++++ .../transcribe_datasets_sphinx.py | 75 ++-------- 3 files changed, 164 insertions(+), 71 deletions(-) create mode 100644 train_and_test_models/transcribe_datasets/transcribe_datasets_google.py diff --git a/speech_recognizers/google_sr.py b/speech_recognizers/google_sr.py index e0d5ebad..b3ee8db2 100644 --- a/speech_recognizers/google_sr.py +++ b/speech_recognizers/google_sr.py @@ -1,7 +1,6 @@ import sys - -sample_rate = sys.argv[1] -input_file = sys.argv[2] +import os +import json def transcribe_file(speech_file, sample_rate): @@ -19,6 +18,8 @@ def transcribe_file(speech_file, sample_rate): encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code="en-US", + enable_word_time_offsets=True, + # enable_word_confidence=True ) response = client.recognize(config=config, audio=audio) @@ -26,15 +27,29 @@ def transcribe_file(speech_file, sample_rate): # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. + all_results = "" + all_text_confs = [] + for result in response.results: # The first alternative is the most likely one for this portion. - text_conf = "%s\t%s" % (result.alternatives[0].transcript, result.alternatives[0].confidence) + all_results += str(result) + '\n' + all_text_confs.append("%s\t%s" % (result.alternatives[0].transcript, + result.alternatives[0].confidence)) + # print(response) # print(response.results) - return text_conf + return all_results, all_text_confs + # return text_conf if __name__ == "__main__": - result = transcribe_file(input_file, sample_rate) - print(result) + # replace this with your credentials + credentials = "your_credentials_here.json" + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials + + sample_rate = int(sys.argv[1]) + input_file = sys.argv[2] + + unformatted_results, utt_result = transcribe_file(input_file, sample_rate) + print(utt_result[0]) diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py new file mode 100644 index 00000000..efa7dcb8 --- /dev/null +++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py @@ -0,0 +1,131 @@ +# transcribe datasets using google speech-to-text + +import os +import sys +import wave +from speech_recognizers.google_sr import transcribe_file + + +def transcribe_and_save(dataset, base_location, wav_location, save_unformatted_location, + save_name="google_transcriptions.txt"): + formatted_results = [] + skipped_audio = [] + + for item in os.listdir(wav_location): + if item.endswith('.wav'): + filesize = os.path.getsize(f"{wav_location}/{item}") + if filesize >= 10000000: + skipped_audio.append(item) + else: + with wave.open(f"{wav_location}/{item}", "rb") as wave_file: + frame_rate = wave_file.getframerate() + if dataset.lower() == "mustard": + item_name = item.split(".wav")[0] + elif dataset.lower() == "meld": + item_name = item.split("_2.wav")[0] + elif dataset.lower() == "chalearn": + item_name = item.split(".wav")[0] + ".mp4" + unformatted_results, utt_results = transcribe_file(f"{wav_location}/{item}", frame_rate) + with open(f"{save_unformatted_location}/{item_name}_unformatted.txt", 'w') as unformatted_file: + unformatted_file.write(unformatted_results) + for utt in utt_results: + line = f"{item}\t{utt}" + formatted_results.append(line) + + with open(f"{base_location}/{save_name}", 'w') as gfile: + gfile.write("\n".join(formatted_results)) + + with open(f"{base_location}/skipped_files.txt", 'w') as skipped: + skipped.write("\n".join(skipped_audio)) + + +if __name__ == "__main__": + # replace with your credentials + credentials = "your_credentials_here.json" + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials + + # sample_rate = 44100 + + if sys.argv[1] == "mustard": + # access the data + mustard_base_location = "/Users/jculnan/datasets/multimodal_datasets/MUStARD" + mustard_wav_location = os.path.join(mustard_base_location, "wav") + # save unformatted objsects + mustard_save_unformatted_location = os.path.join(mustard_base_location, "google-unformatted") + # create this directory if it doesn't exist + os.system(f'if [ ! -d "{mustard_save_unformatted_location}" ]; then mkdir -p {mustard_save_unformatted_location}; fi') + + formatted_results = [] + + for item in os.listdir(mustard_wav_location): + if item.endswith('.wav'): + with wave.open(f"{mustard_wav_location}/{item}", "rb") as wave_file: + frame_rate = wave_file.getframerate() + item_name = item.split(".wav")[0] + unformatted_results, utt_results = transcribe_file(f"{mustard_wav_location}/{item}", frame_rate) + with open(f"{mustard_save_unformatted_location}/{item_name}_unformatted.txt", 'w') as unformatted_file: + unformatted_file.write(unformatted_results) + for utt in utt_results: + line = f"{item}\t{utt}" + formatted_results.append(line) + + with open(f"{mustard_base_location}/google_transcriptions.txt", 'w') as gfile: + gfile.write("\n".join(formatted_results)) + + elif sys.argv[1] == "meld": + # access the data + meld_train_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/train" + meld_dev_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/dev" + meld_test_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/test" + # todo: change this once you've tested it + meld_train_wav_location = os.path.join(meld_train_base_location, "train_audio_mono") + meld_dev_wav_location = os.path.join(meld_dev_base_location, "dev_audio_mono") + meld_test_wav_location = os.path.join(meld_test_base_location, "test_audio_mono") + # save unformatted objects + meld_train_save_unformatted_location = os.path.join(meld_train_base_location, "google-unformatted-test") + meld_dev_save_unformatted_location = os.path.join(meld_dev_base_location, "google-unformatted") + meld_test_save_unformatted_location = os.path.join(meld_test_base_location, "google-unformatted") + # create this directory if it doesn't exist + os.system(f'if [ ! -d "{meld_train_save_unformatted_location}" ]; then mkdir -p {meld_train_save_unformatted_location}; fi') + os.system( + f'if [ ! -d "{meld_dev_save_unformatted_location}" ]; then mkdir -p {meld_dev_save_unformatted_location}; fi') + os.system( + f'if [ ! -d "{meld_test_save_unformatted_location}" ]; then mkdir -p {meld_test_save_unformatted_location}; fi') + + transcribe_and_save("meld", meld_train_base_location, meld_train_wav_location, + meld_train_save_unformatted_location, save_name="dia644_utt4_retest.txt") + + transcribe_and_save("meld", meld_dev_base_location, meld_dev_wav_location, + meld_dev_save_unformatted_location) + + transcribe_and_save("meld", meld_test_base_location, meld_test_wav_location, + meld_test_save_unformatted_location) + + elif sys.argv[1] == "chalearn": + # access the data + chalearn_train_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/train" + chalearn_dev_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/val" + chalearn_test_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/test" + # todo: change this once you've tested it + chalearn_train_wav_location = os.path.join(chalearn_train_base_location, "wav") + chalearn_dev_wav_location = os.path.join(chalearn_dev_base_location, "wav") + chalearn_test_wav_location = os.path.join(chalearn_test_base_location, "wav") + # save unformatted objsects + chalearn_train_save_unformatted_location = os.path.join(chalearn_train_base_location, "google-unformatted") + chalearn_dev_save_unformatted_location = os.path.join(chalearn_dev_base_location, "google-unformatted") + chalearn_test_save_unformatted_location = os.path.join(chalearn_test_base_location, "google-unformatted") + # create this directory if it doesn't exist + os.system(f'if [ ! -d "{chalearn_train_save_unformatted_location}" ]; then mkdir -p {chalearn_train_save_unformatted_location}; fi') + os.system( + f'if [ ! -d "{chalearn_dev_save_unformatted_location}" ]; then mkdir -p {chalearn_dev_save_unformatted_location}; fi') + os.system( + f'if [ ! -d "{chalearn_test_save_unformatted_location}" ]; then mkdir -p {chalearn_test_save_unformatted_location}; fi') + + transcribe_and_save("chalearn", chalearn_train_base_location, chalearn_train_wav_location, + chalearn_train_save_unformatted_location) + + transcribe_and_save("chalearn", chalearn_dev_base_location, chalearn_dev_wav_location, + chalearn_dev_save_unformatted_location) + + transcribe_and_save("chalearn", chalearn_test_base_location, chalearn_test_wav_location, + chalearn_test_save_unformatted_location) diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py index 5ff2727e..7a2e34e7 100644 --- a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py +++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py @@ -59,37 +59,6 @@ def read_in_current_files(self, current_file_location): all_utts = pd.read_csv(current_file_location, sep='\t') return all_utts - # - # all_utts.to_dict('records', into=data_dict) - # - # return data_dict - - # def save_transcriptions(self, transcriptions, save_name): - # """ - # Saves the transcriptions as they come out - # Saved as a tsv file with name save_name - # To location self.location - # Does not consider dataset structure - # """ - # sname = "" - # if save_name.endswith(".tsv"): - # sname = save_name - # elif save_name.endswith(".csv"): - # sname = save_name.split(".csv")[0] - # else: - # sname = f"{save_name}.tsv" - # # convert transcriptions if in dict - # if type(transcriptions) == dict: - # transcriptions = [(k, v) for k, v in transcriptions.items()] - # # save transcriptions - # if type(transcriptions) == pd.DataFrame: - # transcriptions.to_csv(f"{self.location}/{sname}", index=False, - # sep="\t") - # elif type(transcriptions) == list: - # with open(f"{self.location}/{sname}", 'w') as wfile: - # wfile.write("id\tutterance\n") - # for item in transcriptions: - # wfile.write(f"{item[0]}\t{item[1]}\n") def save_transcriptions(self, transcriptions_dict, current_files, save_name): """ @@ -157,35 +126,6 @@ def transcribe(self): # return completed dict return transcript_dict - # def transcribe(self): - # """ - # transcribe all available files in the specified location - # """ - # # save wavname -> transcription dict - # transcript_dict = {} - # - # if self.extensions is not None: - # for ext in self.extensions: - # # get the location of each dir with files - # location = f"{self.location}/{ext}" - # # find wav files - # for wavfile in os.listdir(location): - # if wavfile.endswith(".wav"): - # wavname = wavfile.split('.wav')[0] - # print(f"Now transcribing {wavname}") - # # transcribe wav files - # full_path = os.path.join(location, wavfile) - # transcription = sphinx_sr.transcribe_file(full_path) - # print(transcription) - # # add wavname, transcription pairs to transcript_dict - # if wavname not in transcript_dict: - # transcript_dict[wavname] = transcription - # else: - # warnings.warn(f"{wavname} already in transcript_dict. Not replacing transcription") - # - # # return completed dict - # return transcript_dict - if __name__ == "__main__": if sys.argv[1] == "mustard": @@ -240,15 +180,15 @@ def transcribe(self): elif sys.argv[1] == "chalearn": # assumes that datasets are in the untracked 'data' directory - chalearn_location = "../../data/multimodal_datasets/Chalearn" + chalearn_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn" chalearn_train_extension = "train/mp4" chalearn_dev_extension = "val/mp4" - # chalearn_test_extension = "test/mp4 + chalearn_test_extension = "test/wav" current_train_path = f"{chalearn_location}/train/gold_and_utts.tsv" current_dev_path = f"{chalearn_location}/val/gold_and_utts.tsv" - # current_test_path = f"{chalearn_location}/test/gold_and_utts.tsv" + current_test_path = f"{chalearn_location}/test/gold_and_utts.tsv" chalearn_train_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_train_extension) current_train_file = chalearn_train_transcriber.read_in_current_files(current_train_path) @@ -262,4 +202,11 @@ def transcribe(self): dev_transcripts = chalearn_dev_transcriber.transcribe() # save transcriptions - chalearn_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "val/chalearn_sphinx.tsv") \ No newline at end of file + chalearn_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "val/chalearn_sphinx.tsv") + + chalearn_test_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_test_extension) + current_test_file = chalearn_test_transcriber.read_in_current_files(current_test_path) + test_transcripts = chalearn_test_transcriber.transcribe() + + # save transcriptions + chalearn_test_transcriber.save_transcriptions(test_transcripts, current_test_file, "test/chalearn_sphinx.tsv") \ No newline at end of file