From 9b4a911c93aef82508278fb1eeea6d53efc6d629 Mon Sep 17 00:00:00 2001
From: john culnan <jmculnan@email.arizona.edu>
Date: Wed, 6 Jan 2021 11:29:37 -0700
Subject: [PATCH 1/3] started code to batch-transcribe current datasets with
 pocketsphinx

---
 .../transcribe_datasets/__init__.py           |  0
 .../transcribe_datasets_sphinx.py             | 41 +++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 train_and_test_models/transcribe_datasets/__init__.py
 create mode 100644 train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py

diff --git a/train_and_test_models/transcribe_datasets/__init__.py b/train_and_test_models/transcribe_datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
new file mode 100644
index 00000000..a65b6802
--- /dev/null
+++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
@@ -0,0 +1,41 @@
+# transcribe all datasets using sphinx with speech_recognition/sphinx_sr.py
+# assumes data files are WAV formatted and organized in directories
+
+from speech_recognition import sphinx_sr
+import os
+
+
+class DatasetTranscriber:
+    """
+    Transcribes wav files for datasets using pocketsphinx
+    currently written for MELD, MUStARD, and ChaLearn
+    param dataset should be one of these three
+    """
+    def __init__(self, dataset, location, extensions=None):
+        self.dataset = dataset.lower()  # options: 'meld', 'mustard', 'chalearn'
+        self.location = location
+        # get list of extensions
+        if type(self.extensions) is not str:
+            self.extensions = extensions
+        else:
+            self.extensions = []
+            self.extensions.append(extensions)
+
+    def read_in_current_files(self, current_file_location):
+        pass
+
+    def transcribe(self, save_location):
+        """
+        transcribe all available files in the specified location
+        """
+        if self.extensions is not None:
+            for ext in self.extensions:
+                # get the location of each dir with files
+                location = f"{self.location}/{ext}"
+                # find wav files
+                for wavfile in os.listdir(location):
+                    if wavfile.endswith(".wav"):
+                        # transcribe wav files
+                        full_path = os.path.join(location, wavfile)
+                        transcription = sphinx_sr.transcribe_file(full_path)
+                        # save transcribed wav files to new files
\ No newline at end of file

From 3be6b917926d1c73a558b9a2353d3133a33d42d1 Mon Sep 17 00:00:00 2001
From: john culnan <jmculnan@email.arizona.edu>
Date: Fri, 8 Jan 2021 09:15:49 -0700
Subject: [PATCH 2/3] updated code to transcribe MELD, MUStARD, ChaLearn with
 PocketSphinx

---
 .../README.md                                 |   0
 speech_recognizers/__init__.py                |   0
 .../google_sr.py                              |   0
 .../sphinx_sr.py                              |   8 +-
 .../transcribe_datasets_sphinx.py             | 240 +++++++++++++++++-
 5 files changed, 237 insertions(+), 11 deletions(-)
 rename {speech_recognition => speech_recognizers}/README.md (100%)
 create mode 100644 speech_recognizers/__init__.py
 rename {speech_recognition => speech_recognizers}/google_sr.py (100%)
 rename {speech_recognition => speech_recognizers}/sphinx_sr.py (70%)

diff --git a/speech_recognition/README.md b/speech_recognizers/README.md
similarity index 100%
rename from speech_recognition/README.md
rename to speech_recognizers/README.md
diff --git a/speech_recognizers/__init__.py b/speech_recognizers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/speech_recognition/google_sr.py b/speech_recognizers/google_sr.py
similarity index 100%
rename from speech_recognition/google_sr.py
rename to speech_recognizers/google_sr.py
diff --git a/speech_recognition/sphinx_sr.py b/speech_recognizers/sphinx_sr.py
similarity index 70%
rename from speech_recognition/sphinx_sr.py
rename to speech_recognizers/sphinx_sr.py
index 26746dda..b696c88b 100644
--- a/speech_recognition/sphinx_sr.py
+++ b/speech_recognizers/sphinx_sr.py
@@ -1,8 +1,6 @@
 import speech_recognition as sr
 import sys
 
-input_file = sys.argv[1]
-
 
 def transcribe_file(input_file):
 
@@ -11,11 +9,15 @@ def transcribe_file(input_file):
     with stim as source:
         audio = r.record(source)
 
-    transcription = r.recognize_sphinx(audio)
+    try:
+        transcription = r.recognize_sphinx(audio)
+    except:
+        transcription = None
 
     return transcription
 
 
 if __name__ == "__main__":
+    input_file = sys.argv[1]
     result = transcribe_file(input_file)
     print(result)
\ No newline at end of file
diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
index a65b6802..5ff2727e 100644
--- a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
+++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
@@ -1,41 +1,265 @@
-# transcribe all datasets using sphinx with speech_recognition/sphinx_sr.py
+# transcribe all datasets using sphinx with speech_recognizers/sphinx_sr.py
 # assumes data files are WAV formatted and organized in directories
 
-from speech_recognition import sphinx_sr
+from speech_recognizers import sphinx_sr
+
 import os
+import pandas as pd
+import warnings
+import sys
 
 
 class DatasetTranscriber:
     """
     Transcribes wav files for datasets using pocketsphinx
     currently written for MELD, MUStARD, and ChaLearn
-    param dataset should be one of these three
+    dataset : a string of name of dataset
+    location : full path to dataset directory
+    extensions : directory extensions needed to access wav files
     """
     def __init__(self, dataset, location, extensions=None):
         self.dataset = dataset.lower()  # options: 'meld', 'mustard', 'chalearn'
         self.location = location
         # get list of extensions
-        if type(self.extensions) is not str:
+        if type(extensions) is not str:
             self.extensions = extensions
         else:
             self.extensions = []
             self.extensions.append(extensions)
+        self.save_location = f"{location}/{self.dataset}_transcribed"
+
+    def convert_and_save_transcriptions(self, data_dict, transcript_dict):
+        """
+        takes a dictionary of current gold files with text
+        and a dictionary of new transcriptions
+        and replaces gold with new transcriptions
+        saves updated files in new location
+        """
+        # find dataset type
+        if self.dataset == "meld":
+            pass
+        elif self.dataset == "mustard":
+
+            pass
+        elif self.dataset == "chalearn":
+            pass
 
     def read_in_current_files(self, current_file_location):
-        pass
+        """
+        read in files containing gold transcriptions
+        files are in different formats depending upon the dataset
+        """
+        # create dict for label : utt/other-info pairs
+        data_dict = {}
+        if self.dataset == "meld":
+            # all utterances in a single csv file
+            all_utts = pd.read_csv(current_file_location)
+        elif self.dataset == "mustard" or self.dataset == "chalearn":
+            # all utterances are in a single tsv file
+            all_utts = pd.read_csv(current_file_location, sep='\t')
+
+        return all_utts
+        #
+        # all_utts.to_dict('records', into=data_dict)
+        #
+        # return data_dict
 
-    def transcribe(self, save_location):
+    # def save_transcriptions(self, transcriptions, save_name):
+    #     """
+    #     Saves the transcriptions as they come out
+    #     Saved as a tsv file with name save_name
+    #     To location self.location
+    #     Does not consider dataset structure
+    #     """
+    #     sname = ""
+    #     if save_name.endswith(".tsv"):
+    #         sname = save_name
+    #     elif save_name.endswith(".csv"):
+    #         sname = save_name.split(".csv")[0]
+    #     else:
+    #         sname = f"{save_name}.tsv"
+    #     # convert transcriptions if in dict
+    #     if type(transcriptions) == dict:
+    #         transcriptions = [(k, v) for k, v in transcriptions.items()]
+    #     # save transcriptions
+    #     if type(transcriptions) == pd.DataFrame:
+    #         transcriptions.to_csv(f"{self.location}/{sname}", index=False,
+    #                               sep="\t")
+    #     elif type(transcriptions) == list:
+    #         with open(f"{self.location}/{sname}", 'w') as wfile:
+    #             wfile.write("id\tutterance\n")
+    #             for item in transcriptions:
+    #                 wfile.write(f"{item[0]}\t{item[1]}\n")
+
+    def save_transcriptions(self, transcriptions_dict, current_files, save_name):
+        """
+        Save transcriptions alongside other info currently
+        required for each dataset
+        Saves to location self.location
+        """
+        sname = ""
+        if save_name.endswith(".tsv"):
+            sname = save_name
+        elif save_name.endswith(".csv"):
+            sname = save_name.split(".csv")[0]
+        else:
+            sname = f"{save_name}.tsv"
+
+        # convert transcriptions dict to pandas df
+        transcriptions_df = pd.DataFrame.from_dict(transcriptions_dict)
+        # delete utterance from current_files
+        current_files = current_files.loc[:, ~(current_files.columns.str.lower() == 'utterance')]
+
+        # merge dfs on id
+        if self.dataset == "meld":
+            transcriptions_df.rename(columns={'id': 'DiaID_UttID'}, inplace=True)
+            transcriptions_df = transcriptions_df.merge(current_files, on='DiaID_UttID')
+        elif self.dataset == "mustard":
+            transcriptions_df.rename(columns={'id': 'clip_id'}, inplace=True)
+            print(transcriptions_df.columns.values.tolist())
+            print(current_files.columns.values.tolist())
+            transcriptions_df = transcriptions_df.merge(current_files, on='clip_id')
+        elif self.dataset == "chalearn":
+            transcriptions_df.rename(columns={'id': 'file'}, inplace=True)
+            transcriptions_df = transcriptions_df.merge(current_files, on='file')
+
+        transcriptions_df.to_csv(f"{self.location}/{sname}", index=False,
+                                 sep="\t")
+
+    def transcribe(self):
         """
         transcribe all available files in the specified location
         """
+        # save dict of [name -> [list]]
+        transcript_dict = {'id': [], 'utterance': []}
+
         if self.extensions is not None:
             for ext in self.extensions:
                 # get the location of each dir with files
                 location = f"{self.location}/{ext}"
                 # find wav files
                 for wavfile in os.listdir(location):
-                    if wavfile.endswith(".wav"):
+                    ending = ".wav"
+                    if self.dataset == "meld":
+                        ending = "_2.wav"
+                    if wavfile.endswith(ending):
+                        wavname = wavfile.split(ending)[0]
+                        if self.dataset == "chalearn":
+                            wavname = wavname + ".mp4"
+                        print(f"Now transcribing {wavname}")
                         # transcribe wav files
                         full_path = os.path.join(location, wavfile)
                         transcription = sphinx_sr.transcribe_file(full_path)
-                        # save transcribed wav files to new files
\ No newline at end of file
+                        print(transcription)
+                        # add wavname, transcription pairs to transcript_dict
+                        transcript_dict['id'].append(wavname)
+                        transcript_dict['utterance'].append(transcription)
+        # return completed dict
+        return transcript_dict
+
+    # def transcribe(self):
+    #     """
+    #     transcribe all available files in the specified location
+    #     """
+    #     # save wavname -> transcription dict
+    #     transcript_dict = {}
+    #
+    #     if self.extensions is not None:
+    #         for ext in self.extensions:
+    #             # get the location of each dir with files
+    #             location = f"{self.location}/{ext}"
+    #             # find wav files
+    #             for wavfile in os.listdir(location):
+    #                 if wavfile.endswith(".wav"):
+    #                     wavname = wavfile.split('.wav')[0]
+    #                     print(f"Now transcribing {wavname}")
+    #                     # transcribe wav files
+    #                     full_path = os.path.join(location, wavfile)
+    #                     transcription = sphinx_sr.transcribe_file(full_path)
+    #                     print(transcription)
+    #                     # add wavname, transcription pairs to transcript_dict
+    #                     if wavname not in transcript_dict:
+    #                         transcript_dict[wavname] = transcription
+    #                     else:
+    #                         warnings.warn(f"{wavname} already in transcript_dict. Not replacing transcription")
+    #
+    #     # return completed dict
+    #     return transcript_dict
+
+
+if __name__ == "__main__":
+    if sys.argv[1] == "mustard":
+        # assumes that datasets are in the untracked 'data' directory
+        mustard_location = "../../data/multimodal_datasets/MUStARD"
+        mustard_extensions = "utterances_final"
+        current_file_path = f"{mustard_location}/mustard_utts.tsv"
+
+        mustard_transcriber = DatasetTranscriber("MUStARD", mustard_location, mustard_extensions)
+
+        # get current label file
+        current_file = mustard_transcriber.read_in_current_files(current_file_path)
+        print("Current file read")
+        print(current_file.head(5))
+
+        # transcribe data
+        transcripts = mustard_transcriber.transcribe()
+
+        # save transcriptions
+        mustard_transcriber.save_transcriptions(transcripts, current_file, "mustard_sphinx.tsv")
+
+    elif sys.argv[1] == "meld":
+        # assumes that datasets are in the untracked 'data' directory
+        meld_location = "../../data/multimodal_datasets/MELD_formatted"
+
+        meld_train_extensions = "train/train_audio"
+        meld_dev_extensions = "dev/dev_audio"
+        meld_test_extensions = "test/test_audio"
+
+        current_train_path = f"{meld_location}/train/train_sent_emo.csv"
+        current_dev_path = f"{meld_location}/dev/dev_sent_emo.csv"
+        current_test_path = f"{meld_location}/test/test_sent_emo.csv"
+
+        meld_train_transcriber = DatasetTranscriber("MELD", meld_location, meld_train_extensions)
+        meld_dev_transcriber = DatasetTranscriber("MELD", meld_location, meld_dev_extensions)
+        meld_test_transcriber = DatasetTranscriber("MELD", meld_location, meld_test_extensions)
+
+        # get paths
+        current_train_file = meld_train_transcriber.read_in_current_files(current_train_path)
+        current_dev_file = meld_dev_transcriber.read_in_current_files(current_dev_path)
+        current_test_file = meld_test_transcriber.read_in_current_files(current_test_path)
+
+        # transcribe data
+        train_transcripts = meld_train_transcriber.transcribe()
+        dev_transcripts = meld_dev_transcriber.transcribe()
+        test_transcripts = meld_test_transcriber.transcribe()
+
+        # save transcriptions
+        meld_train_transcriber.save_transcriptions(train_transcripts, current_train_file, "train/meld_sphinx.tsv")
+        meld_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "dev/meld_sphinx.tsv")
+        meld_test_transcriber.save_transcriptions(test_transcripts, current_test_file, "test/meld_sphinx.tsv")
+
+    elif sys.argv[1] == "chalearn":
+        # assumes that datasets are in the untracked 'data' directory
+        chalearn_location = "../../data/multimodal_datasets/Chalearn"
+
+        chalearn_train_extension = "train/mp4"
+        chalearn_dev_extension = "val/mp4"
+        # chalearn_test_extension = "test/mp4
+
+        current_train_path = f"{chalearn_location}/train/gold_and_utts.tsv"
+        current_dev_path = f"{chalearn_location}/val/gold_and_utts.tsv"
+        # current_test_path = f"{chalearn_location}/test/gold_and_utts.tsv"
+
+        chalearn_train_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_train_extension)
+        current_train_file = chalearn_train_transcriber.read_in_current_files(current_train_path)
+        train_transcripts = chalearn_train_transcriber.transcribe()
+
+        # save transcriptions
+        chalearn_train_transcriber.save_transcriptions(train_transcripts, current_train_file, "train/chalearn_sphinx.tsv")
+
+        chalearn_dev_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_dev_extension)
+        current_dev_file = chalearn_dev_transcriber.read_in_current_files(current_dev_path)
+        dev_transcripts = chalearn_dev_transcriber.transcribe()
+
+        # save transcriptions
+        chalearn_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "val/chalearn_sphinx.tsv")
\ No newline at end of file

From 1f549f73edc10e7e78ab948fc2cc689539311734 Mon Sep 17 00:00:00 2001
From: john culnan <jmculnan@email.arizona.edu>
Date: Fri, 15 Jan 2021 10:37:00 -0700
Subject: [PATCH 3/3] updates to run datasets through sphinx and google speech
 recognizers

---
 speech_recognizers/google_sr.py               |  29 +++-
 .../transcribe_datasets_google.py             | 131 ++++++++++++++++++
 .../transcribe_datasets_sphinx.py             |  75 ++--------
 3 files changed, 164 insertions(+), 71 deletions(-)
 create mode 100644 train_and_test_models/transcribe_datasets/transcribe_datasets_google.py

diff --git a/speech_recognizers/google_sr.py b/speech_recognizers/google_sr.py
index e0d5ebad..b3ee8db2 100644
--- a/speech_recognizers/google_sr.py
+++ b/speech_recognizers/google_sr.py
@@ -1,7 +1,6 @@
 import sys
-
-sample_rate = sys.argv[1]
-input_file = sys.argv[2]
+import os
+import json
 
 
 def transcribe_file(speech_file, sample_rate):
@@ -19,6 +18,8 @@ def transcribe_file(speech_file, sample_rate):
         encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=sample_rate,
         language_code="en-US",
+        enable_word_time_offsets=True,
+        # enable_word_confidence=True
     )
 
     response = client.recognize(config=config, audio=audio)
@@ -26,15 +27,29 @@ def transcribe_file(speech_file, sample_rate):
     # Each result is for a consecutive portion of the audio. Iterate through
     # them to get the transcripts for the entire audio file.
 
+    all_results = ""
+    all_text_confs = []
+
     for result in response.results:
         # The first alternative is the most likely one for this portion.
-        text_conf = "%s\t%s" % (result.alternatives[0].transcript, result.alternatives[0].confidence)
+        all_results += str(result) + '\n'
+        all_text_confs.append("%s\t%s" % (result.alternatives[0].transcript,
+                                          result.alternatives[0].confidence))
+
     # print(response)
     # print(response.results)
 
-    return text_conf
+    return all_results, all_text_confs
+    # return text_conf
 
 
 if __name__ == "__main__":
-    result = transcribe_file(input_file, sample_rate)
-    print(result)
+    # replace this with your credentials
+    credentials = "your_credentials_here.json"
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials
+
+    sample_rate = int(sys.argv[1])
+    input_file = sys.argv[2]
+
+    unformatted_results, utt_result = transcribe_file(input_file, sample_rate)
+    print(utt_result[0])
diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py
new file mode 100644
index 00000000..efa7dcb8
--- /dev/null
+++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py
@@ -0,0 +1,131 @@
+# transcribe datasets using google speech-to-text
+
+import os
+import sys
+import wave
+from speech_recognizers.google_sr import transcribe_file
+
+
+def transcribe_and_save(dataset, base_location, wav_location, save_unformatted_location,
+                        save_name="google_transcriptions.txt"):
+    formatted_results = []
+    skipped_audio = []
+
+    for item in os.listdir(wav_location):
+        if item.endswith('.wav'):
+            filesize = os.path.getsize(f"{wav_location}/{item}")
+            if filesize >= 10000000:
+                skipped_audio.append(item)
+            else:
+                with wave.open(f"{wav_location}/{item}", "rb") as wave_file:
+                    frame_rate = wave_file.getframerate()
+                if dataset.lower() == "mustard":
+                    item_name = item.split(".wav")[0]
+                elif dataset.lower() == "meld":
+                    item_name = item.split("_2.wav")[0]
+                elif dataset.lower() == "chalearn":
+                    item_name = item.split(".wav")[0] + ".mp4"
+                unformatted_results, utt_results = transcribe_file(f"{wav_location}/{item}", frame_rate)
+                with open(f"{save_unformatted_location}/{item_name}_unformatted.txt", 'w') as unformatted_file:
+                    unformatted_file.write(unformatted_results)
+                for utt in utt_results:
+                    line = f"{item}\t{utt}"
+                    formatted_results.append(line)
+
+    with open(f"{base_location}/{save_name}", 'w') as gfile:
+        gfile.write("\n".join(formatted_results))
+
+    with open(f"{base_location}/skipped_files.txt", 'w') as skipped:
+        skipped.write("\n".join(skipped_audio))
+
+
+if __name__ == "__main__":
+    # replace with your credentials
+    credentials = "your_credentials_here.json"
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials
+
+    # sample_rate = 44100
+
+    if sys.argv[1] == "mustard":
+        # access the data
+        mustard_base_location = "/Users/jculnan/datasets/multimodal_datasets/MUStARD"
+        mustard_wav_location = os.path.join(mustard_base_location, "wav")
+        # save unformatted objsects
+        mustard_save_unformatted_location = os.path.join(mustard_base_location, "google-unformatted")
+        # create this directory if it doesn't exist
+        os.system(f'if [ ! -d "{mustard_save_unformatted_location}" ]; then mkdir -p {mustard_save_unformatted_location}; fi')
+
+        formatted_results = []
+
+        for item in os.listdir(mustard_wav_location):
+            if item.endswith('.wav'):
+                with wave.open(f"{mustard_wav_location}/{item}", "rb") as wave_file:
+                    frame_rate = wave_file.getframerate()
+                item_name = item.split(".wav")[0]
+                unformatted_results, utt_results = transcribe_file(f"{mustard_wav_location}/{item}", frame_rate)
+                with open(f"{mustard_save_unformatted_location}/{item_name}_unformatted.txt", 'w') as unformatted_file:
+                    unformatted_file.write(unformatted_results)
+                for utt in utt_results:
+                    line = f"{item}\t{utt}"
+                    formatted_results.append(line)
+
+        with open(f"{mustard_base_location}/google_transcriptions.txt", 'w') as gfile:
+            gfile.write("\n".join(formatted_results))
+
+    elif sys.argv[1] == "meld":
+        # access the data
+        meld_train_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/train"
+        meld_dev_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/dev"
+        meld_test_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/test"
+        # todo: change this once you've tested it
+        meld_train_wav_location = os.path.join(meld_train_base_location, "train_audio_mono")
+        meld_dev_wav_location = os.path.join(meld_dev_base_location, "dev_audio_mono")
+        meld_test_wav_location = os.path.join(meld_test_base_location, "test_audio_mono")
+        # save unformatted objects
+        meld_train_save_unformatted_location = os.path.join(meld_train_base_location, "google-unformatted-test")
+        meld_dev_save_unformatted_location = os.path.join(meld_dev_base_location, "google-unformatted")
+        meld_test_save_unformatted_location = os.path.join(meld_test_base_location, "google-unformatted")
+        # create this directory if it doesn't exist
+        os.system(f'if [ ! -d "{meld_train_save_unformatted_location}" ]; then mkdir -p {meld_train_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{meld_dev_save_unformatted_location}" ]; then mkdir -p {meld_dev_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{meld_test_save_unformatted_location}" ]; then mkdir -p {meld_test_save_unformatted_location}; fi')
+
+        transcribe_and_save("meld", meld_train_base_location, meld_train_wav_location,
+                            meld_train_save_unformatted_location, save_name="dia644_utt4_retest.txt")
+
+        transcribe_and_save("meld", meld_dev_base_location, meld_dev_wav_location,
+                            meld_dev_save_unformatted_location)
+
+        transcribe_and_save("meld", meld_test_base_location, meld_test_wav_location,
+                            meld_test_save_unformatted_location)
+
+    elif sys.argv[1] == "chalearn":
+        # access the data
+        chalearn_train_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/train"
+        chalearn_dev_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/val"
+        chalearn_test_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/test"
+        # todo: change this once you've tested it
+        chalearn_train_wav_location = os.path.join(chalearn_train_base_location, "wav")
+        chalearn_dev_wav_location = os.path.join(chalearn_dev_base_location, "wav")
+        chalearn_test_wav_location = os.path.join(chalearn_test_base_location, "wav")
+        # save unformatted objsects
+        chalearn_train_save_unformatted_location = os.path.join(chalearn_train_base_location, "google-unformatted")
+        chalearn_dev_save_unformatted_location = os.path.join(chalearn_dev_base_location, "google-unformatted")
+        chalearn_test_save_unformatted_location = os.path.join(chalearn_test_base_location, "google-unformatted")
+        # create this directory if it doesn't exist
+        os.system(f'if [ ! -d "{chalearn_train_save_unformatted_location}" ]; then mkdir -p {chalearn_train_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{chalearn_dev_save_unformatted_location}" ]; then mkdir -p {chalearn_dev_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{chalearn_test_save_unformatted_location}" ]; then mkdir -p {chalearn_test_save_unformatted_location}; fi')
+
+        transcribe_and_save("chalearn", chalearn_train_base_location, chalearn_train_wav_location,
+                            chalearn_train_save_unformatted_location)
+
+        transcribe_and_save("chalearn", chalearn_dev_base_location, chalearn_dev_wav_location,
+                            chalearn_dev_save_unformatted_location)
+
+        transcribe_and_save("chalearn", chalearn_test_base_location, chalearn_test_wav_location,
+                            chalearn_test_save_unformatted_location)
diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
index 5ff2727e..7a2e34e7 100644
--- a/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
+++ b/train_and_test_models/transcribe_datasets/transcribe_datasets_sphinx.py
@@ -59,37 +59,6 @@ def read_in_current_files(self, current_file_location):
             all_utts = pd.read_csv(current_file_location, sep='\t')
 
         return all_utts
-        #
-        # all_utts.to_dict('records', into=data_dict)
-        #
-        # return data_dict
-
-    # def save_transcriptions(self, transcriptions, save_name):
-    #     """
-    #     Saves the transcriptions as they come out
-    #     Saved as a tsv file with name save_name
-    #     To location self.location
-    #     Does not consider dataset structure
-    #     """
-    #     sname = ""
-    #     if save_name.endswith(".tsv"):
-    #         sname = save_name
-    #     elif save_name.endswith(".csv"):
-    #         sname = save_name.split(".csv")[0]
-    #     else:
-    #         sname = f"{save_name}.tsv"
-    #     # convert transcriptions if in dict
-    #     if type(transcriptions) == dict:
-    #         transcriptions = [(k, v) for k, v in transcriptions.items()]
-    #     # save transcriptions
-    #     if type(transcriptions) == pd.DataFrame:
-    #         transcriptions.to_csv(f"{self.location}/{sname}", index=False,
-    #                               sep="\t")
-    #     elif type(transcriptions) == list:
-    #         with open(f"{self.location}/{sname}", 'w') as wfile:
-    #             wfile.write("id\tutterance\n")
-    #             for item in transcriptions:
-    #                 wfile.write(f"{item[0]}\t{item[1]}\n")
 
     def save_transcriptions(self, transcriptions_dict, current_files, save_name):
         """
@@ -157,35 +126,6 @@ def transcribe(self):
         # return completed dict
         return transcript_dict
 
-    # def transcribe(self):
-    #     """
-    #     transcribe all available files in the specified location
-    #     """
-    #     # save wavname -> transcription dict
-    #     transcript_dict = {}
-    #
-    #     if self.extensions is not None:
-    #         for ext in self.extensions:
-    #             # get the location of each dir with files
-    #             location = f"{self.location}/{ext}"
-    #             # find wav files
-    #             for wavfile in os.listdir(location):
-    #                 if wavfile.endswith(".wav"):
-    #                     wavname = wavfile.split('.wav')[0]
-    #                     print(f"Now transcribing {wavname}")
-    #                     # transcribe wav files
-    #                     full_path = os.path.join(location, wavfile)
-    #                     transcription = sphinx_sr.transcribe_file(full_path)
-    #                     print(transcription)
-    #                     # add wavname, transcription pairs to transcript_dict
-    #                     if wavname not in transcript_dict:
-    #                         transcript_dict[wavname] = transcription
-    #                     else:
-    #                         warnings.warn(f"{wavname} already in transcript_dict. Not replacing transcription")
-    #
-    #     # return completed dict
-    #     return transcript_dict
-
 
 if __name__ == "__main__":
     if sys.argv[1] == "mustard":
@@ -240,15 +180,15 @@ def transcribe(self):
 
     elif sys.argv[1] == "chalearn":
         # assumes that datasets are in the untracked 'data' directory
-        chalearn_location = "../../data/multimodal_datasets/Chalearn"
+        chalearn_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn"
 
         chalearn_train_extension = "train/mp4"
         chalearn_dev_extension = "val/mp4"
-        # chalearn_test_extension = "test/mp4
+        chalearn_test_extension = "test/wav"
 
         current_train_path = f"{chalearn_location}/train/gold_and_utts.tsv"
         current_dev_path = f"{chalearn_location}/val/gold_and_utts.tsv"
-        # current_test_path = f"{chalearn_location}/test/gold_and_utts.tsv"
+        current_test_path = f"{chalearn_location}/test/gold_and_utts.tsv"
 
         chalearn_train_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_train_extension)
         current_train_file = chalearn_train_transcriber.read_in_current_files(current_train_path)
@@ -262,4 +202,11 @@ def transcribe(self):
         dev_transcripts = chalearn_dev_transcriber.transcribe()
 
         # save transcriptions
-        chalearn_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "val/chalearn_sphinx.tsv")
\ No newline at end of file
+        chalearn_dev_transcriber.save_transcriptions(dev_transcripts, current_dev_file, "val/chalearn_sphinx.tsv")
+
+        chalearn_test_transcriber = DatasetTranscriber("Chalearn", chalearn_location, chalearn_test_extension)
+        current_test_file = chalearn_test_transcriber.read_in_current_files(current_test_path)
+        test_transcripts = chalearn_test_transcriber.transcribe()
+
+        # save transcriptions
+        chalearn_test_transcriber.save_transcriptions(test_transcripts, current_test_file, "test/chalearn_sphinx.tsv")
\ No newline at end of file