clulab · jmculnan · Jan 6, 2021 · Jan 8, 2021 · Jan 15, 2021
diff --git a/speech_recognition/README.md → speech_recognizers/README.md b/speech_recognition/README.md → speech_recognizers/README.md
diff --git a/speech_recognizers/__init__.py b/speech_recognizers/__init__.py
diff --git a/speech_recognition/google_sr.py → speech_recognizers/google_sr.py b/speech_recognition/google_sr.py → speech_recognizers/google_sr.py
@@ -1,7 +1,6 @@
 import sys
-
-sample_rate = sys.argv[1]
-input_file = sys.argv[2]
+import os
+import json
 
 
 def transcribe_file(speech_file, sample_rate):
@@ -19,22 +18,38 @@ def transcribe_file(speech_file, sample_rate):
         encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=sample_rate,
         language_code="en-US",
+        enable_word_time_offsets=True,
+        # enable_word_confidence=True
     )
 
     response = client.recognize(config=config, audio=audio)
 
     # Each result is for a consecutive portion of the audio. Iterate through
     # them to get the transcripts for the entire audio file.
 
+    all_results = ""
+    all_text_confs = []
+
     for result in response.results:
         # The first alternative is the most likely one for this portion.
-        text_conf = "%s\t%s" % (result.alternatives[0].transcript, result.alternatives[0].confidence)
+        all_results += str(result) + '\n'
+        all_text_confs.append("%s\t%s" % (result.alternatives[0].transcript,
+                                          result.alternatives[0].confidence))
+
     # print(response)
     # print(response.results)
 
-    return text_conf
+    return all_results, all_text_confs
+    # return text_conf
 
 
 if __name__ == "__main__":
-    result = transcribe_file(input_file, sample_rate)
-    print(result)
+    # replace this with your credentials
+    credentials = "your_credentials_here.json"
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials
+
+    sample_rate = int(sys.argv[1])
+    input_file = sys.argv[2]
+
+    unformatted_results, utt_result = transcribe_file(input_file, sample_rate)
+    print(utt_result[0])
diff --git a/speech_recognition/sphinx_sr.py → speech_recognizers/sphinx_sr.py b/speech_recognition/sphinx_sr.py → speech_recognizers/sphinx_sr.py
@@ -1,8 +1,6 @@
 import speech_recognition as sr
 import sys
 
-input_file = sys.argv[1]
-
 
 def transcribe_file(input_file):
 
@@ -11,11 +9,15 @@ def transcribe_file(input_file):
     with stim as source:
         audio = r.record(source)
 
-    transcription = r.recognize_sphinx(audio)
+    try:
+        transcription = r.recognize_sphinx(audio)
+    except:
+        transcription = None
 
     return transcription
 
 
 if __name__ == "__main__":
+    input_file = sys.argv[1]
     result = transcribe_file(input_file)
     print(result)
diff --git a/train_and_test_models/transcribe_datasets/__init__.py b/train_and_test_models/transcribe_datasets/__init__.py
diff --git a/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py b/train_and_test_models/transcribe_datasets/transcribe_datasets_google.py
@@ -0,0 +1,131 @@
+# transcribe datasets using google speech-to-text
+
+import os
+import sys
+import wave
+from speech_recognizers.google_sr import transcribe_file
+
+
+def transcribe_and_save(dataset, base_location, wav_location, save_unformatted_location,
+                        save_name="google_transcriptions.txt"):
+    formatted_results = []
+    skipped_audio = []
+
+    for item in os.listdir(wav_location):
+        if item.endswith('.wav'):
+            filesize = os.path.getsize(f"{wav_location}/{item}")
+            if filesize >= 10000000:
+                skipped_audio.append(item)
+            else:
+                with wave.open(f"{wav_location}/{item}", "rb") as wave_file:
+                    frame_rate = wave_file.getframerate()
+                if dataset.lower() == "mustard":
+                    item_name = item.split(".wav")[0]
+                elif dataset.lower() == "meld":
+                    item_name = item.split("_2.wav")[0]
+                elif dataset.lower() == "chalearn":
+                    item_name = item.split(".wav")[0] + ".mp4"
+                unformatted_results, utt_results = transcribe_file(f"{wav_location}/{item}", frame_rate)
+                with open(f"{save_unformatted_location}/{item_name}_unformatted.txt", 'w') as unformatted_file:
+                    unformatted_file.write(unformatted_results)
+                for utt in utt_results:
+                    line = f"{item}\t{utt}"
+                    formatted_results.append(line)
+
+    with open(f"{base_location}/{save_name}", 'w') as gfile:
+        gfile.write("\n".join(formatted_results))
+
+    with open(f"{base_location}/skipped_files.txt", 'w') as skipped:
+        skipped.write("\n".join(skipped_audio))
+
+
+if __name__ == "__main__":
+    # replace with your credentials
+    credentials = "your_credentials_here.json"
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials
+
+    # sample_rate = 44100
+
+    if sys.argv[1] == "mustard":
+        # access the data
+        mustard_base_location = "/Users/jculnan/datasets/multimodal_datasets/MUStARD"
+        mustard_wav_location = os.path.join(mustard_base_location, "wav")
+        # save unformatted objsects
+        mustard_save_unformatted_location = os.path.join(mustard_base_location, "google-unformatted")
+        # create this directory if it doesn't exist
+        os.system(f'if [ ! -d "{mustard_save_unformatted_location}" ]; then mkdir -p {mustard_save_unformatted_location}; fi')
+
+        formatted_results = []
+
+        for item in os.listdir(mustard_wav_location):
+            if item.endswith('.wav'):
+                with wave.open(f"{mustard_wav_location}/{item}", "rb") as wave_file:
+                    frame_rate = wave_file.getframerate()
+                item_name = item.split(".wav")[0]
+                unformatted_results, utt_results = transcribe_file(f"{mustard_wav_location}/{item}", frame_rate)
+                with open(f"{mustard_save_unformatted_location}/{item_name}_unformatted.txt", 'w') as unformatted_file:
+                    unformatted_file.write(unformatted_results)
+                for utt in utt_results:
+                    line = f"{item}\t{utt}"
+                    formatted_results.append(line)
+
+        with open(f"{mustard_base_location}/google_transcriptions.txt", 'w') as gfile:
+            gfile.write("\n".join(formatted_results))
+
+    elif sys.argv[1] == "meld":
+        # access the data
+        meld_train_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/train"
+        meld_dev_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/dev"
+        meld_test_base_location = "/Users/jculnan/datasets/multimodal_datasets/MELD_formatted/test"
+        # todo: change this once you've tested it
+        meld_train_wav_location = os.path.join(meld_train_base_location, "train_audio_mono")
+        meld_dev_wav_location = os.path.join(meld_dev_base_location, "dev_audio_mono")
+        meld_test_wav_location = os.path.join(meld_test_base_location, "test_audio_mono")
+        # save unformatted objects
+        meld_train_save_unformatted_location = os.path.join(meld_train_base_location, "google-unformatted-test")
+        meld_dev_save_unformatted_location = os.path.join(meld_dev_base_location, "google-unformatted")
+        meld_test_save_unformatted_location = os.path.join(meld_test_base_location, "google-unformatted")
+        # create this directory if it doesn't exist
+        os.system(f'if [ ! -d "{meld_train_save_unformatted_location}" ]; then mkdir -p {meld_train_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{meld_dev_save_unformatted_location}" ]; then mkdir -p {meld_dev_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{meld_test_save_unformatted_location}" ]; then mkdir -p {meld_test_save_unformatted_location}; fi')
+
+        transcribe_and_save("meld", meld_train_base_location, meld_train_wav_location,
+                            meld_train_save_unformatted_location, save_name="dia644_utt4_retest.txt")
+
+        transcribe_and_save("meld", meld_dev_base_location, meld_dev_wav_location,
+                            meld_dev_save_unformatted_location)
+
+        transcribe_and_save("meld", meld_test_base_location, meld_test_wav_location,
+                            meld_test_save_unformatted_location)
+
+    elif sys.argv[1] == "chalearn":
+        # access the data
+        chalearn_train_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/train"
+        chalearn_dev_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/val"
+        chalearn_test_base_location = "/Users/jculnan/datasets/multimodal_datasets/Chalearn/test"
+        # todo: change this once you've tested it
+        chalearn_train_wav_location = os.path.join(chalearn_train_base_location, "wav")
+        chalearn_dev_wav_location = os.path.join(chalearn_dev_base_location, "wav")
+        chalearn_test_wav_location = os.path.join(chalearn_test_base_location, "wav")
+        # save unformatted objsects
+        chalearn_train_save_unformatted_location = os.path.join(chalearn_train_base_location, "google-unformatted")
+        chalearn_dev_save_unformatted_location = os.path.join(chalearn_dev_base_location, "google-unformatted")
+        chalearn_test_save_unformatted_location = os.path.join(chalearn_test_base_location, "google-unformatted")
+        # create this directory if it doesn't exist
+        os.system(f'if [ ! -d "{chalearn_train_save_unformatted_location}" ]; then mkdir -p {chalearn_train_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{chalearn_dev_save_unformatted_location}" ]; then mkdir -p {chalearn_dev_save_unformatted_location}; fi')
+        os.system(
+            f'if [ ! -d "{chalearn_test_save_unformatted_location}" ]; then mkdir -p {chalearn_test_save_unformatted_location}; fi')
+
+        transcribe_and_save("chalearn", chalearn_train_base_location, chalearn_train_wav_location,
+                            chalearn_train_save_unformatted_location)
+
+        transcribe_and_save("chalearn", chalearn_dev_base_location, chalearn_dev_wav_location,
+                            chalearn_dev_save_unformatted_location)
+
+        transcribe_and_save("chalearn", chalearn_test_base_location, chalearn_test_wav_location,
+                            chalearn_test_save_unformatted_location)