From eaed07c85cc0f7b14a71f9dfc74d76408135fc84 Mon Sep 17 00:00:00 2001
From: Zhongqiang Huang <zhuang@fixie.ai>
Date: Thu, 15 Aug 2024 10:57:27 -0700
Subject: [PATCH 1/5] Update default config to ultravox_v0.3

---
 mcloud.yaml                                   |  2 +-
 .../training/configs/llama3_whisper_kd.yaml   | 22 ++++++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/mcloud.yaml b/mcloud.yaml
index 280e258f..e2fe999c 100644
--- a/mcloud.yaml
+++ b/mcloud.yaml
@@ -13,5 +13,5 @@ command: >-
   cd ultravox && poetry install --no-dev && poetry run torchrun --nproc_per_node=8 -m ultravox.training.train $TRAIN_ARGS
 env_variables:
   MLFLOW_TRACKING_URI: databricks
-  UV_BRANCH: main
+  UV_BRANCH: update_default_config_to_ultravox_v0.3
   TRAIN_ARGS: --config_path ultravox/training/configs/llama3_whisper_kd.yaml
diff --git a/ultravox/training/configs/llama3_whisper_kd.yaml b/ultravox/training/configs/llama3_whisper_kd.yaml
index d951f02d..63c9ce95 100644
--- a/ultravox/training/configs/llama3_whisper_kd.yaml
+++ b/ultravox/training/configs/llama3_whisper_kd.yaml
@@ -13,27 +13,33 @@ loss_config:
 # Temporarily remove heysquad_human from val_sets as it causes the training to fail.
 val_sets: ["anyinstruct", "soda", "peoplespeech"]
 
-batch_size: 4
-max_steps: 1000
+batch_size: 24
+max_steps: 7200 # x8x24 = 1,382,400 samples
 
 data_sets: []
 data_dicts:
   - path: "fixie-ai/librispeech_asr"
     name: "clean"
     splits:
-      - "train.100"
-      - "train.360"
+      - "train.100" # 28_539 samples
+      - "train.360" # 104_014 samples
     user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text }}"
-    weight: 2
-    num_samples: 100_000
+    weight: 1
   - path: "fixie-ai/librispeech_asr"
     name: "other"
     splits:
-      - "train.500"
+      - "train.500" # 148_688 samples
     user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text }}"
     weight: 1
-    num_samples: 100_000
+  - path: "fixie-ai/common_voice_17_0"
+    name: "en"
+    splits:
+      - "train" # 1_101_170 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
+    weight: 8
\ No newline at end of file

From bee674de4b7fcf856e3d6f9154468f520c8fefef Mon Sep 17 00:00:00 2001
From: Zhongqiang Huang <zhuang@fixie.ai>
Date: Thu, 15 Aug 2024 11:33:05 -0700
Subject: [PATCH 2/5] Update

---
 ultravox/training/configs/meta_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ultravox/training/configs/meta_config.yaml b/ultravox/training/configs/meta_config.yaml
index 62f622d3..9ccb4348 100644
--- a/ultravox/training/configs/meta_config.yaml
+++ b/ultravox/training/configs/meta_config.yaml
@@ -3,7 +3,7 @@ audio_model: "facebook/wav2vec2-base-960h"
 
 data_sets: ["gigaspeech"]
 val_sets: ["heysquad_human", "anyinstruct", "soda", "peoplespeech"]
-stop_strategy: "last_exhausted"
+stop_strategy: "LAST_EXHAUSTED"
 
 train_on_inputs: False
 shuffle_data: True

From 7859ac035b932e137208ed9cb9d855b05edcac91 Mon Sep 17 00:00:00 2001
From: Zhongqiang Huang <zhuang@fixie.ai>
Date: Thu, 15 Aug 2024 13:11:22 -0700
Subject: [PATCH 3/5] Add release_config.yaml and use it as the default

---
 mcloud.yaml                                   |  2 +-
 .../training/configs/llama3_whisper_kd.yaml   | 22 ++++-----
 ultravox/training/configs/release_config.yaml | 45 +++++++++++++++++++
 3 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 ultravox/training/configs/release_config.yaml

diff --git a/mcloud.yaml b/mcloud.yaml
index e2fe999c..f44593de 100644
--- a/mcloud.yaml
+++ b/mcloud.yaml
@@ -14,4 +14,4 @@ command: >-
 env_variables:
   MLFLOW_TRACKING_URI: databricks
   UV_BRANCH: update_default_config_to_ultravox_v0.3
-  TRAIN_ARGS: --config_path ultravox/training/configs/llama3_whisper_kd.yaml
+  TRAIN_ARGS: --config_path ultravox/training/configs/release_config.yaml
\ No newline at end of file
diff --git a/ultravox/training/configs/llama3_whisper_kd.yaml b/ultravox/training/configs/llama3_whisper_kd.yaml
index 63c9ce95..d951f02d 100644
--- a/ultravox/training/configs/llama3_whisper_kd.yaml
+++ b/ultravox/training/configs/llama3_whisper_kd.yaml
@@ -13,33 +13,27 @@ loss_config:
 # Temporarily remove heysquad_human from val_sets as it causes the training to fail.
 val_sets: ["anyinstruct", "soda", "peoplespeech"]
 
-batch_size: 24
-max_steps: 7200 # x8x24 = 1,382,400 samples
+batch_size: 4
+max_steps: 1000
 
 data_sets: []
 data_dicts:
   - path: "fixie-ai/librispeech_asr"
     name: "clean"
     splits:
-      - "train.100" # 28_539 samples
-      - "train.360" # 104_014 samples
+      - "train.100"
+      - "train.360"
     user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text }}"
-    weight: 1
+    weight: 2
+    num_samples: 100_000
   - path: "fixie-ai/librispeech_asr"
     name: "other"
     splits:
-      - "train.500" # 148_688 samples
+      - "train.500"
     user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text }}"
     weight: 1
-  - path: "fixie-ai/common_voice_17_0"
-    name: "en"
-    splits:
-      - "train" # 1_101_170 samples
-    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
-    assistant_template: "{{ continuation }}"
-    transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
-    weight: 8
\ No newline at end of file
+    num_samples: 100_000
diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml
new file mode 100644
index 00000000..2f662f07
--- /dev/null
+++ b/ultravox/training/configs/release_config.yaml
@@ -0,0 +1,45 @@
+# SLM with ultravox & llama3, trained wtih knowledge distillation.
+exp_name: "ultravox-v0_3"
+
+# Make sure to accept the license agreement on huggingface hub
+text_model: "meta-llama/Meta-Llama-3-8B-Instruct"
+audio_model: "openai/whisper-small"
+
+
+loss_config:
+  # Choose from ["KL_Divergence", "CrossEntropy"], default is "KL_Divergence"
+  loss_function: "KL_Divergence"
+
+# Temporarily remove heysquad_human from val_sets as it causes the training to fail.
+val_sets: ["anyinstruct", "soda", "peoplespeech"]
+
+batch_size: 24
+max_steps: 7200 # x8x24 = 1,382,400 samples
+
+data_sets: []
+data_dicts:
+  - path: "fixie-ai/librispeech_asr"
+    name: "clean"
+    splits:
+      - "train.100" # 28_539 samples
+      - "train.360" # 104_014 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text }}"
+    weight: 1
+  - path: "fixie-ai/librispeech_asr"
+    name: "other"
+    splits:
+      - "train.500" # 148_688 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text }}"
+    weight: 1
+  - path: "fixie-ai/common_voice_17_0"
+    name: "en"
+    splits:
+      - "train" # 1_101_170 samples
+    user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
+    assistant_template: "{{ continuation }}"
+    transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
+    weight: 8
\ No newline at end of file

From 206a79c3324b60444fc7ec7878b590a3759c068b Mon Sep 17 00:00:00 2001
From: Zhongqiang Huang <zhuang@fixie.ai>
Date: Thu, 15 Aug 2024 13:12:27 -0700
Subject: [PATCH 4/5] Update

---
 ultravox/training/configs/release_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml
index 2f662f07..02e4df5c 100644
--- a/ultravox/training/configs/release_config.yaml
+++ b/ultravox/training/configs/release_config.yaml
@@ -1,8 +1,8 @@
-# SLM with ultravox & llama3, trained wtih knowledge distillation.
+# SLM with ultravox & llama3.1, trained wtih knowledge distillation.
 exp_name: "ultravox-v0_3"
 
 # Make sure to accept the license agreement on huggingface hub
-text_model: "meta-llama/Meta-Llama-3-8B-Instruct"
+text_model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
 audio_model: "openai/whisper-small"
 
 

From f3441d62f0b6614b68d04e8f8675e7244f02bff6 Mon Sep 17 00:00:00 2001
From: Zhongqiang Huang <zhuang@fixie.ai>
Date: Thu, 15 Aug 2024 17:44:54 -0700
Subject: [PATCH 5/5] Update

---
 mcloud.yaml                                   | 2 +-
 ultravox/training/configs/release_config.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mcloud.yaml b/mcloud.yaml
index f44593de..99788954 100644
--- a/mcloud.yaml
+++ b/mcloud.yaml
@@ -13,5 +13,5 @@ command: >-
   cd ultravox && poetry install --no-dev && poetry run torchrun --nproc_per_node=8 -m ultravox.training.train $TRAIN_ARGS
 env_variables:
   MLFLOW_TRACKING_URI: databricks
-  UV_BRANCH: update_default_config_to_ultravox_v0.3
+  UV_BRANCH: main
   TRAIN_ARGS: --config_path ultravox/training/configs/release_config.yaml
\ No newline at end of file
diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml
index 02e4df5c..973656a7 100644
--- a/ultravox/training/configs/release_config.yaml
+++ b/ultravox/training/configs/release_config.yaml
@@ -42,4 +42,4 @@ data_dicts:
     user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
     assistant_template: "{{ continuation }}"
     transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
-    weight: 8
\ No newline at end of file
+    weight: 8