fixie-ai · farzadab · Jul 25, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ openai = "~1.33.0"
 jiwer = "~3.0.4"
 tensorboardx = "~2.6.2.2"
 wandb = "~0.17.1"
+sacrebleu = "^2.4.2"
 
 [tool.poetry.group.dev.dependencies]
 black = "~24.4.2"

diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py
@@ -805,6 +805,85 @@ def _get_sample(self, row) -> VoiceSample:
         return self._get_transcribe_sample(row, tcol="sentence")
 
 
+class CoVoST2Dataset(VoiceDataset):
+    """
+    CoVoST 2 is a large-scale multilingual speech translation corpus covering translations from 21 languages into English
+    and from English into 15 languages. The dataset is created using Mozilla's open-source Common Voice 4 database of
+    crowdsourced voice recordings. There are 2,900 hours of speech represented in the corpus.
+
+    The original Hugging Face dataset link: https://huggingface.co/datasets/facebook/covost2
+    Since this dataset requires audio files to be downloaded separately, a new dataset is created with the audio files:
+    https://huggingface.co/datasets/fixie-ai/covost2
+
+    Due to the scale of the dataset and the audio files being repeated, only a portion of the dataset was converted.
+    See [this issue](https://github.com/fixie-ai/ultravox/issues/50) for more information.
+
+    Supported subsets (En -> X):
+        'en_de', 'en_tr', 'en_fa', 'en_sv-SE', 'en_mn', 'en_zh-CN', 'en_cy',
+        'en_ca', 'en_sl', 'en_et', 'en_id', 'en_ar', 'en_ta', 'en_lv', 'en_ja'
+    Supported subsets (X -> En):
+        'fr_en', 'zh-CN_en', 'es_en'
+    """
+
+    CODE_TO_LANG = {
+        "en": "English",
+        "de": "German",
+        "tr": "Turkish",
+        "fa": "Persian",
+        "sv-SE": "Swedish",
+        "mn": "Mongolian",
+        "zh-CN": "Chinese",
+        "cy": "Welsh",
+        "ca": "Catalan",
+        "sl": "Slovenian",
+        "et": "Estonian",
+        "id": "Indonesian",
+        "ar": "Arabic",
+        "ta": "Tamil",
+        "lv": "Latvian",
+        "ja": "Japanese",
+        "fr": "French",
+        "es": "Spanish",
+    }
+
+    # We currently don't use this dataset for training, so mainly the first prompt it ever used.
+    TRANSLATE_PROMPTS = [
+        "Translate the following into {target} language: <|audio|>",
+        "Translate the following into {target}: <|audio|>",
+        "Please convert the following into {target}.\n<|audio|>",
+        "Could you translate this to {target} language?\n<|audio|>",
+        "Translate the text below to {target}. <|audio|>",
+        "Translate the subsequent text into {target} language. <|audio|>",
+        "Can you translate this into the {target} language?\n<|audio|>",
+        "Transform the following to {target}: <|audio|>",
+    ]
+
+    def __init__(self, args: VoiceDatasetArgs, subset: str) -> None:
+        super().__init__(args)
+        dataset = self._load_audio_dataset(
+            "fixie-ai/covost2", subset, split=args.split.value
+        )
+        langs = subset.split("_")
+        assert len(langs) == 2, f"Invalid subset: {subset}"
+        self.source_lang = self.CODE_TO_LANG[langs[0]]
+        self.target_lang = self.CODE_TO_LANG[langs[1]]
+        self._init_dataset(dataset)
+
+    def _get_sample(self, row) -> VoiceSample:
+        prompt = self._choice(self.TRANSLATE_PROMPTS).format(target=self.target_lang)
+
+        transcript = row["sentence"]
+        translation = row["translation"]
+        if not self._args.include_audio:
+            prompt = prompt.replace("<|audio|>", transcript)
+
+        return self._make_sample(
+            _get_messages(prompt, translation),
+            self._get_audio(row),
+            audio_transcript=transcript,
+        )
+
+
 class PeopleSpeechDataset(VoiceDataset):
     """
     The People's Speech Dataset is among the world's largest English speech
@@ -882,6 +961,7 @@ def create_dataset(name: str, args: VoiceDatasetArgs) -> data.IterableDataset:
         "librispeech": LibriSpeechDataset,
         "voxpopuli": VoxPopuliDataset,
         "commonvoice": CommonVoiceDataset,
+        "covost2": CoVoST2Dataset,
         "peoplespeech": PeopleSpeechDataset,
         "soda": SodaDataset,
         "dummy": LibriSpeechDummyDataset,

diff --git a/ultravox/evaluation/eval.py b/ultravox/evaluation/eval.py
@@ -5,17 +5,18 @@
 from ultravox.evaluation import string_based
 from ultravox.evaluation import wer
 
+METRIC_REGISTRY = {
+    "asr": wer.evaluate_answer_asr,
+    "boolq": gpt_eval_boolq.evaluate_answer_boolq,
+    "instruct": gpt_eval_instruct.evaluate_answer_instruct,
+    "conversation": gpt_eval_conv.evaluate_conversation_response,
+    "exact_match_last_word": string_based.match_last_word,
+    "bleu": string_based.bleu,
+}
+
 
 def evaluate_answer(sample: eval_types.Sample, metric: str) -> eval_types.Result:
-    if metric == "asr":
-        return wer.evaluate_answer_asr(sample)
-    elif metric == "boolq":
-        return gpt_eval_boolq.evaluate_answer_boolq(sample)
-    elif metric == "instruct":
-        return gpt_eval_instruct.evaluate_answer_instruct(sample)
-    elif metric == "conversation":
-        return gpt_eval_conv.evaluate_conversation_response(sample)
-    elif metric == "exact_match_last_word":
-        return string_based.match_last_word(sample)
+    if metric in METRIC_REGISTRY:
+        return METRIC_REGISTRY[metric](sample)
     else:
         raise ValueError(f"Unknown metric: {metric}")
diff --git a/ultravox/evaluation/eval_types.py b/ultravox/evaluation/eval_types.py
@@ -35,4 +35,14 @@ class ExactMatchResult:
     reason: str
 
 
+@dataclasses.dataclass
+class BleuResult:
+    """
+    Score is the BLEU score for the generated answer.
+    Note: BLEU is supposed to be computed on a corpus level, not on a single sample.
+    """
+
+    score: float
+
+
 Result = Union[InstructResult, WerResult, ExactMatchResult]
diff --git a/ultravox/evaluation/string_based.py b/ultravox/evaluation/string_based.py
@@ -1,5 +1,7 @@
 import re
 
+import sacrebleu
+
 from ultravox.evaluation import eval_types
 
 
@@ -21,3 +23,17 @@ def match_last_word(sample: eval_types.Sample) -> eval_types.ExactMatchResult:
     return eval_types.ExactMatchResult(
         score=last_word == expected_tf, reason="exact_match check"
     )
+
+
+def bleu(sample: eval_types.Sample) -> eval_types.BleuResult:
+    """
+    Compute BLEU score for a single sample.
+
+    Note: BLEU is supposed to be computed on a corpus level, not on a single sample.
+    As such, reported values here might not be easily comparable to other metrics.
+    """
+    score = sacrebleu.sentence_bleu(
+        hypothesis=sample.generated_answer,
+        references=[sample.expected_answer],
+    ).score
+    return eval_types.BleuResult(score=score)
diff --git a/ultravox/training/evaluation.py b/ultravox/training/evaluation.py
@@ -58,13 +58,29 @@ class EvalScenario:
 
 
 EVAL_SCENARIOS = [
+    # automatic speech recognition scenarios
+    EvalScenario("boolq__wer", "boolq_in", "asr"),
+    # automatic speech translation scenarios
+    EvalScenario("covost2_en_de__bleu", "covost2:en_de", "bleu"),
+    EvalScenario("covost2_en_zh-CN__bleu", "covost2:en_zh-CN", "bleu"),
+    EvalScenario("covost2_es_en__bleu", "covost2:es_en", "bleu"),
+    EvalScenario(
+        "covost2_en_de__bleu__text_only", "covost2:en_de", "bleu", include_audio=False
+    ),
+    EvalScenario(
+        "covost2_en_zh-CN__bleu__text_only",
+        "covost2:en_zh-CN",
+        "bleu",
+        include_audio=False,
+    ),
+    EvalScenario(
+        "covost2_es_en__bleu__text_only", "covost2:es_en", "bleu", include_audio=False
+    ),
+    # SQA scenarios
     EvalScenario("anyinstruct__instruct_follow", "anyinstruct", "instruct"),
     EvalScenario(
         "boolq__binary", "boolq_extended", "exact_match_last_word", new_tokens=128
     ),
-    EvalScenario("boolq__wer", "boolq_in", "asr"),
-    EvalScenario("soda__sensible_generation", "soda", "conversation", new_tokens=64),
-    # Text-only scenarios: tests for catastrophic forgetting.
     EvalScenario(
         "anyinstruct__instruct_follow__text_only",
         "anyinstruct",
@@ -78,6 +94,8 @@ class EvalScenario:
         new_tokens=128,
         include_audio=False,
     ),
+    # Conversation dialogue scenarios
+    EvalScenario("soda__sensible_generation", "soda", "conversation", new_tokens=64),
     EvalScenario(
         "soda__sensible_generation__text_only",
         "soda",