Multilingual Hellaswag tasks (#332)

* add multilignaul dynamic generative metrics * draft * finish multichoice config * update tokenizers + install nltk reqs * use punkt tab * Update src/lighteval/utils/imports.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * Update src/lighteval/metrics/normalizations.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * fix imports * remove unused import * finish implementation of templates + move stuff around * resolve nits * when in rome do as romans do (handle error messages the same way) * fix utils * nicers tests + fix them * nicer todo * add nice doscrings 📃 * add even more docstring * nit * fix test * add multilingual to dev group * merge nli, add languagees to literals * translation literals * add nli * add copa tasks + fix tranlation literals * add hellaswag tasks * remove custom telgu hellaswag * remove hindi hellaswag * add rcb + chinese nli * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * add two new tasks + docs * add nice docs * update hellaswag with docs * move hellaswag to lighteval suite * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * enable returning none from templates + better typing * change unoficial hellaswag names to have community_prefix + unify hellaswag preprocesisng * let strip be optional in hellaswag --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Hynek Kydlicek <kydliceh.hynek@gmail.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
huggingface · Oct 1, 2024 · 994fe73 · 994fe73
1 parent 01e7240
commit 994fe73
Show file tree

Hide file tree

Showing 11 changed files with 522 additions and 30 deletions.
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -755,21 +755,29 @@ def headqa(line, task_name: str = None):
     )
 
 
-def hellaswag_harness(line, task_name: str = None):
-    def preprocess(text):
-        """Comes from AiHarness"""
-        # text = text.strip()
-        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-        text = text.replace(" [title]", ". ")
-        text = re.sub("\\[.*?\\]", "", text)
-        text = text.replace("  ", " ")
-        return text
+def hellaswag_preprocess(
+    text: str, wikihow_artifacts: list[str] = [" [title]"], truncate_dots: bool = False, strip_text: bool = False
+):
+    """Comes from AiHarness"""
+    # text = text.strip()
+    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+    for dot_repl in wikihow_artifacts:
+        text = text.replace(dot_repl, ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    if truncate_dots:
+        text = text.replace(r"\.+", r"\.")
+    if strip_text:
+        text = text.strip()
+    return text
 
+
+def hellaswag_harness(line, task_name: str = None):
     ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
     return Doc(
         task_name=task_name,
-        query=preprocess(line["activity_label"] + ": " + ctx),
-        choices=[preprocess(ending) for ending in line["endings"]],
+        query=hellaswag_preprocess(line["activity_label"] + ": " + ctx),
+        choices=[hellaswag_preprocess(ending) for ending in line["endings"]],
         gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
         # "metric": "choices_loglikelihood",
     )

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -89,7 +89,7 @@ class LightevalTaskConfig:
     """
 
     name: str
-    prompt_function: Callable[[dict, str], Doc]
+    prompt_function: Callable[[dict, str], Doc | None]
     hf_repo: str
     hf_subset: str
     metric: ListLike[Metric | Metrics]

diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py
@@ -27,6 +27,7 @@
 from lighteval.metrics.normalizations import LogProbTokenNorm
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.templates.copa import get_copa_prompt_function
+from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
 from lighteval.tasks.templates.nli import get_nli_prompt_function
 from lighteval.tasks.templates.utils.formulation import (
     CFFormulation,
@@ -386,6 +387,9 @@
         ),
         hf_repo="ai4bharat/IndicCOPA",
         hf_subset=f"translation-{standardize_tag(language.value)}",
+        # Since we use trust_dataset, we have to be careful about what is inside the dataset
+        # script. We thus lock the revision to ensure that the script doesn't change
+        hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188",
         evaluation_splits=["test"],
         metric=[
             loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
@@ -443,6 +447,141 @@
 ]
 
 
+# ------------------------------- Hellaswag Tasks ------------------------------- #
+# Hellaswag is a commonsense reasoning task that requires models to complete a given scenario
+# with the most plausible ending. It tests the model's ability to understand and reason about
+# everyday situations and human behavior.
+
+# MLMM-Hellaswag: Multilingual adaptation of Hellaswag
+# Paper: https://arxiv.org/abs/2306.07610
+# This is a multilingual version of Hellaswag, part of the MLMM (Massive Language Model Meta-Evaluation) benchmark.
+# It evaluates commonsense reasoning abilities across multiple languages.
+mlmm_hellaswag_tasks = [
+    LightevalTaskConfig(
+        name=f"hellaswag_{lang.value}_{formulation.name.lower()}",
+        suite=["lighteval"],
+        prompt_function=get_hellaswag_prompt_function(
+            language=lang,
+            adapter=lambda line: {
+                # We don't use activity_label as they are not available
+                "ctx_a": line["ctx_a"],
+                "ctx_b": line["ctx_b"],
+                "continuations": line["endings"],
+                "gold_idx": int(line["label"]),
+            },
+            formulation=formulation,
+        ),
+        hf_repo="jon-tow/okapi_hellaswag",
+        hf_subset=standardize_tag(lang.value),
+        # Since we use trust_dataset, we have to be careful about what is inside the dataset
+        # script. We thus lock the revision to ensure that the script doesn't change
+        hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83",
+        evaluation_splits=["validation"],
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+        trust_dataset=True,
+    )
+    for lang in [
+        Language.ARABIC,
+        Language.BENGALI,
+        Language.CATALAN,
+        Language.DANISH,
+        Language.GERMAN,
+        Language.SPANISH,
+        Language.BASQUE,
+        Language.FRENCH,
+        Language.GUJARATI,
+        Language.HINDI,
+        Language.CROATIAN,
+        Language.HUNGARIAN,
+        Language.ARMENIAN,
+        Language.INDONESIAN,
+        Language.ICELANDIC,
+        Language.ITALIAN,
+        Language.KANNADA,
+        Language.MALAYALAM,
+        Language.MARATHI,
+        Language.NORWEGIAN,
+        Language.NEPALI,
+        Language.DUTCH,
+        Language.PORTUGUESE,
+        Language.ROMANIAN,
+        Language.RUSSIAN,
+        Language.SLOVAK,
+        Language.SERBIAN,
+        Language.SWEDISH,
+        Language.TAMIL,
+        Language.TELUGU,
+        Language.UKRAINIAN,
+        Language.VIETNAMESE,
+        Language.CHINESE,
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# Hellaswag Turkish
+# This is a Turkish adaptation of the Hellaswag task.
+# While there's no specific paper for this version, it has been found to work well for evaluating
+# Turkish language models on commonsense reasoning tasks.
+
+# We don't handle them in single task as there is quite a lot of differences (dataset/subset, dot replacement, etc.)
+# which would make it hard to read
+hellaswag_tur_tasks = [
+    LightevalTaskConfig(
+        name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}",
+        suite=["lighteval"],
+        prompt_function=get_hellaswag_prompt_function(
+            language=Language.TURKISH,
+            adapter=lambda line: {
+                "ctx_a": line["ctx_a"],
+                "ctx_b": line["ctx_b"],
+                "continuations": line["endings"],
+                "gold_idx": int(line["label"]),
+            },
+            formulation=formulation,
+            # https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py
+            wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"],
+        ),
+        hf_repo="malhajar/hellaswag_tr-v0.2",
+        hf_subset="default",
+        evaluation_splits=["validation"],
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
+# Hellaswag Thai
+# This is a Thai adaptation of the Hellaswag task.
+# Similar to the Turkish version, there's no specific paper, but it has been found to be effective
+# for evaluating Thai language models on commonsense reasoning tasks.
+hellaswag_tha_tasks = [
+    LightevalTaskConfig(
+        name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}",
+        suite=["lighteval"],
+        prompt_function=get_hellaswag_prompt_function(
+            language=Language.THAI,
+            adapter=lambda line: {
+                "ctx_a": line["ctx_a"],
+                "ctx_b": line["ctx_b"],
+                "continuations": line["endings"],
+                "gold_idx": int(line["label"]),
+            },
+            formulation=formulation,
+        ),
+        hf_repo="HuggingFaceFW-Dev/hellaswag_thai",
+        hf_subset="default",
+        evaluation_splits=["validation"],
+        few_shots_split="train",
+        metric=[
+            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+        ],
+    )
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+
 TASKS_TABLE = [
     *xnli_tasks,
     *xnli2_tasks,
@@ -454,4 +593,7 @@
     *xcopa_tasks,
     *copa_indic_tasks,
     *parus_tasks,
+    *mlmm_hellaswag_tasks,
+    *hellaswag_tur_tasks,
+    *hellaswag_tha_tasks,
 ]
diff --git a/src/lighteval/tasks/templates/continuation.py b/src/lighteval/tasks/templates/continuation.py
@@ -84,7 +84,7 @@ class ContinuationDictAdapter(TypedDict):
 
 def get_continuation_prompt_function(
     language: Language,
-    adapter: Callable[[dict], ContinuationInput] | ContinuationDictAdapter,
+    adapter: Callable[[dict], ContinuationInput | None] | ContinuationDictAdapter,
     formulation: Formulation = MCFFormulation(),
 ):
     """
@@ -121,11 +121,13 @@ def get_continuation_prompt_function(
     Returns:
         Callable: A function that generates Continuation prompt based on the given parameters.
     """
-    adapter_fn: Callable[[dict], ContinuationInput] = create_adapter_from_dict(adapter)  # type: ignore
+    adapter_fn = create_adapter_from_dict(adapter)
     translation_literals = TRANSLATION_LITERALS[language]
 
     def prepare_prompt(line: dict):
         cont_input = adapter_fn(line)
+        if cont_input is None:
+            return None
 
         instruction_val = cont_input.get("instruction")
         instruction = f"{instruction_val}\n" if instruction_val else ""
@@ -140,7 +142,11 @@ def prepare_prompt(line: dict):
         return cont_input, instruction, context, continuations
 
     def prompt_fn_cf(line, task_name: str):
-        cont_input, instruction, context, continuations = prepare_prompt(line)
+        prepared_prompt = prepare_prompt(line)
+        if prepared_prompt is None:
+            return None
+
+        cont_input, instruction, context, continuations = prepared_prompt
 
         context_follows_sentence_space = punctuation_ends_sentence(context, translation_literals)
         answers = build_answers(continuations, formulation, translation_literals, context_follows_sentence_space)
@@ -160,7 +166,11 @@ def prompt_fn_cf(line, task_name: str):
         )
 
     def prompt_fn_mcf(line, task_name: str):
-        cont_input, instruction, context, continuations = prepare_prompt(line)
+        prepared_prompt = prepare_prompt(line)
+        if prepared_prompt is None:
+            return None
+
+        cont_input, instruction, context, continuations = prepared_prompt
 
         options = build_choices(continuations, formulation, translation_literals)
         options = f"{options}\n" if options else ""

diff --git a/src/lighteval/tasks/templates/copa.py b/src/lighteval/tasks/templates/copa.py
@@ -74,7 +74,9 @@ class COPAAdapter(TypedDict):
 
 
 def get_copa_prompt_function(
-    language: Language, adapter: Callable[[dict], COPAInput] | COPAAdapter, formulation: Formulation = MCFFormulation()
+    language: Language,
+    adapter: Callable[[dict], COPAInput | None] | COPAAdapter,
+    formulation: Formulation = MCFFormulation(),
 ):
     """
     Create a templated prompt function for a COPA task.
@@ -109,7 +111,7 @@ def get_copa_prompt_function(
     Returns:
         Callable: A function that generates COPA prompts based on the given parameters.
     """
-    adapter_fn: Callable[[dict], COPAInput] = create_adapter_from_dict(adapter)  # type: ignore
+    adapter_fn = create_adapter_from_dict(adapter)
     continuation_prompt_fn = get_continuation_prompt_function(
         language, {"context": "context", "continuations": "continuations", "gold_idx": "gold_idx"}, formulation
     )
@@ -120,6 +122,9 @@ def copa_prompt(
         task_name: str,
     ):
         input_data = adapter_fn(line)
+        if input_data is None:
+            return None
+
         context = capitalize(input_data["context"].rstrip(PUNCT))
         cause_or_effect_trans = (
             translation_literals.cause_word