Skip to content

Commit

Permalink
Multilingual Hellaswag tasks (#332)
Browse files Browse the repository at this point in the history
* add multilignaul dynamic generative metrics

* draft

* finish multichoice config

* update tokenizers + install nltk reqs

* use punkt tab

* Update src/lighteval/utils/imports.py

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>

* Update src/lighteval/metrics/normalizations.py

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>

* fix imports

* remove unused import

* finish implementation of templates + move stuff around

* resolve nits

* when in rome do as romans do (handle error messages the same way)

* fix utils

* nicers tests + fix them

* nicer todo

* add nice doscrings 📃

* add even more docstring

* nit

* fix test

* add multilingual to dev group

* merge nli, add languagees to literals

* translation literals

* add nli

* add copa tasks + fix tranlation literals

* add hellaswag tasks

* remove custom telgu hellaswag

* remove hindi hellaswag

* add rcb + chinese nli

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* add two new tasks + docs

* add nice docs

* update hellaswag with docs

* move hellaswag to lighteval suite

* Update src/lighteval/tasks/multilingual/tasks.py

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>

* enable returning none from templates + better typing

* change unoficial hellaswag names to have community_prefix + unify hellaswag preprocesisng

* let strip be optional in hellaswag

---------

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Co-authored-by: Hynek Kydlicek <kydliceh.hynek@gmail.com>
Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
  • Loading branch information
4 people authored Oct 1, 2024
1 parent 01e7240 commit 994fe73
Show file tree
Hide file tree
Showing 11 changed files with 522 additions and 30 deletions.
30 changes: 19 additions & 11 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,21 +755,29 @@ def headqa(line, task_name: str = None):
)


def hellaswag_harness(line, task_name: str = None):
def preprocess(text):
"""Comes from AiHarness"""
# text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def hellaswag_preprocess(
text: str, wikihow_artifacts: list[str] = [" [title]"], truncate_dots: bool = False, strip_text: bool = False
):
"""Comes from AiHarness"""
# text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
for dot_repl in wikihow_artifacts:
text = text.replace(dot_repl, ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
if truncate_dots:
text = text.replace(r"\.+", r"\.")
if strip_text:
text = text.strip()
return text


def hellaswag_harness(line, task_name: str = None):
ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
return Doc(
task_name=task_name,
query=preprocess(line["activity_label"] + ": " + ctx),
choices=[preprocess(ending) for ending in line["endings"]],
query=hellaswag_preprocess(line["activity_label"] + ": " + ctx),
choices=[hellaswag_preprocess(ending) for ending in line["endings"]],
gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test
# "metric": "choices_loglikelihood",
)
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class LightevalTaskConfig:
"""

name: str
prompt_function: Callable[[dict, str], Doc]
prompt_function: Callable[[dict, str], Doc | None]
hf_repo: str
hf_subset: str
metric: ListLike[Metric | Metrics]
Expand Down
142 changes: 142 additions & 0 deletions src/lighteval/tasks/multilingual/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from lighteval.metrics.normalizations import LogProbTokenNorm
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.templates.copa import get_copa_prompt_function
from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
from lighteval.tasks.templates.nli import get_nli_prompt_function
from lighteval.tasks.templates.utils.formulation import (
CFFormulation,
Expand Down Expand Up @@ -386,6 +387,9 @@
),
hf_repo="ai4bharat/IndicCOPA",
hf_subset=f"translation-{standardize_tag(language.value)}",
# Since we use trust_dataset, we have to be careful about what is inside the dataset
# script. We thus lock the revision to ensure that the script doesn't change
hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188",
evaluation_splits=["test"],
metric=[
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
Expand Down Expand Up @@ -443,6 +447,141 @@
]


# ------------------------------- Hellaswag Tasks ------------------------------- #
# Hellaswag is a commonsense reasoning task that requires models to complete a given scenario
# with the most plausible ending. It tests the model's ability to understand and reason about
# everyday situations and human behavior.

# MLMM-Hellaswag: Multilingual adaptation of Hellaswag
# Paper: https://arxiv.org/abs/2306.07610
# This is a multilingual version of Hellaswag, part of the MLMM (Massive Language Model Meta-Evaluation) benchmark.
# It evaluates commonsense reasoning abilities across multiple languages.
mlmm_hellaswag_tasks = [
LightevalTaskConfig(
name=f"hellaswag_{lang.value}_{formulation.name.lower()}",
suite=["lighteval"],
prompt_function=get_hellaswag_prompt_function(
language=lang,
adapter=lambda line: {
# We don't use activity_label as they are not available
"ctx_a": line["ctx_a"],
"ctx_b": line["ctx_b"],
"continuations": line["endings"],
"gold_idx": int(line["label"]),
},
formulation=formulation,
),
hf_repo="jon-tow/okapi_hellaswag",
hf_subset=standardize_tag(lang.value),
# Since we use trust_dataset, we have to be careful about what is inside the dataset
# script. We thus lock the revision to ensure that the script doesn't change
hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83",
evaluation_splits=["validation"],
metric=[
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
],
trust_dataset=True,
)
for lang in [
Language.ARABIC,
Language.BENGALI,
Language.CATALAN,
Language.DANISH,
Language.GERMAN,
Language.SPANISH,
Language.BASQUE,
Language.FRENCH,
Language.GUJARATI,
Language.HINDI,
Language.CROATIAN,
Language.HUNGARIAN,
Language.ARMENIAN,
Language.INDONESIAN,
Language.ICELANDIC,
Language.ITALIAN,
Language.KANNADA,
Language.MALAYALAM,
Language.MARATHI,
Language.NORWEGIAN,
Language.NEPALI,
Language.DUTCH,
Language.PORTUGUESE,
Language.ROMANIAN,
Language.RUSSIAN,
Language.SLOVAK,
Language.SERBIAN,
Language.SWEDISH,
Language.TAMIL,
Language.TELUGU,
Language.UKRAINIAN,
Language.VIETNAMESE,
Language.CHINESE,
]
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
]

# Hellaswag Turkish
# This is a Turkish adaptation of the Hellaswag task.
# While there's no specific paper for this version, it has been found to work well for evaluating
# Turkish language models on commonsense reasoning tasks.

# We don't handle them in single task as there is quite a lot of differences (dataset/subset, dot replacement, etc.)
# which would make it hard to read
hellaswag_tur_tasks = [
LightevalTaskConfig(
name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}",
suite=["lighteval"],
prompt_function=get_hellaswag_prompt_function(
language=Language.TURKISH,
adapter=lambda line: {
"ctx_a": line["ctx_a"],
"ctx_b": line["ctx_b"],
"continuations": line["endings"],
"gold_idx": int(line["label"]),
},
formulation=formulation,
# https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py
wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"],
),
hf_repo="malhajar/hellaswag_tr-v0.2",
hf_subset="default",
evaluation_splits=["validation"],
metric=[
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
],
)
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
]

# Hellaswag Thai
# This is a Thai adaptation of the Hellaswag task.
# Similar to the Turkish version, there's no specific paper, but it has been found to be effective
# for evaluating Thai language models on commonsense reasoning tasks.
hellaswag_tha_tasks = [
LightevalTaskConfig(
name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}",
suite=["lighteval"],
prompt_function=get_hellaswag_prompt_function(
language=Language.THAI,
adapter=lambda line: {
"ctx_a": line["ctx_a"],
"ctx_b": line["ctx_b"],
"continuations": line["endings"],
"gold_idx": int(line["label"]),
},
formulation=formulation,
),
hf_repo="HuggingFaceFW-Dev/hellaswag_thai",
hf_subset="default",
evaluation_splits=["validation"],
few_shots_split="train",
metric=[
loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
],
)
for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
]

TASKS_TABLE = [
*xnli_tasks,
*xnli2_tasks,
Expand All @@ -454,4 +593,7 @@
*xcopa_tasks,
*copa_indic_tasks,
*parus_tasks,
*mlmm_hellaswag_tasks,
*hellaswag_tur_tasks,
*hellaswag_tha_tasks,
]
18 changes: 14 additions & 4 deletions src/lighteval/tasks/templates/continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class ContinuationDictAdapter(TypedDict):

def get_continuation_prompt_function(
language: Language,
adapter: Callable[[dict], ContinuationInput] | ContinuationDictAdapter,
adapter: Callable[[dict], ContinuationInput | None] | ContinuationDictAdapter,
formulation: Formulation = MCFFormulation(),
):
"""
Expand Down Expand Up @@ -121,11 +121,13 @@ def get_continuation_prompt_function(
Returns:
Callable: A function that generates Continuation prompt based on the given parameters.
"""
adapter_fn: Callable[[dict], ContinuationInput] = create_adapter_from_dict(adapter) # type: ignore
adapter_fn = create_adapter_from_dict(adapter)
translation_literals = TRANSLATION_LITERALS[language]

def prepare_prompt(line: dict):
cont_input = adapter_fn(line)
if cont_input is None:
return None

instruction_val = cont_input.get("instruction")
instruction = f"{instruction_val}\n" if instruction_val else ""
Expand All @@ -140,7 +142,11 @@ def prepare_prompt(line: dict):
return cont_input, instruction, context, continuations

def prompt_fn_cf(line, task_name: str):
cont_input, instruction, context, continuations = prepare_prompt(line)
prepared_prompt = prepare_prompt(line)
if prepared_prompt is None:
return None

cont_input, instruction, context, continuations = prepared_prompt

context_follows_sentence_space = punctuation_ends_sentence(context, translation_literals)
answers = build_answers(continuations, formulation, translation_literals, context_follows_sentence_space)
Expand All @@ -160,7 +166,11 @@ def prompt_fn_cf(line, task_name: str):
)

def prompt_fn_mcf(line, task_name: str):
cont_input, instruction, context, continuations = prepare_prompt(line)
prepared_prompt = prepare_prompt(line)
if prepared_prompt is None:
return None

cont_input, instruction, context, continuations = prepared_prompt

options = build_choices(continuations, formulation, translation_literals)
options = f"{options}\n" if options else ""
Expand Down
9 changes: 7 additions & 2 deletions src/lighteval/tasks/templates/copa.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ class COPAAdapter(TypedDict):


def get_copa_prompt_function(
language: Language, adapter: Callable[[dict], COPAInput] | COPAAdapter, formulation: Formulation = MCFFormulation()
language: Language,
adapter: Callable[[dict], COPAInput | None] | COPAAdapter,
formulation: Formulation = MCFFormulation(),
):
"""
Create a templated prompt function for a COPA task.
Expand Down Expand Up @@ -109,7 +111,7 @@ def get_copa_prompt_function(
Returns:
Callable: A function that generates COPA prompts based on the given parameters.
"""
adapter_fn: Callable[[dict], COPAInput] = create_adapter_from_dict(adapter) # type: ignore
adapter_fn = create_adapter_from_dict(adapter)
continuation_prompt_fn = get_continuation_prompt_function(
language, {"context": "context", "continuations": "continuations", "gold_idx": "gold_idx"}, formulation
)
Expand All @@ -120,6 +122,9 @@ def copa_prompt(
task_name: str,
):
input_data = adapter_fn(line)
if input_data is None:
return None

context = capitalize(input_data["context"].rstrip(PUNCT))
cause_or_effect_trans = (
translation_literals.cause_word
Expand Down
Loading

0 comments on commit 994fe73

Please sign in to comment.