Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: add metadata basemodel #260

Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1903f89
refactor: rename description to metadata dict
MartinBernstorff Mar 19, 2024
df7672d
refactor: add TaskMetadata and first example
MartinBernstorff Mar 19, 2024
76d496d
update 9 files
MartinBernstorff Mar 19, 2024
e19c80a
update TaskMetadata.py
MartinBernstorff Mar 19, 2024
13cdf1c
update TaskMetadata.py
MartinBernstorff Mar 19, 2024
91ced73
update TaskMetadata.py
MartinBernstorff Mar 19, 2024
5e6c991
update LICENSE, TaskMetadata.py and requirements.dev.txt
MartinBernstorff Mar 20, 2024
906bd8e
update 151 files
MartinBernstorff Mar 20, 2024
6993cba
update 150 files
MartinBernstorff Mar 20, 2024
b128576
update 43 files and delete 1 file
MartinBernstorff Mar 20, 2024
af1528d
update 106 files
MartinBernstorff Mar 20, 2024
37ce529
update 45 files
MartinBernstorff Mar 20, 2024
1737780
update 6 files
MartinBernstorff Mar 20, 2024
c8fde5b
update 14 files
MartinBernstorff Mar 20, 2024
a0b9dce
Added model results to repo and updated CLI to create consistent fold…
KennethEnevoldsen Mar 19, 2024
7c2ed71
Restructing the readme (#262)
KennethEnevoldsen Mar 20, 2024
bfa025e
Merge remote-tracking branch 'upstream/main' into mbern_add_metadata_…
MartinBernstorff Mar 20, 2024
d252ee1
build(deps): update TaskMetadata.py and pyproject.toml
MartinBernstorff Mar 21, 2024
ad81a24
Merge branch 'main' into mbern_add_metadata_basemodel
MartinBernstorff Mar 21, 2024
ec24f9f
update 221 files
MartinBernstorff Mar 21, 2024
2db1e19
build(deps): update pyproject.toml
MartinBernstorff Mar 21, 2024
409cb2a
build(deps): update pyproject.toml
MartinBernstorff Mar 21, 2024
63c7b1d
build(deps): update pyproject.toml
MartinBernstorff Mar 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import random
from abc import ABC, abstractmethod

Expand All @@ -24,17 +26,19 @@ def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded: return
if self.data_loaded:
return

# TODO: add split argument
self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision", None)
self.metadata_dict["hf_hub_name"],
revision=self.metadata_dict.get("revision", None),
)
self.data_loaded = True

@property
@abstractmethod
def description(self):
def metadata_dict(self) -> dict[str, str]:
"""
Returns a description of the task. Should contain the following fields:
name: Name of the task (usually equal to the class name. Should be a valid name for a path on disc)
Expand Down
12 changes: 6 additions & 6 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class AbsTaskBitextMining(AbsTask):
Abstract class for BitextMining tasks
The similarity is computed between pairs and the results are ranked.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
id: str
sentence1: str
sentence2: str
Expand All @@ -28,13 +28,13 @@ def evaluate(self, model, split, **kwargs):
scores = {}
for lang in self.dataset:
logger.info(
f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
)
data_split = self.dataset[lang][split]
scores[lang] = self._evaluate_split(model, data_split, **kwargs)
else:
logger.info(
f"\nTask: {self.description['name']}, split: {split}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
)
data_split = self.dataset[split]
scores = self._evaluate_split(model, data_split, **kwargs)
Expand Down Expand Up @@ -72,9 +72,9 @@ def _evaluate_split(self, model, data_split, **kwargs):
return metrics

def _add_main_score(self, scores):
if self.description["main_score"] in scores:
scores["main_score"] = scores[self.description["main_score"]]
if self.metadata_dict["main_score"] in scores:
scores["main_score"] = scores[self.metadata_dict["main_score"]]
else:
logger.warn(
f"main score {self.description['main_score']} not found in scores {scores.keys()}"
f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
)
23 changes: 15 additions & 8 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class AbsTaskClassification(AbsTask):
Abstract class for kNN classification tasks
The similarity is computed between pairs and the results are ranked.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
text: str
label: int
"""
Expand All @@ -40,23 +40,30 @@ def __init__(
self.n_experiments = (
n_experiments
if n_experiments is not None
else self.description.get("n_experiments", 10)
else self.metadata_dict.get("n_experiments", 10)
)
self.samples_per_label = (
samples_per_label
if samples_per_label is not None
else self.description.get("samples_per_label", 8)
else self.metadata_dict.get("samples_per_label", 8)
)

# kNN parameters
self.k = k

# Run metadata validation by instantiating addressing the attribute
# This is quite hacky. Ideally, this would be done in the constructor of
# each concrete task, but then we have to duplicate the __init__ method's
# interface.
if hasattr(self, "metadata"):
self.metadata

def _add_main_score(self, scores):
if self.description["main_score"] in scores:
scores["main_score"] = scores[self.description["main_score"]]
if self.metadata_dict["main_score"] in scores:
scores["main_score"] = scores[self.metadata_dict["main_score"]]
else:
logger.warn(
f"main score {self.description['main_score']} not found in scores {scores.keys()}"
f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
)

def evaluate(self, model, eval_split="test", train_split="train", **kwargs):
Expand All @@ -67,15 +74,15 @@ def evaluate(self, model, eval_split="test", train_split="train", **kwargs):
scores = {}
for lang in self.dataset:
logger.info(
f"\nTask: {self.description['name']}, split: {eval_split}, language: {lang}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {eval_split}, language: {lang}. Running..."
)
scores[lang] = self._evaluate_monolingual(
model, self.dataset[lang], eval_split, train_split, **kwargs
)
self._add_main_score(scores[lang])
else:
logger.info(
f"\nTask: {self.description['name']}, split: {eval_split}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {eval_split}. Running..."
)
scores = self._evaluate_monolingual(
model, self.dataset, eval_split, train_split, **kwargs
Expand Down
12 changes: 6 additions & 6 deletions mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class AbsTaskClustering(AbsTask):
Abstract class for Clustering tasks
The similarity is computed between pairs and the results are ranked.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
sentences: list of str
labels: list of str
"""
Expand All @@ -23,11 +23,11 @@ def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores):
if self.description["main_score"] in scores:
scores["main_score"] = scores[self.description["main_score"]]
if self.metadata_dict["main_score"] in scores:
scores["main_score"] = scores[self.metadata_dict["main_score"]]
else:
logger.warn(
f"main score {self.description['main_score']} not found in scores {scores.keys()}"
f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
)

def evaluate(self, model, split="test", **kwargs):
Expand All @@ -38,15 +38,15 @@ def evaluate(self, model, split="test", **kwargs):
scores = {}
for lang in self.dataset:
logger.info(
f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
)
scores[lang] = self._evaluate_monolingual(
model, self.dataset[lang], split, **kwargs
)
self._add_main_score(scores[lang])
else:
logger.info(
f"\nTask: {self.description['name']}, split: {split}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
)
scores = self._evaluate_monolingual(model, self.dataset, split, **kwargs)
self._add_main_score(scores)
Expand Down
6 changes: 3 additions & 3 deletions mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class AbsTaskPairClassification(AbsTask):
The similarity is computed between pairs and the results are ranked. Average precision
is computed to measure how well the methods can be used for pairwise pair classification.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
sent1: list[str]
sent2: list[str]
labels: list[int]
Expand Down Expand Up @@ -53,15 +53,15 @@ def evaluate(self, model, split="test", **kwargs):
print("loaded langs:", self.dataset.keys())
for lang, monolingual_dataset in self.dataset.items():
logger.info(
f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
)
scores[lang] = self._evaluate_monolingual(
model, monolingual_dataset, split=split, **kwargs
)
return scores
else:
logger.info(
f"\nTask: {self.description['name']}, split: {split}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
)
return self._evaluate_monolingual(
model, self.dataset, split=split, **kwargs
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class AbsTaskReranking(AbsTask):
"""
Abstract class for re-ranking experiments.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
query: str
positive: list[str]
negative: list[str]
Expand Down
12 changes: 6 additions & 6 deletions mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,13 @@ def load_data(self, **kwargs):
return
self.corpus, self.queries, self.relevant_docs = {}, {}, {}
hf_repo_qrels = (
self.description["hf_hub_name"] + "-qrels"
if "clarin-knext" in self.description["hf_hub_name"]
self.metadata_dict["hf_hub_name"] + "-qrels"
if "clarin-knext" in self.metadata_dict["hf_hub_name"]
else None
)
for split in kwargs.get("eval_splits", self.description["eval_splits"]):
for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]):
corpus, queries, qrels = HFDataLoader(
hf_repo=self.description["hf_hub_name"],
hf_repo=self.metadata_dict["hf_hub_name"],
hf_repo_qrels=hf_repo_qrels,
streaming=False,
keep_in_memory=False,
Expand Down Expand Up @@ -295,11 +295,11 @@ def _evaluate_monolingual(
}
if lang is None:
qrels_save_path = (
f"{output_folder}/{self.description['name']}_qrels.json"
f"{output_folder}/{self.metadata_dict['name']}_qrels.json"
)
else:
qrels_save_path = (
f"{output_folder}/{self.description['name']}_{lang}_qrels.json"
f"{output_folder}/{self.metadata_dict['name']}_{lang}_qrels.json"
)

with open(qrels_save_path, "w") as f:
Expand Down
10 changes: 5 additions & 5 deletions mteb/abstasks/AbsTaskSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class AbsTaskSTS(AbsTask):
"""
Abstract class for STS experiments.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns::
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns::
sentence1: str
sentence2: str
score: float
Expand All @@ -21,11 +21,11 @@ def __init__(self, **kwargs):

@property
def min_score(self):
return self.description["min_score"]
return self.metadata_dict["min_score"]

@property
def max_score(self):
return self.description["max_score"]
return self.metadata_dict["max_score"]

def evaluate(self, model, split, **kwargs):
if not self.data_loaded:
Expand All @@ -35,13 +35,13 @@ def evaluate(self, model, split, **kwargs):
scores = {}
for lang in self.dataset:
logger.info(
f"Task: {self.description['name']}, split: {split}, language: {lang}. Running..."
f"Task: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
)
data_split = self.dataset[lang][split]
scores[lang] = self._evaluate_split(model, data_split, **kwargs)
else:
logger.info(
f"\nTask: {self.description['name']}, split: {split}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
)
data_split = self.dataset[split]
scores = self._evaluate_split(model, data_split, **kwargs)
Expand Down
10 changes: 5 additions & 5 deletions mteb/abstasks/AbsTaskSummarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class AbsTaskSummarization(AbsTask):
"""
Abstract class for summarization experiments.

self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
text: str
human_summaries: list[str]
machine_summaries: list[str]
Expand All @@ -24,11 +24,11 @@ def __init__(self, **kwargs):

@property
def min_score(self):
return self.description["min_score"]
return self.metadata_dict["min_score"]

@property
def max_score(self):
return self.description["max_score"]
return self.metadata_dict["max_score"]

def evaluate(self, model, split, **kwargs):
if not self.data_loaded:
Expand All @@ -38,13 +38,13 @@ def evaluate(self, model, split, **kwargs):
scores = {}
for lang in self.dataset:
logger.info(
f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
)
data_split = self.dataset[lang][split]
scores[lang] = self._evaluate_split(model, data_split, **kwargs)
else:
logger.info(
f"\nTask: {self.description['name']}, split: {split}. Running..."
f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
)
data_split = self.dataset[split]
scores = self._evaluate_split(model, data_split, **kwargs)
Expand Down
8 changes: 4 additions & 4 deletions mteb/abstasks/CrosslingualTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ class CrosslingualTask(AbsTask):
def __init__(self, langs=None, **kwargs):
super().__init__(**kwargs)
if type(langs) is list:
langs = [lang for lang in langs if lang in self.description["eval_langs"]]
langs = [lang for lang in langs if lang in self.metadata_dict["eval_langs"]]
if langs is not None and len(langs) > 0:
self.langs = langs
else:
self.langs = self.description["eval_langs"]
self.langs = self.metadata_dict["eval_langs"]
self.is_crosslingual = True

def load_data(self, **kwargs):
Expand All @@ -23,8 +23,8 @@ def load_data(self, **kwargs):
self.dataset = {}
for lang in self.langs:
self.dataset[lang] = datasets.load_dataset(
self.description["hf_hub_name"],
self.metadata_dict["hf_hub_name"],
lang,
revision=self.description.get("revision", None),
revision=self.metadata_dict.get("revision", None),
)
self.data_loaded = True
12 changes: 7 additions & 5 deletions mteb/abstasks/MultilingualTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ class MultilingualTask(AbsTask):
def __init__(self, langs=None, **kwargs):
super().__init__(**kwargs)
if type(langs) is list:
langs = [lang for lang in langs if lang in self.description["eval_langs"]]
langs = [lang for lang in langs if lang in self.metadata_dict["eval_langs"]]
if langs is not None and len(langs) > 0:
self.langs = langs # TODO: case where user provides langs not in the dataset
self.langs = (
langs # TODO: case where user provides langs not in the dataset
)
else:
self.langs = self.description["eval_langs"]
self.langs = self.metadata_dict["eval_langs"]
self.is_multilingual = True

def load_data(self, **kwargs):
Expand All @@ -23,8 +25,8 @@ def load_data(self, **kwargs):
self.dataset = {}
for lang in self.langs:
self.dataset[lang] = datasets.load_dataset(
self.description["hf_hub_name"],
self.metadata_dict["hf_hub_name"],
lang,
revision=self.description.get("revision", None),
revision=self.metadata_dict.get("revision", None),
)
self.data_loaded = True
Loading