Skip to content

Commit

Permalink
fix: Multiple dataset fixes (#328)
Browse files Browse the repository at this point in the history
* fix: remove time of run (as it does not relate to the model itself). Time of run should be on the dataset results

* fix: fixes the PawsX datasets

* docs: Updated points

* fix: flores clustering

* fix: mulitple dataset fixes

* docs: updated points

* fix: added missing dataset_transform to multitask task

* syle: ran formatter

* fix: correctly fix pawsX
  • Loading branch information
KennethEnevoldsen authored Apr 8, 2024
1 parent 5bd11fb commit 84408f7
Show file tree
Hide file tree
Showing 31 changed files with 1,230 additions and 65 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,4 @@ error_logs.txt

# tests
tests/results
tmp.py
2 changes: 1 addition & 1 deletion docs/mmteb/points.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

| GitHub | Total points | New dataset | New task | Dataset annotations | (Bug)fixes | Running Models | Review PR | Paper Writing | Ideation | Coordination |
|-------------------| ------------ |-------------| -------- | ------------------- | ---------- | -------------- | -------- | -------------- | -------- | ------------- |
| KennethEnevoldsen | | 38+16 | | 1 | | 9 | | | | |
| KennethEnevoldsen | | 38+16 | | 1 | 9+1+2 | | | | | |
| x-tabdeveloping | | 2+16 | | | | | | | | |
| imenelydiaker | | 88 | | | | | 7 | | | |
| wissam-sib | | 88 | | | | | 1 | | | |
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/MultilingualTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ def load_data(self, **kwargs):
name=lang,
**self.metadata_dict.get("dataset", None),
)
self.dataset_transform()
self.data_loaded = True
2 changes: 0 additions & 2 deletions mteb/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from __future__ import annotations

import argparse
import datetime
import json
import logging
from pathlib import Path
Expand All @@ -35,7 +34,6 @@ def _save_model_metadata(

model_meta = {
"model_name": model_name,
"time_of_run": str(datetime.datetime.today()),
"versions": model._model_config.get("__version__", None),
}

Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/Clustering/es/FloresClusteringS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ class FloresClusteringS2S(AbsTaskClustering):
description="Clustering of sentences from various web articles, 32 topics in total.",
reference="https://huggingface.co/datasets/facebook/flores",
dataset={
"path": "facebook/flores",
"revision": "2db78afdeaccaedc3b33a95442a4e55766887e17",
"path": "jinaai/flores_clustering",
"revision": "97faaf98d7ef21869d176115e669e2a4286513bf",
},
type="Clustering",
category="s2s",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Clustering/fr/MLSUMClusteringP2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class MLSUMClusteringP2P(AbsTaskClustering):
"path": "mlsum",
"revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
"name": "fr",
"split": "test",
"trust_remote_code": True,
},
type="Clustering",
category="p2p",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Clustering/fr/MLSUMClusteringS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class MLSUMClusteringS2S(AbsTaskClustering):
"path": "mlsum",
"revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
"name": "fr",
"split": "test",
"trust_remote_code": True,
},
type="Clustering",
category="s2s",
Expand Down
38 changes: 10 additions & 28 deletions mteb/tasks/PairClassification/multilingual/PawsX.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

import datasets

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks import MultilingualTask
Expand Down Expand Up @@ -36,34 +34,18 @@ class PawsX(MultilingualTask, AbsTaskPairClassification):
avg_character_length=None,
)

def load_data(self, **kwargs):
if self.data_loaded:
return

self.dataset = dict()
def dataset_transform(self):
_dataset = {}
for lang in self.langs:
hf_dataset = datasets.load_dataset(
name=lang,
**self.metadata_dict["dataset"],
)

sent1 = []
sent2 = []
labels = []
_dataset[lang] = {}
for split in self.metadata.eval_splits:
hf_dataset = self.dataset[lang][split]

for line in hf_dataset["test"]:
sent1.append(line["sentence1"])
sent2.append(line["sentence2"])
labels.append(line["label"])

self.dataset[lang] = {
"test": [
_dataset[lang][split] = [
{
"sent1": sent1,
"sent2": sent2,
"labels": labels,
"sent1": hf_dataset["sentence1"],
"sent2": hf_dataset["sentence2"],
"labels": hf_dataset["label"],
}
]
}

self.data_loaded = True
self.dataset = _dataset
14 changes: 7 additions & 7 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from .da.t2nord_retrieval import *
from .da.twitterhjerne import *
from .de.GerDaLIRRetrieval import *
from .de.GerDaLIRSmallRetrieval import *
from .de.GermanDPRRetrieval import *
from .de.GermanQuADRetrieval import *
from .de.GerDaLIRSmallRetrieval import *
from .de.LegalQuADRetrieval import *
from .en.AILACasedocsRetrieval import *
from .en.AILAStatutesRetrieval import *
from .en.ArguAnaRetrieval import *
from .en.ClimateFEVERRetrieval import *
from .en.CQADupstackAndroidRetrieval import *
Expand All @@ -27,6 +29,9 @@
from .en.FiQA2018Retrieval import *
from .en.HagridRetrieval import *
from .en.HotpotQARetrieval import *
from .en.LegalBenchConsumerContractsQARetrieval import *
from .en.LegalBenchCorporateLobbyingRetrieval import *
from .en.LegalSummarizationRetrieval import *
from .en.MSMARCORetrieval import *
from .en.MSMARCOv2Retrieval import *
from .en.NarrativeQARetrieval import *
Expand All @@ -37,11 +42,6 @@
from .en.SciFactRetrieval import *
from .en.Touche2020Retrieval import *
from .en.TRECCOVIDRetrieval import *
from .en.AILACasedocsRetrieval import *
from .en.AILAStatutesRetrieval import *
from .en.LegalBenchConsumerContractsQARetrieval import *
from .en.LegalBenchCorporateLobbyingRetrieval import *
from .en.LegalSummarizationRetrieval import *
from .es.SpanishPassageRetrievalS2P import *
from .es.SpanishPassageRetrievalS2S import *
from .fr.AlloprofRetrieval import *
Expand Down Expand Up @@ -71,4 +71,4 @@
from .sv.swedn_retrieval import *
from .sv.swefaq_retrieval import *
from .zh.CMTEBRetrieval import *
from .zh.LeCaRDv2Retrieval import *
from .zh.LeCaRDv2Retrieval import *
1 change: 1 addition & 0 deletions mteb/tasks/Retrieval/de/GerDaLIRRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def load_data(self, **kwargs):
query_rows = datasets.load_dataset(
name="queries",
split=self._EVAL_SPLIT,
**self.metadata_dict["dataset"],
)
corpus_rows = datasets.load_dataset(
name="corpus",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/de/GerDaLIRSmallRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class GerDaLIRSmall(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/de/LegalQuADRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class LegalQuAD(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/en/AILACasedocsRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class AILACasedocs(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/en/AILAStatutesRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class AILAStatutes(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class LegalBenchConsumerContractsQA(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class LegalBenchCorporateLobbying(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/en/LegalSummarizationRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class LegalSummarization(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
5 changes: 2 additions & 3 deletions mteb/tasks/Retrieval/en/NarrativeQARetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ def load_data(self, **kwargs):
return

data = datasets.load_dataset(
split=self._EVAL_SPLIT
# BUGFIX: the revision is now used
** self.metadata_dict["dataset"],
split=self._EVAL_SPLIT,
**self.metadata_dict["dataset"],
)
self.queries = {
self._EVAL_SPLIT: {
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def load_data(self, **kwargs):
if self.data_loaded:
return

# BUGFIX: the revision is now used
query_rows = datasets.load_dataset(
name="queries",
split="test",
trust_remote_code=True,
**self.metadata_dict["dataset"],
)
corpus_rows = datasets.load_dataset(
name="corpus.documents",
Expand Down
4 changes: 1 addition & 3 deletions mteb/tasks/Retrieval/es/SpanishPassageRetrievalS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class SpanishPassageRetrievalS2S(AbsTaskRetrieval):
dataset={
"path": "jinaai/spanish_passage_retrieval",
"revision": "9cddf2ce5209ade52c2115ccfa00eb22c6d3a837",
"trust_remote_code": True,
},
type="Retrieval",
category="s2s",
Expand Down Expand Up @@ -42,19 +43,16 @@ def load_data(self, **kwargs):
query_rows = datasets.load_dataset(
name="queries",
split="test",
trust_remote_code=True,
**self.metadata_dict["dataset"],
)
corpus_rows = datasets.load_dataset(
name="corpus.sentences",
split="test",
trust_remote_code=True,
**self.metadata_dict["dataset"],
)
qrels_rows = datasets.load_dataset(
name="qrels.s2s",
split="test",
trust_remote_code=True,
**self.metadata_dict["dataset"],
)

Expand Down
1 change: 0 additions & 1 deletion mteb/tasks/Retrieval/fr/AlloprofRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def load_data(self, **kwargs):
if self.data_loaded:
return
# fetch both subsets of the dataset
# BUGFIX: the revision is now used
corpus_raw = datasets.load_dataset(
name="documents",
**self.metadata_dict["dataset"],
Expand Down
1 change: 0 additions & 1 deletion mteb/tasks/Retrieval/fr/SyntecRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def load_data(self, **kwargs):
if self.data_loaded:
return
# fetch both subsets of the dataset
# BUGFIX: the revision is now used
corpus_raw = datasets.load_dataset(
name="documents",
**self.metadata_dict["dataset"],
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/pl/QuoraPLRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class QuoraPLRetrieval(AbsTaskRetrieval):
},
type="Retrieval",
category="s2s",
eval_splits=["validation", "test"], # validation for new DataLoader
eval_splits=["validation", "test"],
eval_langs=["pl"],
main_score="ndcg_at_10",
date=None,
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/zh/LeCaRDv2Retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class LeCaRDv2(AbsTaskRetrieval):
annotations_creators="derived",
dialect=None,
text_creation="found",
bibtex_citation= None,
bibtex_citation=None,
n_samples=None,
avg_character_length=None,
)
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"precision": 0.7637658654697831,
"recall": 0.8107167710508003
},
"evaluation_time": 30.59,
"evaluation_time": 24.84,
"fr-en": {
"accuracy": 0.8107167710508003,
"f1": 0.7772438859983376,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"dataset_revision": "97faaf98d7ef21869d176115e669e2a4286513bf",
"mteb_dataset_name": "FloresClusteringS2S",
"mteb_version": "1.2.1.dev0",
"test": {
"evaluation_time": 9.9,
"main_score": 0.3660971441690291,
"v_measure": 0.3660971441690291,
"v_measure_std": 0.0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"dataset_revision": "0bb47f1d73827e96964edb84dfe552f62f4fd5eb",
"mteb_dataset_name": "GerDaLIR",
"mteb_version": "1.2.1.dev0",
"test": {
"evaluation_time": 812.69,
"map_at_1": 0.00203,
"map_at_10": 0.00299,
"map_at_100": 0.00328,
"map_at_1000": 0.00334,
"map_at_3": 0.00251,
"map_at_5": 0.00282,
"mrr_at_1": 0.00228,
"mrr_at_10": 0.00328,
"mrr_at_100": 0.00362,
"mrr_at_1000": 0.00368,
"mrr_at_3": 0.00276,
"mrr_at_5": 0.00311,
"ndcg_at_1": 0.00228,
"ndcg_at_10": 0.00368,
"ndcg_at_100": 0.00547,
"ndcg_at_1000": 0.00758,
"ndcg_at_3": 0.00271,
"ndcg_at_5": 0.00327,
"precision_at_1": 0.00228,
"precision_at_10": 0.00063,
"precision_at_100": 0.00017,
"precision_at_1000": 4e-05,
"precision_at_3": 0.00114,
"precision_at_5": 0.00098,
"recall_at_1": 0.00203,
"recall_at_10": 0.00558,
"recall_at_100": 0.01448,
"recall_at_1000": 0.03166,
"recall_at_3": 0.00305,
"recall_at_5": 0.00436
}
}
Loading

0 comments on commit 84408f7

Please sign in to comment.