fix: Added tasks from SEB (#287)

* Added tasks from SEB * docs: fix link * fix: ran linting * fix typing for 3.8 * fixed annotation for v3.8
embeddings-benchmark · Mar 29, 2024 · 39cff49 · 39cff49
1 parent 76056b5
commit 39cff49
Show file tree

Hide file tree

Showing 59 changed files with 2,034 additions and 3 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -18,4 +18,4 @@ if you are not
 - [ ] I have considered the size of the dataset and reduced it if it is too big (2048 examples is typically large enough for most tasks)
 - [ ] Run tests locally to make sure nothing is broken using `make test`. 
 - [ ] Run the formatter to format the code using `make lint`. 
-- [ ] I have added points for my submission to the [POINTS.md](https://github.com/embeddings-benchmark/mteb/tree/main/docs/mmteb/POINTS.md) file.
+- [ ] I have added points for my submission to the [POINTS.md](https://github.com/embeddings-benchmark/mteb/blob/main/docs/mmteb/points.md) file.
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -17,11 +17,12 @@
     "Discourse coherence",
     "Language identification",
     "Linguistic acceptability",
-    "Political",
+    "Political classification",
     "Question answering",
     "Sentiment/Hate speech",
     "Thematic clustering",
     "Scientific Reranking",
+    "Claim verification",
 ]
 
 TASK_DOMAIN = Literal[

diff --git a/mteb/cmd.py b/mteb/cmd.py
@@ -36,7 +36,7 @@ def _save_model_metadata(
     model_meta = {
         "model_name": model_name,
         "time_of_run": str(datetime.datetime.today()),
-        "versions": model._model_config["__version__"],
+        "versions": model._model_config.get("__version__", None),
     }
 
     with save_path.open("w") as f:

diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py
@@ -6,3 +6,4 @@
 from .multilingual.FloresBitextMining import *
 from .multilingual.NorwegianCourtsBitextMining import *
 from .multilingual.TatoebaBitextMining import *
+from .nb.norwegian_courts_bitext_mining import *
diff --git a/mteb/tasks/BitextMining/nb/__init__.py b/mteb/tasks/BitextMining/nb/__init__.py
diff --git a/mteb/tasks/BitextMining/nb/norwegian_courts_bitext_mining.py b/mteb/tasks/BitextMining/nb/norwegian_courts_bitext_mining.py
@@ -0,0 +1,52 @@
+from typing import Any
+
+import datasets
+
+from mteb.abstasks import AbsTaskBitextMining
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class NorwegianCourtsBitextMining(AbsTaskBitextMining):
+    metadata = TaskMetadata(
+        name="NorwegianCourtsBitextMining",
+        hf_hub_name="kaedrodrur/norwegian-courts",
+        description="Nynorsk and Bokmål parallel corpus from Norwegian courts. ",
+        reference="https://opus.nlpl.eu/ELRC-Courts_Norway-v1.php",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["nb", "nn"],
+        main_score="accuracy",
+        revision="d79af07e969a6678fcbbe819956840425816468f",
+        date=("2000-01-01", "2020-12-31"),  # approximate guess
+        form=["spoken"],
+        domains=["Spoken"],
+        task_subtypes=["Political classification"],
+        license="openUnder-PSI",
+        socioeconomic_status="high",
+        annotations_creators="derived",  # best guess
+        dialect=[],
+        text_creation="found",
+        bibtex_citation=None,
+        n_samples={"test": 456},
+        avg_character_length={"test": 82.11},
+    )
+
+    def load_data(self, **kwargs: Any) -> None:  # noqa: ARG002
+        """
+        Load dataset from HuggingFace hub and convert it to the standard format.
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self) -> None:
+        # Convert to standard format
+        self.dataset = self.dataset.rename_column("nb", "sentence1")
+        self.dataset = self.dataset.rename_column("nn", "sentence2")
diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py
@@ -26,5 +26,8 @@
 from .fr.MLSUMClusteringS2S import *
 from .multilingual.MasakhaNEWSClusteringP2P import *
 from .multilingual.MasakhaNEWSClusteringS2S import *
+from .nb.snl_clustering import *
+from .nb.vg_clustering import *
 from .pl.PolishClustering import *
+from .sv.swedn_clustering import *
 from .zh.CMTEBClustering import *
diff --git a/mteb/tasks/Clustering/nb/__init__.py b/mteb/tasks/Clustering/nb/__init__.py
diff --git a/mteb/tasks/Clustering/nb/snl_clustering.py b/mteb/tasks/Clustering/nb/snl_clustering.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import random
+from itertools import islice
+from typing import Iterable, TypeVar
+
+import datasets
+
+from mteb.abstasks import AbsTaskClustering, TaskMetadata
+
+T = TypeVar("T")
+
+
+def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+class SNLClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="SNLClustering",
+        hf_hub_name="navjordj/SNL_summarization",
+        description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.",
+        reference="https://huggingface.co/datasets/navjordj/SNL_summarization",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["nb"],
+        main_score="v_measure",
+        revision="3d3d27aa7af8941408cefc3991ada5d12a4273d1",
+        date=("2020-01-01", "2024-12-31"),  # best guess
+        form=["written"],
+        domains=["Encyclopaedic", "Non-fiction"],
+        license=None,
+        socioeconomic_status="high",
+        annotations_creators="derived",
+        dialect=[],
+        task_subtypes=["Thematic clustering"],
+        text_creation="found",
+        bibtex_citation="""@mastersthesis{navjord2023beyond,
+  title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
+  author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
+  year={2023},
+  school={Norwegian University of Life Sciences, {\AA}s}
+}""",
+        n_samples={"test": 2048},
+        avg_character_length={"test": 1101.30},
+    )
+
+    def load_data(self, **kwargs: dict):  # noqa: ARG002
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset: datasets.DatasetDict = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision"),
+        )  # type: ignore
+
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        splits = self.metadata_dict["eval_splits"]
+
+        documents: list = []
+        labels: list = []
+        label_col = "category"
+
+        ds = {}
+        for split in splits:
+            ds_split = self.dataset[split]
+
+            _label = self.normalize_labels(ds_split[label_col])
+            documents.extend(ds_split["ingress"])
+            labels.extend(_label)
+
+            documents.extend(ds_split["article"])
+            labels.extend(_label)
+
+            assert len(documents) == len(labels)
+
+            rng = random.Random(42)  # local only seed
+            pairs = list(zip(documents, labels))
+            rng.shuffle(pairs)
+            documents, labels = [list(collection) for collection in zip(*pairs)]
+
+            # reduce size of dataset to not have too large datasets in the clustering task
+            documents_batched = list(batched(documents, 512))[:4]
+            labels_batched = list(batched(labels, 512))[:4]
+
+            ds[split] = datasets.Dataset.from_dict(
+                {
+                    "sentences": documents_batched,
+                    "labels": labels_batched,
+                }
+            )
+
+        self.dataset = datasets.DatasetDict(ds)
+
+    @staticmethod
+    def normalize_labels(labels: list[str]) -> list[str]:
+        # example label:
+        # Store norske leksikon,Kunst og estetikk,Musikk,Klassisk musikk,Internasjonale dirigenter
+        # When using 2 levels there is 17 unique labels
+        # When using 3 levels there is 121 unique labels
+        return [",".join(tuple(label.split(",")[:3])) for label in labels]
diff --git a/mteb/tasks/Clustering/nb/vg_clustering.py b/mteb/tasks/Clustering/nb/vg_clustering.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+import random
+from itertools import islice
+from typing import Iterable, TypeVar
+
+import datasets
+
+from mteb.abstasks import AbsTaskClustering, TaskMetadata
+
+T = TypeVar("T")
+
+
+def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+class VGClustering(AbsTaskClustering):
+    metadata = TaskMetadata(
+        name="VGClustering",
+        hf_hub_name="navjordj/VG_summarization",
+        description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.",
+        reference="https://huggingface.co/datasets/navjordj/VG_summarization",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["nb"],
+        main_score="v_measure",
+        revision="d4c5a8ba10ae71224752c727094ac4c46947fa29",
+        date=("2020-01-01", "2024-12-31"),  # best guess
+        form=["written"],
+        domains=["News", "Non-fiction"],
+        license=None,
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        task_subtypes=["Thematic clustering"],
+        text_creation="found",
+        bibtex_citation="""@mastersthesis{navjord2023beyond,
+    title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
+    author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
+    year={2023},
+    school={Norwegian University of Life Sciences, {\AA}s}
+}""",
+        n_samples={"test": 2048},
+        avg_character_length={"test": 1009.65},
+    )
+
+    def load_data(self, **kwargs: dict):  # noqa: ARG002
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset: datasets.DatasetDict = datasets.load_dataset(
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision"),
+        )  # type: ignore
+
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        splits = self.metadata_dict["eval_splits"]
+
+        documents: list = []
+        labels: list = []
+        label_col = "classes"
+
+        ds = {}
+        for split in splits:
+            ds_split = self.dataset[split]
+
+            _label = self.normalize_labels(ds_split[label_col])
+            documents.extend(ds_split["title"])
+            labels.extend(_label)
+
+            documents.extend(ds_split["ingress"])
+            labels.extend(_label)
+
+            documents.extend(ds_split["article"])
+            labels.extend(_label)
+
+            assert len(documents) == len(labels)
+
+            rng = random.Random(1111)  # local only seed
+            # resampling changes scores from 12.68, 11.30, 12.65 (sample model)
+            pairs = list(zip(documents, labels))
+            rng.shuffle(pairs)
+            documents, labels = [list(collection) for collection in zip(*pairs)]
+
+            # reduce size of dataset to not have too large datasets in the clustering task
+            documents_batched = list(batched(documents, 512))[:4]
+            labels_batched = list(batched(labels, 512))[:4]
+            # See:
+            # https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/pull/96
+            # for a discussion on sizes
+
+            ds[split] = datasets.Dataset.from_dict(
+                {
+                    "sentences": documents_batched,
+                    "labels": labels_batched,
+                }
+            )
+
+        self.dataset = datasets.DatasetDict(ds)
+
+    @staticmethod
+    def normalize_labels(labels: list[str]) -> list[str]:
+        # Agreed on and debated in: https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/issues/83
+        return [label.split(",")[0] for label in labels]
diff --git a/mteb/tasks/Clustering/sv/__init__.py b/mteb/tasks/Clustering/sv/__init__.py