embeddings-benchmark · Muennighoff · Jul 29, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/README.md b/README.md
diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
@@ -50,4 +50,11 @@ def _evaluate_split(self, model, data_split, **kwargs):
 
         evaluator = BitextMiningEvaluator(sentence1, sentence2, gold, **kwargs)
         metrics = evaluator(model)
+        self._add_main_score(metrics)
         return metrics
+
+    def _add_main_score(self, scores):
+        if self.description["main_score"] in scores:
+            scores["main_score"] = scores[self.description["main_score"]]
+        else:
+            print(f"WARNING: main score {self.description['main_score']} not found in scores {scores.keys()}")
diff --git a/mteb/tasks/BitextMining/BornholmskBitextMining.py b/mteb/tasks/BitextMining/BornholmskBitextMining.py
@@ -0,0 +1,40 @@
+import datasets
+
+from mteb.abstasks import AbsTaskBitextMining
+
+
+class BornholmBitextMining(AbsTaskBitextMining):
+    @property
+    def description(self):
+        return {
+            "name": "BornholmBitextMining",
+            "hf_hub_name": "strombergnlp/bornholmsk_parallel",
+            "description": "Danish Bornholmsk Parallel Corpus. "
+            + "Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. "
+            + "Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
+            "reference": "https://aclanthology.org/W19-6138/",
+            "type": "BitextMining",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da", "da-bornholm"],
+            "main_score": "f1",
+            "revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub and convert it to the standard format.
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        # Convert to standard format
+        self.dataset.rename_column("da", "sentence1")
+        self.dataset.rename_column("da_bornholm", "sentence2")
diff --git a/mteb/tasks/Classification/AngryTweetsClassification.py b/mteb/tasks/Classification/AngryTweetsClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+
+
+class AngryTweetsClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "AngryTweetsClassification",
+            "hf_hub_name": "DDSC/angry-tweets",
+            "description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
+            "reference": "https://aclanthology.org/2021.nodalida-main.53/",
+            "eval_splits": ["test"],
+            "eval_langs": ["da"],
+            "type": "Classification",
+            "category": "s2s",
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "20b0e6081892e78179356fada741b7afa381443d",
+        }
diff --git a/mteb/tasks/Classification/DKHateClassification.py b/mteb/tasks/Classification/DKHateClassification.py
@@ -0,0 +1,41 @@
+import datasets
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+
+
+class DKHateClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "DKHateClassification",
+            "hf_hub_name": "DDSC/dkhate",
+            "description": "Danish Tweets annotated for Hate Speech either being Offensive or not",
+            "reference": "https://aclanthology.org/2020.lrec-1.430/",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "59d12749a3c91a186063c7d729ec392fda94681c",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        # convert label to a 0/1 label
+        labels = self.dataset["train"]["label"]  # type: ignore
+        lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
+        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
diff --git a/mteb/tasks/Classification/DalajClassification.py b/mteb/tasks/Classification/DalajClassification.py
@@ -0,0 +1,59 @@
+# SuperLIM tasks
+import datasets
+
+from mteb.abstasks import AbsTaskClassification
+
+
+class DalajClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "DalajClassification",
+            "hf_hub_name": "AI-Sweden/SuperLim",
+            "description": "A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.",
+            "reference": "https://spraakbanken.gu.se/en/resources/superlim",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["sv"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"],
+            "dalaj",  # chose the relevant subset
+            revision=self.description.get("revision"),
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        """
+        This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence".
+        We will use the original sentence as we "wrong" sentence and the corrected sentence as the "correct" sentence
+        """
+
+        def __convert_sample_to_classification(sample):
+            text = sample["original_sentence"] + sample["corrected_sentence"]
+            label = [1] * len(sample["original_sentence"]) + [0] * len(sample["corrected_sentence"])
+            return {"text": text, "label": label}
+
+        columns_to_keep = ["original_sentence", "corrected_sentence"]
+        for split in self.dataset:
+            columns_names = self.dataset[split].column_names  # type: ignore
+            columns_to_remove = [col for col in columns_names if col not in columns_to_keep]
+            self.dataset[split] = self.dataset[split].remove_columns(columns_to_remove)  # type: ignore
+
+        self.dataset = self.dataset.map(
+            __convert_sample_to_classification, batched=True, remove_columns=columns_to_keep
+        )
diff --git a/mteb/tasks/Classification/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/DanishPoliticalCommentsClassification.py
@@ -0,0 +1,39 @@
+import datasets
+
+from mteb.abstasks import AbsTaskClassification
+
+
+class DanishPoliticalCommentsClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "DanishPoliticalCommentsClassification",
+            "hf_hub_name": "danish_political_comments",
+            "description": "A dataset of Danish political comments rated for sentiment",
+            "reference": "NA",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["train"],
+            "eval_langs": ["da"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "edbb03726c04a0efab14fc8c3b8b79e4d420e5a1",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], revision=self.description.get("revision")
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("sentence", "text")
+        self.dataset = self.dataset.rename_column("target", "label")
diff --git a/mteb/tasks/Classification/LccClassification.py b/mteb/tasks/Classification/LccClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks import AbsTaskClassification
+
+
+class LccClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "LccClassification",
+            "hf_hub_name": "DDSC/lcc",
+            "description": "The leipzig corpora collection, annotated for sentiment",
+            "reference": "https://github.com/fnielsen/lcc-sentiment",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6",
+        }
diff --git a/mteb/tasks/Classification/NoRecClassification.py b/mteb/tasks/Classification/NoRecClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks import AbsTaskClassification
+
+
+class NoRecClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "NoRecClassification",
+            "hf_hub_name": "ScandEval/norec-mini",  # Using the mini version to keep results ~comparable to the ScandEval benchmark
+            "description": "A Norwegian dataset for sentiment classification on review",
+            "reference": "https://aclanthology.org/L18-1661/",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["no"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "07b99ab3363c2e7f8f87015b01c21f4d9b917ce3",
+        }
diff --git a/mteb/tasks/Classification/NordicLangClassification.py b/mteb/tasks/Classification/NordicLangClassification.py
@@ -0,0 +1,42 @@
+import datasets
+
+from mteb.abstasks import AbsTaskClassification
+
+
+class NordicLangClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "NordicLangClassification",
+            "hf_hub_name": "strombergnlp/nordic_langid",
+            "description": "A dataset for Nordic language identification.",
+            "reference": "https://aclanthology.org/2021.vardial-1.8/",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da", "no", "sv", "nb", "no", "is", "fo"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "e254179d18ab0165fdb6dbef91178266222bee2a",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], 
+            "10k", # select relevant subset
+            revision=self.description.get("revision")  
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("sentence", "text")
+        self.dataset = self.dataset.rename_column("language", "label")
+
diff --git a/mteb/tasks/Classification/NorwegianParlaimentClassification.py b/mteb/tasks/Classification/NorwegianParlaimentClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks import AbsTaskClassification
+
+
+class NorwegianParliamentClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "NorwegianParliament",
+            "hf_hub_name": "NbAiLab/norwegian_parliament",
+            "description": "Norwegian parliament speeches annotated for sentiment",
+            "reference": "https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test", "validation"],
+            "eval_langs": ["no"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "f7393532774c66312378d30b197610b43d751972",
+        }