Added support for Scandinavian Languages (#124)

* Make sure that main score is added to bitext mining tasks * Added scandinavian languages: da, no, sv * Updated readme with scandinavian tasks * Changes n samples for the nordic lang CLF * Added scandinavian models to init * Added error logs to gitignore * fix import error * fix dataset columns * rename dataset columns * remove swefaq * fix: Added functionality to raise error * fix: Updated names * fix: Removed no as a language * Added missing data transformation * Fix spelling error
embeddings-benchmark · Jul 29, 2023 · acb0f59 · acb0f59
1 parent c50b8ab
commit acb0f59
Show file tree

Hide file tree

Showing 17 changed files with 605 additions and 79 deletions.
diff --git a/.gitignore b/.gitignore
@@ -138,3 +138,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# error logs
+error_logs.txt
diff --git a/README.md b/README.md
diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
@@ -50,4 +50,11 @@ def _evaluate_split(self, model, data_split, **kwargs):
 
         evaluator = BitextMiningEvaluator(sentence1, sentence2, gold, **kwargs)
         metrics = evaluator(model)
+        self._add_main_score(metrics)
         return metrics
+
+    def _add_main_score(self, scores):
+        if self.description["main_score"] in scores:
+            scores["main_score"] = scores[self.description["main_score"]]
+        else:
+            print(f"WARNING: main score {self.description['main_score']} not found in scores {scores.keys()}")
diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
@@ -79,9 +79,9 @@ def _extend_lang_pairs(self):
         # add all possible language pairs
         langs = set(self._task_langs)
         for x in langs:
-            if '-' not in x:
+            if "-" not in x:
                 for y in langs:
-                    if '-' not in y:
+                    if "-" not in y:
                         pair = f"{x}-{y}"
                         if pair not in langs:
                             self._task_langs.append(pair)
@@ -183,7 +183,14 @@ def load_tasks_data(self):
             task.load_data()
 
     def run(
-        self, model, verbosity=1, output_folder="results/result", eval_splits=None, overwrite_results=False, **kwargs
+        self,
+        model,
+        verbosity=1,
+        output_folder="results/result",
+        eval_splits=None,
+        overwrite_results=False,
+        raise_error: bool = True,
+        **kwargs
     ):
         """
         Run the evaluation pipeline on the selected tasks.
@@ -199,6 +206,8 @@ def run(
             2: print everything (including datasets loading)
         output_folder: str
             Folder where the results will be saved
+        raise_error: bool
+            Whether to raise an error if an exception occurs during evaluation.
         :return: Returns a dictionary of task names and corresponding metrics results.
         """
         # Set logging
@@ -259,6 +268,8 @@ def run(
             except Exception as e:
                 logger.error(f"Error while evaluating {task.description['name']}: {e}")
                 logger.error(f"Please check all the error logs at: {self.err_logs_path}")
+                if raise_error:
+                    raise e
                 with open(self.err_logs_path, "a") as f_out:
                     f_out.write(f"{datetime.now()} >>> {task.description['name']}\n")
                     f_out.write(traceback.format_exc())

diff --git a/mteb/tasks/BitextMining/BornholmskBitextMining.py b/mteb/tasks/BitextMining/BornholmskBitextMining.py
@@ -0,0 +1,41 @@
+import datasets
+
+from mteb.abstasks import AbsTaskBitextMining
+
+
+class BornholmBitextMining(AbsTaskBitextMining):
+    @property
+    def description(self):
+        return {
+            "name": "BornholmBitextMining",
+            "hf_hub_name": "strombergnlp/bornholmsk_parallel",
+            "description": "Danish Bornholmsk Parallel Corpus. "
+            + "Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. "
+            + "Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
+            "reference": "https://aclanthology.org/W19-6138/",
+            "type": "BitextMining",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da", "da-bornholm"],
+            "main_score": "f1",
+            "revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub and convert it to the standard format.
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"],
+            revision=self.description.get("revision", None),
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        # Convert to standard format
+        self.dataset = self.dataset.rename_column("da", "sentence1")
+        self.dataset = self.dataset.rename_column("da_bornholm", "sentence2")
diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py
@@ -1,2 +1,3 @@
 from .BUCCBitextMining import *
 from .TatoebaBitextMining import *
+from .BornholmskBitextMining import *
diff --git a/mteb/tasks/Classification/AngryTweetsClassification.py b/mteb/tasks/Classification/AngryTweetsClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+
+
+class AngryTweetsClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "AngryTweetsClassification",
+            "hf_hub_name": "DDSC/angry-tweets",
+            "description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
+            "reference": "https://aclanthology.org/2021.nodalida-main.53/",
+            "eval_splits": ["test"],
+            "eval_langs": ["da"],
+            "type": "Classification",
+            "category": "s2s",
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "20b0e6081892e78179356fada741b7afa381443d",
+        }
diff --git a/mteb/tasks/Classification/DKHateClassification.py b/mteb/tasks/Classification/DKHateClassification.py
@@ -0,0 +1,41 @@
+import datasets
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+
+
+class DKHateClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "DKHateClassification",
+            "hf_hub_name": "DDSC/dkhate",
+            "description": "Danish Tweets annotated for Hate Speech either being Offensive or not",
+            "reference": "https://aclanthology.org/2020.lrec-1.430/",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "59d12749a3c91a186063c7d729ec392fda94681c",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        # convert label to a 0/1 label
+        labels = self.dataset["train"]["label"]  # type: ignore
+        lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
+        self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
diff --git a/mteb/tasks/Classification/DalajClassification.py b/mteb/tasks/Classification/DalajClassification.py
@@ -0,0 +1,59 @@
+# SuperLIM tasks
+import datasets
+
+from mteb.abstasks import AbsTaskClassification
+
+
+class DalajClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "DalajClassification",
+            "hf_hub_name": "AI-Sweden/SuperLim",
+            "description": "A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.",
+            "reference": "https://spraakbanken.gu.se/en/resources/superlim",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["sv"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"],
+            "dalaj",  # chose the relevant subset
+            revision=self.description.get("revision"),
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        """
+        This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence".
+        We will use the original sentence as we "wrong" sentence and the corrected sentence as the "correct" sentence
+        """
+
+        def __convert_sample_to_classification(sample):
+            text = sample["original_sentence"] + sample["corrected_sentence"]
+            label = [1] * len(sample["original_sentence"]) + [0] * len(sample["corrected_sentence"])
+            return {"text": text, "label": label}
+
+        columns_to_keep = ["original_sentence", "corrected_sentence"]
+        for split in self.dataset:
+            columns_names = self.dataset[split].column_names  # type: ignore
+            columns_to_remove = [col for col in columns_names if col not in columns_to_keep]
+            self.dataset[split] = self.dataset[split].remove_columns(columns_to_remove)  # type: ignore
+
+        self.dataset = self.dataset.map(
+            __convert_sample_to_classification, batched=True, remove_columns=columns_to_keep
+        )
diff --git a/mteb/tasks/Classification/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/DanishPoliticalCommentsClassification.py
@@ -0,0 +1,42 @@
+import datasets
+
+from mteb.abstasks import AbsTaskClassification
+
+
+class DanishPoliticalCommentsClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "DanishPoliticalCommentsClassification",
+            "hf_hub_name": "danish_political_comments",
+            "description": "A dataset of Danish political comments rated for sentiment",
+            "reference": "NA",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["train"],
+            "eval_langs": ["da"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "edbb03726c04a0efab14fc8c3b8b79e4d420e5a1",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], revision=self.description.get("revision")
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("sentence", "text")
+        self.dataset = self.dataset.rename_column("target", "label")
+
+        # create train and test splits
+        self.dataset = self.dataset["train"].train_test_split(0.2, seed=self.seed)
diff --git a/mteb/tasks/Classification/LccSentimentClassification.py b/mteb/tasks/Classification/LccSentimentClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks import AbsTaskClassification
+
+
+class LccSentimentClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "LccSentimentClassification",
+            "hf_hub_name": "DDSC/lcc",
+            "description": "The leipzig corpora collection, annotated for sentiment",
+            "reference": "https://github.com/fnielsen/lcc-sentiment",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6",
+        }
diff --git a/mteb/tasks/Classification/NoRecClassification.py b/mteb/tasks/Classification/NoRecClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks import AbsTaskClassification
+
+
+class NoRecClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "NoRecClassification",
+            "hf_hub_name": "ScandEval/norec-mini",  # Using the mini version to keep results ~comparable to the ScandEval benchmark
+            "description": "A Norwegian dataset for sentiment classification on review",
+            "reference": "https://aclanthology.org/L18-1661/",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["nb"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "07b99ab3363c2e7f8f87015b01c21f4d9b917ce3",
+        }
diff --git a/mteb/tasks/Classification/NordicLangClassification.py b/mteb/tasks/Classification/NordicLangClassification.py
@@ -0,0 +1,39 @@
+import datasets
+
+from mteb.abstasks import AbsTaskClassification
+
+
+class NordicLangClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "NordicLangClassification",
+            "hf_hub_name": "strombergnlp/nordic_langid",
+            "description": "A dataset for Nordic language identification.",
+            "reference": "https://aclanthology.org/2021.vardial-1.8/",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test"],
+            "eval_langs": ["da", "sv", "nb", "nn", "is", "fo"],
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 32,
+            "revision": "e254179d18ab0165fdb6dbef91178266222bee2a",
+        }
+
+    def load_data(self, **kwargs):
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset = datasets.load_dataset(
+            self.description["hf_hub_name"], "10k", revision=self.description.get("revision")  # select relevant subset
+        )
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        self.dataset = self.dataset.rename_column("sentence", "text")
+        self.dataset = self.dataset.rename_column("language", "label")
diff --git a/mteb/tasks/Classification/NorwegianParliamentClassification.py b/mteb/tasks/Classification/NorwegianParliamentClassification.py
@@ -0,0 +1,20 @@
+from mteb.abstasks import AbsTaskClassification
+
+
+class NorwegianParliamentClassification(AbsTaskClassification):
+    @property
+    def description(self):
+        return {
+            "name": "NorwegianParliament",
+            "hf_hub_name": "NbAiLab/norwegian_parliament",
+            "description": "Norwegian parliament speeches annotated for sentiment",
+            "reference": "https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
+            "type": "Classification",
+            "category": "s2s",
+            "eval_splits": ["test", "validation"],
+            "eval_langs": ["nb"], # assumed to be bokmål
+            "main_score": "accuracy",
+            "n_experiments": 10,
+            "samples_per_label": 16,
+            "revision": "f7393532774c66312378d30b197610b43d751972",
+        }