Skip to content

Commit

Permalink
Added support for Scandinavian Languages (#124)
Browse files Browse the repository at this point in the history
* Make sure that main score is added to bitext mining tasks

* Added scandinavian languages: da, no, sv

* Updated readme with scandinavian tasks

* Changes n samples for the nordic lang CLF

* Added scandinavian models to init

* Added error logs to gitignore

* fix import error

* fix dataset columns

* rename dataset columns

* remove swefaq

* fix: Added functionality to raise error

* fix: Updated names

* fix: Removed no as a language

* Added missing data transformation

* Fix spelling error
  • Loading branch information
KennethEnevoldsen authored Jul 29, 2023
1 parent c50b8ab commit acb0f59
Show file tree
Hide file tree
Showing 17 changed files with 605 additions and 79 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,6 @@ dmypy.json

# Pyre type checker
.pyre/

# error logs
error_logs.txt
167 changes: 91 additions & 76 deletions README.md

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,11 @@ def _evaluate_split(self, model, data_split, **kwargs):

evaluator = BitextMiningEvaluator(sentence1, sentence2, gold, **kwargs)
metrics = evaluator(model)
self._add_main_score(metrics)
return metrics

def _add_main_score(self, scores):
if self.description["main_score"] in scores:
scores["main_score"] = scores[self.description["main_score"]]
else:
print(f"WARNING: main score {self.description['main_score']} not found in scores {scores.keys()}")
17 changes: 14 additions & 3 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ def _extend_lang_pairs(self):
# add all possible language pairs
langs = set(self._task_langs)
for x in langs:
if '-' not in x:
if "-" not in x:
for y in langs:
if '-' not in y:
if "-" not in y:
pair = f"{x}-{y}"
if pair not in langs:
self._task_langs.append(pair)
Expand Down Expand Up @@ -183,7 +183,14 @@ def load_tasks_data(self):
task.load_data()

def run(
self, model, verbosity=1, output_folder="results/result", eval_splits=None, overwrite_results=False, **kwargs
self,
model,
verbosity=1,
output_folder="results/result",
eval_splits=None,
overwrite_results=False,
raise_error: bool = True,
**kwargs
):
"""
Run the evaluation pipeline on the selected tasks.
Expand All @@ -199,6 +206,8 @@ def run(
2: print everything (including datasets loading)
output_folder: str
Folder where the results will be saved
raise_error: bool
Whether to raise an error if an exception occurs during evaluation.
:return: Returns a dictionary of task names and corresponding metrics results.
"""
# Set logging
Expand Down Expand Up @@ -259,6 +268,8 @@ def run(
except Exception as e:
logger.error(f"Error while evaluating {task.description['name']}: {e}")
logger.error(f"Please check all the error logs at: {self.err_logs_path}")
if raise_error:
raise e
with open(self.err_logs_path, "a") as f_out:
f_out.write(f"{datetime.now()} >>> {task.description['name']}\n")
f_out.write(traceback.format_exc())
Expand Down
41 changes: 41 additions & 0 deletions mteb/tasks/BitextMining/BornholmskBitextMining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import datasets

from mteb.abstasks import AbsTaskBitextMining


class BornholmBitextMining(AbsTaskBitextMining):
@property
def description(self):
return {
"name": "BornholmBitextMining",
"hf_hub_name": "strombergnlp/bornholmsk_parallel",
"description": "Danish Bornholmsk Parallel Corpus. "
+ "Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. "
+ "Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
"reference": "https://aclanthology.org/W19-6138/",
"type": "BitextMining",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da", "da-bornholm"],
"main_score": "f1",
"revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub and convert it to the standard format.
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"],
revision=self.description.get("revision", None),
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
# Convert to standard format
self.dataset = self.dataset.rename_column("da", "sentence1")
self.dataset = self.dataset.rename_column("da_bornholm", "sentence2")
1 change: 1 addition & 0 deletions mteb/tasks/BitextMining/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .BUCCBitextMining import *
from .TatoebaBitextMining import *
from .BornholmskBitextMining import *
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/AngryTweetsClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks.AbsTaskClassification import AbsTaskClassification


class AngryTweetsClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "AngryTweetsClassification",
"hf_hub_name": "DDSC/angry-tweets",
"description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
"reference": "https://aclanthology.org/2021.nodalida-main.53/",
"eval_splits": ["test"],
"eval_langs": ["da"],
"type": "Classification",
"category": "s2s",
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "20b0e6081892e78179356fada741b7afa381443d",
}
41 changes: 41 additions & 0 deletions mteb/tasks/Classification/DKHateClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import datasets

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification


class DKHateClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "DKHateClassification",
"hf_hub_name": "DDSC/dkhate",
"description": "Danish Tweets annotated for Hate Speech either being Offensive or not",
"reference": "https://aclanthology.org/2020.lrec-1.430/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "59d12749a3c91a186063c7d729ec392fda94681c",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision", None)
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
# convert label to a 0/1 label
labels = self.dataset["train"]["label"] # type: ignore
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
59 changes: 59 additions & 0 deletions mteb/tasks/Classification/DalajClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SuperLIM tasks
import datasets

from mteb.abstasks import AbsTaskClassification


class DalajClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "DalajClassification",
"hf_hub_name": "AI-Sweden/SuperLim",
"description": "A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.",
"reference": "https://spraakbanken.gu.se/en/resources/superlim",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["sv"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"],
"dalaj", # chose the relevant subset
revision=self.description.get("revision"),
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
"""
This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence".
We will use the original sentence as we "wrong" sentence and the corrected sentence as the "correct" sentence
"""

def __convert_sample_to_classification(sample):
text = sample["original_sentence"] + sample["corrected_sentence"]
label = [1] * len(sample["original_sentence"]) + [0] * len(sample["corrected_sentence"])
return {"text": text, "label": label}

columns_to_keep = ["original_sentence", "corrected_sentence"]
for split in self.dataset:
columns_names = self.dataset[split].column_names # type: ignore
columns_to_remove = [col for col in columns_names if col not in columns_to_keep]
self.dataset[split] = self.dataset[split].remove_columns(columns_to_remove) # type: ignore

self.dataset = self.dataset.map(
__convert_sample_to_classification, batched=True, remove_columns=columns_to_keep
)
42 changes: 42 additions & 0 deletions mteb/tasks/Classification/DanishPoliticalCommentsClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import datasets

from mteb.abstasks import AbsTaskClassification


class DanishPoliticalCommentsClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "DanishPoliticalCommentsClassification",
"hf_hub_name": "danish_political_comments",
"description": "A dataset of Danish political comments rated for sentiment",
"reference": "NA",
"type": "Classification",
"category": "s2s",
"eval_splits": ["train"],
"eval_langs": ["da"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "edbb03726c04a0efab14fc8c3b8b79e4d420e5a1",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision")
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
self.dataset = self.dataset.rename_column("target", "label")

# create train and test splits
self.dataset = self.dataset["train"].train_test_split(0.2, seed=self.seed)
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/LccSentimentClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks import AbsTaskClassification


class LccSentimentClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "LccSentimentClassification",
"hf_hub_name": "DDSC/lcc",
"description": "The leipzig corpora collection, annotated for sentiment",
"reference": "https://github.com/fnielsen/lcc-sentiment",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6",
}
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/NoRecClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks import AbsTaskClassification


class NoRecClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "NoRecClassification",
"hf_hub_name": "ScandEval/norec-mini", # Using the mini version to keep results ~comparable to the ScandEval benchmark
"description": "A Norwegian dataset for sentiment classification on review",
"reference": "https://aclanthology.org/L18-1661/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["nb"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "07b99ab3363c2e7f8f87015b01c21f4d9b917ce3",
}
39 changes: 39 additions & 0 deletions mteb/tasks/Classification/NordicLangClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import datasets

from mteb.abstasks import AbsTaskClassification


class NordicLangClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "NordicLangClassification",
"hf_hub_name": "strombergnlp/nordic_langid",
"description": "A dataset for Nordic language identification.",
"reference": "https://aclanthology.org/2021.vardial-1.8/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da", "sv", "nb", "nn", "is", "fo"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 32,
"revision": "e254179d18ab0165fdb6dbef91178266222bee2a",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], "10k", revision=self.description.get("revision") # select relevant subset
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
self.dataset = self.dataset.rename_column("language", "label")
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/NorwegianParliamentClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks import AbsTaskClassification


class NorwegianParliamentClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "NorwegianParliament",
"hf_hub_name": "NbAiLab/norwegian_parliament",
"description": "Norwegian parliament speeches annotated for sentiment",
"reference": "https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test", "validation"],
"eval_langs": ["nb"], # assumed to be bokmål
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "f7393532774c66312378d30b197610b43d751972",
}
Loading

0 comments on commit acb0f59

Please sign in to comment.