Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for Scandinavian Languages #124

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 91 additions & 76 deletions README.md

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,11 @@ def _evaluate_split(self, model, data_split, **kwargs):

evaluator = BitextMiningEvaluator(sentence1, sentence2, gold, **kwargs)
metrics = evaluator(model)
self._add_main_score(metrics)
return metrics

def _add_main_score(self, scores):
if self.description["main_score"] in scores:
scores["main_score"] = scores[self.description["main_score"]]
else:
print(f"WARNING: main score {self.description['main_score']} not found in scores {scores.keys()}")
40 changes: 40 additions & 0 deletions mteb/tasks/BitextMining/BornholmskBitextMining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import datasets

from mteb.abstasks import AbsTaskBitextMining


class BornholmBitextMining(AbsTaskBitextMining):
@property
def description(self):
return {
"name": "BornholmBitextMining",
"hf_hub_name": "strombergnlp/bornholmsk_parallel",
"description": "Danish Bornholmsk Parallel Corpus. "
+ "Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. "
+ "Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
"reference": "https://aclanthology.org/W19-6138/",
"type": "BitextMining",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da", "da-bornholm"],
"main_score": "f1",
"revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub and convert it to the standard format.
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision", None)
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
# Convert to standard format
self.dataset.rename_column("da", "sentence1")
self.dataset.rename_column("da_bornholm", "sentence2")
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/AngryTweetsClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks.AbsTaskClassification import AbsTaskClassification


class AngryTweetsClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "AngryTweetsClassification",
"hf_hub_name": "DDSC/angry-tweets",
"description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets",
"reference": "https://aclanthology.org/2021.nodalida-main.53/",
"eval_splits": ["test"],
"eval_langs": ["da"],
"type": "Classification",
"category": "s2s",
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "20b0e6081892e78179356fada741b7afa381443d",
}
41 changes: 41 additions & 0 deletions mteb/tasks/Classification/DKHateClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import datasets

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification


class DKHateClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "DKHateClassification",
"hf_hub_name": "DDSC/dkhate",
"description": "Danish Tweets annotated for Hate Speech either being Offensive or not",
"reference": "https://aclanthology.org/2020.lrec-1.430/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "59d12749a3c91a186063c7d729ec392fda94681c",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision", None)
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
# convert label to a 0/1 label
labels = self.dataset["train"]["label"] # type: ignore
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"])
59 changes: 59 additions & 0 deletions mteb/tasks/Classification/DalajClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SuperLIM tasks
import datasets

from mteb.abstasks import AbsTaskClassification


class DalajClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "DalajClassification",
"hf_hub_name": "AI-Sweden/SuperLim",
"description": "A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.",
"reference": "https://spraakbanken.gu.se/en/resources/superlim",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["sv"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"],
"dalaj", # chose the relevant subset
revision=self.description.get("revision"),
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
"""
This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence".
We will use the original sentence as we "wrong" sentence and the corrected sentence as the "correct" sentence
"""

def __convert_sample_to_classification(sample):
text = sample["original_sentence"] + sample["corrected_sentence"]
label = [1] * len(sample["original_sentence"]) + [0] * len(sample["corrected_sentence"])
return {"text": text, "label": label}

columns_to_keep = ["original_sentence", "corrected_sentence"]
for split in self.dataset:
columns_names = self.dataset[split].column_names # type: ignore
columns_to_remove = [col for col in columns_names if col not in columns_to_keep]
self.dataset[split] = self.dataset[split].remove_columns(columns_to_remove) # type: ignore

self.dataset = self.dataset.map(
__convert_sample_to_classification, batched=True, remove_columns=columns_to_keep
)
39 changes: 39 additions & 0 deletions mteb/tasks/Classification/DanishPoliticalCommentsClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import datasets

from mteb.abstasks import AbsTaskClassification


class DanishPoliticalCommentsClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "DanishPoliticalCommentsClassification",
"hf_hub_name": "danish_political_comments",
"description": "A dataset of Danish political comments rated for sentiment",
"reference": "NA",
"type": "Classification",
"category": "s2s",
"eval_splits": ["train"],
"eval_langs": ["da"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "edbb03726c04a0efab14fc8c3b8b79e4d420e5a1",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision")
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
self.dataset = self.dataset.rename_column("target", "label")
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/LccClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks import AbsTaskClassification


class LccClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "LccClassification",
"hf_hub_name": "DDSC/lcc",
"description": "The leipzig corpora collection, annotated for sentiment",
"reference": "https://github.com/fnielsen/lcc-sentiment",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6",
}
20 changes: 20 additions & 0 deletions mteb/tasks/Classification/NoRecClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks import AbsTaskClassification


class NoRecClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "NoRecClassification",
"hf_hub_name": "ScandEval/norec-mini", # Using the mini version to keep results ~comparable to the ScandEval benchmark
"description": "A Norwegian dataset for sentiment classification on review",
"reference": "https://aclanthology.org/L18-1661/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["no"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "07b99ab3363c2e7f8f87015b01c21f4d9b917ce3",
}
42 changes: 42 additions & 0 deletions mteb/tasks/Classification/NordicLangClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import datasets

from mteb.abstasks import AbsTaskClassification


class NordicLangClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "NordicLangClassification",
"hf_hub_name": "strombergnlp/nordic_langid",
"description": "A dataset for Nordic language identification.",
"reference": "https://aclanthology.org/2021.vardial-1.8/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da", "no", "sv", "nb", "no", "is", "fo"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "e254179d18ab0165fdb6dbef91178266222bee2a",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.description["hf_hub_name"],
"10k", # select relevant subset
revision=self.description.get("revision")
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
self.dataset = self.dataset.rename_column("language", "label")

20 changes: 20 additions & 0 deletions mteb/tasks/Classification/NorwegianParlaimentClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mteb.abstasks import AbsTaskClassification


class NorwegianParliamentClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "NorwegianParliament",
"hf_hub_name": "NbAiLab/norwegian_parliament",
"description": "Norwegian parliament speeches annotated for sentiment",
"reference": "https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test", "validation"],
"eval_langs": ["no"],
"main_score": "accuracy",
"n_experiments": 10,
"samples_per_label": 16,
"revision": "f7393532774c66312378d30b197610b43d751972",
}
Loading