-
Notifications
You must be signed in to change notification settings - Fork 290
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added support for Scandinavian Languages (#124)
* Make sure that main score is added to bitext mining tasks * Added scandinavian languages: da, no, sv * Updated readme with scandinavian tasks * Changes n samples for the nordic lang CLF * Added scandinavian models to init * Added error logs to gitignore * fix import error * fix dataset columns * rename dataset columns * remove swefaq * fix: Added functionality to raise error * fix: Updated names * fix: Removed no as a language * Added missing data transformation * Fix spelling error
- Loading branch information
1 parent
c50b8ab
commit acb0f59
Showing
17 changed files
with
605 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,3 +138,6 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# error logs | ||
error_logs.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskBitextMining | ||
|
||
|
||
class BornholmBitextMining(AbsTaskBitextMining): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "BornholmBitextMining", | ||
"hf_hub_name": "strombergnlp/bornholmsk_parallel", | ||
"description": "Danish Bornholmsk Parallel Corpus. " | ||
+ "Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. " | ||
+ "Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.", | ||
"reference": "https://aclanthology.org/W19-6138/", | ||
"type": "BitextMining", | ||
"category": "s2s", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["da", "da-bornholm"], | ||
"main_score": "f1", | ||
"revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552", | ||
} | ||
|
||
def load_data(self, **kwargs): | ||
""" | ||
Load dataset from HuggingFace hub and convert it to the standard format. | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset = datasets.load_dataset( | ||
self.description["hf_hub_name"], | ||
revision=self.description.get("revision", None), | ||
) | ||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
# Convert to standard format | ||
self.dataset = self.dataset.rename_column("da", "sentence1") | ||
self.dataset = self.dataset.rename_column("da_bornholm", "sentence2") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from .BUCCBitextMining import * | ||
from .TatoebaBitextMining import * | ||
from .BornholmskBitextMining import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mteb.abstasks.AbsTaskClassification import AbsTaskClassification | ||
|
||
|
||
class AngryTweetsClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "AngryTweetsClassification", | ||
"hf_hub_name": "DDSC/angry-tweets", | ||
"description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets", | ||
"reference": "https://aclanthology.org/2021.nodalida-main.53/", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["da"], | ||
"type": "Classification", | ||
"category": "s2s", | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "20b0e6081892e78179356fada741b7afa381443d", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import datasets | ||
|
||
from mteb.abstasks.AbsTaskClassification import AbsTaskClassification | ||
|
||
|
||
class DKHateClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "DKHateClassification", | ||
"hf_hub_name": "DDSC/dkhate", | ||
"description": "Danish Tweets annotated for Hate Speech either being Offensive or not", | ||
"reference": "https://aclanthology.org/2020.lrec-1.430/", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["da"], | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "59d12749a3c91a186063c7d729ec392fda94681c", | ||
} | ||
|
||
def load_data(self, **kwargs): | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset = datasets.load_dataset( | ||
self.description["hf_hub_name"], revision=self.description.get("revision", None) | ||
) | ||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
# convert label to a 0/1 label | ||
labels = self.dataset["train"]["label"] # type: ignore | ||
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))} | ||
self.dataset = self.dataset.map(lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# SuperLIM tasks | ||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskClassification | ||
|
||
|
||
class DalajClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "DalajClassification", | ||
"hf_hub_name": "AI-Sweden/SuperLim", | ||
"description": "A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.", | ||
"reference": "https://spraakbanken.gu.se/en/resources/superlim", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["sv"], | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56", | ||
} | ||
|
||
def load_data(self, **kwargs): | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset = datasets.load_dataset( | ||
self.description["hf_hub_name"], | ||
"dalaj", # chose the relevant subset | ||
revision=self.description.get("revision"), | ||
) | ||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
""" | ||
This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence". | ||
We will use the original sentence as we "wrong" sentence and the corrected sentence as the "correct" sentence | ||
""" | ||
|
||
def __convert_sample_to_classification(sample): | ||
text = sample["original_sentence"] + sample["corrected_sentence"] | ||
label = [1] * len(sample["original_sentence"]) + [0] * len(sample["corrected_sentence"]) | ||
return {"text": text, "label": label} | ||
|
||
columns_to_keep = ["original_sentence", "corrected_sentence"] | ||
for split in self.dataset: | ||
columns_names = self.dataset[split].column_names # type: ignore | ||
columns_to_remove = [col for col in columns_names if col not in columns_to_keep] | ||
self.dataset[split] = self.dataset[split].remove_columns(columns_to_remove) # type: ignore | ||
|
||
self.dataset = self.dataset.map( | ||
__convert_sample_to_classification, batched=True, remove_columns=columns_to_keep | ||
) |
42 changes: 42 additions & 0 deletions
42
mteb/tasks/Classification/DanishPoliticalCommentsClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskClassification | ||
|
||
|
||
class DanishPoliticalCommentsClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "DanishPoliticalCommentsClassification", | ||
"hf_hub_name": "danish_political_comments", | ||
"description": "A dataset of Danish political comments rated for sentiment", | ||
"reference": "NA", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["train"], | ||
"eval_langs": ["da"], | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "edbb03726c04a0efab14fc8c3b8b79e4d420e5a1", | ||
} | ||
|
||
def load_data(self, **kwargs): | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset = datasets.load_dataset( | ||
self.description["hf_hub_name"], revision=self.description.get("revision") | ||
) | ||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
self.dataset = self.dataset.rename_column("sentence", "text") | ||
self.dataset = self.dataset.rename_column("target", "label") | ||
|
||
# create train and test splits | ||
self.dataset = self.dataset["train"].train_test_split(0.2, seed=self.seed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mteb.abstasks import AbsTaskClassification | ||
|
||
|
||
class LccSentimentClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "LccSentimentClassification", | ||
"hf_hub_name": "DDSC/lcc", | ||
"description": "The leipzig corpora collection, annotated for sentiment", | ||
"reference": "https://github.com/fnielsen/lcc-sentiment", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["da"], | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "de7ba3406ee55ea2cc52a0a41408fa6aede6d3c6", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mteb.abstasks import AbsTaskClassification | ||
|
||
|
||
class NoRecClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "NoRecClassification", | ||
"hf_hub_name": "ScandEval/norec-mini", # Using the mini version to keep results ~comparable to the ScandEval benchmark | ||
"description": "A Norwegian dataset for sentiment classification on review", | ||
"reference": "https://aclanthology.org/L18-1661/", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["nb"], | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "07b99ab3363c2e7f8f87015b01c21f4d9b917ce3", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskClassification | ||
|
||
|
||
class NordicLangClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "NordicLangClassification", | ||
"hf_hub_name": "strombergnlp/nordic_langid", | ||
"description": "A dataset for Nordic language identification.", | ||
"reference": "https://aclanthology.org/2021.vardial-1.8/", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["da", "sv", "nb", "nn", "is", "fo"], | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 32, | ||
"revision": "e254179d18ab0165fdb6dbef91178266222bee2a", | ||
} | ||
|
||
def load_data(self, **kwargs): | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset = datasets.load_dataset( | ||
self.description["hf_hub_name"], "10k", revision=self.description.get("revision") # select relevant subset | ||
) | ||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
self.dataset = self.dataset.rename_column("sentence", "text") | ||
self.dataset = self.dataset.rename_column("language", "label") |
20 changes: 20 additions & 0 deletions
20
mteb/tasks/Classification/NorwegianParliamentClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mteb.abstasks import AbsTaskClassification | ||
|
||
|
||
class NorwegianParliamentClassification(AbsTaskClassification): | ||
@property | ||
def description(self): | ||
return { | ||
"name": "NorwegianParliament", | ||
"hf_hub_name": "NbAiLab/norwegian_parliament", | ||
"description": "Norwegian parliament speeches annotated for sentiment", | ||
"reference": "https://huggingface.co/datasets/NbAiLab/norwegian_parliament", | ||
"type": "Classification", | ||
"category": "s2s", | ||
"eval_splits": ["test", "validation"], | ||
"eval_langs": ["nb"], # assumed to be bokmål | ||
"main_score": "accuracy", | ||
"n_experiments": 10, | ||
"samples_per_label": 16, | ||
"revision": "f7393532774c66312378d30b197610b43d751972", | ||
} |
Oops, something went wrong.