Skip to content

Commit

Permalink
fix: Added tasks from SEB (#287)
Browse files Browse the repository at this point in the history
* Added tasks from SEB

* docs: fix link

* fix: ran linting

* fix typing for 3.8

* fixed annotation for v3.8
  • Loading branch information
KennethEnevoldsen authored Mar 29, 2024
1 parent 76056b5 commit 39cff49
Show file tree
Hide file tree
Showing 59 changed files with 2,034 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ if you are not
- [ ] I have considered the size of the dataset and reduced it if it is too big (2048 examples is typically large enough for most tasks)
- [ ] Run tests locally to make sure nothing is broken using `make test`.
- [ ] Run the formatter to format the code using `make lint`.
- [ ] I have added points for my submission to the [POINTS.md](https://github.com/embeddings-benchmark/mteb/tree/main/docs/mmteb/POINTS.md) file.
- [ ] I have added points for my submission to the [POINTS.md](https://github.com/embeddings-benchmark/mteb/blob/main/docs/mmteb/points.md) file.
3 changes: 2 additions & 1 deletion mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
"Discourse coherence",
"Language identification",
"Linguistic acceptability",
"Political",
"Political classification",
"Question answering",
"Sentiment/Hate speech",
"Thematic clustering",
"Scientific Reranking",
"Claim verification",
]

TASK_DOMAIN = Literal[
Expand Down
2 changes: 1 addition & 1 deletion mteb/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _save_model_metadata(
model_meta = {
"model_name": model_name,
"time_of_run": str(datetime.datetime.today()),
"versions": model._model_config["__version__"],
"versions": model._model_config.get("__version__", None),
}

with save_path.open("w") as f:
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/BitextMining/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
from .multilingual.FloresBitextMining import *
from .multilingual.NorwegianCourtsBitextMining import *
from .multilingual.TatoebaBitextMining import *
from .nb.norwegian_courts_bitext_mining import *
Empty file.
52 changes: 52 additions & 0 deletions mteb/tasks/BitextMining/nb/norwegian_courts_bitext_mining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Any

import datasets

from mteb.abstasks import AbsTaskBitextMining
from mteb.abstasks.TaskMetadata import TaskMetadata


class NorwegianCourtsBitextMining(AbsTaskBitextMining):
metadata = TaskMetadata(
name="NorwegianCourtsBitextMining",
hf_hub_name="kaedrodrur/norwegian-courts",
description="Nynorsk and Bokmål parallel corpus from Norwegian courts. ",
reference="https://opus.nlpl.eu/ELRC-Courts_Norway-v1.php",
type="BitextMining",
category="s2s",
eval_splits=["test"],
eval_langs=["nb", "nn"],
main_score="accuracy",
revision="d79af07e969a6678fcbbe819956840425816468f",
date=("2000-01-01", "2020-12-31"), # approximate guess
form=["spoken"],
domains=["Spoken"],
task_subtypes=["Political classification"],
license="openUnder-PSI",
socioeconomic_status="high",
annotations_creators="derived", # best guess
dialect=[],
text_creation="found",
bibtex_citation=None,
n_samples={"test": 456},
avg_character_length={"test": 82.11},
)

def load_data(self, **kwargs: Any) -> None: # noqa: ARG002
"""
Load dataset from HuggingFace hub and convert it to the standard format.
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
self.metadata_dict["hf_hub_name"],
revision=self.metadata_dict.get("revision", None),
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
# Convert to standard format
self.dataset = self.dataset.rename_column("nb", "sentence1")
self.dataset = self.dataset.rename_column("nn", "sentence2")
3 changes: 3 additions & 0 deletions mteb/tasks/Clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,8 @@
from .fr.MLSUMClusteringS2S import *
from .multilingual.MasakhaNEWSClusteringP2P import *
from .multilingual.MasakhaNEWSClusteringS2S import *
from .nb.snl_clustering import *
from .nb.vg_clustering import *
from .pl.PolishClustering import *
from .sv.swedn_clustering import *
from .zh.CMTEBClustering import *
Empty file.
113 changes: 113 additions & 0 deletions mteb/tasks/Clustering/nb/snl_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from __future__ import annotations

import random
from itertools import islice
from typing import Iterable, TypeVar

import datasets

from mteb.abstasks import AbsTaskClustering, TaskMetadata

T = TypeVar("T")


def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch


class SNLClustering(AbsTaskClustering):
metadata = TaskMetadata(
name="SNLClustering",
hf_hub_name="navjordj/SNL_summarization",
description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.",
reference="https://huggingface.co/datasets/navjordj/SNL_summarization",
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=["nb"],
main_score="v_measure",
revision="3d3d27aa7af8941408cefc3991ada5d12a4273d1",
date=("2020-01-01", "2024-12-31"), # best guess
form=["written"],
domains=["Encyclopaedic", "Non-fiction"],
license=None,
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
task_subtypes=["Thematic clustering"],
text_creation="found",
bibtex_citation="""@mastersthesis{navjord2023beyond,
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
year={2023},
school={Norwegian University of Life Sciences, {\AA}s}
}""",
n_samples={"test": 2048},
avg_character_length={"test": 1101.30},
)

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.metadata_dict["hf_hub_name"],
revision=self.metadata_dict.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
splits = self.metadata_dict["eval_splits"]

documents: list = []
labels: list = []
label_col = "category"

ds = {}
for split in splits:
ds_split = self.dataset[split]

_label = self.normalize_labels(ds_split[label_col])
documents.extend(ds_split["ingress"])
labels.extend(_label)

documents.extend(ds_split["article"])
labels.extend(_label)

assert len(documents) == len(labels)

rng = random.Random(42) # local only seed
pairs = list(zip(documents, labels))
rng.shuffle(pairs)
documents, labels = [list(collection) for collection in zip(*pairs)]

# reduce size of dataset to not have too large datasets in the clustering task
documents_batched = list(batched(documents, 512))[:4]
labels_batched = list(batched(labels, 512))[:4]

ds[split] = datasets.Dataset.from_dict(
{
"sentences": documents_batched,
"labels": labels_batched,
}
)

self.dataset = datasets.DatasetDict(ds)

@staticmethod
def normalize_labels(labels: list[str]) -> list[str]:
# example label:
# Store norske leksikon,Kunst og estetikk,Musikk,Klassisk musikk,Internasjonale dirigenter
# When using 2 levels there is 17 unique labels
# When using 3 levels there is 121 unique labels
return [",".join(tuple(label.split(",")[:3])) for label in labels]
117 changes: 117 additions & 0 deletions mteb/tasks/Clustering/nb/vg_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

import random
from itertools import islice
from typing import Iterable, TypeVar

import datasets

from mteb.abstasks import AbsTaskClustering, TaskMetadata

T = TypeVar("T")


def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch


class VGClustering(AbsTaskClustering):
metadata = TaskMetadata(
name="VGClustering",
hf_hub_name="navjordj/VG_summarization",
description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.",
reference="https://huggingface.co/datasets/navjordj/VG_summarization",
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=["nb"],
main_score="v_measure",
revision="d4c5a8ba10ae71224752c727094ac4c46947fa29",
date=("2020-01-01", "2024-12-31"), # best guess
form=["written"],
domains=["News", "Non-fiction"],
license=None,
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
task_subtypes=["Thematic clustering"],
text_creation="found",
bibtex_citation="""@mastersthesis{navjord2023beyond,
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
year={2023},
school={Norwegian University of Life Sciences, {\AA}s}
}""",
n_samples={"test": 2048},
avg_character_length={"test": 1009.65},
)

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.metadata_dict["hf_hub_name"],
revision=self.metadata_dict.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
splits = self.metadata_dict["eval_splits"]

documents: list = []
labels: list = []
label_col = "classes"

ds = {}
for split in splits:
ds_split = self.dataset[split]

_label = self.normalize_labels(ds_split[label_col])
documents.extend(ds_split["title"])
labels.extend(_label)

documents.extend(ds_split["ingress"])
labels.extend(_label)

documents.extend(ds_split["article"])
labels.extend(_label)

assert len(documents) == len(labels)

rng = random.Random(1111) # local only seed
# resampling changes scores from 12.68, 11.30, 12.65 (sample model)
pairs = list(zip(documents, labels))
rng.shuffle(pairs)
documents, labels = [list(collection) for collection in zip(*pairs)]

# reduce size of dataset to not have too large datasets in the clustering task
documents_batched = list(batched(documents, 512))[:4]
labels_batched = list(batched(labels, 512))[:4]
# See:
# https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/pull/96
# for a discussion on sizes

ds[split] = datasets.Dataset.from_dict(
{
"sentences": documents_batched,
"labels": labels_batched,
}
)

self.dataset = datasets.DatasetDict(ds)

@staticmethod
def normalize_labels(labels: list[str]) -> list[str]:
# Agreed on and debated in: https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/issues/83
return [label.split(",")[0] for label in labels]
Empty file.
Loading

0 comments on commit 39cff49

Please sign in to comment.