Skip to content

Commit

Permalink
Add Korean Text Search Tasks to MTEB (#210)
Browse files Browse the repository at this point in the history
* add Ko-miracl, Ko-StrategyQA, Ko-mrtydi tasks

* Update mteb/abstasks/AbsTaskRetrieval.py

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>

* Update AbsTaskRetrieval.py

* Update mteb/abstasks/AbsTaskRetrieval.py

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>

* Update scripts/run_mteb_korean.py

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>

---------

Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>
  • Loading branch information
taeminlee and Muennighoff authored Feb 6, 2024
1 parent 2f65179 commit dadf2da
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 0 deletions.
42 changes: 42 additions & 0 deletions mteb/abstasks/BeIRKOTask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
from .AbsTask import AbsTask


class BeIRKOTask(AbsTask):
def __init__(self, **kwargs):
super().__init__(**kwargs)

def load_data(self, eval_splits=None, **kwargs):
"""
Load dataset from BeIR-KO benchmark.
"""
try:
from beir.datasets.data_loader_hf import HFDataLoader
except ImportError:
raise Exception("Retrieval tasks require beir package. Please install it with `pip install mteb[beir]`")


if self.data_loaded:
return
if eval_splits is None:
eval_splits = self.description["eval_splits"]
dataset = self.description["beir_name"]

hf_repo = self.description["hf_repo"]
hf_repo_qrels = self.description["hf_repo_qrels"]

# cqadupstack not on huggingface yet
# dataset, sub_dataset = dataset.split("/") if "cqadupstack" in dataset else (dataset, None)

self.corpus, self.queries, self.relevant_docs = {}, {}, {}

for split in eval_splits:

corpus, queries, qrels = HFDataLoader(hf_repo=hf_repo, hf_repo_qrels=hf_repo_qrels, streaming=False, keep_in_memory=False).load(split=split)
# Conversion from DataSet
queries = {query['id']: query['text'] for query in queries}
corpus = {doc['id']: {'title': doc['title'] , 'text': doc['text']} for doc in corpus}

self.corpus[split], self.queries[split], self.relevant_docs[split] = corpus, queries, qrels

self.data_loaded = True
1 change: 1 addition & 0 deletions mteb/abstasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
from .CrosslingualTask import *
from .MultilingualTask import *
from .BeIRPLTask import *
from .BeIRKOTask import *
20 changes: 20 additions & 0 deletions mteb/tasks/Retrieval/KoMiracl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRKOTask import BeIRKOTask


class KoMiracl(AbsTaskRetrieval, BeIRKOTask):
@property
def description(self):
return {
"name": "Ko-miracl",
"hf_repo": "taeminlee/Ko-miracl",
"hf_repo_qrels": "taeminlee/Ko-miracl",
"beir_name": "Ko-miracl",
"description": "Ko-miracl",
"reference": "",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["dev"],
"eval_langs": ["ko"],
"main_score": "ndcg_at_10",
}
20 changes: 20 additions & 0 deletions mteb/tasks/Retrieval/KoMrtydi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRKOTask import BeIRKOTask


class KoMrtydi(AbsTaskRetrieval, BeIRKOTask):
@property
def description(self):
return {
"name": "Ko-mrtydi",
"hf_repo": "taeminlee/Ko-mrtydi",
"hf_repo_qrels": "taeminlee/Ko-mrtydi",
"beir_name": "Ko-mrtydi",
"description": "Ko-mrtydi",
"reference": "",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["dev"],
"eval_langs": ["ko"],
"main_score": "ndcg_at_10",
}
20 changes: 20 additions & 0 deletions mteb/tasks/Retrieval/KoStrategyQA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRKOTask import BeIRKOTask


class KoStrategyQA(AbsTaskRetrieval, BeIRKOTask):
@property
def description(self):
return {
"name": "Ko-StrategyQA",
"hf_repo": "taeminlee/Ko-StrategyQA",
"hf_repo_qrels": "taeminlee/Ko-StrategyQA",
"beir_name": "Ko-StrategyQA",
"description": "Ko-StrategyQA",
"reference": "",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["dev"],
"eval_langs": ["ko"],
"main_score": "ndcg_at_10",
}
3 changes: 3 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@
from .GerDaLIRRetrieval import *
from .XMarketRetrieval import *
from .MultiLongDocRetrieval import *
from .KoStrategyQA import *
from .KoMrtydi import *
from .KoMiracl import *
48 changes: 48 additions & 0 deletions scripts/run_mteb_korean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Example script for benchmarking all datasets constituting the MTEB Korean leaderboard & average scores"""

import logging

from mteb import MTEB
from sentence_transformers import SentenceTransformer

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger("main")

TASK_LIST_CLASSIFICATION = [
]

TASK_LIST_CLUSTERING = [
]

TASK_LIST_PAIR_CLASSIFICATION = [
]

TASK_LIST_RERANKING = [
]

TASK_LIST_RETRIEVAL = [
'Ko-StrategyQA',
'Ko-mrtydi',
'Ko-miracl'
]

TASK_LIST_STS = [
]

TASK_LIST = (
TASK_LIST_CLASSIFICATION
+ TASK_LIST_CLUSTERING
+ TASK_LIST_PAIR_CLASSIFICATION
+ TASK_LIST_RERANKING
+ TASK_LIST_RETRIEVAL
+ TASK_LIST_STS
)

model_name = "average_word_embeddings_komninos"
model = SentenceTransformer(model_name)

for task in TASK_LIST:
logger.info(f"Running task: {task}")
evaluation = MTEB(tasks=[task], task_langs=["ko"]) # Remove "ko" for running all languages
evaluation.run(model, output_folder=f"results/{model_name}")

0 comments on commit dadf2da

Please sign in to comment.