Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Korean Text Search Tasks to MTEB #210

Merged
merged 7 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,4 @@ def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs)
(doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
for doc in corpus
]
return self.model.encode(sentences, batch_size=batch_size, **kwargs)
return self.model.encode(sentences, batch_size=batch_size, **kwargs)
taeminlee marked this conversation as resolved.
Show resolved Hide resolved
42 changes: 42 additions & 0 deletions mteb/abstasks/BeIRKOTask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
from .AbsTask import AbsTask


class BeIRKOTask(AbsTask):
def __init__(self, **kwargs):
super().__init__(**kwargs)

def load_data(self, eval_splits=None, **kwargs):
"""
Load dataset from BeIR-KO benchmark.
"""
try:
from beir.datasets.data_loader_hf import HFDataLoader
except ImportError:
raise Exception("Retrieval tasks require beir package. Please install it with `pip install mteb[beir]`")


if self.data_loaded:
return
if eval_splits is None:
eval_splits = self.description["eval_splits"]
dataset = self.description["beir_name"]

hf_repo = self.description["hf_repo"]
hf_repo_qrels = self.description["hf_repo_qrels"]

# cqadupstack not on huggingface yet
# dataset, sub_dataset = dataset.split("/") if "cqadupstack" in dataset else (dataset, None)

self.corpus, self.queries, self.relevant_docs = {}, {}, {}

for split in eval_splits:

corpus, queries, qrels = HFDataLoader(hf_repo=hf_repo, hf_repo_qrels=hf_repo_qrels, streaming=False, keep_in_memory=False).load(split=split)
# Conversion from DataSet
queries = {query['id']: query['text'] for query in queries}
corpus = {doc['id']: {'title': doc['title'] , 'text': doc['text']} for doc in corpus}

self.corpus[split], self.queries[split], self.relevant_docs[split] = corpus, queries, qrels

self.data_loaded = True
1 change: 1 addition & 0 deletions mteb/abstasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
from .CrosslingualTask import *
from .MultilingualTask import *
from .BeIRPLTask import *
from .BeIRKOTask import *
20 changes: 20 additions & 0 deletions mteb/tasks/Retrieval/KoMiracl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRKOTask import BeIRKOTask


class KoMiracl(AbsTaskRetrieval, BeIRKOTask):
@property
def description(self):
return {
"name": "Ko-miracl",
"hf_repo": "taeminlee/Ko-miracl",
"hf_repo_qrels": "taeminlee/Ko-miracl",
"beir_name": "Ko-miracl",
"description": "Ko-miracl",
"reference": "",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["dev"],
"eval_langs": ["ko"],
"main_score": "ndcg_at_10",
}
20 changes: 20 additions & 0 deletions mteb/tasks/Retrieval/KoMrtydi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRKOTask import BeIRKOTask


class KoMrtydi(AbsTaskRetrieval, BeIRKOTask):
@property
def description(self):
return {
"name": "Ko-mrtydi",
"hf_repo": "taeminlee/Ko-mrtydi",
"hf_repo_qrels": "taeminlee/Ko-mrtydi",
"beir_name": "Ko-mrtydi",
"description": "Ko-mrtydi",
"reference": "",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["dev"],
"eval_langs": ["ko"],
"main_score": "ndcg_at_10",
}
20 changes: 20 additions & 0 deletions mteb/tasks/Retrieval/KoStrategyQA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRKOTask import BeIRKOTask


class KoStrategyQA(AbsTaskRetrieval, BeIRKOTask):
@property
def description(self):
return {
"name": "Ko-StrategyQA",
"hf_repo": "taeminlee/Ko-StrategyQA",
"hf_repo_qrels": "taeminlee/Ko-StrategyQA",
"beir_name": "Ko-StrategyQA",
"description": "Ko-StrategyQA",
"reference": "",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["dev"],
"eval_langs": ["ko"],
"main_score": "ndcg_at_10",
}
3 changes: 3 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@
from .GermanDPRRetrieval import *
from .GerDaLIRRetrieval import *
from .XMarketRetrieval import *
from .KoStrategyQA import *
from .KoMrtydi import *
from .KoMiracl import *
48 changes: 48 additions & 0 deletions scripts/run_mteb_korean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Example script for benchmarking all datasets constituting the MTEB English leaderboard & average scores"""
taeminlee marked this conversation as resolved.
Show resolved Hide resolved

import logging

from mteb import MTEB
from sentence_transformers import SentenceTransformer

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger("main")

TASK_LIST_CLASSIFICATION = [
]

TASK_LIST_CLUSTERING = [
]

TASK_LIST_PAIR_CLASSIFICATION = [
]

TASK_LIST_RERANKING = [
]

TASK_LIST_RETRIEVAL = [
'Ko-StrategyQA',
'Ko-mrtydi',
'Ko-miracl'
]

TASK_LIST_STS = [
]

TASK_LIST = (
TASK_LIST_CLASSIFICATION
+ TASK_LIST_CLUSTERING
+ TASK_LIST_PAIR_CLASSIFICATION
+ TASK_LIST_RERANKING
+ TASK_LIST_RETRIEVAL
+ TASK_LIST_STS
)

model_name = "average_word_embeddings_komninos"
model = SentenceTransformer(model_name)

for task in TASK_LIST:
logger.info(f"Running task: {task}")
evaluation = MTEB(tasks=[task], task_langs=["ko"]) # Remove "ko" for running all languages
evaluation.run(model, output_folder=f"results/{model_name}")