Skip to content

Commit

Permalink
fix: Added medical qa dataset (#333)
Browse files Browse the repository at this point in the history
* Added news classification dataset.

* Fixes on suggestions

* Added new medical qa dataset

* Update model run files and model path

* Added points for dataset.

* Fixes

---------

Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
  • Loading branch information
Sakshamrzt and KennethEnevoldsen authored Apr 10, 2024
1 parent 84cffa2 commit 80acc3e
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 9 deletions.
18 changes: 9 additions & 9 deletions docs/mmteb/points.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Points

| GitHub | Total points | New dataset | New task | Dataset annotations | (Bug)fixes | Running Models | Review PR | Paper Writing | Ideation | Coordination |
|-------------------| ------------ |-------------| -------- | ------------------- | ---------- | -------------- | -------- | -------------- | -------- | ------------- |
| KennethEnevoldsen | | 38+16 | | 1 | 9+1+2 | | | | | |
| x-tabdeveloping | | 2+16 | | | | | | | | |
| imenelydiaker | | 88 | | | | | 7 | | | |
| wissam-sib | | 88 | | | | | 1 | | | |
| GabrielSequeira | | 88 | | | | | | | | |
| schmarion | | 88 | | | | | | | | |
| MathieuCiancone | | 88 | | | | | | | | |
| Sakshamrzt | | 2 | | | | | | | | |
|-------------------| ------------ |-------------| -------- | ------------------- | ---------- | ------------ |-----------| -------------- | -------- | ------------- |
| KennethEnevoldsen | | 38+16 | | 1 | 9+1+2 | | 1 | | | |
| x-tabdeveloping | | 2+16 | | | | | | | | |
| imenelydiaker | | 88 | | | | | 7 | | | |
| wissam-sib | | 88 | | | | | 1 | | | |
| GabrielSequeira | | 88 | | | | | | | | |
| schmarion | | 88 | | | | | | | | |
| MathieuCiancone | | 88 | | | | | | | | |
| Sakshamrzt | | 4 | | | | | | | | |
| MartinBernstorff | | 2 | | | 7 | | 3 | | | |

Note that coordination and ideation is not included in the total points, but is used to determine first and last authors.
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .en.LegalBenchConsumerContractsQARetrieval import *
from .en.LegalBenchCorporateLobbyingRetrieval import *
from .en.LegalSummarizationRetrieval import *
from .en.MedicalQARetrieval import *
from .en.MSMARCORetrieval import *
from .en.MSMARCOv2Retrieval import *
from .en.NarrativeQARetrieval import *
Expand Down
42 changes: 42 additions & 0 deletions mteb/tasks/Retrieval/en/MedicalQARetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata


class MedicalQARetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="MedicalQARetrieval",
description="The dataset consists 2048 medical question and answer pairs.",
reference="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4",
dataset={
"path": "mteb/medical_qa",
"revision": "ae763399273d8b20506b80cf6f6f9a31a6a2b238",
},
type="Retrieval",
category="s2s",
eval_splits=["test"],
eval_langs=["en"],
main_score="ndcg_at_10",
date=("2017-01-01", "2019-12-31"), # best guess,
form=["written"],
domains=["Medical"],
task_subtypes=["Article retrieval"],
license="CC0 1.0 Universal",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@ARTICLE{BenAbacha-BMC-2019,
author = {Asma {Ben Abacha} and Dina Demner{-}Fushman},
title = {A Question-Entailment Approach to Question Answering},
journal = {{BMC} Bioinform.},
volume = {20},
number = {1},
pages = {511:1--511:23},
year = {2019},
url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4}
} """,
n_samples={"test": 2048},
avg_character_length={"test": 1205.9619140625},
)
38 changes: 38 additions & 0 deletions results/intfloat__multilingual-e5-small/MedicalQARetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"dataset_revision": "ae763399273d8b20506b80cf6f6f9a31a6a2b238",
"mteb_dataset_name": "MedicalQARetrieval",
"mteb_version": "1.5.4",
"test": {
"evaluation_time": 29.86,
"map_at_1": 0.43896,
"map_at_10": 0.56803,
"map_at_100": 0.57388,
"map_at_1000": 0.5741,
"map_at_3": 0.54167,
"map_at_5": 0.55658,
"mrr_at_1": 0.43896,
"mrr_at_10": 0.56815,
"mrr_at_100": 0.57401,
"mrr_at_1000": 0.57423,
"mrr_at_3": 0.54191,
"mrr_at_5": 0.55671,
"ndcg_at_1": 0.43896,
"ndcg_at_10": 0.62856,
"ndcg_at_100": 0.65666,
"ndcg_at_1000": 0.66229,
"ndcg_at_3": 0.57453,
"ndcg_at_5": 0.60145,
"precision_at_1": 0.43896,
"precision_at_10": 0.08174,
"precision_at_100": 0.00948,
"precision_at_1000": 0.00099,
"precision_at_3": 0.22314,
"precision_at_5": 0.14697,
"recall_at_1": 0.43896,
"recall_at_10": 0.81738,
"recall_at_100": 0.94824,
"recall_at_1000": 0.99219,
"recall_at_3": 0.66943,
"recall_at_5": 0.73486
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"dataset_revision": "ae763399273d8b20506b80cf6f6f9a31a6a2b238",
"mteb_dataset_name": "MedicalQARetrieval",
"mteb_version": "1.5.4",
"test": {
"evaluation_time": 11.98,
"map_at_1": 0.35059,
"map_at_10": 0.47558,
"map_at_100": 0.48095,
"map_at_1000": 0.48147,
"map_at_3": 0.44621,
"map_at_5": 0.46586,
"mrr_at_1": 0.35059,
"mrr_at_10": 0.47562,
"mrr_at_100": 0.48099,
"mrr_at_1000": 0.4815,
"mrr_at_3": 0.44637,
"mrr_at_5": 0.4659,
"ndcg_at_1": 0.35059,
"ndcg_at_10": 0.5363,
"ndcg_at_100": 0.5641,
"ndcg_at_1000": 0.57894,
"ndcg_at_3": 0.47757,
"ndcg_at_5": 0.51276,
"precision_at_1": 0.35059,
"precision_at_10": 0.07261,
"precision_at_100": 0.0086,
"precision_at_1000": 0.00098,
"precision_at_3": 0.18945,
"precision_at_5": 0.13066,
"recall_at_1": 0.35059,
"recall_at_10": 0.72607,
"recall_at_100": 0.86035,
"recall_at_1000": 0.97949,
"recall_at_3": 0.56836,
"recall_at_5": 0.65332
}
}
60 changes: 60 additions & 0 deletions scripts/data/medicalqaretrieval/create_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

import os
import uuid
from typing import Dict

from datasets import load_dataset
from huggingface_hub import create_repo, upload_file


def preprocess_data(example: Dict) -> Dict:
"""
Preprocessed the data in a format easier
to handle for the loading of queries and corpus
------
PARAMS
example : element in med-qa dataset
"""
return {
"query-id": str(uuid.uuid4()),
"query_text": example["Question"],
"corpus-id": str(uuid.uuid4()),
"answer_text": example["Answer"],
}


repo_name = "mteb/medical_qa"
create_repo(repo_name, repo_type="dataset", token="")


raw_dset = load_dataset("keivalya/MedQuad-MedicalQnADataset")
dset = raw_dset["train"]
trimmed_dataset = dset.select(range(2048))
updated_dataset = trimmed_dataset.map(
preprocess_data, remove_columns=["Question", "Answer", "qtype"]
)
corpus_ds = updated_dataset.map(
lambda example: {"_id": example["corpus-id"], "text": example["answer_text"]},
remove_columns=["query-id", "query_text", "corpus-id", "answer_text"],
)
corpus_ds = corpus_ds.add_column("title", len(corpus_ds) * [""])
default_ds = updated_dataset.map(
lambda example: example, remove_columns=["answer_text", "query_text"]
)
default_ds = default_ds.add_column("score", len(corpus_ds) * [0])
queries_ds = updated_dataset.map(
lambda example: {"_id": example["query-id"], "text": example["query_text"]},
remove_columns=["corpus-id", "answer_text", "query-id", "query_text"],
)
data = {"corpus": corpus_ds, "default": default_ds, "queries": queries_ds}
for splits in ["default", "queries"]:
save_path = f"{splits}.jsonl"
data[splits].to_json(save_path)
upload_file(
path_or_fileobj=save_path,
path_in_repo=save_path,
repo_id=repo_name,
repo_type="dataset",
)
os.system(f"rm {save_path}")

0 comments on commit 80acc3e

Please sign in to comment.