Skip to content

Commit

Permalink
Merge main v2 (#1504)
Browse files Browse the repository at this point in the history
* fix: Count unique texts, data leaks in calculate metrics (#1438)

* add more stat

* add more stat

* update statistics

* fix: update task metadata to allow for null (#1448)

* Update tasks table

* 1.19.5

Automatically generated by python-semantic-release

* Fix: Made data parsing in the leaderboard figure more robust (#1450)

Bugfixes with data parsing in main figure

* Fixed task loading (#1451)

* Fixed task result loading from disk

* Fixed task result loading from disk

* fix: publish (#1452)

* 1.19.6

Automatically generated by python-semantic-release

* fix: Fix load external results with `None` mteb_version (#1453)

* fix

* lint

* 1.19.7

Automatically generated by python-semantic-release

* WIP: Polishing up leaderboard UI (#1461)

* fix: Removed column wrapping on the table, so that it remains readable

* Added disclaimer to figure

* fix: Added links to task info table, switched out license with metric

* fix: loading pre 1.11.0 (#1460)

* small fix

* fix: fix

* 1.19.8

Automatically generated by python-semantic-release

* fix: swap touche2020 to maintain compatibility (#1469)

swap touche2020 for parity

* 1.19.9

Automatically generated by python-semantic-release

* docs: Add sum per language for task counts (#1468)

* add sum per lang

* add sort by sum option

* make lint

* fix: pinned datasets to <3.0.0 (#1470)

* 1.19.10

Automatically generated by python-semantic-release

* feat: add CUREv1 retrieval dataset (#1459)

* feat: add CUREv1 dataset

---------

Co-authored-by: nadshe <nadia.sheikh@clinia.com>
Co-authored-by: olivierr42 <olivier.rousseau@clinia.com>
Co-authored-by: Daniel Buades Marcos <daniel@buad.es>

* feat: add missing domains to medical tasks

* feat: modify benchmark tasks

* chore: benchmark naming

---------

Co-authored-by: nadshe <nadia.sheikh@clinia.com>
Co-authored-by: olivierr42 <olivier.rousseau@clinia.com>

* Update tasks table

* 1.20.0

Automatically generated by python-semantic-release

* fix: check if `model` attr of model exists (#1499)

* check if model attr of model exists

* lint

* Fix retrieval evaluator

* 1.20.1

Automatically generated by python-semantic-release

* add cure statistics

---------

Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Márton Kardos <power.up1163@gmail.com>
Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
Co-authored-by: Napuh <55241721+Napuh@users.noreply.github.com>
Co-authored-by: Daniel Buades Marcos <daniel.buades@clinia.com>
Co-authored-by: nadshe <nadia.sheikh@clinia.com>
Co-authored-by: olivierr42 <olivier.rousseau@clinia.com>
  • Loading branch information
10 people authored Nov 27, 2024
1 parent 99247b2 commit 022d355
Show file tree
Hide file tree
Showing 19 changed files with 1,485 additions and 28 deletions.
19 changes: 12 additions & 7 deletions docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str:
return table


def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
table_dict = {}
## Group by language. If it is a multilingual dataset, 1 is added to all languages present.
for task in tasks:
Expand All @@ -82,22 +82,27 @@ def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
## Wrangle for polars
pl_table_dict = []
for lang, d in table_dict.items():
d.update({"lang": lang})
d.update({"0-lang": lang}) # for sorting columns
pl_table_dict.append(d)

df = pl.DataFrame(pl_table_dict).sort(by="lang")
df = pl.DataFrame(pl_table_dict).sort(by="0-lang")
df = df.with_columns(sum=pl.sum_horizontal(get_args(TASK_TYPE)))
df = df.select(sorted(df.columns))
if sort_by_sum:
df = df.sort(by="sum", descending=True)

total = df.sum()

task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE)))
horizontal_line_md = "---|---" * (len(sorted(get_args(TASK_TYPE))) + 1)
table = f"""
| Language | {task_names_md} |
| Language | {task_names_md} | Sum |
|{horizontal_line_md}|
"""

for row in df.iter_rows():
table += f"| {row[-1]} "
for num in row[:-1]:
table += f"| {row[0]} "
for num in row[1:]:
table += f"| {num} "
table += "|\n"

Expand Down
2 changes: 2 additions & 0 deletions mteb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MTEB_ENG_CLASSIC,
MTEB_MAIN_RU,
MTEB_RETRIEVAL_LAW,
MTEB_RETRIEVAL_MEDICAL,
MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
CoIR,
)
Expand All @@ -24,6 +25,7 @@
"MTEB_ENG_CLASSIC",
"MTEB_MAIN_RU",
"MTEB_RETRIEVAL_LAW",
"MTEB_RETRIEVAL_MEDICAL",
"MTEB_RETRIEVAL_WITH_INSTRUCTIONS",
"CoIR",
"TASKS_REGISTRY",
Expand Down
28 changes: 26 additions & 2 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def load_results(
"StackExchangeClustering.v2",
"StackExchangeClusteringP2P.v2",
"TRECCOVID",
"Touche2020",
"Touche2020Retrieval.v3",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
"TwentyNewsgroupsClustering.v2",
Expand Down Expand Up @@ -186,7 +186,7 @@ def load_results(
"StackOverflowDupQuestions",
"SummEval",
"TRECCOVID",
"Touche2020Retrieval.v3",
"Touche2020",
"ToxicConversationsClassification",
"TweetSentimentExtractionClassification",
"TwentyNewsgroupsClustering",
Expand Down Expand Up @@ -308,6 +308,29 @@ def load_results(
citation=None,
)

MTEB_RETRIEVAL_MEDICAL = Benchmark(
name="MTEB(Medical)",
tasks=get_tasks(
tasks=[
"CUREv1",
"NFCorpus",
"TRECCOVID",
"TRECCOVID-PL",
"SciFact",
"SciFact-PL",
"MedicalQARetrieval",
"PublicHealthQA",
"MedrxivClusteringP2P.v2",
"MedrxivClusteringS2S.v2",
"CmedqaRetrieval",
"CMedQAv2-reranking",
],
),
description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
reference="",
citation=None,
)

MTEB_MINERS_BITEXT_MINING = Benchmark(
name="MINERSBitextMining",
tasks=get_tasks(
Expand Down Expand Up @@ -702,6 +725,7 @@ def load_results(
"SpartQA",
"TempReasonL1",
"TRECCOVID",
"CUREv1",
"WinoGrande",
"BelebeleRetrieval",
"MLQARetrieval",
Expand Down
Loading

0 comments on commit 022d355

Please sign in to comment.