Merge main v2 (#1504)

* fix: Count unique texts, data leaks in calculate metrics (#1438) * add more stat * add more stat * update statistics * fix: update task metadata to allow for null (#1448) * Update tasks table * 1.19.5 Automatically generated by python-semantic-release * Fix: Made data parsing in the leaderboard figure more robust (#1450) Bugfixes with data parsing in main figure * Fixed task loading (#1451) * Fixed task result loading from disk * Fixed task result loading from disk * fix: publish (#1452) * 1.19.6 Automatically generated by python-semantic-release * fix: Fix load external results with `None` mteb_version (#1453) * fix * lint * 1.19.7 Automatically generated by python-semantic-release * WIP: Polishing up leaderboard UI (#1461) * fix: Removed column wrapping on the table, so that it remains readable * Added disclaimer to figure * fix: Added links to task info table, switched out license with metric * fix: loading pre 1.11.0 (#1460) * small fix * fix: fix * 1.19.8 Automatically generated by python-semantic-release * fix: swap touche2020 to maintain compatibility (#1469) swap touche2020 for parity * 1.19.9 Automatically generated by python-semantic-release * docs: Add sum per language for task counts (#1468) * add sum per lang * add sort by sum option * make lint * fix: pinned datasets to <3.0.0 (#1470) * 1.19.10 Automatically generated by python-semantic-release * feat: add CUREv1 retrieval dataset (#1459) * feat: add CUREv1 dataset --------- Co-authored-by: nadshe <nadia.sheikh@clinia.com> Co-authored-by: olivierr42 <olivier.rousseau@clinia.com> Co-authored-by: Daniel Buades Marcos <daniel@buad.es> * feat: add missing domains to medical tasks * feat: modify benchmark tasks * chore: benchmark naming --------- Co-authored-by: nadshe <nadia.sheikh@clinia.com> Co-authored-by: olivierr42 <olivier.rousseau@clinia.com> * Update tasks table * 1.20.0 Automatically generated by python-semantic-release * fix: check if `model` attr of model exists (#1499) * check if model attr of model exists * lint * Fix retrieval evaluator * 1.20.1 Automatically generated by python-semantic-release * add cure statistics --------- Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions <github-actions@github.com> Co-authored-by: Márton Kardos <power.up1163@gmail.com> Co-authored-by: Isaac Chung <chungisaac1217@gmail.com> Co-authored-by: Napuh <55241721+Napuh@users.noreply.github.com> Co-authored-by: Daniel Buades Marcos <daniel.buades@clinia.com> Co-authored-by: nadshe <nadia.sheikh@clinia.com> Co-authored-by: olivierr42 <olivier.rousseau@clinia.com>
embeddings-benchmark · Nov 27, 2024 · 022d355 · 022d355
1 parent 99247b2
commit 022d355
Show file tree

Hide file tree

Showing 19 changed files with 1,485 additions and 28 deletions.
diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py
@@ -68,7 +68,7 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str:
     return table
 
 
-def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
+def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
     table_dict = {}
     ## Group by language. If it is a multilingual dataset, 1 is added to all languages present.
     for task in tasks:
@@ -82,22 +82,27 @@ def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
     ## Wrangle for polars
     pl_table_dict = []
     for lang, d in table_dict.items():
-        d.update({"lang": lang})
+        d.update({"0-lang": lang})  # for sorting columns
         pl_table_dict.append(d)
 
-    df = pl.DataFrame(pl_table_dict).sort(by="lang")
+    df = pl.DataFrame(pl_table_dict).sort(by="0-lang")
+    df = df.with_columns(sum=pl.sum_horizontal(get_args(TASK_TYPE)))
+    df = df.select(sorted(df.columns))
+    if sort_by_sum:
+        df = df.sort(by="sum", descending=True)
+
     total = df.sum()
 
     task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
-    horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE)))
+    horizontal_line_md = "---|---" * (len(sorted(get_args(TASK_TYPE))) + 1)
     table = f"""
-| Language | {task_names_md} |
+| Language | {task_names_md} | Sum |
 |{horizontal_line_md}|
 """
 
     for row in df.iter_rows():
-        table += f"| {row[-1]} "
-        for num in row[:-1]:
+        table += f"| {row[0]} "
+        for num in row[1:]:
             table += f"| {num} "
         table += "|\n"
 

diff --git a/mteb/__init__.py b/mteb/__init__.py
@@ -6,6 +6,7 @@
     MTEB_ENG_CLASSIC,
     MTEB_MAIN_RU,
     MTEB_RETRIEVAL_LAW,
+    MTEB_RETRIEVAL_MEDICAL,
     MTEB_RETRIEVAL_WITH_INSTRUCTIONS,
     CoIR,
 )
@@ -24,6 +25,7 @@
     "MTEB_ENG_CLASSIC",
     "MTEB_MAIN_RU",
     "MTEB_RETRIEVAL_LAW",
+    "MTEB_RETRIEVAL_MEDICAL",
     "MTEB_RETRIEVAL_WITH_INSTRUCTIONS",
     "CoIR",
     "TASKS_REGISTRY",

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -106,7 +106,7 @@ def load_results(
             "StackExchangeClustering.v2",
             "StackExchangeClusteringP2P.v2",
             "TRECCOVID",
-            "Touche2020",
+            "Touche2020Retrieval.v3",
             "ToxicConversationsClassification",
             "TweetSentimentExtractionClassification",
             "TwentyNewsgroupsClustering.v2",
@@ -186,7 +186,7 @@ def load_results(
             "StackOverflowDupQuestions",
             "SummEval",
             "TRECCOVID",
-            "Touche2020Retrieval.v3",
+            "Touche2020",
             "ToxicConversationsClassification",
             "TweetSentimentExtractionClassification",
             "TwentyNewsgroupsClustering",
@@ -308,6 +308,29 @@ def load_results(
     citation=None,
 )
 
+MTEB_RETRIEVAL_MEDICAL = Benchmark(
+    name="MTEB(Medical)",
+    tasks=get_tasks(
+        tasks=[
+            "CUREv1",
+            "NFCorpus",
+            "TRECCOVID",
+            "TRECCOVID-PL",
+            "SciFact",
+            "SciFact-PL",
+            "MedicalQARetrieval",
+            "PublicHealthQA",
+            "MedrxivClusteringP2P.v2",
+            "MedrxivClusteringS2S.v2",
+            "CmedqaRetrieval",
+            "CMedQAv2-reranking",
+        ],
+    ),
+    description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.",
+    reference="",
+    citation=None,
+)
+
 MTEB_MINERS_BITEXT_MINING = Benchmark(
     name="MINERSBitextMining",
     tasks=get_tasks(
@@ -702,6 +725,7 @@ def load_results(
             "SpartQA",
             "TempReasonL1",
             "TRECCOVID",
+            "CUREv1",
             "WinoGrande",
             "BelebeleRetrieval",
             "MLQARetrieval",