diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 9b89d5dd4..8a5eb961c 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -60,21 +60,25 @@ def format_list(props: list[str]): return ", ".join(props) -def update_task_info(task_names: str) -> str: +def update_task_info(task_names: str) -> gr.DataFrame: tasks = mteb.get_tasks(tasks=task_names) - df = tasks.to_dataframe() + df = tasks.to_dataframe( + properties=["name", "type", "languages", "domains", "reference", "main_score"] + ) df["languages"] = df["languages"].map(format_list) df["domains"] = df["domains"].map(format_list) + df["name"] = "[" + df["name"] + "](" + df["reference"] + ")" df = df.rename( columns={ "name": "Task Name", "type": "Task Type", "languages": "Languages", "domains": "Domains", - "license": "License", + "main_score": "Metric", } ) - return df + df = df.drop(columns="reference") + return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) all_results = load_results().filter_models() @@ -215,6 +219,9 @@ def update_task_info(task_names: str) -> str: citation = gr.Markdown(update_citation, inputs=[benchmark_select]) with gr.Column(): plot = gr.Plot(performance_size_plot, inputs=[summary_table]) + gr.Markdown( + "*We only display models that have been run on all tasks in the benchmark*" + ) with gr.Tab("Summary"): summary_table.render() with gr.Tab("Performance per task"): diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index d9b830d23..c965a7f68 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -200,7 +200,7 @@ def scores_to_tables( joint_table_style, # column_widths=column_widths, datatype=column_types, - wrap=True, + # wrap=True, ), gr.DataFrame(per_task_style), )