Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Leaderboard 2.0: added performance x n_parameters plot + more benchmark info #1437

Merged
merged 9 commits into from
Nov 12, 2024
25 changes: 22 additions & 3 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import mteb
from mteb.caching import json_cache
from mteb.leaderboard.figures import performance_size_plot
from mteb.leaderboard.table import scores_to_tables


Expand All @@ -32,11 +33,22 @@ def update_citation(benchmark_name: str) -> str:
return citation


def update_description(benchmark_name: str) -> str:
def update_description(
benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str]
) -> str:
benchmark = mteb.get_benchmark(benchmark_name)
description = f"## {benchmark.name}\n{benchmark.description}\n"
n_languages = len(languages)
n_task_types = len(task_types)
n_tasks = len(benchmark.tasks)
n_domains = len(domains)
description += f" - **Number of languages**: {n_languages}\n"
description += f" - **Number of datasets**: {n_tasks}\n"
description += f" - **Number of task types**: {n_task_types}\n"
description += f" - **Number of domains**: {n_domains}\n"
if str(benchmark.reference) != "None":
description += f"\n[Click for More Info]({benchmark.reference})"

return description


Expand Down Expand Up @@ -194,14 +206,21 @@ def update_task_info(task_names: str) -> str:
interactive=True,
)
scores = gr.State(default_scores)
description = gr.Markdown(update_description, inputs=[benchmark_select])
with gr.Row():
with gr.Column():
description = gr.Markdown(
update_description,
inputs=[benchmark_select, lang_select, type_select, domain_select],
)
citation = gr.Markdown(update_citation, inputs=[benchmark_select])
with gr.Column():
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
with gr.Tab("Summary"):
summary_table.render()
with gr.Tab("Performance per task"):
per_task_table.render()
with gr.Tab("Task information"):
task_info_table = gr.DataFrame(update_task_info, inputs=[task_select])
citation = gr.Markdown(update_citation, inputs=[benchmark_select])

@gr.on(inputs=[scores, searchbar], outputs=[summary_table, per_task_table])
def update_tables(scores, search_query: str):
Expand Down
82 changes: 82 additions & 0 deletions mteb/leaderboard/figures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


def parse_n_params(text: str) -> int:
if text.endswith("M"):
return float(text[:-1]) * 1e6
if text.endswith("B"):
return float(text[:-1]) * 1e9


def parse_model_name(name: str) -> str:
name, _ = name.split("]")
return name[1:]


models_to_annotate = [
"all-MiniLM-L6-v2",
"GritLM-7B",
"LaBSE",
"multilingual-e5-large-instruct",
]


def performance_size_plot(df: pd.DataFrame) -> go.Figure:
df = df.copy()
df["Number of Parameters"] = df["Number of Parameters"].map(parse_n_params)
df["Model"] = df["Model"].map(parse_model_name)
df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "")
df["Embedding Dimensions"] = df["Embedding Dimensions"].map(int)
df["Max Tokens"] = df["Max Tokens"].map(int)
df["Log(Tokens)"] = np.log10(df["Max Tokens"])
min_score, max_score = df["Mean (Task)"].min(), df["Mean (Task)"].max()
fig = px.scatter(
df,
x="Number of Parameters",
y="Mean (Task)",
log_x=True,
template="plotly_white",
text="model_text",
size="Embedding Dimensions",
color="Log(Tokens)",
range_color=[2, 5],
range_x=[8 * 1e6, 11 * 1e9],
range_y=[min(0, min_score * 1.25), max_score * 1.25],
hover_data={
"Max Tokens": True,
"Embedding Dimensions": True,
"Number of Parameters": True,
"Mean (Task)": True,
"Rank (Borda)": True,
"Log(Tokens)": False,
"model_text": False,
},
hover_name="Model",
)
fig.update_layout(
coloraxis_colorbar=dict(
title="Max Tokens",
tickvals=[2, 3, 4, 5],
ticktext=[
"100",
"1K",
"10K",
"100K",
],
),
hoverlabel=dict(
bgcolor="white",
font_size=16,
),
)
fig.update_traces(
textposition="top center",
)
fig.update_layout(
font=dict(size=16, color="black"),
margin=dict(b=20, t=10, l=20, r=10),
)
return fig
47 changes: 19 additions & 28 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,6 @@ def scores_to_tables(
joint_table = joint_table.drop(columns=["model_revision"])
model_metas = joint_table["model_name"].map(get_model_meta)
joint_table["model_link"] = model_metas.map(lambda m: m.reference)
# joint_table.insert(
# 1,
# "Rank (Mean)",
# joint_table["mean"].rank(ascending=False, method="min").astype(int),
# )
joint_table.insert(
1,
"Max Tokens",
Expand Down Expand Up @@ -163,36 +158,32 @@ def scores_to_tables(
}
)
joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank"))
to_format = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns]
joint_table[to_format] = joint_table[to_format].map(format_scores)
joint_table = joint_table.style.highlight_max(
subset=to_format,
props="font-weight: bold",
)
joint_table = joint_table.format(
"{:.2f}", subset=joint_table.data.select_dtypes("number").columns
)
joint_table = joint_table.format("{:,}", subset=["Rank (Borda)"])
joint_table = joint_table.highlight_min(
subset=["Rank (Borda)"], props="font-weight: bold"
)
numerics = per_task.select_dtypes("number").columns
per_task[numerics] = per_task[numerics].map(format_scores)
per_task = per_task.style.highlight_max(
subset=numerics, props="font-weight: bold"
).format("{:.2f}", subset=numerics)
column_widths = get_column_widths(joint_table.data)
column_widths = get_column_widths(joint_table)
# overriding for model name
column_widths[1] = "250px"
column_types = get_column_types(joint_table.data)
column_types = get_column_types(joint_table)
# setting model name column to markdown
column_types[1] = "markdown"
score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns]
joint_table[score_columns] *= 100
joint_table_style = (
joint_table.style.format(
{**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"}
)
.highlight_min("Rank (Borda)", props="font-weight: bold")
.highlight_max(subset=score_columns, props="font-weight: bold")
)
task_score_columns = per_task.select_dtypes("number").columns
per_task[task_score_columns] *= 100
per_task_style = per_task.style.format(
"{:.2f}", subset=task_score_columns
).highlight_max(subset=task_score_columns, props="font-weight: bold")
return (
gr.DataFrame(
joint_table,
column_widths=column_widths,
joint_table_style,
# column_widths=column_widths,
datatype=column_types,
wrap=True,
),
gr.DataFrame(per_task),
gr.DataFrame(per_task_style),
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint
codecarbon = ["codecarbon"]
speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"]
peft = ["peft>=0.11.0"]
leaderboard = ["gradio>=4.44.0", "gradio_rangeslider>=0.0.6"]
leaderboard = ["gradio>=5.5.0", "gradio_rangeslider>=0.0.8"]
flagembedding = ["FlagEmbedding"]
jina = ["einops>=0.8.0"]
flash_attention = ["flash-attn>=2.6.3"]
Expand Down
Loading