Skip to content

Commit

Permalink
Merge branch 'lm-sys:main' into feature/jab-api-0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
rmontoya12 authored Nov 26, 2024
2 parents 0af9f11 + 1cd4b74 commit 2179cce
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 77 deletions.
4 changes: 2 additions & 2 deletions fastchat/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

# Survey Link URL (to be removed) #00729c
SURVEY_LINK = """<div style='text-align: left; margin: 20px 0;'>
<div style='display: inline-block; border: 2px solid #C41E3A; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
<span style='color: #C41E3A; font-weight: bold;'>New Launch! Jailbreak models at <a href='https://redarena.ai' style='color: #C41E3A; text-decoration: underline;'>RedTeam Arena</a>. </span>
<div style='display: inline-block; border: 2px solid #00729c; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
<span style='color: #00729c; font-weight: bold;'>New Launch! Copilot Arena: <a href='https://marketplace.visualstudio.com/items?itemName=copilot-arena.copilot-arena' style='color: #00729c; text-decoration: underline;'>VS Code Extension</a> to compare Top LLMs</span>
</div>
</div>"""
# SURVEY_LINK = ""
Expand Down
1 change: 0 additions & 1 deletion fastchat/serve/monitor/elo_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from transformers import AutoTokenizer

from fastchat.model.model_registry import get_model_info
from fastchat.serve.monitor.basic_stats import get_log_files
Expand Down
46 changes: 14 additions & 32 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,16 @@


def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())
q025 = arena_df["rating_q025"].values
q975 = arena_df["rating_q975"].values

sorted_q025 = np.sort(q025)
insertion_indices = np.searchsorted(sorted_q025, q975, side="right")
counts = len(sorted_q025) - insertion_indices

rankings = 1 + counts
ranking_series = pd.Series(rankings, index=arena_df.index)
return ranking_series.tolist()


def arena_hard_title(date):
Expand All @@ -81,22 +78,6 @@ def arena_hard_title(date):
return arena_hard_title


def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())


def update_elo_components(
max_num_files, elo_results_file, ban_ip_file, exclude_model_names
):
Expand Down Expand Up @@ -861,14 +842,15 @@ def build_category_leaderboard_tab(
"full_style_control",
"hard_6",
"hard_6_style_control",
"if",
"coding",
"math",
"multiturn",
"creative_writing",
"if",
"long_user",
"multiturn",
# "no_refusal",
]
selected_categories_width = [110, 110, 110, 110, 110, 80, 80, 80, 80]
selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80]

language_categories = [
"english",
Expand Down
13 changes: 8 additions & 5 deletions fastchat/serve/monitor/monitor_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"gemini-1.5-pro-api-0409-preview",
"bard-jan-24-gemini-pro",
"chatgpt-4o-latest-20240808",
"chatgpt-4o-latest-20240903",
]

key_to_category_name = {
Expand All @@ -18,11 +19,12 @@
"math": "Math",
"if": "Instruction Following",
"multiturn": "Multi-Turn",
"creative_writing": "Creative Writing",
"coding": "Coding",
"coding_style_control": "Coding w/ Style Control",
"hard_6": "Hard Prompts (Overall)",
"hard_6": "Hard Prompts",
"hard_english_6": "Hard Prompts (English)",
"hard_6_style_control": "Hard Prompts (Overall) w/ Style Control",
"hard_6_style_control": "Hard Prompts w/ Style Control",
"long_user": "Longer Query",
"english": "English",
"chinese": "Chinese",
Expand All @@ -47,8 +49,8 @@
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
"Coding": "Coding: whether conversation contains code snippets",
"Coding w/ Style Control": "Coding with Style Control",
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts (Overall) w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
"Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Longer Query": "Longer Query (>= 500 tokens)",
"English": "English Prompts",
Expand All @@ -64,6 +66,7 @@
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
"Creative Writing": "Creative Writing",
}
cat_name_to_baseline = {
"Hard Prompts (English)": "English",
Expand All @@ -81,7 +84,7 @@ def make_default_md_1(mirror=False):
link_color = "#1976D2" # This color should be clear in both light and dark mode
leaderboard_md = f"""
# 🏆 Chatbot Arena LLM Leaderboard: Community-driven Evaluation for Best LLM and AI chatbots
[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
[Twitter](https://twitter.com/lmarena_ai) | [Discord](https://discord.gg/6GXcFg3TH8) | [Blog](https://blog.lmarena.ai/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Kaggle Competition](https://www.kaggle.com/competitions/wsdm-cup-multilingual-chatbot-arena)
"""

return leaderboard_md
Expand Down
84 changes: 47 additions & 37 deletions tests/load_test.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
import argparse
import time, asyncio
from openai import AsyncOpenAI, AsyncAzureOpenAI
import time
import threading
from concurrent.futures import ThreadPoolExecutor
import uuid
import traceback
import numpy as np
from transformers import AutoTokenizer
from litellm import completion

# base_url - litellm proxy endpoint
# api_key - litellm proxy api-key, is created proxy with auth
litellm_client = None


async def litellm_completion(args, tokenizer, image_url=None):
# Your existing code for litellm_completion goes here
def litellm_completion(args, tokenizer, image_url=None):
try:
if image_url:
messages = [
Expand All @@ -30,16 +27,24 @@ async def litellm_completion(args, tokenizer, image_url=None):
]

start = time.time()
response = await litellm_client.chat.completions.create(

additional_api_kwargs = {}
if args.api_key:
additional_api_kwargs["api_key"] = args.api_key
if args.api_base:
additional_api_kwargs["api_base"] = args.api_base

response = completion(
model=args.model,
messages=messages,
stream=True,
**additional_api_kwargs,
)
ttft = None

itl_list = []
content = ""
async for chunk in response:
for chunk in response:
if chunk.choices[0].delta.content:
end_time = time.time()
if ttft is None:
Expand All @@ -52,43 +57,48 @@ async def litellm_completion(args, tokenizer, image_url=None):
return content, ttft, itl_list

except Exception as e:
# If there's an exception, log the error message
print(e)
with open("error_log.txt", "a") as error_log:
error_log.write(f"Error during completion: {str(e)}\n")
return str(e)


async def main(args):
def main(args):
n = args.num_total_responses
batch_size = args.req_per_sec # Requests per second
start = time.time()

all_tasks = []
all_results = []
tokenizer = AutoTokenizer.from_pretrained("gpt2")
for i in range(0, n, batch_size):
batch = range(i, min(i + batch_size, n))
for _ in batch:
if args.include_image:
# Generate a random dimension for the image
if args.randomize_image_dimensions:
y_dimension = np.random.randint(100, 1025)

with ThreadPoolExecutor(max_workers=batch_size) as executor:
for i in range(0, n, batch_size):
batch_futures = []
batch = range(i, min(i + batch_size, n))

for _ in batch:
if args.include_image:
if args.randomize_image_dimensions:
y_dimension = np.random.randint(100, 1025)
else:
y_dimension = 512
image_url = f"https://placehold.co/1024x{y_dimension}/png"
future = executor.submit(
litellm_completion, args, tokenizer, image_url
)
else:
y_dimension = 512
image_url = f"https://placehold.co/1024x{y_dimension}/png"
task = asyncio.create_task(
litellm_completion(args, tokenizer, image_url)
)
else:
task = asyncio.create_task(litellm_completion(args, tokenizer))
all_tasks.append(task)
if i + batch_size < n:
await asyncio.sleep(1) # Wait 1 second before the next batch

all_completions = await asyncio.gather(*all_tasks)
future = executor.submit(litellm_completion, args, tokenizer)
batch_futures.append(future)

# Wait for batch to complete
for future in batch_futures:
all_results.append(future.result())

if i + batch_size < n:
time.sleep(1) # Wait 1 second before next batch

successful_completions = [
c for c in all_completions if isinstance(c, tuple) and len(c) == 3
c for c in all_results if isinstance(c, tuple) and len(c) == 3
]
ttft_list = np.array([float(c[1]) for c in successful_completions])
itl_list_flattened = np.array(
Expand All @@ -101,7 +111,7 @@ async def main(args):

# Write errors to error_log.txt
with open("load_test_errors.log", "a") as error_log:
for completion in all_completions:
for completion in all_results:
if isinstance(completion, str):
error_log.write(completion + "\n")

Expand All @@ -115,15 +125,15 @@ async def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="azure-gpt-3.5")
parser.add_argument("--server-address", type=str, default="http://0.0.0.0:9094")
parser.add_argument("--api-base", type=str, default=None)
parser.add_argument("--api-key", type=str, default=None)
parser.add_argument("--num-total-responses", type=int, default=50)
parser.add_argument("--req-per-sec", type=int, default=5)
parser.add_argument("--include-image", action="store_true")
parser.add_argument("--randomize-image-dimensions", action="store_true")
args = parser.parse_args()

litellm_client = AsyncOpenAI(base_url=args.server_address, api_key="sk-1234")
# Blank out contents of error_log.txt
open("load_test_errors.log", "w").close()

asyncio.run(main(args))
main(args)

0 comments on commit 2179cce

Please sign in to comment.