diff --git a/example/llm-leaderboard/leaderboard.md b/example/llm-leaderboard/leaderboard.md index 809bc13705..f61ef9eef7 100644 --- a/example/llm-leaderboard/leaderboard.md +++ b/example/llm-leaderboard/leaderboard.md @@ -15,6 +15,8 @@ Current supported LLMs: - chatglm 6b - chatglm2 6b - aquila 7b/7b-chat +- mistral-7b-instruct +- mistral-8*7b-instruct ## Build Starwhale Runtime diff --git a/example/llm-leaderboard/src/benchmark/cmmlu.py b/example/llm-leaderboard/src/benchmark/cmmlu.py index e227198503..9249542746 100644 --- a/example/llm-leaderboard/src/benchmark/cmmlu.py +++ b/example/llm-leaderboard/src/benchmark/cmmlu.py @@ -1,9 +1,9 @@ from __future__ import annotations import re +import random import typing as t -from starwhale import dataset from starwhale.utils.debug import console from starwhale.base.uri.resource import Resource @@ -79,11 +79,27 @@ def generate_samples_prompt( if few_shot <= 0: return "" - ds = dataset(dataset_uri) + # simplify samples with some fixed questions + samples_features = [ + {"question": "病毒体核心的主要物质是", "a": "类脂", "b": "核酸", "c": "蛋白质", "d": "磷酸", "answer": "B"}, + {"question": "流行病学属于什么范畴", "a": "临床医学", "b": "生物医学", "c": "基础医学", "d": "预防医学", "answer": "D"}, + {"question": "下列选项中,属于处分行为的是", "a": "捐助行为", "b": "抛弃所有权的行为", "c": "签订货物买卖合同", "d": "委托行为", "answer": "B"}, + {"question": "对累犯从重处罚的刑罚制度,体现了我国刑法的", "a": "罪刑法定原则", "b": "惩罚与教育相结合原则", "c": "刑法适用平等原则", "d": "罪责刑相适应原则", "answer": "D"}, + {"question": "犯罪分子具有刑法规定的减轻处罚情节的,应当在()判处刑罚。", "a": "法定刑幅度内按照最低刑", "b": "法定最高刑以下", "c": "法定刑以下", "d": "法定刑以内", "answer": "C"}, + {"question": "下列短语中,是定中短语的是", "a": "打扫干净", "b": "操作方法", "c": "张华同学", "d": "已经完成", "answer": "B"}, + {"question": "在下面重叠的例子中,表示“适度、适中”意义的是", "a": "白白的", "b": "坐坐", "c": "客客气气的", "d": "散散步", "answer": "A"}, + {"question": "“员、祖、乡、分、妊、严”中包含的自由语素是", "a": "乡、分、严", "b": "祖、分、严", "c": "祖、乡、分", "d": "员、分、妊", "answer": "A"}, + {"question": "必然王国和自由王国是社会发展的", "a": "两条不同的道路", "b": "两种不同的理想", "c": "两种不同的状态", "d": "两种不同的选择", "answer": "C"}, + {"question": "在垄断资本主义阶段占统治地位的资本是", "a": "工业资本", "b": "金融资本", "c": "农业资本", "d": "银行资本", "answer": "B"}, + ] + + random.shuffle(samples_features) samples = [] total = 0 + idx = 0 for i in range(0, few_shot): - features = ds[f"{subject}/dev/{i}"].features + features = samples_features[idx] + idx = (idx + 1) % len(samples_features) question = self.generate_question(features, include_answer=True) total += len_tokens(question) if total > max_length: @@ -121,6 +137,10 @@ def _ingest_choice(self, content: str) -> str: if match: return match.group(index) + m = re.findall(r"[ABCD]", content) + if len(m) >= 1: + return m[0] + raise ValueError(f"cannot ingest ABCD choice from {content}") def calculate_score( diff --git a/example/llm-leaderboard/src/evaluation.py b/example/llm-leaderboard/src/evaluation.py index f04bd5de42..359af68104 100644 --- a/example/llm-leaderboard/src/evaluation.py +++ b/example/llm-leaderboard/src/evaluation.py @@ -3,11 +3,12 @@ import os import typing as t import threading +import dataclasses from collections import defaultdict import numpy -from starwhale import evaluation +from starwhale import argument, evaluation from starwhale.utils.debug import console try: @@ -30,27 +31,48 @@ _g_llm = None _g_benchmarks: t.Dict[str, BenchmarkBase] = {} -max_prompt_length = int(os.environ.get("MAX_PROMPT_LENGTH", 2048)) -max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", 256)) + +@dataclasses.dataclass +class ModelGenerateArguments: + max_prompt_length: int = dataclasses.field( + default=2048, metadata={"help": "max length of prompt"} + ) + max_new_tokens: int = dataclasses.field( + default=256, metadata={"help": "max length of generated text"} + ) + batch: int = dataclasses.field( + default=1, metadata={"help": "batch size for inference"} + ) + temperature: float = dataclasses.field( + default=0.8, metadata={"help": "temperature"} + ) + top_p: float = dataclasses.field(default=0.95, metadata={"help": "top p"}) + tensor_parallel: int = dataclasses.field( + default=1, metadata={"help": "tensor parallel for vllm"} + ) + max_model_len: int = dataclasses.field( + default=16384, metadata={"help": "max model len for vllm kv cache"} + ) # TODO: support multi-gpus evaluation # TODO: enhance selected features +@argument(ModelGenerateArguments) @evaluation.predict( resources={"nvidia.com/gpu": 1}, replicas=1, + batch_size=32, auto_log=False, ) -def predict_question(data: dict, external: dict) -> None: - # dev split is used for few shot samples - if data.get("_hf_split", "") == "dev": - return - +def predict_question( + data: t.List[dict], external: dict, argument: ModelGenerateArguments +) -> None: # TODO: record cpu/gpu/memory info per predict pod global _g_llm + with threading.Lock(): if _g_llm is None: - _g_llm = get_built_llm() + _g_llm = get_built_llm(tensor_parallel=argument.tensor_parallel, max_model_len=argument.max_model_len) global _g_benchmarks dataset_uri = external["dataset_uri"] @@ -59,34 +81,61 @@ def predict_question(data: dict, external: dict) -> None: # TODO: use dataset_info to get benchmark _g_benchmarks[dataset_name] = get_benchmark(dataset_name) - result = {} benchmark = _g_benchmarks[dataset_name]() - for shot, show_name in few_shot_choices.items(): - prompt = benchmark.generate_prompt( - data, - few_shot=shot, - dataset_uri=dataset_uri, - max_length=max_prompt_length, - len_tokens=_g_llm.calculate_tokens_length, - ) - predict_result = _g_llm.do_predict( - prompt, - benchmark_type=benchmark.get_type(), - max_new_tokens=max_new_tokens, - predict_choice_by_logits=True, + + inputs = [] + for _index, _data in zip(external["index"], data): + # dev split is used for few shot samples + if _data.get("_hf_split", "") == "dev": + continue + + for _shot, _show_name in few_shot_choices.items(): + _prompt = benchmark.generate_prompt( + _data, + few_shot=_shot, + dataset_uri=dataset_uri, + max_length=argument.max_prompt_length, + len_tokens=_g_llm.calculate_tokens_length, + ) + inputs.append((_index, _show_name, _data, _prompt)) + + predict_results = [] + for idx in range(0, len(inputs), argument.batch): + batch_prompts = [x[-1] for x in inputs[idx : idx + argument.batch]] + + if _g_llm.support_batch_inference(): + _results = _g_llm.do_predict( + batch_prompts, + benchmark_type=benchmark.get_type(), + max_new_tokens=argument.max_new_tokens, + predict_choice_by_logits=True, + ) + predict_results.extend(_results) + else: + for _prompt in batch_prompts: + _result = _g_llm.do_predict( + _prompt, + benchmark_type=benchmark.get_type(), + max_new_tokens=argument.max_new_tokens, + predict_choice_by_logits=True, + ) + predict_results.append(_result) + + for (_index, _show_name, _data, _prompt), predict_result in zip( + inputs, predict_results + ): + score = benchmark.calculate_score(predict_result, _data) + console.trace(f"prompt:\n {_prompt}") + console.trace(f"answer: {_data['answer']}, predict: {score}") + + evaluation.log( + category="results", + id=f"{benchmark.get_name()}-{_index}", + metrics={ + "input": benchmark.make_input_features_display(_data), + "output": {_show_name: score}, + }, ) - result[show_name] = benchmark.calculate_score(predict_result, data) - console.trace(f"prompt:\n {prompt}") - console.trace(f"answer: {data['answer']}, predict: {result[show_name]}") - - evaluation.log( - category="results", - id=f"{benchmark.get_name()}-{external['index']}", - metrics={ - "input": benchmark.make_input_features_display(data), - "output": result, - }, - ) @evaluation.evaluate(needs=[predict_question], use_predict_auto_log=False) diff --git a/example/llm-leaderboard/src/llm/__init__.py b/example/llm-leaderboard/src/llm/__init__.py index 9a0c3da6e3..dfc9827c11 100644 --- a/example/llm-leaderboard/src/llm/__init__.py +++ b/example/llm-leaderboard/src/llm/__init__.py @@ -1,4 +1,4 @@ -from . import qwen, llama, tiger, aquila, xverse, chatglm, baichuan # noqa: F401 +from . import qwen, llama, tiger, aquila, xverse, chatglm, baichuan, mistral # noqa: F401 from .base import get_llm, get_built_llm, get_supported_llm __all__ = ["get_llm", "get_supported_llm", "get_built_llm"] diff --git a/example/llm-leaderboard/src/llm/base.py b/example/llm-leaderboard/src/llm/base.py index c8c6b25bb3..145c33d17e 100644 --- a/example/llm-leaderboard/src/llm/base.py +++ b/example/llm-leaderboard/src/llm/base.py @@ -5,6 +5,7 @@ import typing as t import logging from abc import ABC, abstractmethod +from typing import Dict, List from pathlib import Path from dataclasses import dataclass @@ -37,7 +38,8 @@ class LLMModelDesc: class LLMBase(ABC): - def __init__(self, rootdir: Path | None = None) -> None: + def __init__(self, **kwargs: t.Any) -> None: + rootdir = kwargs.get("rootdir") self.rootdir = rootdir if rootdir is not None else Path.cwd() @classmethod @@ -54,6 +56,9 @@ def get_description(cls) -> LLMModelDesc: def download(self) -> None: raise NotImplementedError + def support_batch_inference(self) -> bool: + return False + def ensure_swignore(self) -> None: swi_path = self.rootdir / ".swignore" @@ -80,16 +85,29 @@ def get_pretrained_dir(self) -> Path: @abstractmethod def do_predict( self, - input_prompt: str, + input_prompt: str | t.List[str], benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice, max_new_tokens: int = 50, - predict_choice_by_logits: bool = False, - ) -> t.Dict | str: + **kwargs: t.Any, + ) -> t.Dict | str | t.List: raise NotImplementedError def calculate_tokens_length(self, input_prompt: str) -> int: return len(input_prompt) + def _simplify_content(self, content: str) -> str: + content = content.strip() + + for token in ("###", "[UNK]", "", "]", ")", ")", "】"): + if content.endswith(token): + content = content[: -len(token)].strip() + + for prefix in (":", ":", "(", "(", "[", "【"): + if content.startswith(prefix): + content = content[len(prefix) :].strip() + + return content + def ensure_readme(self) -> None: readme_path = self.rootdir / "README.md" desc = self.get_description() @@ -141,9 +159,79 @@ def ensure_readme(self) -> None: ensure_file(readme_path, content) +class vLLMBase(LLMBase): + def __init__(self, **kwargs: t.Any) -> None: + super().__init__(**kwargs) + + self._model = None + self._tensor_parallel = kwargs.get("tensor_parallel", 1) + self._max_model_len = kwargs.get("max_model_len", 32768) + + @abstractmethod + def get_hf_repo_id(self) -> str: + raise NotImplementedError + + def get_base_dir(self) -> Path: + return self.get_pretrained_dir() / self.get_name() / "base" + + def download(self) -> None: + from huggingface_hub import snapshot_download + + local_dir = self.get_base_dir() + ensure_dir(local_dir) + snapshot_download( + repo_id=self.get_hf_repo_id(), local_dir=local_dir, max_workers=16 + ) + + def support_batch_inference(self) -> bool: + return True + + def _get_model(self) -> t.Any: + if self._model is None: + path = self.get_base_dir() + console.print(f":monkey: try to load model({path}) into memory...") + + import vllm + + self._model = vllm.LLM( + path, dtype=torch.float16, tensor_parallel_size=self._tensor_parallel, max_model_len=self._max_model_len + ) + + return self._model + + @torch.no_grad() + def do_predict( + self, + input_prompts: str | List[str], + benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice, + max_new_tokens: int = 50, + **kwargs: t.Any, + ) -> Dict | str | t.List: + if isinstance(input_prompts, str): + input_prompts = [input_prompts] + + import vllm + + temperature = kwargs.get("temperature", 0.8) + top_p = kwargs.get("top_p", 0.95) + + sp = vllm.SamplingParams( + temperature=temperature, top_p=top_p, max_tokens=max_new_tokens + ) + outputs = self._get_model().generate(input_prompts, sp) + outputs.sort(key=lambda x: x.request_id) + + ret = [] + for output in outputs: + content = "".join([o.text for o in output.outputs]) + content = self._simplify_content(content) + ret.append(content) + return ret + + class HuggingfaceLLMBase(LLMBase): - def __init__(self, rootdir: Path | None = None) -> None: - super().__init__(rootdir) + def __init__(self, **kwargs: t.Any) -> None: + super().__init__(**kwargs) self._tokenizer = None self._model = None @@ -247,30 +335,17 @@ def get_generate_kwargs(self) -> t.Dict[str, t.Any]: repetition_penalty=float(os.environ.get("REPETITION_PENALTY", 1.3)), ) - def _simplify_content(self, content: str) -> str: - content = content.strip() - - for token in ("###", "[UNK]", "", "]", ")", ")", "】"): - if content.endswith(token): - content = content[: -len(token)].strip() - - for prefix in (":", ":", "(", "(", "[", "【"): - if content.startswith(prefix): - content = content[len(prefix) :].strip() - - return content - def do_predict( self, input_prompt: str, benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice, max_new_tokens: int = 50, - predict_choice_by_logits: bool = False, - ) -> t.Dict | str: + **kwargs: t.Any, + ) -> t.Dict | str | t.List: # TODO: add self prompt wrapper content = self._do_predict_with_generate(input_prompt, max_new_tokens) ret = {"content": self._simplify_content(content)} - if predict_choice_by_logits: + if kwargs.get("predict_choice_by_logits", False): if benchmark_type != BenchmarkType.MultipleChoice: raise ValueError( "predict_choice_by_logits only support BenchmarkType.MultipleChoice" @@ -363,7 +438,8 @@ def get_llm(name: str, **kwargs: t.Any) -> LLMBase: return _SUPPORTED_LLM[name](**kwargs) -def get_built_llm(rootdir: Path | None = None, **kwargs: t.Any) -> LLMBase: +def get_built_llm(**kwargs: t.Any) -> LLMBase: + rootdir = kwargs.get("rootdir") rootdir = rootdir if rootdir is not None else Path.cwd() config_path = rootdir / "pretrained" / "sw_config.json" if not config_path.exists(): diff --git a/example/llm-leaderboard/src/llm/mistral.py b/example/llm-leaderboard/src/llm/mistral.py new file mode 100644 index 0000000000..6650507221 --- /dev/null +++ b/example/llm-leaderboard/src/llm/mistral.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from .base import register, vLLMBase, LLMModelDesc + + +@register() +class Mistral7BInstruct(vLLMBase): + def get_hf_repo_id(self) -> str: + return "mistralai/Mistral-7B-Instruct-v0.2" + + @classmethod + def get_name(cls) -> str: + return "mistral-7b-instruct" + + @classmethod + def get_description(cls) -> LLMModelDesc: + return LLMModelDesc( + params="7b", + intro=( + "We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. " + "Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation." + "Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. " + "We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are released under the Apache 2.0 license." + ), + license="apache-2.0", + author="Mistral", + github="https://github.com/mistralai/mistral-src", + type="fine-tuned", + ) + + def download(self) -> None: + super().download() + + local_dir = self.get_base_dir() + # We only need safetensors files. + useless_fnames = ( + "pytorch_model-00001-of-00003.bin", + "pytorch_model-00002-of-00003.bin", + "pytorch_model-00003-of-00003.bin", + "pytorch_model.bin.index.json", + ) + for fname in useless_fnames: + (local_dir / fname).unlink()