diff --git a/example/llm-leaderboard/leaderboard.md b/example/llm-leaderboard/leaderboard.md
index 809bc13705..f61ef9eef7 100644
--- a/example/llm-leaderboard/leaderboard.md
+++ b/example/llm-leaderboard/leaderboard.md
@@ -15,6 +15,8 @@ Current supported LLMs:
 - chatglm 6b
 - chatglm2 6b
 - aquila 7b/7b-chat
+- mistral-7b-instruct
+- mistral-8*7b-instruct
 
 ## Build Starwhale Runtime
 
diff --git a/example/llm-leaderboard/src/benchmark/cmmlu.py b/example/llm-leaderboard/src/benchmark/cmmlu.py
index e227198503..9249542746 100644
--- a/example/llm-leaderboard/src/benchmark/cmmlu.py
+++ b/example/llm-leaderboard/src/benchmark/cmmlu.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
 import re
+import random
 import typing as t
 
-from starwhale import dataset
 from starwhale.utils.debug import console
 from starwhale.base.uri.resource import Resource
 
@@ -79,11 +79,27 @@ def generate_samples_prompt(
         if few_shot <= 0:
             return ""
 
-        ds = dataset(dataset_uri)
+        # simplify samples with some fixed questions
+        samples_features = [
+            {"question": "病毒体核心的主要物质是", "a": "类脂", "b": "核酸", "c": "蛋白质", "d": "磷酸", "answer": "B"},
+            {"question": "流行病学属于什么范畴", "a": "临床医学", "b": "生物医学", "c": "基础医学", "d": "预防医学", "answer": "D"},
+            {"question": "下列选项中，属于处分行为的是", "a": "捐助行为", "b": "抛弃所有权的行为", "c": "签订货物买卖合同", "d": "委托行为", "answer": "B"},
+            {"question": "对累犯从重处罚的刑罚制度，体现了我国刑法的", "a": "罪刑法定原则", "b": "惩罚与教育相结合原则", "c": "刑法适用平等原则", "d": "罪责刑相适应原则", "answer": "D"},
+            {"question": "犯罪分子具有刑法规定的减轻处罚情节的，应当在（）判处刑罚。", "a": "法定刑幅度内按照最低刑", "b": "法定最高刑以下", "c": "法定刑以下", "d": "法定刑以内", "answer": "C"},
+            {"question": "下列短语中，是定中短语的是", "a": "打扫干净", "b": "操作方法", "c": "张华同学", "d": "已经完成", "answer": "B"},
+            {"question": "在下面重叠的例子中，表示“适度、适中”意义的是", "a": "白白的", "b": "坐坐", "c": "客客气气的", "d": "散散步", "answer": "A"},
+            {"question": "“员、祖、乡、分、妊、严”中包含的自由语素是", "a": "乡、分、严", "b": "祖、分、严", "c": "祖、乡、分", "d": "员、分、妊", "answer": "A"},
+            {"question": "必然王国和自由王国是社会发展的", "a": "两条不同的道路", "b": "两种不同的理想", "c": "两种不同的状态", "d": "两种不同的选择", "answer": "C"},
+            {"question": "在垄断资本主义阶段占统治地位的资本是", "a": "工业资本", "b": "金融资本", "c": "农业资本", "d": "银行资本", "answer": "B"},
+        ]
+
+        random.shuffle(samples_features)
         samples = []
         total = 0
+        idx = 0
         for i in range(0, few_shot):
-            features = ds[f"{subject}/dev/{i}"].features
+            features = samples_features[idx]
+            idx = (idx + 1) % len(samples_features)
             question = self.generate_question(features, include_answer=True)
             total += len_tokens(question)
             if total > max_length:
@@ -121,6 +137,10 @@ def _ingest_choice(self, content: str) -> str:
             if match:
                 return match.group(index)
 
+        m = re.findall(r"[ABCD]", content)
+        if len(m) >= 1:
+            return m[0]
+
         raise ValueError(f"cannot ingest ABCD choice from {content}")
 
     def calculate_score(
diff --git a/example/llm-leaderboard/src/evaluation.py b/example/llm-leaderboard/src/evaluation.py
index f04bd5de42..359af68104 100644
--- a/example/llm-leaderboard/src/evaluation.py
+++ b/example/llm-leaderboard/src/evaluation.py
@@ -3,11 +3,12 @@
 import os
 import typing as t
 import threading
+import dataclasses
 from collections import defaultdict
 
 import numpy
 
-from starwhale import evaluation
+from starwhale import argument, evaluation
 from starwhale.utils.debug import console
 
 try:
@@ -30,27 +31,48 @@
 _g_llm = None
 _g_benchmarks: t.Dict[str, BenchmarkBase] = {}
 
-max_prompt_length = int(os.environ.get("MAX_PROMPT_LENGTH", 2048))
-max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", 256))
+
+@dataclasses.dataclass
+class ModelGenerateArguments:
+    max_prompt_length: int = dataclasses.field(
+        default=2048, metadata={"help": "max length of prompt"}
+    )
+    max_new_tokens: int = dataclasses.field(
+        default=256, metadata={"help": "max length of generated text"}
+    )
+    batch: int = dataclasses.field(
+        default=1, metadata={"help": "batch size for inference"}
+    )
+    temperature: float = dataclasses.field(
+        default=0.8, metadata={"help": "temperature"}
+    )
+    top_p: float = dataclasses.field(default=0.95, metadata={"help": "top p"})
+    tensor_parallel: int = dataclasses.field(
+        default=1, metadata={"help": "tensor parallel for vllm"}
+    )
+    max_model_len: int = dataclasses.field(
+        default=16384, metadata={"help": "max model len for vllm kv cache"}
+    )
 
 
 # TODO: support multi-gpus evaluation
 # TODO: enhance selected features
+@argument(ModelGenerateArguments)
 @evaluation.predict(
     resources={"nvidia.com/gpu": 1},
     replicas=1,
+    batch_size=32,
     auto_log=False,
 )
-def predict_question(data: dict, external: dict) -> None:
-    # dev split is used for few shot samples
-    if data.get("_hf_split", "") == "dev":
-        return
-
+def predict_question(
+    data: t.List[dict], external: dict, argument: ModelGenerateArguments
+) -> None:
     # TODO: record cpu/gpu/memory info per predict pod
     global _g_llm
+
     with threading.Lock():
         if _g_llm is None:
-            _g_llm = get_built_llm()
+            _g_llm = get_built_llm(tensor_parallel=argument.tensor_parallel, max_model_len=argument.max_model_len)
 
     global _g_benchmarks
     dataset_uri = external["dataset_uri"]
@@ -59,34 +81,61 @@ def predict_question(data: dict, external: dict) -> None:
         # TODO: use dataset_info to get benchmark
         _g_benchmarks[dataset_name] = get_benchmark(dataset_name)
 
-    result = {}
     benchmark = _g_benchmarks[dataset_name]()
-    for shot, show_name in few_shot_choices.items():
-        prompt = benchmark.generate_prompt(
-            data,
-            few_shot=shot,
-            dataset_uri=dataset_uri,
-            max_length=max_prompt_length,
-            len_tokens=_g_llm.calculate_tokens_length,
-        )
-        predict_result = _g_llm.do_predict(
-            prompt,
-            benchmark_type=benchmark.get_type(),
-            max_new_tokens=max_new_tokens,
-            predict_choice_by_logits=True,
+
+    inputs = []
+    for _index, _data in zip(external["index"], data):
+        # dev split is used for few shot samples
+        if _data.get("_hf_split", "") == "dev":
+            continue
+
+        for _shot, _show_name in few_shot_choices.items():
+            _prompt = benchmark.generate_prompt(
+                _data,
+                few_shot=_shot,
+                dataset_uri=dataset_uri,
+                max_length=argument.max_prompt_length,
+                len_tokens=_g_llm.calculate_tokens_length,
+            )
+            inputs.append((_index, _show_name, _data, _prompt))
+
+    predict_results = []
+    for idx in range(0, len(inputs), argument.batch):
+        batch_prompts = [x[-1] for x in inputs[idx : idx + argument.batch]]
+
+        if _g_llm.support_batch_inference():
+            _results = _g_llm.do_predict(
+                batch_prompts,
+                benchmark_type=benchmark.get_type(),
+                max_new_tokens=argument.max_new_tokens,
+                predict_choice_by_logits=True,
+            )
+            predict_results.extend(_results)
+        else:
+            for _prompt in batch_prompts:
+                _result = _g_llm.do_predict(
+                    _prompt,
+                    benchmark_type=benchmark.get_type(),
+                    max_new_tokens=argument.max_new_tokens,
+                    predict_choice_by_logits=True,
+                )
+                predict_results.append(_result)
+
+    for (_index, _show_name, _data, _prompt), predict_result in zip(
+        inputs, predict_results
+    ):
+        score = benchmark.calculate_score(predict_result, _data)
+        console.trace(f"prompt:\n {_prompt}")
+        console.trace(f"answer: {_data['answer']}, predict: {score}")
+
+        evaluation.log(
+            category="results",
+            id=f"{benchmark.get_name()}-{_index}",
+            metrics={
+                "input": benchmark.make_input_features_display(_data),
+                "output": {_show_name: score},
+            },
         )
-        result[show_name] = benchmark.calculate_score(predict_result, data)
-        console.trace(f"prompt:\n {prompt}")
-        console.trace(f"answer: {data['answer']}, predict: {result[show_name]}")
-
-    evaluation.log(
-        category="results",
-        id=f"{benchmark.get_name()}-{external['index']}",
-        metrics={
-            "input": benchmark.make_input_features_display(data),
-            "output": result,
-        },
-    )
 
 
 @evaluation.evaluate(needs=[predict_question], use_predict_auto_log=False)
diff --git a/example/llm-leaderboard/src/llm/__init__.py b/example/llm-leaderboard/src/llm/__init__.py
index 9a0c3da6e3..dfc9827c11 100644
--- a/example/llm-leaderboard/src/llm/__init__.py
+++ b/example/llm-leaderboard/src/llm/__init__.py
@@ -1,4 +1,4 @@
-from . import qwen, llama, tiger, aquila, xverse, chatglm, baichuan  # noqa: F401
+from . import qwen, llama, tiger, aquila, xverse, chatglm, baichuan, mistral  # noqa: F401
 from .base import get_llm, get_built_llm, get_supported_llm
 
 __all__ = ["get_llm", "get_supported_llm", "get_built_llm"]
diff --git a/example/llm-leaderboard/src/llm/base.py b/example/llm-leaderboard/src/llm/base.py
index c8c6b25bb3..145c33d17e 100644
--- a/example/llm-leaderboard/src/llm/base.py
+++ b/example/llm-leaderboard/src/llm/base.py
@@ -5,6 +5,7 @@
 import typing as t
 import logging
 from abc import ABC, abstractmethod
+from typing import Dict, List
 from pathlib import Path
 from dataclasses import dataclass
 
@@ -37,7 +38,8 @@ class LLMModelDesc:
 
 
 class LLMBase(ABC):
-    def __init__(self, rootdir: Path | None = None) -> None:
+    def __init__(self, **kwargs: t.Any) -> None:
+        rootdir = kwargs.get("rootdir")
         self.rootdir = rootdir if rootdir is not None else Path.cwd()
 
     @classmethod
@@ -54,6 +56,9 @@ def get_description(cls) -> LLMModelDesc:
     def download(self) -> None:
         raise NotImplementedError
 
+    def support_batch_inference(self) -> bool:
+        return False
+
     def ensure_swignore(self) -> None:
         swi_path = self.rootdir / ".swignore"
 
@@ -80,16 +85,29 @@ def get_pretrained_dir(self) -> Path:
     @abstractmethod
     def do_predict(
         self,
-        input_prompt: str,
+        input_prompt: str | t.List[str],
         benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice,
         max_new_tokens: int = 50,
-        predict_choice_by_logits: bool = False,
-    ) -> t.Dict | str:
+        **kwargs: t.Any,
+    ) -> t.Dict | str | t.List:
         raise NotImplementedError
 
     def calculate_tokens_length(self, input_prompt: str) -> int:
         return len(input_prompt)
 
+    def _simplify_content(self, content: str) -> str:
+        content = content.strip()
+
+        for token in ("###", "[UNK]", "</s>", "]", ")", "）", "】"):
+            if content.endswith(token):
+                content = content[: -len(token)].strip()
+
+        for prefix in (":", "：", "(", "（", "[", "【"):
+            if content.startswith(prefix):
+                content = content[len(prefix) :].strip()
+
+        return content
+
     def ensure_readme(self) -> None:
         readme_path = self.rootdir / "README.md"
         desc = self.get_description()
@@ -141,9 +159,79 @@ def ensure_readme(self) -> None:
         ensure_file(readme_path, content)
 
 
+class vLLMBase(LLMBase):
+    def __init__(self, **kwargs: t.Any) -> None:
+        super().__init__(**kwargs)
+
+        self._model = None
+        self._tensor_parallel = kwargs.get("tensor_parallel", 1)
+        self._max_model_len = kwargs.get("max_model_len", 32768)
+
+    @abstractmethod
+    def get_hf_repo_id(self) -> str:
+        raise NotImplementedError
+
+    def get_base_dir(self) -> Path:
+        return self.get_pretrained_dir() / self.get_name() / "base"
+
+    def download(self) -> None:
+        from huggingface_hub import snapshot_download
+
+        local_dir = self.get_base_dir()
+        ensure_dir(local_dir)
+        snapshot_download(
+            repo_id=self.get_hf_repo_id(), local_dir=local_dir, max_workers=16
+        )
+
+    def support_batch_inference(self) -> bool:
+        return True
+
+    def _get_model(self) -> t.Any:
+        if self._model is None:
+            path = self.get_base_dir()
+            console.print(f":monkey: try to load model({path}) into memory...")
+
+            import vllm
+
+            self._model = vllm.LLM(
+                path, dtype=torch.float16, tensor_parallel_size=self._tensor_parallel, max_model_len=self._max_model_len
+            )
+
+        return self._model
+
+    @torch.no_grad()
+    def do_predict(
+        self,
+        input_prompts: str | List[str],
+        benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice,
+        max_new_tokens: int = 50,
+        **kwargs: t.Any,
+    ) -> Dict | str | t.List:
+        if isinstance(input_prompts, str):
+            input_prompts = [input_prompts]
+
+        import vllm
+
+        temperature = kwargs.get("temperature", 0.8)
+        top_p = kwargs.get("top_p", 0.95)
+
+        sp = vllm.SamplingParams(
+            temperature=temperature, top_p=top_p, max_tokens=max_new_tokens
+        )
+        outputs = self._get_model().generate(input_prompts, sp)
+        outputs.sort(key=lambda x: x.request_id)
+
+        ret = []
+        for output in outputs:
+            content = "".join([o.text for o in output.outputs])
+            content = self._simplify_content(content)
+            ret.append(content)
+        return ret
+
+
 class HuggingfaceLLMBase(LLMBase):
-    def __init__(self, rootdir: Path | None = None) -> None:
-        super().__init__(rootdir)
+    def __init__(self, **kwargs: t.Any) -> None:
+        super().__init__(**kwargs)
 
         self._tokenizer = None
         self._model = None
@@ -247,30 +335,17 @@ def get_generate_kwargs(self) -> t.Dict[str, t.Any]:
             repetition_penalty=float(os.environ.get("REPETITION_PENALTY", 1.3)),
         )
 
-    def _simplify_content(self, content: str) -> str:
-        content = content.strip()
-
-        for token in ("###", "[UNK]", "</s>", "]", ")", "）", "】"):
-            if content.endswith(token):
-                content = content[: -len(token)].strip()
-
-        for prefix in (":", "：", "(", "（", "[", "【"):
-            if content.startswith(prefix):
-                content = content[len(prefix) :].strip()
-
-        return content
-
     def do_predict(
         self,
         input_prompt: str,
         benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice,
         max_new_tokens: int = 50,
-        predict_choice_by_logits: bool = False,
-    ) -> t.Dict | str:
+        **kwargs: t.Any,
+    ) -> t.Dict | str | t.List:
         # TODO: add self prompt wrapper
         content = self._do_predict_with_generate(input_prompt, max_new_tokens)
         ret = {"content": self._simplify_content(content)}
-        if predict_choice_by_logits:
+        if kwargs.get("predict_choice_by_logits", False):
             if benchmark_type != BenchmarkType.MultipleChoice:
                 raise ValueError(
                     "predict_choice_by_logits only support BenchmarkType.MultipleChoice"
@@ -363,7 +438,8 @@ def get_llm(name: str, **kwargs: t.Any) -> LLMBase:
     return _SUPPORTED_LLM[name](**kwargs)
 
 
-def get_built_llm(rootdir: Path | None = None, **kwargs: t.Any) -> LLMBase:
+def get_built_llm(**kwargs: t.Any) -> LLMBase:
+    rootdir = kwargs.get("rootdir")
     rootdir = rootdir if rootdir is not None else Path.cwd()
     config_path = rootdir / "pretrained" / "sw_config.json"
     if not config_path.exists():
diff --git a/example/llm-leaderboard/src/llm/mistral.py b/example/llm-leaderboard/src/llm/mistral.py
new file mode 100644
index 0000000000..6650507221
--- /dev/null
+++ b/example/llm-leaderboard/src/llm/mistral.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from .base import register, vLLMBase, LLMModelDesc
+
+
+@register()
+class Mistral7BInstruct(vLLMBase):
+    def get_hf_repo_id(self) -> str:
+        return "mistralai/Mistral-7B-Instruct-v0.2"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "mistral-7b-instruct"
+
+    @classmethod
+    def get_description(cls) -> LLMModelDesc:
+        return LLMModelDesc(
+            params="7b",
+            intro=(
+                "We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. "
+                "Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation."
+                "Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. "
+                "We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are released under the Apache 2.0 license."
+            ),
+            license="apache-2.0",
+            author="Mistral",
+            github="https://github.com/mistralai/mistral-src",
+            type="fine-tuned",
+        )
+
+    def download(self) -> None:
+        super().download()
+
+        local_dir = self.get_base_dir()
+        # We only need safetensors files.
+        useless_fnames = (
+            "pytorch_model-00001-of-00003.bin",
+            "pytorch_model-00002-of-00003.bin",
+            "pytorch_model-00003-of-00003.bin",
+            "pytorch_model.bin.index.json",
+        )
+        for fname in useless_fnames:
+            (local_dir / fname).unlink()