diff --git a/example/llm-leaderboard/leaderboard.md b/example/llm-leaderboard/leaderboard.md
index 809bc13705..f61ef9eef7 100644
--- a/example/llm-leaderboard/leaderboard.md
+++ b/example/llm-leaderboard/leaderboard.md
@@ -15,6 +15,8 @@ Current supported LLMs:
 - chatglm 6b
 - chatglm2 6b
 - aquila 7b/7b-chat
+- mistral-7b-instruct
+- mistral-8*7b-instruct
 
 ## Build Starwhale Runtime
 
diff --git a/example/llm-leaderboard/src/benchmark/cmmlu.py b/example/llm-leaderboard/src/benchmark/cmmlu.py
index e227198503..7c2ea92417 100644
--- a/example/llm-leaderboard/src/benchmark/cmmlu.py
+++ b/example/llm-leaderboard/src/benchmark/cmmlu.py
@@ -121,6 +121,10 @@ def _ingest_choice(self, content: str) -> str:
             if match:
                 return match.group(index)
 
+        m = re.findall(r"[ABCD]", content)
+        if len(m) >= 1:
+            return m[0]
+
         raise ValueError(f"cannot ingest ABCD choice from {content}")
 
     def calculate_score(
diff --git a/example/llm-leaderboard/src/evaluation.py b/example/llm-leaderboard/src/evaluation.py
index f04bd5de42..67834dc4e5 100644
--- a/example/llm-leaderboard/src/evaluation.py
+++ b/example/llm-leaderboard/src/evaluation.py
@@ -3,11 +3,12 @@
 import os
 import typing as t
 import threading
+import dataclasses
 from collections import defaultdict
 
 import numpy
 
-from starwhale import evaluation
+from starwhale import argument, evaluation
 from starwhale.utils.debug import console
 
 try:
@@ -30,18 +31,39 @@
 _g_llm = None
 _g_benchmarks: t.Dict[str, BenchmarkBase] = {}
 
-max_prompt_length = int(os.environ.get("MAX_PROMPT_LENGTH", 2048))
-max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", 256))
+
+@dataclasses.dataclass
+class ModelGenerateArguments:
+    max_prompt_length: int = dataclasses.field(
+        default=2048, metadata={"help": "max length of prompt"}
+    )
+    max_new_tokens: int = dataclasses.field(
+        default=256, metadata={"help": "max length of generated text"}
+    )
+    batch: int = dataclasses.field(
+        default=1, metadata={"help": "batch size for inference"}
+    )
+    temperature: float = dataclasses.field(
+        default=0.8, metadata={"help": "temperature"}
+    )
+    top_p: float = dataclasses.field(default=0.95, metadata={"help": "top p"})
+    tensor_parallel: int = dataclasses.field(
+        default=1, metadata={"help": "tensor parallel for vllm"}
+    )
 
 
 # TODO: support multi-gpus evaluation
 # TODO: enhance selected features
+@argument(ModelGenerateArguments)
 @evaluation.predict(
     resources={"nvidia.com/gpu": 1},
     replicas=1,
+    batch_size=32,
     auto_log=False,
 )
-def predict_question(data: dict, external: dict) -> None:
+def predict_question(
+    data: t.List[dict], external: dict, arguments: ModelGenerateArguments
+) -> None:
     # dev split is used for few shot samples
     if data.get("_hf_split", "") == "dev":
         return
@@ -50,7 +72,7 @@ def predict_question(data: dict, external: dict) -> None:
     global _g_llm
     with threading.Lock():
         if _g_llm is None:
-            _g_llm = get_built_llm()
+            _g_llm = get_built_llm(tensor_parallel=arguments.tensor_parallel)
 
     global _g_benchmarks
     dataset_uri = external["dataset_uri"]
@@ -59,34 +81,57 @@ def predict_question(data: dict, external: dict) -> None:
         # TODO: use dataset_info to get benchmark
         _g_benchmarks[dataset_name] = get_benchmark(dataset_name)
 
-    result = {}
     benchmark = _g_benchmarks[dataset_name]()
-    for shot, show_name in few_shot_choices.items():
-        prompt = benchmark.generate_prompt(
-            data,
-            few_shot=shot,
-            dataset_uri=dataset_uri,
-            max_length=max_prompt_length,
-            len_tokens=_g_llm.calculate_tokens_length,
-        )
-        predict_result = _g_llm.do_predict(
-            prompt,
-            benchmark_type=benchmark.get_type(),
-            max_new_tokens=max_new_tokens,
-            predict_choice_by_logits=True,
+
+    inputs = []
+    for _index, _data in zip(data, external["index"]):
+        for _shot, _show_name in few_shot_choices.items():
+            _prompt = benchmark.generate_prompt(
+                _data,
+                few_shot=_shot,
+                dataset_uri=dataset_uri,
+                max_length=arguments.max_prompt_length,
+                len_tokens=_g_llm.calculate_tokens_length,
+            )
+            inputs.append((_index, _show_name, _data, _prompt))
+
+    predict_results = []
+    for idx in range(0, len(inputs), arguments.batch):
+        batch_prompts = [x[-1] for x in inputs[idx : idx + arguments.batch]]
+
+        if _g_llm.support_batch_inference():
+            _results = _g_llm.do_batch_predict(
+                batch_prompts,
+                benchmark_type=benchmark.get_type(),
+                max_new_tokens=arguments.max_new_tokens,
+                predict_choice_by_logits=True,
+            )
+            predict_results.extend(_results)
+        else:
+            for _prompt in batch_prompts:
+                _result = _g_llm.do_predict(
+                    _prompt,
+                    benchmark_type=benchmark.get_type(),
+                    max_new_tokens=arguments.max_new_tokens,
+                    predict_choice_by_logits=True,
+                )
+                predict_results.append(_result)
+
+    for (_index, _show_name, _data, _prompt), predict_result in zip(
+        inputs, predict_results
+    ):
+        score = benchmark.calculate_score(predict_result, _data)
+        console.trace(f"prompt:\n {_prompt}")
+        console.trace(f"answer: {_data['answer']}, predict: {score}")
+
+        evaluation.log(
+            category="results",
+            id=f"{benchmark.get_name()}-{_index}",
+            metrics={
+                "input": benchmark.make_input_features_display(_data),
+                "output": {_show_name: score},
+            },
         )
-        result[show_name] = benchmark.calculate_score(predict_result, data)
-        console.trace(f"prompt:\n {prompt}")
-        console.trace(f"answer: {data['answer']}, predict: {result[show_name]}")
-
-    evaluation.log(
-        category="results",
-        id=f"{benchmark.get_name()}-{external['index']}",
-        metrics={
-            "input": benchmark.make_input_features_display(data),
-            "output": result,
-        },
-    )
 
 
 @evaluation.evaluate(needs=[predict_question], use_predict_auto_log=False)
diff --git a/example/llm-leaderboard/src/llm/base.py b/example/llm-leaderboard/src/llm/base.py
index c8c6b25bb3..b3fb14e8aa 100644
--- a/example/llm-leaderboard/src/llm/base.py
+++ b/example/llm-leaderboard/src/llm/base.py
@@ -5,6 +5,7 @@
 import typing as t
 import logging
 from abc import ABC, abstractmethod
+from typing import Dict, List
 from pathlib import Path
 from dataclasses import dataclass
 
@@ -37,7 +38,8 @@ class LLMModelDesc:
 
 
 class LLMBase(ABC):
-    def __init__(self, rootdir: Path | None = None) -> None:
+    def __init__(self, **kwargs: t.Any) -> None:
+        rootdir = kwargs.get("rootdir")
         self.rootdir = rootdir if rootdir is not None else Path.cwd()
 
     @classmethod
@@ -54,6 +56,9 @@ def get_description(cls) -> LLMModelDesc:
     def download(self) -> None:
         raise NotImplementedError
 
+    def support_batch_inference(self) -> bool:
+        return False
+
     def ensure_swignore(self) -> None:
         swi_path = self.rootdir / ".swignore"
 
@@ -80,16 +85,29 @@ def get_pretrained_dir(self) -> Path:
     @abstractmethod
     def do_predict(
         self,
-        input_prompt: str,
+        input_prompt: str | t.List[str],
         benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice,
         max_new_tokens: int = 50,
-        predict_choice_by_logits: bool = False,
-    ) -> t.Dict | str:
+        **kwargs: t.Any,
+    ) -> t.Dict | str | t.List:
         raise NotImplementedError
 
     def calculate_tokens_length(self, input_prompt: str) -> int:
         return len(input_prompt)
 
+    def _simplify_content(self, content: str) -> str:
+        content = content.strip()
+
+        for token in ("###", "[UNK]", "</s>", "]", ")", "）", "】"):
+            if content.endswith(token):
+                content = content[: -len(token)].strip()
+
+        for prefix in (":", "：", "(", "（", "[", "【"):
+            if content.startswith(prefix):
+                content = content[len(prefix) :].strip()
+
+        return content
+
     def ensure_readme(self) -> None:
         readme_path = self.rootdir / "README.md"
         desc = self.get_description()
@@ -141,9 +159,78 @@ def ensure_readme(self) -> None:
         ensure_file(readme_path, content)
 
 
+class vLLMBase(LLMBase):
+    def __init__(self, **kwargs: t.Any) -> None:
+        super().__init__(**kwargs)
+
+        self._model = None
+        self._tensor_parallel = kwargs.get("tensor_parallel", 1)
+
+    @abstractmethod
+    def get_hf_repo_id(self) -> str:
+        raise NotImplementedError
+
+    def get_base_dir(self) -> Path:
+        return self.get_pretrained_dir() / self.get_name() / "base"
+
+    def download(self) -> None:
+        from huggingface_hub import snapshot_download
+
+        local_dir = self.get_base_dir()
+        ensure_dir(local_dir)
+        snapshot_download(
+            repo_id=self.get_hf_repo_id(), local_dir=local_dir, max_workers=16
+        )
+
+    def support_batch_inference(self) -> bool:
+        return True
+
+    def _get_model(self) -> t.Any:
+        if self._model is None:
+            path = self.get_base_dir()
+            console.print(f":monkey: try to load model({path}) into memory...")
+
+            import vllm
+
+            self._model = vllm.LLM(
+                path, dtype=torch.float16, tensor_parallel_size=self._tensor_parallel
+            )
+
+        return self._model
+
+    @torch.no_grad()
+    def do_predict(
+        self,
+        input_prompts: str | List[str],
+        benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice,
+        max_new_tokens: int = 50,
+        **kwargs: t.Any,
+    ) -> Dict | str | t.List:
+        if isinstance(input_prompts, str):
+            input_prompts = [input_prompts]
+
+        import vllm
+
+        temperature = kwargs.get("temperature", 0.8)
+        top_p = kwargs.get("top_p", 0.95)
+
+        sp = vllm.SamplingParams(
+            temperature=temperature, top_p=top_p, max_tokens=max_new_tokens
+        )
+        outputs = self._get_model().generate(input_prompts, sp)
+        outputs.sort(key=lambda x: x.request_id)
+
+        ret = []
+        for output in outputs:
+            content = "".join([o.text for o in output.outputs])
+            content = self._simplify_content(content)
+            ret.append(content)
+        return ret
+
+
 class HuggingfaceLLMBase(LLMBase):
-    def __init__(self, rootdir: Path | None = None) -> None:
-        super().__init__(rootdir)
+    def __init__(self, **kwargs: t.Any) -> None:
+        super().__init__(**kwargs)
 
         self._tokenizer = None
         self._model = None
@@ -247,30 +334,17 @@ def get_generate_kwargs(self) -> t.Dict[str, t.Any]:
             repetition_penalty=float(os.environ.get("REPETITION_PENALTY", 1.3)),
         )
 
-    def _simplify_content(self, content: str) -> str:
-        content = content.strip()
-
-        for token in ("###", "[UNK]", "</s>", "]", ")", "）", "】"):
-            if content.endswith(token):
-                content = content[: -len(token)].strip()
-
-        for prefix in (":", "：", "(", "（", "[", "【"):
-            if content.startswith(prefix):
-                content = content[len(prefix) :].strip()
-
-        return content
-
     def do_predict(
         self,
         input_prompt: str,
         benchmark_type: BenchmarkType = BenchmarkType.MultipleChoice,
         max_new_tokens: int = 50,
-        predict_choice_by_logits: bool = False,
-    ) -> t.Dict | str:
+        **kwargs: t.Any,
+    ) -> t.Dict | str | t.List:
         # TODO: add self prompt wrapper
         content = self._do_predict_with_generate(input_prompt, max_new_tokens)
         ret = {"content": self._simplify_content(content)}
-        if predict_choice_by_logits:
+        if kwargs.get("predict_choice_by_logits", False):
             if benchmark_type != BenchmarkType.MultipleChoice:
                 raise ValueError(
                     "predict_choice_by_logits only support BenchmarkType.MultipleChoice"
@@ -363,7 +437,8 @@ def get_llm(name: str, **kwargs: t.Any) -> LLMBase:
     return _SUPPORTED_LLM[name](**kwargs)
 
 
-def get_built_llm(rootdir: Path | None = None, **kwargs: t.Any) -> LLMBase:
+def get_built_llm(**kwargs: t.Any) -> LLMBase:
+    rootdir = kwargs.get("rootdir")
     rootdir = rootdir if rootdir is not None else Path.cwd()
     config_path = rootdir / "pretrained" / "sw_config.json"
     if not config_path.exists():
diff --git a/example/llm-leaderboard/src/llm/mistral.py b/example/llm-leaderboard/src/llm/mistral.py
new file mode 100644
index 0000000000..6650507221
--- /dev/null
+++ b/example/llm-leaderboard/src/llm/mistral.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from .base import register, vLLMBase, LLMModelDesc
+
+
+@register()
+class Mistral7BInstruct(vLLMBase):
+    def get_hf_repo_id(self) -> str:
+        return "mistralai/Mistral-7B-Instruct-v0.2"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "mistral-7b-instruct"
+
+    @classmethod
+    def get_description(cls) -> LLMModelDesc:
+        return LLMModelDesc(
+            params="7b",
+            intro=(
+                "We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. "
+                "Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation."
+                "Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. "
+                "We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are released under the Apache 2.0 license."
+            ),
+            license="apache-2.0",
+            author="Mistral",
+            github="https://github.com/mistralai/mistral-src",
+            type="fine-tuned",
+        )
+
+    def download(self) -> None:
+        super().download()
+
+        local_dir = self.get_base_dir()
+        # We only need safetensors files.
+        useless_fnames = (
+            "pytorch_model-00001-of-00003.bin",
+            "pytorch_model-00002-of-00003.bin",
+            "pytorch_model-00003-of-00003.bin",
+            "pytorch_model.bin.index.json",
+        )
+        for fname in useless_fnames:
+            (local_dir / fname).unlink()