diff --git a/pyproject.toml b/pyproject.toml
index dcf9df88..42011711 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ docx = "^0.2.4"
 pai_rag = "pai_rag.main:main"
 load_data = "pai_rag.tool.load_data_tool:run"
 load_model = "pai_rag.utils.download_models:load_models"
-evaluation = "pai_rag.evaluation.eval_pipeline:run"
+run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"
 
 [[tool.poetry.source]]
 name = "pytorch_cpu"
diff --git a/pyproject_gpu.toml b/pyproject_gpu.toml
index b96a93be..3d4a4cb4 100644
--- a/pyproject_gpu.toml
+++ b/pyproject_gpu.toml
@@ -100,7 +100,7 @@ peft = "^0.12.0"
 pai_rag = "pai_rag.main:main"
 load_data = "pai_rag.tool.load_data_tool:run"
 load_model = "pai_rag.utils.download_models:load_models"
-evaluation = "pai_rag.evaluation.eval_pipeline:run"
+run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
diff --git a/src/pai_rag/app/web/ui_constants.py b/src/pai_rag/app/web/ui_constants.py
index 1d67db52..0bff82d1 100644
--- a/src/pai_rag/app/web/ui_constants.py
+++ b/src/pai_rag/app/web/ui_constants.py
@@ -112,7 +112,7 @@
 MLLM_MODEL_KEY_DICT = {
     "dashscope": [
         "qwen-vl-max",
-        "qwen-vl-turbo",
+        "qwen-vl-plus",
     ]
 }
 
diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml
new file mode 100644
index 00000000..76823aa8
--- /dev/null
+++ b/src/pai_rag/config/evaluation/config.yaml
@@ -0,0 +1,8 @@
+experiment:
+  # [custom knowledge dataset]
+  - name: "exp1"
+    data_path: "example_data/eval_docs"
+    setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
+  - name: "exp2"
+    data_path: "example_data/eval_docs_1"
+    setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
diff --git a/src/pai_rag/evaluation/settings_eval.toml b/src/pai_rag/config/evaluation/settings_eval.toml
similarity index 83%
rename from src/pai_rag/evaluation/settings_eval.toml
rename to src/pai_rag/config/evaluation/settings_eval.toml
index e5b2114f..341c3949 100644
--- a/src/pai_rag/evaluation/settings_eval.toml
+++ b/src/pai_rag/config/evaluation/settings_eval.toml
@@ -5,16 +5,8 @@ name = "pai_rag"
 version = "0.1.1"
 
 [rag.agent]
-type = "react"
-
-[rag.agent.custom_config]
-agent_file_path = ""
-
-[rag.agent.intent_detection]
-type = ""
-
-[rag.agent.tool]
-type = ""
+custom_agent_config_file = ""
+agent_tool_type = ""
 
 [rag.chat_store]
 type = "Local" # [Local, Aliyun-Redis]
@@ -23,12 +15,9 @@ password = "Aliyun-Redis user:pwd"
 persist_path = "localdata/eval_exp_data/storage"
 
 [rag.data_analysis]
-analysis_type = "nl2pandas"
+type = "pandas"
 nl2sql_prompt = "给定一个输入问题，创建一个语法正确的{dialect}查询语句来执行，不要从特定的表中查询所有列，只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时，请使用表名限定列名。\n=====\n 你必须使用以下格式，每项占一行：\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: "
 
-[rag.data_loader]
-type = "local"
-
 [rag.data_reader]
 type = "SimpleDirectoryReader"
 
@@ -42,9 +31,6 @@ type = "SimpleDirectoryReader"
 source = "DashScope"
 embed_batch_size = 10
 
-[rag.embedding.multi_modal]
-source = "cnclip"
-
 [rag.index]
 persist_path = "localdata/eval_exp_data/storage"
 enable_multimodal = true
@@ -60,12 +46,11 @@ vector_store.type = "FAISS"
 source = "DashScope"
 model = "qwen-turbo"
 
-[rag.llm.function_calling_llm]
-source = ""
+[rag.multimodal_embedding]
+source = "cnclip"
 
-[rag.llm.multi_modal]
-enable = true
-source = "DashScope"
+[rag.multimodal_llm]
+source = "dashscope"
 model = "qwen-vl-plus"
 
 [rag.node_enhancement]
@@ -81,20 +66,16 @@ enable_multimodal = true
 
 [rag.oss_store]
 bucket = ""
-endpoint = ""
-prefix = ""
+endpoint = "oss-cn-hangzhou.aliyuncs.com"
 
 [rag.postprocessor]
-reranker_type = "simple-weighted-reranker" # [simple-weighted-reranker, model-based-reranker]
+reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker]
 reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large]
 keyword_weight = 0.3
 vector_weight = 0.7
 similarity_threshold = 0.5
 top_n = 2
 
-[rag.query_engine]
-type = "RetrieverQueryEngine"
-
 [rag.query_transform]
 type = ""
 
@@ -111,6 +92,6 @@ type = "SimpleSummarize"
 text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n"
 
 [rag.trace]
-type = "pai-llm-trace"
+type = "pai_trace"
 endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090"
 token = ""
diff --git a/src/pai_rag/evaluation/dataset/rag_eval_dataset.py b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py
new file mode 100644
index 00000000..c9555e37
--- /dev/null
+++ b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py
@@ -0,0 +1,111 @@
+from typing import List, Optional, Type, Dict
+from llama_index.core.bridge.pydantic import Field
+import json
+from llama_index.core.bridge.pydantic import BaseModel
+from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample
+
+
+class EvaluationSample(RagQcaSample):
+    """Response Evaluation RAG example class."""
+
+    hitrate: Optional[float] = Field(
+        default_factory=None,
+        description="The hitrate value for retrieval evaluation.",
+    )
+    mrr: Optional[float] = Field(
+        default_factory=None,
+        description="The mrr value for retrieval evaluation.",
+    )
+
+    faithfulness_score: Optional[float] = Field(
+        default_factory=None,
+        description="The faithfulness score for response evaluation.",
+    )
+
+    faithfulness_reason: Optional[str] = Field(
+        default_factory=None,
+        description="The faithfulness reason for response evaluation.",
+    )
+
+    correctness_score: Optional[float] = Field(
+        default_factory=None,
+        description="The correctness score for response evaluation.",
+    )
+
+    correctness_reason: Optional[str] = Field(
+        default_factory=None,
+        description="The correctness reason for response evaluation.",
+    )
+
+    @property
+    def class_name(self) -> str:
+        """Data example class name."""
+        return "EvaluationSample"
+
+
+class PaiRagEvalDataset(BaseModel):
+    _example_type: Type[EvaluationSample] = EvaluationSample  # type: ignore[misc]
+    examples: List[EvaluationSample] = Field(
+        default=[], description="Data examples of this dataset."
+    )
+    results: Dict[str, Dict[str, float]] = Field(
+        default_factory=dict, description="Evaluation result of this dataset."
+    )
+    status: Dict[str, bool] = Field(
+        default_factory=dict, description="Status of this dataset."
+    )
+
+    @property
+    def class_name(self) -> str:
+        """Class name."""
+        return "PaiRagEvalDataset"
+
+    def cal_mean_metric_score(self) -> float:
+        """Calculate the mean metric score."""
+        self.results["retrieval"] = {}
+        self.results["response"] = {}
+        if self.status["retrieval"]:
+            self.results["retrieval"] = {
+                "mean_hitrate": sum(float(entry.hitrate) for entry in self.examples)
+                / len(self.examples),
+                "mean_mrr": sum(float(entry.mrr) for entry in self.examples)
+                / len(self.examples),
+            }
+        if self.status["response"]:
+            self.results["response"] = {
+                "mean_faithfulness_score": sum(
+                    float(entry.faithfulness_score) for entry in self.examples
+                )
+                / len(self.examples),
+                "mean_correctness_score": sum(
+                    float(entry.correctness_score) for entry in self.examples
+                )
+                / len(self.examples),
+            }
+
+    def save_json(self, path: str) -> None:
+        """Save json."""
+        self.cal_mean_metric_score()
+
+        with open(path, "w", encoding="utf-8") as f:
+            examples = [self._example_type.dict(el) for el in self.examples]
+            data = {
+                "examples": examples,
+                "results": self.results,
+                "status": self.status,
+            }
+
+            json.dump(data, f, indent=4, ensure_ascii=False)
+            print(f"Saved dataset to {path}.")
+
+    @classmethod
+    def from_json(cls, path: str) -> "PaiRagEvalDataset":
+        """Load json."""
+        with open(path) as f:
+            data = json.load(f)
+
+        examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
+        results = data["results"]
+        status = data["status"]
+
+        return cls(examples=examples, results=results, status=status)
diff --git a/src/pai_rag/evaluation/generator/rag_qca_sample.py b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py
similarity index 54%
rename from src/pai_rag/evaluation/generator/rag_qca_sample.py
rename to src/pai_rag/evaluation/dataset/rag_qca_dataset.py
index f0f37e9f..a4d0bd3d 100644
--- a/src/pai_rag/evaluation/generator/rag_qca_sample.py
+++ b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py
@@ -1,22 +1,15 @@
-from typing import List, Optional
+from typing import List, Optional, Type
 from llama_index.core.bridge.pydantic import Field
 from llama_index.core.llama_dataset.base import BaseLlamaDataExample
 from llama_index.core.llama_dataset import CreatedBy
+import json
+from llama_index.core.bridge.pydantic import BaseModel
 
 
-class LabelledRagQcaSample(BaseLlamaDataExample):
-    """RAG example class. Analogous to traditional ML datasets, this dataset contains
+class RagQcaSample(BaseLlamaDataExample):
+    """Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
     the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
     to evaluate the prediction.
-
-    Args:
-        query (str): The user query
-        query_by (CreatedBy): Query generated by human or ai (model-name)
-        reference_contexts (Optional[List[str]]): The contexts used for response
-        reference_node_id (Optional[List[str]]): The node id corresponding to the contexts
-        reference_answer ([str]): Reference answer to the query. An answer
-                                    that would receive full marks upon evaluation.
-        reference_answer_by: The reference answer generated by human or ai (model-name).
     """
 
     query: str = Field(
@@ -40,18 +33,6 @@ class LabelledRagQcaSample(BaseLlamaDataExample):
         default=None, description="What model generated the reference answer."
     )
 
-    @property
-    def class_name(self) -> str:
-        """Data example class name."""
-        return "LabelledRagQcaSample"
-
-
-class PredictedRagQcaSample(LabelledRagQcaSample):
-    """Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
-    the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
-    to evaluate the prediction.
-    """
-
     predicted_contexts: Optional[List[str]] = Field(
         default_factory=None,
         description="The contexts used to generate the predicted answer.",
@@ -71,4 +52,50 @@ class PredictedRagQcaSample(LabelledRagQcaSample):
     @property
     def class_name(self) -> str:
         """Data example class name."""
-        return "PredictedRagQcaSample"
+        return "RagQcaSample"
+
+
+class PaiRagQcaDataset(BaseModel):
+    _example_type: Type[RagQcaSample] = RagQcaSample  # type: ignore[misc]
+    examples: List[RagQcaSample] = Field(
+        default=[], description="Data examples of this dataset."
+    )
+    labelled: bool = Field(
+        default=False, description="Whether the dataset is labelled or not."
+    )
+    predicted: bool = Field(
+        default=False, description="Whether the dataset is predicted or not."
+    )
+
+    @property
+    def class_name(self) -> str:
+        """Class name."""
+        return "PaiRagQcaDataset"
+
+    def save_json(self, path: str) -> None:
+        """Save json."""
+        with open(path, "w", encoding="utf-8") as f:
+            examples = [self._example_type.dict(el) for el in self.examples]
+            data = {
+                "examples": examples,
+                "labelled": self.labelled,
+                "predicted": self.predicted,
+            }
+
+            json.dump(data, f, indent=4, ensure_ascii=False)
+            print(f"Saved PaiRagQcaDataset to {path}.")
+
+    @classmethod
+    def from_json(cls, path: str) -> "PaiRagQcaDataset":
+        """Load json."""
+        with open(path) as f:
+            data = json.load(f)
+
+        if len(data["examples"]) > 0:
+            examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
+            labelled = data["labelled"]
+            predicted = data["predicted"]
+
+            return cls(examples=examples, labelled=labelled, predicted=predicted)
+        else:
+            return None
diff --git a/src/pai_rag/evaluation/eval_pipeline.py b/src/pai_rag/evaluation/eval_pipeline.py
deleted file mode 100644
index 19a76f44..00000000
--- a/src/pai_rag/evaluation/eval_pipeline.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import click
-import os
-import asyncio
-from pathlib import Path
-from pai_rag.core.rag_config_manager import RagConfigManager
-from pai_rag.core.rag_data_loader import RagDataLoader
-from pai_rag.core.rag_module import (
-    resolve,
-    resolve_data_loader,
-    resolve_llm,
-    resolve_vector_index,
-)
-from pai_rag.evaluation.generator.labelled_qca_generator import LabelledRagQcaGenerator
-from pai_rag.evaluation.generator.predicted_qca_generator import (
-    PredictedRagQcaGenerator,
-)
-from pai_rag.integrations.llms.pai.pai_multi_modal_llm import (
-    PaiMultiModalLlm,
-)
-import logging
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-
-_BASE_DIR = Path(__file__).parent.parent
-DEFAULT_APPLICATION_CONFIG_FILE = os.path.join(
-    _BASE_DIR, "evaluation/settings_eval.toml"
-)
-
-
-def _create_data_loader(config_file, enable_raptor: bool = False) -> RagDataLoader:
-    config = RagConfigManager.from_file(config_file).get_value()
-    data_loader = resolve_data_loader(config)
-    vector_index = resolve_vector_index(config)
-
-    return data_loader, vector_index
-
-
-def _create_labelled_qca_generator(config_file, vector_index) -> None:
-    config = RagConfigManager.from_file(config_file).get_value()
-    llm = resolve_llm(config)
-    qca_generator = LabelledRagQcaGenerator(
-        llm=llm, vector_index=vector_index, persist_path=config.rag.index.persist_path
-    )
-    return qca_generator
-
-
-def _create_predicted_qca_generator(config_file, vector_index) -> None:
-    config = RagConfigManager.from_file(config_file).get_value()
-    # llm = resolve_llm(config)
-    multimodal_llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm)
-    predicted_qca_generator = PredictedRagQcaGenerator(
-        llm=multimodal_llm,
-        vector_index=vector_index,
-        persist_path=config.rag.index.persist_path,
-    )
-    return predicted_qca_generator
-
-
-@click.command()
-@click.option(
-    "-c",
-    "--config",
-    show_default=True,
-    help=f"Configuration file. Default: {DEFAULT_APPLICATION_CONFIG_FILE}",
-    default=DEFAULT_APPLICATION_CONFIG_FILE,
-)
-@click.option(
-    "-o",
-    "--oss_path",
-    type=str,
-    required=False,
-    default=None,
-    show_default=True,
-    help="oss path (file or directory) to ingest. Example: oss://rag-demo/testdata",
-)
-@click.option(
-    "-d",
-    "--data_path",
-    type=str,
-    required=False,
-    default=None,
-    show_default=True,
-    help="data path (file or directory) to ingest.",
-)
-@click.option(
-    "-p",
-    "--pattern",
-    required=False,
-    type=str,
-    default=None,
-    help="data pattern to ingest.",
-)
-@click.option(
-    "-r",
-    "--enable_raptor",
-    required=False,
-    is_flag=True,
-    show_default=True,
-    default=False,
-    help="use raptor for node enhancement.",
-)
-def run(
-    config=None,
-    oss_path=None,
-    data_path=None,
-    pattern=None,
-    enable_raptor=False,
-):
-    assert (oss_path is not None) or (
-        data_path is not None
-    ), "Must provide either local path or oss path."
-    assert (oss_path is None) or (
-        data_path is None
-    ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'."
-
-    data_loader, vector_index = _create_data_loader(config, enable_raptor)
-    data_loader.load_data(
-        file_path_or_directory=data_path,
-        filter_pattern=pattern,
-        oss_path=oss_path,
-        from_oss=oss_path is not None,
-        enable_raptor=enable_raptor,
-    )
-    qca_generator = _create_labelled_qca_generator(config, vector_index)
-    asyncio.run(qca_generator.agenerate_labelled_qca_dataset())
-
-    predicted_qca_generator = _create_predicted_qca_generator(config, vector_index)
-    asyncio.run(predicted_qca_generator.agenerate_predicted_qca_dataset())
diff --git a/src/pai_rag/evaluation/evaluator/base_evaluator.py b/src/pai_rag/evaluation/evaluator/base_evaluator.py
new file mode 100644
index 00000000..b3ccb56b
--- /dev/null
+++ b/src/pai_rag/evaluation/evaluator/base_evaluator.py
@@ -0,0 +1,132 @@
+import os
+from pai_rag.evaluation.metrics.retrieval.hitrate import HitRate
+from pai_rag.evaluation.metrics.retrieval.mrr import MRR
+from pai_rag.evaluation.metrics.response.faithfulness import Faithfulness
+from pai_rag.evaluation.metrics.response.correctness import Correctness
+
+from llama_index.core.async_utils import run_jobs
+from pai_rag.evaluation.dataset.rag_eval_dataset import (
+    EvaluationSample,
+    PaiRagEvalDataset,
+)
+from pai_rag.evaluation.dataset.rag_qca_dataset import PaiRagQcaDataset
+
+
+class BaseEvaluator:
+    def __init__(self, llm, persist_path: str = None):
+        self._llm = llm
+        self.persist_path = persist_path
+        self.hitrate = HitRate()
+        self.mrr = MRR()
+        self.retrieval_evaluators = [self.hitrate, self.mrr]
+        self.faithfulness_evaluator = Faithfulness(
+            llm=self._llm,
+        )
+        self.correctness_evaluator = Correctness(
+            llm=self._llm,
+        )
+        self.response_evaluators = [
+            self.faithfulness_evaluator,
+            self.correctness_evaluator,
+        ]
+        self.evaluation_dataset_path = os.path.join(
+            self.persist_path, "evaluation_dataset.json"
+        )
+        self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json")
+        self._show_progress = True
+        self._workers = 2
+
+    def load_qca_dataset(self) -> None:
+        if os.path.exists(self.qca_dataset_path):
+            rag_qca_dataset = PaiRagQcaDataset.from_json(self.qca_dataset_path)
+            if rag_qca_dataset.labelled and rag_qca_dataset.predicted:
+                print(
+                    f"Labelled QCA dataset already exists at {self.qca_dataset_path}."
+                )
+                return rag_qca_dataset
+            else:
+                raise ValueError(
+                    "The QCA dataset exists but is not labelled and predicted. "
+                    "Please either label it or provide a new one."
+                )
+        else:
+            print("No existing QCA dataset found. You can proceed to create a new one.")
+            return None
+
+    def load_evaluation_dataset(self) -> None:
+        if os.path.exists(self.evaluation_dataset_path):
+            print(
+                f"A evaluation dataset already exists at {self.evaluation_dataset_path}."
+            )
+            evaluation_dataset = PaiRagEvalDataset.from_json(
+                self.evaluation_dataset_path
+            )
+            return evaluation_dataset
+        else:
+            print(
+                "No existing evaluation dataset found. You can proceed to create a new one."
+            )
+            return None
+
+    async def compute_retrieval_metrics(self, qca_sample):
+        retrieval_eval_example = EvaluationSample(**vars(qca_sample))
+        reference_node_id = retrieval_eval_example.reference_node_id
+        predicted_node_id = retrieval_eval_example.predicted_node_id
+        for metric in self.retrieval_evaluators:
+            metric_score = metric.compute(reference_node_id, predicted_node_id)
+            setattr(retrieval_eval_example, metric.metric_name, metric_score)
+
+        return retrieval_eval_example
+
+    async def compute_response_metrics(self, qca_sample):
+        response_eval_example = EvaluationSample(**vars(qca_sample))
+        query = response_eval_example.query
+        response = response_eval_example.reference_answer
+        contexts = response_eval_example.predicted_contexts
+        for metric in self.response_evaluators:
+            metric_result = await metric.aevaluate(query, response, contexts)
+            setattr(
+                response_eval_example, f"{metric.metric_name}_score", metric_result[0]
+            )
+            setattr(
+                response_eval_example, f"{metric.metric_name}_reason", metric_result[1]
+            )
+
+        return response_eval_example
+
+    async def aevaluation(self, stage):
+        """Run evaluation with qca dataset."""
+        _status = {"retrieval": False, "response": False}
+        evaluation_dataset = self.load_evaluation_dataset()
+        qca_dataset = self.load_qca_dataset()
+        if evaluation_dataset:
+            print(
+                f"A evaluation dataset already exists with status: [[{evaluation_dataset.status}]]"
+            )
+            _status = evaluation_dataset.status
+            if _status[stage]:
+                return evaluation_dataset.results[stage]
+            else:
+                qca_dataset = evaluation_dataset
+        if qca_dataset:
+            print(f"Starting to generate evaluation dataset for stage: [[{stage}]]...")
+            eval_tasks = []
+            for qca in qca_dataset.examples:
+                if stage == "retrieval":
+                    eval_tasks.append(self.compute_retrieval_metrics(qca))
+                elif stage == "response":
+                    eval_tasks.append(self.compute_response_metrics(qca))
+                else:
+                    raise ValueError(f"Invalid stage: {stage}")
+            eval_examples = await run_jobs(
+                eval_tasks, self._show_progress, self._workers
+            )
+            _status[stage] = True
+            eval_dataset = PaiRagEvalDataset(examples=eval_examples, status=_status)
+            eval_dataset.save_json(self.evaluation_dataset_path)
+            return eval_dataset.results[stage]
+        else:
+            raise ValueError(
+                "No QCA dataset found. Please provide a QCA dataset or "
+                "generate one first."
+            )
diff --git a/src/pai_rag/evaluation/generator/labelled_qca_generator.py b/src/pai_rag/evaluation/generator/labelled_qca_generator.py
deleted file mode 100644
index 9fe9b024..00000000
--- a/src/pai_rag/evaluation/generator/labelled_qca_generator.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from typing import List
-from llama_index.core.indices import VectorStoreIndex
-from pai_rag.utils.prompt_template import (
-    DEFAULT_QUESTION_GENERATION_PROMPT,
-    DEFAULT_TEXT_QA_PROMPT_TMPL,
-    DEFAULT_QUESTION_GENERATION_QUERY,
-)
-from llama_index.core.base.response.schema import RESPONSE_TYPE
-from llama_index.core.prompts.base import PromptTemplate
-import re
-from llama_index.core.async_utils import run_jobs
-from pai_rag.evaluation.generator.rag_qca_sample import LabelledRagQcaSample
-from llama_index.core.llama_dataset import (
-    CreatedBy,
-    CreatedByType,
-    LabelledRagDataset,
-)
-import json
-import os
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class LabelledRagQcaGenerator:
-    def __init__(
-        self, llm, vector_index: VectorStoreIndex = None, persist_path: str = None
-    ):
-        self._llm = llm
-        self._vector_index = vector_index._vector_index
-        self.question_gen_query = DEFAULT_QUESTION_GENERATION_QUERY.format(
-            num_questions_per_chunk=3
-        )
-        self.text_question_template = PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT)
-        self.text_question_answer_template = PromptTemplate(DEFAULT_TEXT_QA_PROMPT_TMPL)
-        model_name = self._llm.metadata.model_name
-        self.created_by = CreatedBy(type=CreatedByType.AI, model_name=model_name)
-        self.persist_path = persist_path
-        self._show_progress = True
-        self._workers = 2
-
-    def save_labelled_qca_dataset_json(self, qas: LabelledRagDataset) -> None:
-        """Save json."""
-        labelled_qca_dataset_path = os.path.join(
-            self.persist_path, "labelled_qca_dataset.json"
-        )
-        with open(labelled_qca_dataset_path, "w", encoding="utf-8") as f:
-            json.dump(qas.dict(), f, indent=4, ensure_ascii=False)
-        logger.info(f"Saved labelled qca dataset to {labelled_qca_dataset_path}")
-
-    async def agenerate_labelled_qca_dataset(
-        self,
-    ) -> LabelledRagDataset:
-        examples: List[LabelledRagQcaSample] = []
-        docs = self._vector_index._docstore.docs
-        nodes = list(docs.values())
-        query_tasks = []
-        for node in nodes:
-            prompt_str = self.text_question_template.format(
-                context_str=node.text, num_questions_per_chunk=3
-            )
-            task = self._llm.acomplete(prompt=prompt_str)
-            query_tasks.append(task)
-        responses = await run_jobs(query_tasks, self._show_progress, self._workers)
-        for node, response in zip(nodes, responses):
-            result = str(response).strip().split("\n")
-            cleaned_questions = [
-                re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
-            ]
-            cleaned_questions = [
-                question for question in cleaned_questions if len(question) > 0
-            ]
-            qr_tasks = []
-            for query in cleaned_questions:
-                # build summary index off of node (i.e. context)
-                prompt_str = self.text_question_answer_template.format(
-                    context_str=node.text, query_str=query
-                )
-                qr_task = self._llm.acomplete(prompt=prompt_str)
-                qr_tasks.append(qr_task)
-            answer_responses: List[RESPONSE_TYPE] = await run_jobs(
-                qr_tasks, self._show_progress, self._workers
-            )
-            for (
-                question,
-                answer_response,
-            ) in zip(cleaned_questions, answer_responses):
-                example = LabelledRagQcaSample(
-                    query=question,
-                    reference_answer=str(answer_response),
-                    reference_contexts=[node.text],
-                    reference_node_id=[node.node_id],
-                    reference_answer_by=self.created_by,
-                    query_by=self.created_by,
-                )
-                examples.append(example)
-        labelled_qca_dataset = LabelledRagDataset(examples=examples)
-        self.save_labelled_qca_dataset_json(labelled_qca_dataset)
-        return labelled_qca_dataset
diff --git a/src/pai_rag/evaluation/generator/predicted_qca_generator.py b/src/pai_rag/evaluation/generator/predicted_qca_generator.py
deleted file mode 100644
index 2d1e1838..00000000
--- a/src/pai_rag/evaluation/generator/predicted_qca_generator.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from typing import List
-from llama_index.core.indices import VectorStoreIndex
-from llama_index.core.async_utils import run_jobs
-from pai_rag.evaluation.generator.rag_qca_sample import PredictedRagQcaSample
-from llama_index.core.llama_dataset import (
-    CreatedBy,
-    CreatedByType,
-    LabelledRagDataset,
-)
-
-import json
-import os
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class PredictedRagQcaGenerator:
-    def __init__(
-        self, llm, vector_index: VectorStoreIndex = None, persist_path: str = None
-    ):
-        self._vector_index = vector_index._vector_index
-        self._llm = llm
-        model_name = self._llm.metadata.model_name
-        self.created_by = CreatedBy(type=CreatedByType.AI, model_name=model_name)
-        self._query_engine = vector_index._vector_index.as_query_engine(llm=self._llm)
-        self.persist_path = persist_path
-        self._show_progress = True
-        self._workers = 2
-
-    def from_labelled_qca_dataset(self) -> LabelledRagDataset:
-        """Load json."""
-        labelled_qca_dataset_path = os.path.join(
-            self.persist_path, "labelled_qca_dataset.json"
-        )
-        with open(labelled_qca_dataset_path, encoding="utf-8") as f:
-            data = json.load(f)
-        return data
-
-    def save_predicted_qca_dataset_json(self, qas: LabelledRagDataset) -> None:
-        """Save json."""
-        predicted_qca_dataset_path = os.path.join(
-            self.persist_path, "predicted_qca_dataset.json"
-        )
-        with open(predicted_qca_dataset_path, "w", encoding="utf-8") as f:
-            json.dump(qas.dict(), f, indent=4, ensure_ascii=False)
-        logger.info(f"Saved labelled qca dataset to {predicted_qca_dataset_path}")
-
-    async def agenerate_predicted_qca_dataset(
-        self,
-    ):
-        labelled_qca_dataset = self.from_labelled_qca_dataset()
-        query_tasks = []
-        labelled_qca_set = []
-        for qca in labelled_qca_dataset["examples"]:
-            task = self._query_engine.aquery(qca["query"])
-            labelled_qca_set.append(qca)
-            query_tasks.append(task)
-        responses = await run_jobs(query_tasks, self._show_progress, self._workers)
-        examples: List[PredictedRagQcaSample] = []
-        for qca, response in zip(labelled_qca_set, responses):
-            example = PredictedRagQcaSample(
-                query=qca["query"],
-                reference_answer=qca["reference_answer"],
-                reference_contexts=qca["reference_contexts"],
-                reference_node_id=qca["reference_node_id"],
-                reference_answer_by=qca["reference_answer_by"],
-                query_by=qca["query_by"],
-                predicted_contexts=[node.node.text for node in response.source_nodes],
-                predicted_node_id=[node.node.node_id for node in response.source_nodes],
-                predicted_answer=response.response,
-                predicted_answer_by=self.created_by,
-            )
-            examples.append(example)
-        predicted_qca_dataset = LabelledRagDataset(examples=examples)
-        self.save_predicted_qca_dataset_json(predicted_qca_dataset)
-        return predicted_qca_dataset
diff --git a/src/pai_rag/evaluation/generator/rag_qca_generator.py b/src/pai_rag/evaluation/generator/rag_qca_generator.py
new file mode 100644
index 00000000..6002462c
--- /dev/null
+++ b/src/pai_rag/evaluation/generator/rag_qca_generator.py
@@ -0,0 +1,155 @@
+from typing import List
+from llama_index.core.indices import VectorStoreIndex
+from pai_rag.utils.prompt_template import (
+    DEFAULT_QUESTION_GENERATION_PROMPT,
+    DEFAULT_TEXT_QA_PROMPT_TMPL,
+    DEFAULT_QUESTION_GENERATION_QUERY,
+)
+from llama_index.core.base.response.schema import RESPONSE_TYPE
+from llama_index.core.prompts.base import PromptTemplate
+import re
+from llama_index.core.async_utils import run_jobs
+from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample, PaiRagQcaDataset
+from llama_index.core.llama_dataset import (
+    CreatedBy,
+    CreatedByType,
+)
+from pai_rag.integrations.synthesizer.pai_synthesizer import PaiQueryBundle
+
+import os
+import logging
+from pai_rag.integrations.query_engine.pai_retriever_query_engine import (
+    PaiRetrieverQueryEngine,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class RagQcaGenerator:
+    def __init__(
+        self,
+        llm,
+        vector_index: VectorStoreIndex = None,
+        query_engine: PaiRetrieverQueryEngine = None,
+        persist_path: str = None,
+    ):
+        self._llm = llm
+        self._vector_index = vector_index._vector_index
+        self._query_engine = query_engine
+        self.question_gen_query = DEFAULT_QUESTION_GENERATION_QUERY.format(
+            num_questions_per_chunk=3
+        )
+        self.text_question_template = PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT)
+        self.text_question_answer_template = PromptTemplate(DEFAULT_TEXT_QA_PROMPT_TMPL)
+        self.created_by = CreatedBy(
+            type=CreatedByType.AI, model_name=self._llm.metadata.model_name
+        )
+        self.persist_path = persist_path
+        self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json")
+        self._show_progress = True
+        self._workers = 2
+
+    def load_qca_dataset(self) -> None:
+        if os.path.exists(self.qca_dataset_path):
+            rag_qca_dataset = PaiRagQcaDataset.from_json(self.qca_dataset_path)
+            print(
+                f"A RAG QCA dataset already exists at {self.qca_dataset_path} with status: [labelled: {rag_qca_dataset.labelled}, predicted: {rag_qca_dataset.predicted}]."
+            )
+            return rag_qca_dataset
+        else:
+            print("No existing QCA dataset found. You can proceed to create a new one.")
+            return None
+
+    async def agenerate_qca_dataset(self, stage):
+        rag_qca_dataset = self.load_qca_dataset()
+        if rag_qca_dataset and rag_qca_dataset.labelled:
+            if stage == "labelled":
+                print("Labelled QCA dataset already exists. Skipping labelled stage.")
+                return rag_qca_dataset.examples
+            elif stage == "predicted":
+                if rag_qca_dataset.predicted:
+                    print(
+                        "Predicted QCA dataset already exists. Skipping predicted stage."
+                    )
+                    return rag_qca_dataset.examples
+                else:
+                    return await self.agenerate_predicted_qca_dataset(rag_qca_dataset)
+            else:
+                raise ValueError(f"Invalid stage: {stage}")
+        else:
+            return await self.agenerate_labelled_qca_dataset()
+
+    async def agenerate_labelled_qca_sample(self, node):
+        prompt_str = self.text_question_template.format(
+            context_str=node.text, num_questions_per_chunk=1
+        )
+        response = await self._llm.acomplete(prompt=prompt_str, image_documents=None)
+        result = str(response).strip().split("\n")
+        cleaned_questions = [
+            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
+        ]
+        cleaned_questions = [
+            question for question in cleaned_questions if len(question) > 0
+        ]
+        qr_tasks = []
+        for query in cleaned_questions:
+            prompt_str = self.text_question_answer_template.format(
+                context_str=node.text, query_str=query
+            )
+            qr_task = self._llm.acomplete(prompt=prompt_str, image_documents=None)
+            qr_tasks.append(qr_task)
+        answer_responses: List[RESPONSE_TYPE] = await run_jobs(
+            qr_tasks, self._show_progress, self._workers
+        )
+        for (
+            question,
+            answer_response,
+        ) in zip(cleaned_questions, answer_responses):
+            sample = RagQcaSample(
+                query=question,
+                reference_answer=str(answer_response),
+                reference_contexts=[node.text],
+                reference_node_id=[node.node_id],
+                reference_answer_by=self.created_by,
+                query_by=self.created_by,
+            )
+            return sample
+
+    async def agenerate_labelled_qca_dataset(
+        self,
+    ):
+        print("Starting to generate QCA dataset for [[labelled]].")
+        docs = self._vector_index._docstore.docs
+        nodes = list(docs.values())
+        tasks = []
+        for node in nodes:
+            tasks.append(self.agenerate_labelled_qca_sample(node))
+        examples = await run_jobs(tasks, self._show_progress, self._workers)
+        labelled_qca_dataset = PaiRagQcaDataset(examples=examples, labelled=True)
+        labelled_qca_dataset.save_json(self.qca_dataset_path)
+        return labelled_qca_dataset.examples
+
+    async def agenerate_predicted_qca_sample(self, qca_sample):
+        query_bundle = PaiQueryBundle(query_str=qca_sample.query)
+        response = await self._query_engine.aquery(query_bundle)
+        qca_sample.predicted_answer = response.response
+        qca_sample.predicted_contexts = [
+            node.node.text for node in response.source_nodes
+        ]
+        qca_sample.predicted_node_id = [
+            node.node.node_id for node in response.source_nodes
+        ]
+        qca_sample.predicted_answer_by = self.created_by
+        return qca_sample
+
+    async def agenerate_predicted_qca_dataset(self, rag_qca_dataset):
+        print("Starting to generate QCA dataset for [[predicted]].")
+        tasks = []
+        for qca_sample in rag_qca_dataset.examples:
+            tasks.append(self.agenerate_predicted_qca_sample(qca_sample))
+        predicted_examples = await run_jobs(tasks, self._show_progress, self._workers)
+        predicted_qca_dataset = PaiRagQcaDataset(
+            examples=predicted_examples, labelled=True, predicted=True
+        )
+        predicted_qca_dataset.save_json(self.qca_dataset_path)
+        return predicted_qca_dataset
diff --git a/src/pai_rag/evaluation/metrics/response/base.py b/src/pai_rag/evaluation/metrics/response/base.py
new file mode 100644
index 00000000..d0bad74f
--- /dev/null
+++ b/src/pai_rag/evaluation/metrics/response/base.py
@@ -0,0 +1,73 @@
+"""Llm metric for response evaluation."""
+from abc import abstractmethod
+from typing import Any, Optional, Sequence, Union
+
+from llama_index.core.evaluation.base import EvaluationResult
+from llama_index.core.llms.llm import LLM
+from llama_index.core.prompts import BasePromptTemplate, PromptTemplate
+from llama_index.core.prompts.mixin import PromptDictType
+from llama_index.core.prompts.mixin import PromptMixin, PromptMixinType
+
+DEFAULT_EVAL_TEMPLATE = PromptTemplate(
+    "Information: {query_str}\n" "Context: {context_str}\n" "Answer: " "Reason: "
+)
+
+
+class LlmMetric(PromptMixin):
+    """
+    Llm Metric.
+    """
+
+    metric_name: str = "base"
+
+    def __init__(
+        self,
+        llm: Optional[LLM] = None,
+        raise_error: bool = False,
+        eval_template: Optional[Union[str, BasePromptTemplate]] = None,
+    ) -> None:
+        """Init params."""
+        self._llm = llm
+        self._raise_error = raise_error
+
+        self._eval_template: BasePromptTemplate
+        if isinstance(eval_template, str):
+            self._eval_template = PromptTemplate(eval_template)
+        else:
+            self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
+
+    def _get_prompts(self) -> PromptDictType:
+        """Get prompts."""
+        return {
+            "eval_template": self._eval_template,
+        }
+
+    def _update_prompts(self, prompts: PromptDictType) -> None:
+        """Update prompts."""
+        if "eval_template" in prompts:
+            self._eval_template = prompts["eval_template"]
+
+    @abstractmethod
+    async def parse_eval_result(self, eval_result: str) -> float:
+        """Parse eval_result."""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def aevaluate(
+        self,
+        query: str | None = None,
+        response: str | None = None,
+        contexts: Sequence[str] | None = None,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        """Run evaluation with query string, retrieved contexts,
+        and generated response string.
+
+        Subclasses can override this method to provide custom evaluation logic and
+        take in additional arguments.
+        """
+        raise NotImplementedError
+
+    def _get_prompt_modules(self) -> PromptMixinType:
+        """Get prompt modules."""
+        return {}
diff --git a/src/pai_rag/evaluation/metrics/response/correctness.py b/src/pai_rag/evaluation/metrics/response/correctness.py
new file mode 100644
index 00000000..3e8b820a
--- /dev/null
+++ b/src/pai_rag/evaluation/metrics/response/correctness.py
@@ -0,0 +1,143 @@
+"""Correctness evaluation."""
+import asyncio
+from typing import Any, Optional, Sequence, Union
+
+from llama_index.core.evaluation.base import EvaluationResult
+from llama_index.core.llms.llm import LLM
+from llama_index.core.prompts import (
+    BasePromptTemplate,
+    ChatMessage,
+    ChatPromptTemplate,
+    MessageRole,
+    PromptTemplate,
+)
+from pai_rag.evaluation.metrics.response.base import LlmMetric
+
+DEFAULT_SYSTEM_TEMPLATE = """
+You are an expert evaluation system for a question answering chatbot.
+
+You are given the following information:
+- a user query, and
+- a generated answer
+
+You may also be given a reference answer to use for reference in your evaluation.
+
+Your job is to judge the relevance and correctness of the generated answer.
+Output a single score that represents a holistic evaluation.
+You must return your response in a line with only the score.
+Do not return answers in any other format.
+On a separate line provide your reasoning for the score as well.
+
+Follow these guidelines for scoring:
+- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
+- If the generated answer is not relevant to the user query, \
+you should give a score of 1.
+- If the generated answer is relevant but contains mistakes, \
+you should give a score between 2 and 3.
+- If the generated answer is relevant and fully correct, \
+you should give a score between 4 and 5.
+
+Example Response:
+4.0
+The generated answer has the exact same metrics as the reference answer, \
+    but it is not as concise.
+
+"""
+
+DEFAULT_USER_TEMPLATE = """
+## User Query
+{query}
+
+## Reference Answer
+{reference_answer}
+
+## Generated Answer
+{generated_answer}
+"""
+
+DEFAULT_EVAL_TEMPLATE = ChatPromptTemplate(
+    message_templates=[
+        ChatMessage(role=MessageRole.SYSTEM, content=DEFAULT_SYSTEM_TEMPLATE),
+        ChatMessage(role=MessageRole.USER, content=DEFAULT_USER_TEMPLATE),
+    ]
+)
+
+
+class Correctness(LlmMetric):
+    """Correctness evaluator.
+
+    Evaluates the correctness of a question answering system.
+    This evaluator depends on `reference` answer to be provided, in addition to the
+    query string and response string.
+
+    It outputs a score between 1 and 5, where 1 is the worst and 5 is the best,
+    along with a reasoning for the score.
+    Passing is defined as a score greater than or equal to the given threshold.
+
+    Args:
+        service_context (Optional[ServiceContext]): Service context.
+        eval_template (Optional[Union[BasePromptTemplate, str]]):
+            Template for the evaluation prompt.
+        score_threshold (float): Numerical threshold for passing the evaluation,
+            defaults to 4.0.
+    """
+
+    metric_name: str = "correctness"
+
+    def __init__(
+        self,
+        llm: Optional[LLM] = None,
+        raise_error: bool = False,
+        eval_template: Optional[Union[str, BasePromptTemplate]] = None,
+        score_threshold: float = 4.0,
+    ) -> None:
+        if isinstance(eval_template, str):
+            eval_template = PromptTemplate(eval_template)
+        else:
+            eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
+
+        super().__init__(llm, raise_error, eval_template)
+
+        self._score_threshold = score_threshold
+
+    def parse_eval_result(self, eval_result: str):
+        if not eval_result.strip():
+            # Return None or default values if the response is empty
+            return None, "No response"
+
+        score_str, reasoning_str = eval_result.split("\n", 1)
+
+        try:
+            score = float(score_str)
+        except ValueError:
+            score = None
+
+        reasoning = reasoning_str.lstrip("\n")
+        return [score, reasoning]
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        reference: Optional[str] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+        del contexts  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None:
+            raise ValueError("query, and response must be provided")
+
+        raw_response = await self._llm.apredict(
+            prompt=self._eval_template,
+            query=query,
+            generated_answer=response,
+            reference_answer=reference or "(NO REFERENCE ANSWER SUPPLIED)",
+        )
+
+        # Use the parser function
+        return self.parse_eval_result(raw_response)
diff --git a/src/pai_rag/evaluation/metrics/response/faithfulness.py b/src/pai_rag/evaluation/metrics/response/faithfulness.py
new file mode 100644
index 00000000..7e545963
--- /dev/null
+++ b/src/pai_rag/evaluation/metrics/response/faithfulness.py
@@ -0,0 +1,116 @@
+"""Faithfulness evaluation."""
+import asyncio
+from typing import Any, Optional, Sequence, Union
+from llama_index.core.llms.llm import LLM
+from llama_index.core.prompts import (
+    BasePromptTemplate,
+    PromptTemplate,
+)
+from llama_index.core.evaluation.base import EvaluationResult
+from pai_rag.evaluation.metrics.response.base import LlmMetric
+
+DEFAULT_EVAL_TEMPLATE = PromptTemplate(
+    "Please tell if a given piece of information "
+    "is supported by the context.\n"
+    "You need to answer with either YES or NO.\n"
+    "Answer YES if any of the context supports the information, even "
+    "if most of the context is unrelated. "
+    "Some examples are provided below. \n\n"
+    "Information: Apple pie is generally double-crusted.\n"
+    "Context: An apple pie is a fruit pie in which the principal filling "
+    "ingredient is apples. \n"
+    "Apple pie is often served with whipped cream, ice cream "
+    "('apple pie à la mode'), custard or cheddar cheese.\n"
+    "It is generally double-crusted, with pastry both above "
+    "and below the filling; the upper crust may be solid or "
+    "latticed (woven of crosswise strips).\n"
+    "Answer: YES\n"
+    "Reason: The context explicitly states that 'It is generally double-crusted,' "
+    "which directly supports the information that 'Apple pie is generally double-crusted.' "
+    "Therefore, the information is confirmed by the context. \n\n"
+    "Information: Apple pies tastes bad.\n"
+    "Context: An apple pie is a fruit pie in which the principal filling "
+    "ingredient is apples. \n"
+    "Apple pie is often served with whipped cream, ice cream "
+    "('apple pie à la mode'), custard or cheddar cheese.\n"
+    "It is generally double-crusted, with pastry both above "
+    "and below the filling; the upper crust may be solid or "
+    "latticed (woven of crosswise strips).\n"
+    "Answer: NO\n"
+    "Reason: The context does not provide any information regarding the taste of apple pie. "
+    "It describes the ingredients and serving suggestions but does not support the claim that "
+    "'apple pies taste bad.' Therefore, the information is not supported by the context. \n"
+    "Information: {query_str}\n"
+    "Context: {context_str}\n"
+    "Answer: "
+    "Reason: "
+)
+
+
+class Faithfulness(LlmMetric):
+    """
+    Faithfulness evaluator.
+
+    Evaluates whether a response is faithful to the contexts
+    (i.e. whether the response is supported by the contexts or hallucinated.)
+
+    This evaluator only considers the response string and the list of context strings.
+
+    Args:
+        raise_error(bool): Whether to raise an error when the response is invalid.
+            Defaults to False.
+        eval_template(Optional[Union[str, BasePromptTemplate]]):
+            The template to use for evaluation.
+    """
+
+    metric_name: str = "faithfulness"
+
+    def __init__(
+        self,
+        llm: Optional[LLM] = None,
+        raise_error: bool = False,
+        eval_template: Optional[Union[str, BasePromptTemplate]] = None,
+    ) -> None:
+        if isinstance(eval_template, str):
+            eval_template = PromptTemplate(eval_template)
+        else:
+            eval_template = eval_template or DEFAULT_EVAL_TEMPLATE
+
+        super().__init__(llm, raise_error, eval_template)
+
+    def parse_eval_result(self, eval_result: str):
+        raw_response_txt = eval_result.text.lower()
+        if "yes" in raw_response_txt:
+            passing = True
+        else:
+            passing = False
+            if self._raise_error:
+                raise ValueError("The response is invalid")
+        score = 1.0 if passing else 0.0
+        reasoning = raw_response_txt
+        return [score, reasoning]
+
+    async def aevaluate(
+        self,
+        query: str | None = None,
+        response: str | None = None,
+        contexts: Sequence[str] | None = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        """Evaluate whether the response is faithful to the contexts."""
+        del kwargs  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if contexts is None or response is None:
+            raise ValueError("contexts and response must be provided")
+
+        prompt_str = self._eval_template.format(
+            query_str=query,
+            context_str="\n".join(contexts),
+        )
+        raw_response = await self._llm.acomplete(prompt=prompt_str)
+
+        # Use the parser function
+        return self.parse_eval_result(raw_response)
diff --git a/src/pai_rag/evaluation/metrics/retrieval/core.py b/src/pai_rag/evaluation/metrics/retrieval/core.py
new file mode 100644
index 00000000..c25024e8
--- /dev/null
+++ b/src/pai_rag/evaluation/metrics/retrieval/core.py
@@ -0,0 +1,36 @@
+def default_hit_rate(expected_ids, retrieved_ids):
+    """Default HitRate calculation: Check if there is a single hit"""
+    is_hit = any(id in expected_ids for id in retrieved_ids)
+    score = 1.0 if is_hit else 0.0
+    return score
+
+
+def granular_hit_rate(expected_ids, retrieved_ids):
+    """Granular HitRate calculation: Calculate all hits and divide by the number of expected docs"""
+    expected_set = set(expected_ids)
+    hits = sum(1 for doc_id in retrieved_ids if doc_id in expected_set)
+    score = hits / len(expected_ids) if expected_ids else 0.0
+    return score
+
+
+def default_mrr(expected_ids, retrieved_ids):
+    """Default MRR calculation: Reciprocal rank of the first relevant document retrieved"""
+    for i, id in enumerate(retrieved_ids):
+        if id in expected_ids:
+            return 1.0 / (i + 1)
+    return 0.0
+
+
+def granular_mrr(expected_ids, retrieved_ids):
+    """Granular MRR calculation: All relevant retrieved docs have their reciprocal ranks summed and averaged."""
+    expected_set = set(expected_ids)
+    reciprocal_rank_sum = 0.0
+    relevant_docs_count = 0
+    for index, doc_id in enumerate(retrieved_ids):
+        if doc_id in expected_set:
+            relevant_docs_count += 1
+            reciprocal_rank_sum += 1.0 / (index + 1)
+    score = (
+        reciprocal_rank_sum / relevant_docs_count if relevant_docs_count > 0 else 0.0
+    )
+    return score
diff --git a/src/pai_rag/evaluation/metrics/retrieval/hitrate.py b/src/pai_rag/evaluation/metrics/retrieval/hitrate.py
new file mode 100644
index 00000000..a51c1e65
--- /dev/null
+++ b/src/pai_rag/evaluation/metrics/retrieval/hitrate.py
@@ -0,0 +1,51 @@
+from typing import List, Optional
+from pai_rag.evaluation.metrics.retrieval.core import (
+    granular_hit_rate,
+    default_hit_rate,
+)
+
+
+class HitRate:
+    """Hit rate metric: Compute hit rate with two calculation options.
+
+    - The default method checks for a single match between any of the retrieved docs and expected docs.
+    - The more granular method checks for all potential matches between retrieved docs and expected docs.
+
+    Attributes:
+        metric_name (str): The name of the metric.
+        use_granular_hit_rate (bool): Determines whether to use the granular method for calculation.
+    """
+
+    metric_name: str = "hitrate"
+    use_granular_hit_rate: bool = False
+
+    def compute(
+        self,
+        expected_ids: Optional[List[str]] = None,
+        retrieved_ids: Optional[List[str]] = None,
+    ):
+        """Compute metric based on the provided inputs.
+
+        Parameters:
+            expected_ids (Optional[List[str]]): Expected document IDs.
+            retrieved_ids (Optional[List[str]]): Retrieved document IDs.
+
+        Raises:
+            ValueError: If the necessary IDs are not provided.
+
+        Returns:
+            RetrievalMetricResult: The result with the computed hit rate score.
+        """
+        # Checking for the required arguments
+        if (
+            retrieved_ids is None
+            or expected_ids is None
+            or not retrieved_ids
+            or not expected_ids
+        ):
+            raise ValueError("Retrieved ids and expected ids must be provided")
+
+        if self.use_granular_hit_rate:
+            return granular_hit_rate(expected_ids, retrieved_ids)
+        else:
+            return default_hit_rate(expected_ids, retrieved_ids)
diff --git a/src/pai_rag/evaluation/metrics/retrieval/mrr.py b/src/pai_rag/evaluation/metrics/retrieval/mrr.py
new file mode 100644
index 00000000..ed92449b
--- /dev/null
+++ b/src/pai_rag/evaluation/metrics/retrieval/mrr.py
@@ -0,0 +1,51 @@
+from typing import List, Optional
+from pai_rag.evaluation.metrics.retrieval.core import (
+    default_mrr,
+    granular_mrr,
+)
+
+
+class MRR:
+    """MRR (Mean Reciprocal Rank) metric with two calculation options.
+
+    - The default method calculates the reciprocal rank of the first relevant retrieved document.
+    - The more granular method sums the reciprocal ranks of all relevant retrieved documents and divides by the count of relevant documents.
+
+    Attributes:
+        metric_name (str): The name of the metric.
+        use_granular_mrr (bool): Determines whether to use the granular method for calculation.
+    """
+
+    metric_name: str = "mrr"
+    use_granular_mrr: bool = False
+
+    def compute(
+        self,
+        expected_ids: Optional[List[str]] = None,
+        retrieved_ids: Optional[List[str]] = None,
+    ):
+        """Compute MRR based on the provided inputs and selected method.
+
+        Parameters:
+            expected_ids (Optional[List[str]]): Expected document IDs.
+            retrieved_ids (Optional[List[str]]): Retrieved document IDs.
+
+        Raises:
+            ValueError: If the necessary IDs are not provided.
+
+        Returns:
+            RetrievalMetricResult: The result with the computed MRR score.
+        """
+        # Checking for the required arguments
+        if (
+            retrieved_ids is None
+            or expected_ids is None
+            or not retrieved_ids
+            or not expected_ids
+        ):
+            raise ValueError("Retrieved ids and expected ids must be provided")
+
+        if self.use_granular_mrr:
+            return granular_mrr(expected_ids, retrieved_ids)
+        else:
+            return default_mrr(expected_ids, retrieved_ids)
diff --git a/src/pai_rag/evaluation/run_evaluation_experiments.py b/src/pai_rag/evaluation/run_evaluation_experiments.py
new file mode 100644
index 00000000..7cb0826b
--- /dev/null
+++ b/src/pai_rag/evaluation/run_evaluation_experiments.py
@@ -0,0 +1,64 @@
+import yaml
+import click
+import logging
+import time
+import json
+import hashlib
+from pai_rag.evaluation.run_evaluation_pipeline import run_evaluation_pipeline
+
+
+def validate_json_file(ctx, param, value):
+    """检查文件路径是否以 .json 结尾"""
+    if value is not None and not value.endswith(".json"):
+        raise click.BadParameter(
+            "Output path must be a JSON file with a .json extension."
+        )
+    return value
+
+
+def calculate_md5_from_json(data):
+    """计算 JSON 内容的 MD5 值"""
+    hasher = hashlib.md5()
+    # 将 JSON 对象转换为字符串，并确保顺序一致
+    json_str = json.dumps(data, sort_keys=True)
+    hasher.update(json_str.encode("utf-8"))
+    return hasher.hexdigest()
+
+
+def run_experiment(exp_params):
+    name = exp_params["name"]
+    logging.info(f"Running experiment with name={name}, exp_params={exp_params}")
+    try:
+        # 运行实验并获取结果
+        result = run_evaluation_pipeline(
+            config=exp_params["setting_file"],
+            data_path=exp_params["data_path"],
+            name=name,
+        )
+        logging.info(f"Finished experiment with name={name}")
+    except Exception as e:
+        logging.error(f"Error running experiment {name}: {e}")
+
+    return {"name": exp_params["name"], "parameters": exp_params, "result": result}
+
+
+@click.command()
+@click.option("-i", "--input_exp_config", show_default=True)
+@click.option("-o", "--output_path", callback=validate_json_file, show_default=True)
+def run(input_exp_config=None, output_path=None):
+    with open(input_exp_config) as file:
+        configs = yaml.safe_load(file)
+
+    if not output_path:
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+        file_key = calculate_md5_from_json(configs)
+        output_path = f"localdata/eval_exp_data/results_{file_key}_{timestamp}.json"
+    results = []
+    for exp in configs["experiment"]:
+        result = run_experiment(exp)
+        results.append(result)
+
+    with open(output_path, "w") as result_file:
+        json.dump(results, result_file, ensure_ascii=False, indent=4)
+
+    logging.info(f"Results saved to {output_path}")
diff --git a/src/pai_rag/evaluation/run_evaluation_pipeline.py b/src/pai_rag/evaluation/run_evaluation_pipeline.py
new file mode 100644
index 00000000..67841a6b
--- /dev/null
+++ b/src/pai_rag/evaluation/run_evaluation_pipeline.py
@@ -0,0 +1,99 @@
+import os
+import asyncio
+from pathlib import Path
+from pai_rag.core.rag_config_manager import RagConfigManager
+from pai_rag.core.rag_data_loader import RagDataLoader
+from pai_rag.core.rag_module import (
+    resolve,
+    resolve_data_loader,
+    resolve_llm,
+    resolve_vector_index,
+    resolve_query_engine,
+)
+from pai_rag.evaluation.generator.rag_qca_generator import RagQcaGenerator
+from pai_rag.integrations.llms.pai.pai_multi_modal_llm import (
+    PaiMultiModalLlm,
+)
+from pai_rag.evaluation.evaluator.base_evaluator import BaseEvaluator
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+_BASE_DIR = Path(__file__).parent.parent
+DEFAULT_APPLICATION_CONFIG_FILE = os.path.join(
+    _BASE_DIR, "evaluation/settings_eval.toml"
+)
+
+
+def _create_data_loader(
+    config_file, name, enable_raptor: bool = False
+) -> RagDataLoader:
+    config = RagConfigManager.from_file(config_file).get_value()
+
+    config.index.vector_store.persist_path = (
+        f"{config.index.vector_store.persist_path}__{name}"
+    )
+    data_loader = resolve_data_loader(config)
+    vector_index = resolve_vector_index(config)
+    query_engine = resolve_query_engine(config)
+
+    return data_loader, vector_index, query_engine
+
+
+def _create_qca_generator(config_file, name, vector_index, query_engine):
+    config = RagConfigManager.from_file(config_file).get_value()
+    multimodal_llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm)
+    persist_path = f"{config.index.vector_store.persist_path}__{name}"
+    qca_generator = RagQcaGenerator(
+        llm=multimodal_llm,
+        vector_index=vector_index,
+        query_engine=query_engine,
+        persist_path=persist_path,
+    )
+    return qca_generator
+
+
+def _create_base_evaluator(config_file, name):
+    config = RagConfigManager.from_file(config_file).get_value()
+    llm = resolve_llm(config)
+    persist_path = f"{config.index.vector_store.persist_path}__{name}"
+    return BaseEvaluator(
+        llm=llm,
+        persist_path=persist_path,
+    )
+
+
+def run_evaluation_pipeline(
+    config=None,
+    oss_path=None,
+    data_path=None,
+    pattern=None,
+    enable_raptor=False,
+    name="default",
+):
+    assert (oss_path is not None) or (
+        data_path is not None
+    ), "Must provide either local path or oss path."
+    assert (oss_path is None) or (
+        data_path is None
+    ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'."
+
+    data_loader, vector_index, query_engine = _create_data_loader(
+        config, name, enable_raptor
+    )
+    data_loader.load_data(
+        file_path_or_directory=data_path,
+        filter_pattern=pattern,
+        oss_path=oss_path,
+        from_oss=oss_path is not None,
+        enable_raptor=enable_raptor,
+    )
+    qca_generator = _create_qca_generator(config, name, vector_index, query_engine)
+    _ = asyncio.run(qca_generator.agenerate_qca_dataset(stage="labelled"))
+    _ = asyncio.run(qca_generator.agenerate_qca_dataset(stage="predicted"))
+    evaluator = _create_base_evaluator(config, name)
+    retrieval_result = asyncio.run(evaluator.aevaluation(stage="retrieval"))
+    response_result = asyncio.run(evaluator.aevaluation(stage="response"))
+    print("retrieval_result", retrieval_result, "response_result", response_result)
+    return {"retrieval": retrieval_result, "response": response_result}
diff --git a/src/pai_rag/tools/agent_tool.py b/src/pai_rag/tools/agent_tool.py
index 396ed721..122903c3 100644
--- a/src/pai_rag/tools/agent_tool.py
+++ b/src/pai_rag/tools/agent_tool.py
@@ -1,7 +1,6 @@
 import click
 import os
 from pathlib import Path
-from pai_rag.core.rag_config import RagConfig
 from pai_rag.core.rag_config_manager import RagConfigManager
 from pai_rag.core.rag_module import resolve_agent
 import logging
@@ -61,12 +60,11 @@ def run(
 
     config = RagConfigManager.from_file(config_file).get_value()
     if tool_definition_file:
-        config.rag.agent.tool_definition_file = tool_definition_file
+        config.agent.tool_definition_file = tool_definition_file
     if python_script_file:
-        config.rag.agent.python_script_file = python_script_file
+        config.agent.python_script_file = python_script_file
 
-    rag_config = RagConfig.model_validate(config.rag)
-    agent = resolve_agent(rag_config)
+    agent = resolve_agent(config)
 
     print("**Question**: ", question)
     response = agent.chat(question)
diff --git a/src/pai_rag/tools/load_data_tool.py b/src/pai_rag/tools/load_data_tool.py
index 120581ba..d3dd94cd 100644
--- a/src/pai_rag/tools/load_data_tool.py
+++ b/src/pai_rag/tools/load_data_tool.py
@@ -1,7 +1,6 @@
 import click
 import os
 from pathlib import Path
-from pai_rag.core.rag_config import RagConfig
 from pai_rag.core.rag_config_manager import RagConfigManager
 from pai_rag.core.rag_module import resolve_data_loader
 import logging
@@ -72,9 +71,8 @@ def run(
     ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'."
 
     config = RagConfigManager.from_file(config_file).get_value()
-    rag_config = RagConfig.model_validate(config.rag)
 
-    data_loader = resolve_data_loader(rag_config)
+    data_loader = resolve_data_loader(config)
     data_loader.load_data(
         file_path_or_directory=data_path,
         filter_pattern=pattern,
diff --git a/src/pai_rag/tools/query_tool.py b/src/pai_rag/tools/query_tool.py
index 3ef6a693..8f303bc1 100644
--- a/src/pai_rag/tools/query_tool.py
+++ b/src/pai_rag/tools/query_tool.py
@@ -1,7 +1,6 @@
 import click
 import os
 from pathlib import Path
-from pai_rag.core.rag_config import RagConfig
 from pai_rag.core.rag_config_manager import RagConfigManager
 from pai_rag.core.rag_module import resolve_query_engine, setup_tracing
 from pai_rag.integrations.synthesizer.pai_synthesizer import PaiQueryBundle
@@ -45,10 +44,9 @@ def run(
     logging.basicConfig(level=logging.INFO)
 
     config = RagConfigManager.from_file(config_file).get_value()
-    rag_config = RagConfig.model_validate(config.rag)
-    setup_tracing(rag_config.trace)
+    setup_tracing(config.trace)
 
-    query_engine = resolve_query_engine(rag_config)
+    query_engine = resolve_query_engine(config)
 
     print("**Question**: ", question)