Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multi evaluators and experiments pipeline #247

Merged
merged 12 commits into from
Oct 25, 2024
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ docx = "^0.2.4"
pai_rag = "pai_rag.main:main"
load_data = "pai_rag.tool.load_data_tool:run"
load_model = "pai_rag.utils.download_models:load_models"
evaluation = "pai_rag.evaluation.eval_pipeline:run"
run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"

[[tool.poetry.source]]
name = "pytorch_cpu"
Expand Down
2 changes: 1 addition & 1 deletion pyproject_gpu.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ peft = "^0.12.0"
pai_rag = "pai_rag.main:main"
load_data = "pai_rag.tool.load_data_tool:run"
load_model = "pai_rag.utils.download_models:load_models"
evaluation = "pai_rag.evaluation.eval_pipeline:run"
run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"

[tool.pytest.ini_options]
asyncio_mode = "auto"
2 changes: 1 addition & 1 deletion src/pai_rag/app/web/ui_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
MLLM_MODEL_KEY_DICT = {
"dashscope": [
"qwen-vl-max",
"qwen-vl-turbo",
"qwen-vl-plus",
]
}

Expand Down
8 changes: 8 additions & 0 deletions src/pai_rag/config/evaluation/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
experiment:
# [custom knowledge dataset]
- name: "exp1"
data_path: "example_data/eval_docs"
setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
- name: "exp2"
data_path: "example_data/eval_docs_1"
setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,8 @@ name = "pai_rag"
version = "0.1.1"

[rag.agent]
type = "react"

[rag.agent.custom_config]
agent_file_path = ""

[rag.agent.intent_detection]
type = ""

[rag.agent.tool]
type = ""
custom_agent_config_file = ""
agent_tool_type = ""

[rag.chat_store]
type = "Local" # [Local, Aliyun-Redis]
Expand All @@ -23,12 +15,9 @@ password = "Aliyun-Redis user:pwd"
persist_path = "localdata/eval_exp_data/storage"

[rag.data_analysis]
analysis_type = "nl2pandas"
type = "pandas"
nl2sql_prompt = "给定一个输入问题,创建一个语法正确的{dialect}查询语句来执行,不要从特定的表中查询所有列,只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时,请使用表名限定列名。\n=====\n 你必须使用以下格式,每项占一行:\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: "

[rag.data_loader]
type = "local"

[rag.data_reader]
type = "SimpleDirectoryReader"

Expand All @@ -42,9 +31,6 @@ type = "SimpleDirectoryReader"
source = "DashScope"
embed_batch_size = 10

[rag.embedding.multi_modal]
source = "cnclip"

[rag.index]
persist_path = "localdata/eval_exp_data/storage"
enable_multimodal = true
Expand All @@ -60,12 +46,11 @@ vector_store.type = "FAISS"
source = "DashScope"
model = "qwen-turbo"

[rag.llm.function_calling_llm]
source = ""
[rag.multimodal_embedding]
source = "cnclip"

[rag.llm.multi_modal]
enable = true
source = "DashScope"
[rag.multimodal_llm]
source = "dashscope"
model = "qwen-vl-plus"

[rag.node_enhancement]
Expand All @@ -81,20 +66,16 @@ enable_multimodal = true

[rag.oss_store]
bucket = ""
endpoint = ""
prefix = ""
endpoint = "oss-cn-hangzhou.aliyuncs.com"

[rag.postprocessor]
reranker_type = "simple-weighted-reranker" # [simple-weighted-reranker, model-based-reranker]
reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker]
reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large]
keyword_weight = 0.3
vector_weight = 0.7
similarity_threshold = 0.5
top_n = 2

[rag.query_engine]
type = "RetrieverQueryEngine"

[rag.query_transform]
type = ""

Expand All @@ -111,6 +92,6 @@ type = "SimpleSummarize"
text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n"

[rag.trace]
type = "pai-llm-trace"
type = "pai_trace"
endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090"
token = ""
111 changes: 111 additions & 0 deletions src/pai_rag/evaluation/dataset/rag_eval_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import List, Optional, Type, Dict
from llama_index.core.bridge.pydantic import Field
import json
from llama_index.core.bridge.pydantic import BaseModel
from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample


class EvaluationSample(RagQcaSample):
"""Response Evaluation RAG example class."""

hitrate: Optional[float] = Field(
default_factory=None,
description="The hitrate value for retrieval evaluation.",
)
mrr: Optional[float] = Field(
default_factory=None,
description="The mrr value for retrieval evaluation.",
)

faithfulness_score: Optional[float] = Field(
default_factory=None,
description="The faithfulness score for response evaluation.",
)

faithfulness_reason: Optional[str] = Field(
default_factory=None,
description="The faithfulness reason for response evaluation.",
)

correctness_score: Optional[float] = Field(
default_factory=None,
description="The correctness score for response evaluation.",
)

correctness_reason: Optional[str] = Field(
default_factory=None,
description="The correctness reason for response evaluation.",
)

@property
def class_name(self) -> str:
"""Data example class name."""
return "EvaluationSample"


class PaiRagEvalDataset(BaseModel):
_example_type: Type[EvaluationSample] = EvaluationSample # type: ignore[misc]
examples: List[EvaluationSample] = Field(
default=[], description="Data examples of this dataset."
)
results: Dict[str, Dict[str, float]] = Field(
default_factory=dict, description="Evaluation result of this dataset."
)
status: Dict[str, bool] = Field(
default_factory=dict, description="Status of this dataset."
)

@property
def class_name(self) -> str:
"""Class name."""
return "PaiRagEvalDataset"

def cal_mean_metric_score(self) -> float:
"""Calculate the mean metric score."""
self.results["retrieval"] = {}
self.results["response"] = {}
if self.status["retrieval"]:
self.results["retrieval"] = {
"mean_hitrate": sum(float(entry.hitrate) for entry in self.examples)
/ len(self.examples),
"mean_mrr": sum(float(entry.mrr) for entry in self.examples)
/ len(self.examples),
}
if self.status["response"]:
self.results["response"] = {
"mean_faithfulness_score": sum(
float(entry.faithfulness_score) for entry in self.examples
)
/ len(self.examples),
"mean_correctness_score": sum(
float(entry.correctness_score) for entry in self.examples
)
/ len(self.examples),
}

def save_json(self, path: str) -> None:
"""Save json."""
self.cal_mean_metric_score()

with open(path, "w", encoding="utf-8") as f:
examples = [self._example_type.dict(el) for el in self.examples]
data = {
"examples": examples,
"results": self.results,
"status": self.status,
}

json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Saved dataset to {path}.")

@classmethod
def from_json(cls, path: str) -> "PaiRagEvalDataset":
"""Load json."""
with open(path) as f:
data = json.load(f)

examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
results = data["results"]
status = data["status"]

return cls(examples=examples, results=results, status=status)
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
from typing import List, Optional
from typing import List, Optional, Type
from llama_index.core.bridge.pydantic import Field
from llama_index.core.llama_dataset.base import BaseLlamaDataExample
from llama_index.core.llama_dataset import CreatedBy
import json
from llama_index.core.bridge.pydantic import BaseModel


class LabelledRagQcaSample(BaseLlamaDataExample):
"""RAG example class. Analogous to traditional ML datasets, this dataset contains
class RagQcaSample(BaseLlamaDataExample):
"""Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
to evaluate the prediction.

Args:
query (str): The user query
query_by (CreatedBy): Query generated by human or ai (model-name)
reference_contexts (Optional[List[str]]): The contexts used for response
reference_node_id (Optional[List[str]]): The node id corresponding to the contexts
reference_answer ([str]): Reference answer to the query. An answer
that would receive full marks upon evaluation.
reference_answer_by: The reference answer generated by human or ai (model-name).
"""

query: str = Field(
Expand All @@ -40,18 +33,6 @@ class LabelledRagQcaSample(BaseLlamaDataExample):
default=None, description="What model generated the reference answer."
)

@property
def class_name(self) -> str:
"""Data example class name."""
return "LabelledRagQcaSample"


class PredictedRagQcaSample(LabelledRagQcaSample):
"""Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
to evaluate the prediction.
"""

predicted_contexts: Optional[List[str]] = Field(
default_factory=None,
description="The contexts used to generate the predicted answer.",
Expand All @@ -71,4 +52,50 @@ class PredictedRagQcaSample(LabelledRagQcaSample):
@property
def class_name(self) -> str:
"""Data example class name."""
return "PredictedRagQcaSample"
return "RagQcaSample"


class PaiRagQcaDataset(BaseModel):
_example_type: Type[RagQcaSample] = RagQcaSample # type: ignore[misc]
examples: List[RagQcaSample] = Field(
default=[], description="Data examples of this dataset."
)
labelled: bool = Field(
default=False, description="Whether the dataset is labelled or not."
)
predicted: bool = Field(
default=False, description="Whether the dataset is predicted or not."
)

@property
def class_name(self) -> str:
"""Class name."""
return "PaiRagQcaDataset"

def save_json(self, path: str) -> None:
"""Save json."""
with open(path, "w", encoding="utf-8") as f:
examples = [self._example_type.dict(el) for el in self.examples]
data = {
"examples": examples,
"labelled": self.labelled,
"predicted": self.predicted,
}

json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Saved PaiRagQcaDataset to {path}.")

@classmethod
def from_json(cls, path: str) -> "PaiRagQcaDataset":
"""Load json."""
with open(path) as f:
data = json.load(f)

if len(data["examples"]) > 0:
examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
labelled = data["labelled"]
predicted = data["predicted"]

return cls(examples=examples, labelled=labelled, predicted=predicted)
else:
return None
Loading
Loading