diff --git "a/example_data/eval_docs_image/\350\267\221\351\236\213\346\216\250\350\215\220.pdf" "b/example_data/eval_docs_image/\350\267\221\351\236\213\346\216\250\350\215\220.pdf" new file mode 100644 index 00000000..8dfb5614 Binary files /dev/null and "b/example_data/eval_docs_image/\350\267\221\351\236\213\346\216\250\350\215\220.pdf" differ diff --git a/example_data/eval_docs_text/EasyRec.txt b/example_data/eval_docs_text/EasyRec.txt new file mode 100644 index 00000000..d6fc32b7 --- /dev/null +++ b/example_data/eval_docs_text/EasyRec.txt @@ -0,0 +1,44 @@ +EasyRec是一个易于使用的推荐框架¶ +EasyRec 实现了常见推荐任务中使用的最先进的机器学习模型:候选生成(匹配)、评分(排名)和多任务学习。 它通过简单的配置和超参数调整(HPO)提高了生成高性能模型的效率。 + +EasyRec视频介绍 +为什么选择 EasyRec?¶ +到处运行¶ +MaxCompute / 数据科学 / DLC / 本地 +TF1.12-1.15 / TF2.x / PAI-TF +多样的输入数据¶ +MaxCompute表 +HDFS 文件 +操作系统文件 +卡夫卡流 +本地 CSV + +配置简单¶ +灵活的功能配置和简单的模型配置 +高效、鲁棒的特征生成[淘宝使用] +漂亮的网络界面正在开发中 + +它很聪明¶ +EarlyStop / 最佳检查站保护程序 +超参数搜索/AutoFeatureCross +开发中:NAS、知识蒸馏、多模式 + +规模大、部署方便¶ +支持大规模嵌入,增量保存 +许多并行策略:ParameterServer、Mirrored、MultiWorker +轻松部署到 EAS:自动扩展、轻松监控 +一致性保证:训练和服务 + +多种模型 +DSSM / MIND / DropoutNet / CoMetricLearningI2I / PDN +W&D / DeepFM / MultiTower / DCN / DIN / BST +MMoE / ESMM / DBMTL / PLE +CMBF / 联合 + +易于定制¶ +易于实现定制模型 +无需关心数据管道 +快速向量检索¶ +在分布式环境中运行向量的 knn 算法 + +欢迎加入【EasyRec推荐算法交流群】,钉钉群号 : 32260796 diff --git a/example_data/eval_docs_text/PAI.txt b/example_data/eval_docs_text/PAI.txt new file mode 100644 index 00000000..bf115c73 --- /dev/null +++ b/example_data/eval_docs_text/PAI.txt @@ -0,0 +1,14 @@ +机器学习PAI(Platform of Artificial Intelligence)是阿里云人工智能平台,提供一站式的机器学习解决方案。本文为您介绍什么是机器学习PAI。 + +什么是机器学习 +机器学习是指机器通过统计学算法,对大量历史数据进行学习,进而利用生成的经验模型指导业务。目前机器学习主要应用在以下场景: +营销类场景:商品推荐、用户群体画像或广告精准投放。 +金融类场景:贷款发放预测、金融风险控制、股票走势预测或黄金价格预测。 +社交网络服务关系挖掘场景:微博粉丝领袖分析或社交关系链分析。 +文本类场景:新闻分类、关键词提取、文章摘要或文本内容分析。 +非结构化数据处理场景:图片分类或图片文本内容提取。 +其它各类预测场景:降雨预测或足球比赛结果预测。 +机器学习包括传统机器学习和深度学习。传统机器学习分为以下几类: +有监督学习(Supervised Learning):每个样本都有对应的期望值,通过搭建模型,实现从输入特征向量到目标值的映射。例如解决回归和分类问题。 +无监督学习(Unsupervised Learning):所有样本没有目标值,期望从数据本身发现一些潜在规律。例如解决聚类问题。 +增强学习(Reinforcement Learning):相对比较复杂,系统和外界环境不断交互,根据外界反馈决定自身行为,达到目标最优化。例如阿尔法围棋和无人驾驶。 diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml index 76823aa8..1b286790 100644 --- a/src/pai_rag/config/evaluation/config.yaml +++ b/src/pai_rag/config/evaluation/config.yaml @@ -1,8 +1,12 @@ experiment: # [custom knowledge dataset] - name: "exp1" - data_path: "example_data/eval_docs" - setting_file: "src/pai_rag/config/evaluation/settings_eval.toml" + eval_data_path: "example_data/eval_docs_text" + eval_model_source: "Dashscope" + eval_model_name: "qwen-max" + rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_text.toml" - name: "exp2" - data_path: "example_data/eval_docs_1" - setting_file: "src/pai_rag/config/evaluation/settings_eval.toml" + eval_data_path: "example_data/eval_docs_image" + eval_model_source: "Dashscope" + eval_model_name: "qwen-vl-max" + rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml" diff --git a/src/pai_rag/config/evaluation/settings_eval_for_image.toml b/src/pai_rag/config/evaluation/settings_eval_for_image.toml new file mode 100644 index 00000000..5a251446 --- /dev/null +++ b/src/pai_rag/config/evaluation/settings_eval_for_image.toml @@ -0,0 +1,99 @@ +dynaconf_merge = true + +[rag] +name = "pai_rag" +version = "0.1.1" + +[rag.agent] +custom_agent_config_file = "" +agent_tool_type = "" + +[rag.chat_store] +type = "Local" # [Local, Aliyun-Redis] +host = "Aliyun-Redis host" +password = "Aliyun-Redis user:pwd" +persist_path = "localdata/eval_exp_data/storage" + +[rag.data_analysis] +type = "pandas" +nl2sql_prompt = "给定一个输入问题,创建一个语法正确的{dialect}查询语句来执行,不要从特定的表中查询所有列,只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时,请使用表名限定列名。\n=====\n 你必须使用以下格式,每项占一行:\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: " + +[rag.data_reader] +type = "SimpleDirectoryReader" + +# embedding configurations, source support API: OpenAI,DashScope; and local model:HuggingFace +# if use API, need set OPENAI_API_KEY or DASHSCOPE_API_KEY in ENV, If HuggingFace, need set model +# eg. +# source = "HuggingFace" +# model = "bge-large-zh-v1.5" +# embed_batch_size = 10 +[rag.embedding] +source = "DashScope" +embed_batch_size = 10 + +[rag.index] +persist_path = "localdata/eval_exp_data/storage" +enable_multimodal = true +vector_store.type = "FAISS" + +# llm configurations, source support API: OpenAI,DashScope or PAI-EAS's deployment +# eg. +# source = "PaiEas" +# model = "" +# endpoint = "" +# token = "" +[rag.llm] +source = "DashScope" +model = "qwen-turbo" + +[rag.multimodal_embedding] +source = "cnclip" + +[rag.multimodal_llm] +source = "dashscope" +model = "qwen-vl-plus" + +[rag.node_enhancement] +tree_depth = 3 +max_clusters = 52 +proba_threshold = 0.10 + +[rag.node_parser] +type = "Sentence" +chunk_size = 500 +chunk_overlap = 10 +enable_multimodal = true + +[rag.oss_store] +bucket = "pai-rag" +endpoint = "oss-cn-hangzhou.aliyuncs.com" +prefix = "evaluation" + +[rag.postprocessor] +reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker] +reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large] +keyword_weight = 0.3 +vector_weight = 0.7 +similarity_threshold = 0.5 +top_n = 2 + +[rag.query_transform] +type = "" + +[rag.retriever] +similarity_top_k = 3 +retrieval_mode = "hybrid" # [hybrid, embedding, keyword, router] +query_rewrite_n = 1 # set to 1 to disable query generation +search_image = true + +[rag.search] +search_api_key = "" + +[rag.synthesizer] +type = "SimpleSummarize" +text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n" + +[rag.trace] +type = "pai_trace" +endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090" +token = "" diff --git a/src/pai_rag/config/evaluation/settings_eval.toml b/src/pai_rag/config/evaluation/settings_eval_for_text.toml similarity index 98% rename from src/pai_rag/config/evaluation/settings_eval.toml rename to src/pai_rag/config/evaluation/settings_eval_for_text.toml index 341c3949..0e7186c8 100644 --- a/src/pai_rag/config/evaluation/settings_eval.toml +++ b/src/pai_rag/config/evaluation/settings_eval_for_text.toml @@ -33,7 +33,7 @@ embed_batch_size = 10 [rag.index] persist_path = "localdata/eval_exp_data/storage" -enable_multimodal = true +enable_multimodal = false vector_store.type = "FAISS" # llm configurations, source support API: OpenAI,DashScope or PAI-EAS's deployment @@ -62,7 +62,7 @@ proba_threshold = 0.10 type = "Sentence" chunk_size = 500 chunk_overlap = 10 -enable_multimodal = true +enable_multimodal = false [rag.oss_store] bucket = "" diff --git a/src/pai_rag/evaluation/dataset/rag_eval_dataset.py b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py index c9555e37..8d8d5330 100644 --- a/src/pai_rag/evaluation/dataset/rag_eval_dataset.py +++ b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py @@ -3,6 +3,7 @@ import json from llama_index.core.bridge.pydantic import BaseModel from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample +from llama_index.core.llama_dataset import CreatedBy class EvaluationSample(RagQcaSample): @@ -36,6 +37,9 @@ class EvaluationSample(RagQcaSample): default_factory=None, description="The correctness reason for response evaluation.", ) + evaluated_by: Optional[CreatedBy] = Field( + default=None, description="What model generated the evaluation result." + ) @property def class_name(self) -> str: diff --git a/src/pai_rag/evaluation/dataset/rag_qca_dataset.py b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py index a4d0bd3d..5af59983 100644 --- a/src/pai_rag/evaluation/dataset/rag_qca_dataset.py +++ b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py @@ -25,6 +25,10 @@ class RagQcaSample(BaseLlamaDataExample): reference_node_id: Optional[List[str]] = Field( default_factory=None, description="The node id corresponding to the contexts" ) + reference_image_url_list: Optional[List[str]] = Field( + default_factory=None, + description="The image urls used to generate the reference answer.", + ) reference_answer: str = Field( default_factory=str, description="The reference (ground-truth) answer to the example.", @@ -41,6 +45,10 @@ class RagQcaSample(BaseLlamaDataExample): default_factory=None, description="The node id corresponding to the predicted contexts", ) + predicted_image_url_list: Optional[List[str]] = Field( + default_factory=None, + description="The image urls used to generate the reference answer.", + ) predicted_answer: str = Field( default_factory=str, description="The predicted answer to the example.", diff --git a/src/pai_rag/evaluation/evaluator/base_evaluator.py b/src/pai_rag/evaluation/evaluator/base_evaluator.py index b3ccb56b..6a71762b 100644 --- a/src/pai_rag/evaluation/evaluator/base_evaluator.py +++ b/src/pai_rag/evaluation/evaluator/base_evaluator.py @@ -9,11 +9,15 @@ EvaluationSample, PaiRagEvalDataset, ) +from llama_index.core.llama_dataset import ( + CreatedBy, + CreatedByType, +) from pai_rag.evaluation.dataset.rag_qca_dataset import PaiRagQcaDataset class BaseEvaluator: - def __init__(self, llm, persist_path: str = None): + def __init__(self, llm, persist_path: str = None, enable_multi_modal: bool = False): self._llm = llm self.persist_path = persist_path self.hitrate = HitRate() @@ -32,9 +36,13 @@ def __init__(self, llm, persist_path: str = None): self.evaluation_dataset_path = os.path.join( self.persist_path, "evaluation_dataset.json" ) + self.created_by = CreatedBy( + type=CreatedByType.AI, model_name=self._llm.metadata.model_name + ) self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json") self._show_progress = True self._workers = 2 + self.enable_multi_modal = enable_multi_modal def load_qca_dataset(self) -> None: if os.path.exists(self.qca_dataset_path): @@ -75,22 +83,44 @@ async def compute_retrieval_metrics(self, qca_sample): for metric in self.retrieval_evaluators: metric_score = metric.compute(reference_node_id, predicted_node_id) setattr(retrieval_eval_example, metric.metric_name, metric_score) + setattr(retrieval_eval_example, "evaluated_by", self.created_by) return retrieval_eval_example async def compute_response_metrics(self, qca_sample): response_eval_example = EvaluationSample(**vars(qca_sample)) query = response_eval_example.query - response = response_eval_example.reference_answer + reference_answer = response_eval_example.reference_answer + response_answer = response_eval_example.predicted_answer + reference_image_url_list = response_eval_example.reference_image_url_list contexts = response_eval_example.predicted_contexts + for metric in self.response_evaluators: - metric_result = await metric.aevaluate(query, response, contexts) + if self.enable_multi_modal: + metric_result = await metric.aevaluate_multimodal( + query, + reference_answer, + contexts, + reference_image_url_list, + response_answer, + sleep_time_in_seconds=0.5, + ) + else: + metric_result = await metric.aevaluate( + query, + reference_answer, + contexts, + response_answer, + sleep_time_in_seconds=0.5, + ) + setattr( response_eval_example, f"{metric.metric_name}_score", metric_result[0] ) setattr( response_eval_example, f"{metric.metric_name}_reason", metric_result[1] ) + setattr(response_eval_example, "evaluated_by", self.created_by) return response_eval_example diff --git a/src/pai_rag/evaluation/generator/rag_qca_generator.py b/src/pai_rag/evaluation/generator/rag_qca_generator.py index 6002462c..9e401fba 100644 --- a/src/pai_rag/evaluation/generator/rag_qca_generator.py +++ b/src/pai_rag/evaluation/generator/rag_qca_generator.py @@ -2,8 +2,9 @@ from llama_index.core.indices import VectorStoreIndex from pai_rag.utils.prompt_template import ( DEFAULT_QUESTION_GENERATION_PROMPT, + DEFAULT_MULTI_MODAL_QUESTION_GENERATION_PROMPT, DEFAULT_TEXT_QA_PROMPT_TMPL, - DEFAULT_QUESTION_GENERATION_QUERY, + DEFAULT_MULTI_MODAL_IMAGE_QA_PROMPT_TMPL, ) from llama_index.core.base.response.schema import RESPONSE_TYPE from llama_index.core.prompts.base import PromptTemplate @@ -21,6 +22,10 @@ from pai_rag.integrations.query_engine.pai_retriever_query_engine import ( PaiRetrieverQueryEngine, ) +from llama_index.multi_modal_llms.openai import OpenAIMultiModal +from llama_index.core.schema import TextNode, ImageNode +from llama_index.core.multi_modal_llms.generic_utils import load_image_urls + logger = logging.getLogger(__name__) @@ -32,15 +37,19 @@ def __init__( vector_index: VectorStoreIndex = None, query_engine: PaiRetrieverQueryEngine = None, persist_path: str = None, + enable_multi_modal: bool = False, ): self._llm = llm self._vector_index = vector_index._vector_index self._query_engine = query_engine - self.question_gen_query = DEFAULT_QUESTION_GENERATION_QUERY.format( - num_questions_per_chunk=3 - ) self.text_question_template = PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT) + self.multi_modal_question_template = PromptTemplate( + DEFAULT_MULTI_MODAL_QUESTION_GENERATION_PROMPT + ) self.text_question_answer_template = PromptTemplate(DEFAULT_TEXT_QA_PROMPT_TMPL) + self.multi_modal_question_answer_template = PromptTemplate( + DEFAULT_MULTI_MODAL_IMAGE_QA_PROMPT_TMPL + ) self.created_by = CreatedBy( type=CreatedByType.AI, model_name=self._llm.metadata.model_name ) @@ -48,6 +57,7 @@ def __init__( self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json") self._show_progress = True self._workers = 2 + self.enable_multi_modal = enable_multi_modal def load_qca_dataset(self) -> None: if os.path.exists(self.qca_dataset_path): @@ -79,11 +89,70 @@ async def agenerate_qca_dataset(self, stage): else: return await self.agenerate_labelled_qca_dataset() + async def agenerate_labelled_multimodal_qca_sample(self, node): + assert isinstance( + self._llm, OpenAIMultiModal + ), "Multi-modal LLM must be provided to understand image documents." + image_url_infos = node.metadata.get("image_info_list", None) + if image_url_infos: + image_url_list = [ + image_url_info.get("image_url", None) + for image_url_info in image_url_infos + ] + image_context_str = "\n\n".join(image_url_list) + image_documents = load_image_urls(image_url_list) + + else: + image_url_list = [] + image_context_str = "" + image_documents = None + + context_str = f"{node.text}\n\n图片链接列表: \n\n{image_context_str}\n\n" + prompt_str = self.multi_modal_question_template.format( + context_str=context_str, num_questions_per_chunk=1 + ) + response = await self._llm.acomplete( + prompt=prompt_str, image_documents=image_documents + ) + result = str(response).strip().split("\n") + cleaned_questions = [ + re.sub(r"^\d+[\).\s]", "", question).strip() for question in result + ] + cleaned_questions = [ + question for question in cleaned_questions if len(question) > 0 + ] + qr_tasks = [] + for query in cleaned_questions: + prompt_str = self.multi_modal_question_answer_template.format( + context_str=context_str, query_str=query + ) + qr_task = self._llm.acomplete( + prompt=prompt_str, image_documents=image_documents + ) + qr_tasks.append(qr_task) + answer_responses: List[RESPONSE_TYPE] = await run_jobs( + qr_tasks, self._show_progress, self._workers + ) + for ( + question, + answer_response, + ) in zip(cleaned_questions, answer_responses): + sample = RagQcaSample( + query=question, + reference_answer=str(answer_response), + reference_contexts=[node.text], + reference_image_url_list=image_url_list, + reference_node_id=[node.node_id], + reference_answer_by=self.created_by, + query_by=self.created_by, + ) + return sample + async def agenerate_labelled_qca_sample(self, node): prompt_str = self.text_question_template.format( context_str=node.text, num_questions_per_chunk=1 ) - response = await self._llm.acomplete(prompt=prompt_str, image_documents=None) + response = await self._llm.acomplete(prompt=prompt_str) result = str(response).strip().split("\n") cleaned_questions = [ re.sub(r"^\d+[\).\s]", "", question).strip() for question in result @@ -96,7 +165,10 @@ async def agenerate_labelled_qca_sample(self, node): prompt_str = self.text_question_answer_template.format( context_str=node.text, query_str=query ) - qr_task = self._llm.acomplete(prompt=prompt_str, image_documents=None) + if isinstance(self._llm, OpenAIMultiModal): + qr_task = self._llm.acomplete(prompt=prompt_str, image_documents=None) + else: + qr_task = self._llm.acomplete(prompt=prompt_str) qr_tasks.append(qr_task) answer_responses: List[RESPONSE_TYPE] = await run_jobs( qr_tasks, self._show_progress, self._workers @@ -123,15 +195,51 @@ async def agenerate_labelled_qca_dataset( nodes = list(docs.values()) tasks = [] for node in nodes: - tasks.append(self.agenerate_labelled_qca_sample(node)) + if self.enable_multi_modal: + if type(node) is TextNode: + tasks.append(self.agenerate_labelled_multimodal_qca_sample(node)) + else: + tasks.append(self.agenerate_labelled_qca_sample(node)) examples = await run_jobs(tasks, self._show_progress, self._workers) labelled_qca_dataset = PaiRagQcaDataset(examples=examples, labelled=True) labelled_qca_dataset.save_json(self.qca_dataset_path) return labelled_qca_dataset.examples + async def agenerate_predicted_multimodal_qca_sample(self, qca_sample): + query_bundle = PaiQueryBundle(query_str=qca_sample.query) + response = await self._query_engine.aquery(query_bundle) + + qca_sample.predicted_answer = response.response + predicted_contexts = [] + predicted_node_id = [] + predicted_image_url_list = [] + for node in response.source_nodes: + if type(node.node) is TextNode: + predicted_contexts.append(node.node.text) + predicted_node_id.append(node.node.node_id) + image_url_infos = node.node.metadata.get("image_info_list", None) + if image_url_infos: + predicted_image_url_list.extend( + [ + image_url_info.get("image_url", None) + for image_url_info in image_url_infos + ] + ) + elif type(node.node) is ImageNode: + predicted_image_url_list.append( + node.node.metadata.get("image_url", None) + ) + + qca_sample.predicted_contexts = predicted_contexts + qca_sample.predicted_node_id = predicted_node_id + qca_sample.predicted_image_url_list = predicted_image_url_list + qca_sample.predicted_answer_by = self.created_by + return qca_sample + async def agenerate_predicted_qca_sample(self, qca_sample): query_bundle = PaiQueryBundle(query_str=qca_sample.query) response = await self._query_engine.aquery(query_bundle) + qca_sample.predicted_answer = response.response qca_sample.predicted_contexts = [ node.node.text for node in response.source_nodes @@ -146,7 +254,10 @@ async def agenerate_predicted_qca_dataset(self, rag_qca_dataset): print("Starting to generate QCA dataset for [[predicted]].") tasks = [] for qca_sample in rag_qca_dataset.examples: - tasks.append(self.agenerate_predicted_qca_sample(qca_sample)) + if self.enable_multi_modal: + tasks.append(self.agenerate_predicted_multimodal_qca_sample(qca_sample)) + else: + tasks.append(self.agenerate_predicted_qca_sample(qca_sample)) predicted_examples = await run_jobs(tasks, self._show_progress, self._workers) predicted_qca_dataset = PaiRagQcaDataset( examples=predicted_examples, labelled=True, predicted=True diff --git a/src/pai_rag/evaluation/metrics/response/base.py b/src/pai_rag/evaluation/metrics/response/base.py index d0bad74f..d6a4ac5a 100644 --- a/src/pai_rag/evaluation/metrics/response/base.py +++ b/src/pai_rag/evaluation/metrics/response/base.py @@ -1,17 +1,12 @@ """Llm metric for response evaluation.""" from abc import abstractmethod -from typing import Any, Optional, Sequence, Union +from typing import Any, Optional, Sequence from llama_index.core.evaluation.base import EvaluationResult from llama_index.core.llms.llm import LLM -from llama_index.core.prompts import BasePromptTemplate, PromptTemplate from llama_index.core.prompts.mixin import PromptDictType from llama_index.core.prompts.mixin import PromptMixin, PromptMixinType -DEFAULT_EVAL_TEMPLATE = PromptTemplate( - "Information: {query_str}\n" "Context: {context_str}\n" "Answer: " "Reason: " -) - class LlmMetric(PromptMixin): """ @@ -24,18 +19,11 @@ def __init__( self, llm: Optional[LLM] = None, raise_error: bool = False, - eval_template: Optional[Union[str, BasePromptTemplate]] = None, ) -> None: """Init params.""" self._llm = llm self._raise_error = raise_error - self._eval_template: BasePromptTemplate - if isinstance(eval_template, str): - self._eval_template = PromptTemplate(eval_template) - else: - self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE - def _get_prompts(self) -> PromptDictType: """Get prompts.""" return { @@ -56,8 +44,9 @@ async def parse_eval_result(self, eval_result: str) -> float: async def aevaluate( self, query: str | None = None, - response: str | None = None, + reference_answer: str | None = None, contexts: Sequence[str] | None = None, + response_answer: str | None = None, **kwargs: Any, ) -> EvaluationResult: """Run evaluation with query string, retrieved contexts, diff --git a/src/pai_rag/evaluation/metrics/response/correctness.py b/src/pai_rag/evaluation/metrics/response/correctness.py index 3e8b820a..423471df 100644 --- a/src/pai_rag/evaluation/metrics/response/correctness.py +++ b/src/pai_rag/evaluation/metrics/response/correctness.py @@ -1,65 +1,91 @@ """Correctness evaluation.""" import asyncio -from typing import Any, Optional, Sequence, Union +from typing import Any, Optional, Sequence, Union, List from llama_index.core.evaluation.base import EvaluationResult from llama_index.core.llms.llm import LLM from llama_index.core.prompts import ( BasePromptTemplate, - ChatMessage, - ChatPromptTemplate, - MessageRole, PromptTemplate, ) from pai_rag.evaluation.metrics.response.base import LlmMetric +from llama_index.multi_modal_llms.openai import OpenAIMultiModal +from llama_index.core.multi_modal_llms.generic_utils import load_image_urls -DEFAULT_SYSTEM_TEMPLATE = """ -You are an expert evaluation system for a question answering chatbot. - -You are given the following information: -- a user query, and -- a generated answer - -You may also be given a reference answer to use for reference in your evaluation. - -Your job is to judge the relevance and correctness of the generated answer. -Output a single score that represents a holistic evaluation. -You must return your response in a line with only the score. -Do not return answers in any other format. -On a separate line provide your reasoning for the score as well. - -Follow these guidelines for scoring: -- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best. -- If the generated answer is not relevant to the user query, \ -you should give a score of 1. -- If the generated answer is relevant but contains mistakes, \ -you should give a score between 2 and 3. -- If the generated answer is relevant and fully correct, \ -you should give a score between 4 and 5. - -Example Response: -4.0 -The generated answer has the exact same metrics as the reference answer, \ - but it is not as concise. - -""" - -DEFAULT_USER_TEMPLATE = """ -## User Query -{query} - -## Reference Answer -{reference_answer} - -## Generated Answer -{generated_answer} -""" - -DEFAULT_EVAL_TEMPLATE = ChatPromptTemplate( - message_templates=[ - ChatMessage(role=MessageRole.SYSTEM, content=DEFAULT_SYSTEM_TEMPLATE), - ChatMessage(role=MessageRole.USER, content=DEFAULT_USER_TEMPLATE), - ] + +DEFAULT_EVAL_TEMPLATE = PromptTemplate( + """ + 你是一个问答聊天机器人的专家评估系统。 + 你将获得以下信息: + - 用户查询,以及 + - 生成的回答 + 你可能还会获得一个参考答案,以供评估时参考。 + 你的工作是判断生成的回答的相关性和正确性。 + 输出一个代表整体评估的单一分数。 + 你必须以仅包含分数的一行返回你的响应。 + 不要以其他格式返回答案。 + 在另一行提供你给出分数的理由。 + 请遵循以下评分指南: + - 你的分数必须在1到5之间,其中1为最低,5为最高。 + - 如果生成的回答与用户查询无关, \ + 你应该给出1分。 + - 如果生成的回答相关但包含错误, \ + 你应该给出2到3之间的分数。 + - 如果生成的回答相关且完全正确, \ + 你应该给出4到5之间的分数。 + 示例响应: + 4.0 + 生成的回答与参考答案具有完全相同的指标, \ + 但不够简洁。 + ## 用户查询 + {query} + ## 参考答案 + {reference_answer} + ## 生成的回答 + {generated_answer} + """ +) + + +DEFAULT_MULTIMODAL_EVAL_TEMPLATE = PromptTemplate( + """ + 你是一个问答聊天机器人的专家评估系统。 + 你将获得以下信息: + - 用户查询,以及 + - 生成的回答 + - 参考答案,以及 + - 参考图片链接 + + 你的工作是判断生成的回答和参考图片的相关性与正确性。 + 输出一个代表整体评估的单一分数。 + 你必须以仅包含分数的一行返回你的响应。 + 不要以其他格式返回答案。 + 在另一行提供你给出分数的理由。 + 请遵循以下评分指南: + - 你的分数必须在1到5之间,其中1为最低,5为最高。 + - 如果生成的回答与用户查询无关, + 你应该给出1分。 + - 如果生成的回答相关但包含错误, + 你应该给出2到3之间的分数。 + - 如果生成的回答相关且完全正确, + 你应该给出4到5之间的分数。 + - 另外,参考图片的质量和内容也应与用户查询和生成的回答相匹配。 + 对于相关图像,如果图像的质量高且与回答内容相关,可以增加评分。 + + 示例响应: + 4.0 + 生成的回答与参考答案具有完全相同的指标, + 但图像质量不足或不够相关。 + + ## 用户查询 + {query} + ## 参考答案 + {reference_answer} + ## 参考图片链接 + {reference_image_url_list} + ## 生成的回答 + {generated_answer} + """ ) @@ -89,14 +115,21 @@ def __init__( llm: Optional[LLM] = None, raise_error: bool = False, eval_template: Optional[Union[str, BasePromptTemplate]] = None, + multimodal_eval_template: Optional[Union[str, BasePromptTemplate]] = None, score_threshold: float = 4.0, ) -> None: + super().__init__(llm, raise_error) if isinstance(eval_template, str): - eval_template = PromptTemplate(eval_template) + self._eval_template = PromptTemplate(eval_template) else: - eval_template = eval_template or DEFAULT_EVAL_TEMPLATE + self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE - super().__init__(llm, raise_error, eval_template) + if isinstance(eval_template, str): + self._multimodal_eval_template = PromptTemplate(multimodal_eval_template) + else: + self._multimodal_eval_template = ( + multimodal_eval_template or DEFAULT_MULTIMODAL_EVAL_TEMPLATE + ) self._score_threshold = score_threshold @@ -117,10 +150,10 @@ def parse_eval_result(self, eval_result: str): async def aevaluate( self, - query: Optional[str] = None, - response: Optional[str] = None, - contexts: Optional[Sequence[str]] = None, - reference: Optional[str] = None, + query: str | None = None, + reference_answer: str | None = None, + contexts: Sequence[str] | None = None, + response_answer: str | None = None, sleep_time_in_seconds: int = 0, **kwargs: Any, ) -> EvaluationResult: @@ -129,15 +162,50 @@ async def aevaluate( await asyncio.sleep(sleep_time_in_seconds) - if query is None or response is None: + if query is None or response_answer is None: raise ValueError("query, and response must be provided") - raw_response = await self._llm.apredict( - prompt=self._eval_template, + prompt_str = self._eval_template.format( query=query, - generated_answer=response, - reference_answer=reference or "(NO REFERENCE ANSWER SUPPLIED)", + generated_answer=response_answer, + reference_answer=reference_answer or "(没有提供参考答案)", + ) + raw_response = await self._llm.acomplete(prompt=prompt_str) + + # Use the parser function + return self.parse_eval_result(str(raw_response)) + + async def aevaluate_multimodal( + self, + query: str | None = None, + reference_answer: str | None = None, + contexts: Sequence[str] | None = None, + reference_image_url_list: Optional[List[str]] = None, + response_answer: str | None = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + del contexts # Unused + assert isinstance( + self._llm, OpenAIMultiModal + ), "Multi-modal LLM must be provided to understand image." + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response_answer is None: + raise ValueError("query, and response must be provided") + + prompt_str = self._multimodal_eval_template.format( + query=query, + generated_answer=response_answer, + reference_answer=reference_answer or "(没有提供参考答案)", + reference_image_url_list=reference_image_url_list or "(没有提供参考图片链接)", + ) + image_documents = load_image_urls(reference_image_url_list) + raw_response = await self._llm.acomplete( + prompt=prompt_str, image_documents=image_documents ) # Use the parser function - return self.parse_eval_result(raw_response) + return self.parse_eval_result(str(raw_response)) diff --git a/src/pai_rag/evaluation/metrics/response/faithfulness.py b/src/pai_rag/evaluation/metrics/response/faithfulness.py index 7e545963..b790066d 100644 --- a/src/pai_rag/evaluation/metrics/response/faithfulness.py +++ b/src/pai_rag/evaluation/metrics/response/faithfulness.py @@ -1,6 +1,6 @@ """Faithfulness evaluation.""" import asyncio -from typing import Any, Optional, Sequence, Union +from typing import Any, Optional, Sequence, Union, List from llama_index.core.llms.llm import LLM from llama_index.core.prompts import ( BasePromptTemplate, @@ -8,42 +8,58 @@ ) from llama_index.core.evaluation.base import EvaluationResult from pai_rag.evaluation.metrics.response.base import LlmMetric +from llama_index.multi_modal_llms.openai import OpenAIMultiModal +from llama_index.core.multi_modal_llms.generic_utils import load_image_urls DEFAULT_EVAL_TEMPLATE = PromptTemplate( - "Please tell if a given piece of information " - "is supported by the context.\n" - "You need to answer with either YES or NO.\n" - "Answer YES if any of the context supports the information, even " - "if most of the context is unrelated. " - "Some examples are provided below. \n\n" - "Information: Apple pie is generally double-crusted.\n" - "Context: An apple pie is a fruit pie in which the principal filling " - "ingredient is apples. \n" - "Apple pie is often served with whipped cream, ice cream " - "('apple pie à la mode'), custard or cheddar cheese.\n" - "It is generally double-crusted, with pastry both above " - "and below the filling; the upper crust may be solid or " - "latticed (woven of crosswise strips).\n" - "Answer: YES\n" - "Reason: The context explicitly states that 'It is generally double-crusted,' " - "which directly supports the information that 'Apple pie is generally double-crusted.' " - "Therefore, the information is confirmed by the context. \n\n" - "Information: Apple pies tastes bad.\n" - "Context: An apple pie is a fruit pie in which the principal filling " - "ingredient is apples. \n" - "Apple pie is often served with whipped cream, ice cream " - "('apple pie à la mode'), custard or cheddar cheese.\n" - "It is generally double-crusted, with pastry both above " - "and below the filling; the upper crust may be solid or " - "latticed (woven of crosswise strips).\n" - "Answer: NO\n" - "Reason: The context does not provide any information regarding the taste of apple pie. " - "It describes the ingredients and serving suggestions but does not support the claim that " - "'apple pies taste bad.' Therefore, the information is not supported by the context. \n" - "Information: {query_str}\n" - "Context: {context_str}\n" - "Answer: " - "Reason: " + """ + 请告诉我一段信息是否得到上下文的支持。 + 你需要回答“是”或“否”。 + 如果任何上下文支持该信息,即使大部分上下文无关,也请回答“是”。 + 下面提供了一些示例。\n\n + 信息:苹果派通常是双皮的。 + 上下文:苹果派是一种水果派,主要填充成分是苹果。 + 苹果派通常搭配鲜奶油、冰淇淋(‘苹果派à la mode’)、奶油或切达乳酪。 + 它通常是双层的,馅料上方和下方都有酥皮;上层酥皮可以是实心的或是格子状(由交叉条纹编织而成)。 + 答案:是 + 理由:上下文明确指出“它通常是双层的”,直接支持信息“苹果派通常是双皮的”。因此,该信息得到了上下文的确认。\n\n + 信息:苹果派味道不好。 + 上下文:苹果派是一种水果派,主要填充成分是苹果。 + 苹果派通常搭配鲜奶油、冰淇淋(‘苹果派à la mode’)、奶油或切达乳酪。 + 它通常是双层的,馅料上方和下方都有酥皮;上层酥皮可以是实心的或是格子状(由交叉条纹编织而成)。 + 答案:否 + 理由:上下文没有提供关于苹果派味道的任何信息。它描述了成分和搭配建议,但没有支持“苹果派味道不好”的说法。因此,该信息没有得到上下文的支持。\n + 信息:{response_str}\n + 上下文:{context_str}\n + 答案: + 理由: + """ +) + +DEFAULT_MULTIMODAL_EVAL_TEMPLATE = PromptTemplate( + """ + 请告诉我一段信息是否得到上下文的支持。 + 你需要回答“是”或“否”。 + 如果任何上下文支持该信息,即使大部分上下文无关,也请回答“是”。 + 下面提供了一些示例。\n\n + 信息:苹果派通常是双皮的。 + 上下文:苹果派是一种水果派,主要填充成分是苹果。 + 苹果派通常搭配鲜奶油、冰淇淋(‘苹果派à la mode’)、奶油或切达乳酪。 + 它通常是双层的,馅料上方和下方都有酥皮;上层酥皮可以是实心的或是格子状(由交叉条纹编织而成)。 + 答案:是 + 理由:上下文明确指出“它通常是双层的”,直接支持信息“苹果派通常是双皮的”。因此,该信息得到了上下文的确认。\n\n + 信息:苹果派味道不好。 + 上下文:苹果派是一种水果派,主要填充成分是苹果。 + 苹果派通常搭配鲜奶油、冰淇淋(‘苹果派à la mode’)、奶油或切达乳酪。 + 它通常是双层的,馅料上方和下方都有酥皮;上层酥皮可以是实心的或是格子状(由交叉条纹编织而成)。 + 答案:否 + 理由:上下文没有提供关于苹果派味道的任何信息。它描述了成分和搭配建议,但没有支持“苹果派味道不好”的说法。因此,该信息没有得到上下文的支持。\n + 信息:{response_str}\n + 上下文:{context_str}\n + 参考图片链接:{reference_image_url_list}\n + 答案: + 理由: + """ ) @@ -70,16 +86,23 @@ def __init__( llm: Optional[LLM] = None, raise_error: bool = False, eval_template: Optional[Union[str, BasePromptTemplate]] = None, + multimodal_eval_template: Optional[Union[str, BasePromptTemplate]] = None, ) -> None: + super().__init__(llm, raise_error) if isinstance(eval_template, str): - eval_template = PromptTemplate(eval_template) + self._eval_template = PromptTemplate(eval_template) else: - eval_template = eval_template or DEFAULT_EVAL_TEMPLATE + self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE - super().__init__(llm, raise_error, eval_template) + if isinstance(eval_template, str): + self._multimodal_eval_template = PromptTemplate(multimodal_eval_template) + else: + self._multimodal_eval_template = ( + multimodal_eval_template or DEFAULT_MULTIMODAL_EVAL_TEMPLATE + ) def parse_eval_result(self, eval_result: str): - raw_response_txt = eval_result.text.lower() + raw_response_txt = eval_result.lower() if "yes" in raw_response_txt: passing = True else: @@ -93,24 +116,67 @@ def parse_eval_result(self, eval_result: str): async def aevaluate( self, query: str | None = None, - response: str | None = None, + reference_answer: str | None = None, contexts: Sequence[str] | None = None, + response_answer: str | None = None, sleep_time_in_seconds: int = 0, **kwargs: Any, ) -> EvaluationResult: """Evaluate whether the response is faithful to the contexts.""" + del query # Unused + del reference_answer # Unused del kwargs # Unused await asyncio.sleep(sleep_time_in_seconds) - if contexts is None or response is None: + if contexts is None or response_answer is None: raise ValueError("contexts and response must be provided") prompt_str = self._eval_template.format( - query_str=query, + response_str=response_answer, context_str="\n".join(contexts), ) - raw_response = await self._llm.acomplete(prompt=prompt_str) + if isinstance(self._llm, OpenAIMultiModal): + raw_response = await self._llm.acomplete( + prompt=prompt_str, image_documents=None + ) + else: + raw_response = await self._llm.acomplete(prompt=prompt_str) + + # Use the parser function + return self.parse_eval_result(str(raw_response)) + + async def aevaluate_multimodal( + self, + query: str | None = None, + reference_answer: str | None = None, + contexts: Sequence[str] | None = None, + reference_image_url_list: Optional[List[str]] = None, + response_answer: str | None = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del query # Unused + del reference_answer # Unused + del kwargs # Unused + assert isinstance( + self._llm, OpenAIMultiModal + ), "Multi-modal LLM must be provided to understand image." + + await asyncio.sleep(sleep_time_in_seconds) + + if contexts is None or response_answer is None: + raise ValueError("contexts and response must be provided") + + prompt_str = self._multimodal_eval_template.format( + response_str=response_answer, + context_str="\n".join(contexts), + reference_image_url_list=reference_image_url_list or "(没有提供参考图片链接)", + ) + image_documents = load_image_urls(reference_image_url_list) + raw_response = await self._llm.acomplete( + prompt=prompt_str, image_documents=image_documents + ) # Use the parser function - return self.parse_eval_result(raw_response) + return self.parse_eval_result(str(raw_response)) diff --git a/src/pai_rag/evaluation/run_evaluation_experiments.py b/src/pai_rag/evaluation/run_evaluation_experiments.py index 7cb0826b..4c43f902 100644 --- a/src/pai_rag/evaluation/run_evaluation_experiments.py +++ b/src/pai_rag/evaluation/run_evaluation_experiments.py @@ -26,18 +26,20 @@ def calculate_md5_from_json(data): def run_experiment(exp_params): - name = exp_params["name"] - logging.info(f"Running experiment with name={name}, exp_params={exp_params}") + exp_name = exp_params["name"] + logging.info(f"Running experiment with name={exp_name}, exp_params={exp_params}") try: # 运行实验并获取结果 result = run_evaluation_pipeline( - config=exp_params["setting_file"], - data_path=exp_params["data_path"], - name=name, + config=exp_params["rag_setting_file"], + data_path=exp_params["eval_data_path"], + exp_name=exp_name, + eval_model_source=exp_params["eval_model_source"], + eval_model_name=exp_params["eval_model_name"], ) - logging.info(f"Finished experiment with name={name}") + logging.info(f"Finished experiment with name={exp_name}") except Exception as e: - logging.error(f"Error running experiment {name}: {e}") + logging.error(f"Error running experiment {exp_name}: {e}") return {"name": exp_params["name"], "parameters": exp_params, "result": result} diff --git a/src/pai_rag/evaluation/run_evaluation_pipeline.py b/src/pai_rag/evaluation/run_evaluation_pipeline.py index 67841a6b..fd799d39 100644 --- a/src/pai_rag/evaluation/run_evaluation_pipeline.py +++ b/src/pai_rag/evaluation/run_evaluation_pipeline.py @@ -2,11 +2,9 @@ import asyncio from pathlib import Path from pai_rag.core.rag_config_manager import RagConfigManager -from pai_rag.core.rag_data_loader import RagDataLoader from pai_rag.core.rag_module import ( resolve, resolve_data_loader, - resolve_llm, resolve_vector_index, resolve_query_engine, ) @@ -14,63 +12,76 @@ from pai_rag.integrations.llms.pai.pai_multi_modal_llm import ( PaiMultiModalLlm, ) +from pai_rag.integrations.llms.pai.pai_llm import PaiLlm from pai_rag.evaluation.evaluator.base_evaluator import BaseEvaluator import logging +from pai_rag.integrations.llms.pai.llm_config import parse_llm_config +from pai_rag.integrations.llms.pai.llm_utils import create_llm, create_multi_modal_llm + logger = logging.getLogger(__name__) _BASE_DIR = Path(__file__).parent.parent DEFAULT_APPLICATION_CONFIG_FILE = os.path.join( - _BASE_DIR, "evaluation/settings_eval.toml" + _BASE_DIR, "evaluation/settings_eval_for_text.toml" ) -def _create_data_loader( - config_file, name, enable_raptor: bool = False -) -> RagDataLoader: +def _create_components( + config_file, exp_name, eval_model_source, eval_model_name +) -> None: + """Create all components from the default config file.""" config = RagConfigManager.from_file(config_file).get_value() + mode = "image" if config.retriever.search_image else "text" + config.synthesizer.use_multimodal_llm = True if mode == "image" else False + + print(f"Creating RAG evaluation components for mode: {mode}...") config.index.vector_store.persist_path = ( - f"{config.index.vector_store.persist_path}__{name}" + f"{config.index.vector_store.persist_path}__{exp_name}" ) data_loader = resolve_data_loader(config) vector_index = resolve_vector_index(config) query_engine = resolve_query_engine(config) + eval_llm_config_data = { + "source": eval_model_source.lower(), + "model": eval_model_name, + "max_tokens": 1024, + } + eval_llm_config = parse_llm_config(eval_llm_config_data) + if mode == "text": + llm = resolve(cls=PaiLlm, llm_config=config.llm) + eval_llm = create_llm(eval_llm_config) + else: + llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm) + eval_llm = create_multi_modal_llm(eval_llm_config) - return data_loader, vector_index, query_engine - - -def _create_qca_generator(config_file, name, vector_index, query_engine): - config = RagConfigManager.from_file(config_file).get_value() - multimodal_llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm) - persist_path = f"{config.index.vector_store.persist_path}__{name}" qca_generator = RagQcaGenerator( - llm=multimodal_llm, + llm=llm, vector_index=vector_index, query_engine=query_engine, - persist_path=persist_path, + persist_path=config.index.vector_store.persist_path, + enable_multi_modal=True if mode == "image" else False, ) - return qca_generator - -def _create_base_evaluator(config_file, name): - config = RagConfigManager.from_file(config_file).get_value() - llm = resolve_llm(config) - persist_path = f"{config.index.vector_store.persist_path}__{name}" - return BaseEvaluator( - llm=llm, - persist_path=persist_path, + evaluator = BaseEvaluator( + llm=eval_llm, + persist_path=config.index.vector_store.persist_path, + enable_multi_modal=True if mode == "image" else False, ) + return data_loader, qca_generator, evaluator + def run_evaluation_pipeline( config=None, oss_path=None, data_path=None, pattern=None, - enable_raptor=False, - name="default", + exp_name="default", + eval_model_source=None, + eval_model_name=None, ): assert (oss_path is not None) or ( data_path is not None @@ -79,20 +90,19 @@ def run_evaluation_pipeline( data_path is None ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'." - data_loader, vector_index, query_engine = _create_data_loader( - config, name, enable_raptor + data_loader, qca_generator, evaluator = _create_components( + config, exp_name, eval_model_source, eval_model_name ) data_loader.load_data( file_path_or_directory=data_path, filter_pattern=pattern, oss_path=oss_path, from_oss=oss_path is not None, - enable_raptor=enable_raptor, + enable_raptor=False, ) - qca_generator = _create_qca_generator(config, name, vector_index, query_engine) + _ = asyncio.run(qca_generator.agenerate_qca_dataset(stage="labelled")) _ = asyncio.run(qca_generator.agenerate_qca_dataset(stage="predicted")) - evaluator = _create_base_evaluator(config, name) retrieval_result = asyncio.run(evaluator.aevaluation(stage="retrieval")) response_result = asyncio.run(evaluator.aevaluation(stage="response")) print("retrieval_result", retrieval_result, "response_result", response_result) diff --git a/src/pai_rag/utils/prompt_template.py b/src/pai_rag/utils/prompt_template.py index f093665a..05a465a3 100644 --- a/src/pai_rag/utils/prompt_template.py +++ b/src/pai_rag/utils/prompt_template.py @@ -16,6 +16,19 @@ #08 请仔细阅读给出的内容,生成适合作为问答对数据集的{num_questions_per_chunk}个问题: """ +DEFAULT_MULTI_MODAL_QUESTION_GENERATION_PROMPT = """\ + #01 你是一个问答对数据集处理专家,擅长理解和分析多模态信息(文字和图片)。 + #02 你的任务是根据我给出的文字内容和相关图像,生成适合作为问答对数据集的问题。 + #03 问题要紧扣文件内容和图像,确保每个问题都清晰且简短。 + #04 一句话中仅包含一个问题。 + #05 生成的问题需要具体明确,能够准确反映文件内容和图像信息。 + #06 生成问题需要避免指代不明确,以下是需要避免的示例:这款产品、这些文献、这项研究等。 + #07 以下是我给出的文字内容和相关图像链接: + --------------------- + {context_str} + --------------------- + #08 请仔细阅读给出的内容和图像描述,生成适合作为问答对数据集的{num_questions_per_chunk}个问题: + """ DEFAULT_TEXT_QA_PROMPT_TMPL = """内容信息如下 --------------------- @@ -25,9 +38,19 @@ 问题: {query_str} 答案: """ -DEFAULT_QUESTION_GENERATION_QUERY = "你是一个问答对数据集处理专家。你的任务是产出 \ - {num_questions_per_chunk} 个问题。 \ - 整个文件中的问题本质上应该是多样化的。将问题限制在所提供的上下文信息范围内。" +DEFAULT_MULTI_MODAL_IMAGE_QA_PROMPT_TMPL = ( + "结合上面给出的图片和下面给出的参考材料来回答用户的问题。材料中包含一组图片链接,分别对应到前面给出的图片的地址。\n\n" + "材料:" + "---------------------\n\n" + "{context_str}\n" + "---------------------\n\n" + "请根据给定的材料回答给出的问题,回答中需要有文字描述和图片。如果材料中没有找到答案,就说没有找到相关的信息,不要编造答案。\n\n" + "如果上面有图片对你生成答案有帮助,请找到图片链接并用markdown格式给出,如![](image_url)。\n\n" + "---------------------\n\n" + "问题: {query_str}\n请返回文字和展示图片,不需要标明图片顺序" + "答案: " +) + DEFAULT_QA_GENERATE_PROMPT_TMPL_ZH = """\ 上下文信息如下。