From 292fec6e08a41b739adc0e76b3962cf78c532f02 Mon Sep 17 00:00:00 2001 From: Yuan <45984206+Yuan325@users.noreply.github.com> Date: Mon, 18 Nov 2024 20:12:22 -0800 Subject: [PATCH] deps: Update google-cloud-aiplatform (#510) Upgrading the latest google-cloud-aiplatform library made a breaking change to the evaluation system. This PR addresses the library version upgrade along with the updates to the evaluation system. --- llm_demo/evaluation/eval_golden.py | 11 +++++-- llm_demo/evaluation/evaluation.py | 44 +++++++++++++-------------- llm_demo/evaluation/metrics.py | 49 ++++++++++++++++++++++++++++++ llm_demo/pyproject.toml | 6 +++- llm_demo/requirements.txt | 2 +- 5 files changed, 84 insertions(+), 28 deletions(-) create mode 100644 llm_demo/evaluation/metrics.py diff --git a/llm_demo/evaluation/eval_golden.py b/llm_demo/evaluation/eval_golden.py index bd5c445b..ee09d4bd 100644 --- a/llm_demo/evaluation/eval_golden.py +++ b/llm_demo/evaluation/eval_golden.py @@ -39,7 +39,8 @@ class EvalData(BaseModel): category: Optional[str] = Field(default=None, description="Evaluation category") query: Optional[str] = Field(default=None, description="User query") instruction: Optional[str] = Field( - default=None, description="Instruction to llm system" + default="", + description="Part of the input user prompt. It refers to the inference instruction that is sent to you llm", ) content: Optional[str] = Field( default=None, @@ -48,16 +49,20 @@ class EvalData(BaseModel): tool_calls: List[ToolCall] = Field( default=[], description="Golden tool call for evaluation" ) + prompt: Optional[str] = Field( + default="", + description="User input for the Gen AI model or application. It's optional in some cases.", + ) context: Optional[List[Dict[str, Any] | List[Dict[str, Any]]]] = Field( default=None, description="Context given to llm in order to answer user query" ) output: Optional[str] = Field( default=None, description="Golden output for evaluation" ) - prediction_tool_calls: List[ToolCall] = Field( + llm_tool_calls: List[ToolCall] = Field( default=[], description="Tool call output from LLM" ) - prediction_output: str = Field(default="", description="Final output from LLM") + llm_output: str = Field(default="", description="Final output from LLM") reset: bool = Field( default=True, description="Determine to reset the chat after invoke" ) diff --git a/llm_demo/evaluation/evaluation.py b/llm_demo/evaluation/evaluation.py index 7772fc9d..fa6d0cac 100644 --- a/llm_demo/evaluation/evaluation.py +++ b/llm_demo/evaluation/evaluation.py @@ -18,19 +18,21 @@ import pandas as pd from pydantic import BaseModel, Field -from vertexai.preview.evaluation import EvalTask # type: ignore -from vertexai.preview.evaluation import _base as evaluation_base +from vertexai.evaluation import EvalTask +from vertexai.evaluation import _base as evaluation_base from orchestrator import BaseOrchestrator from .eval_golden import EvalData, ToolCall +from .metrics import response_phase_metrics, retrieval_phase_metrics async def run_llm_for_eval( eval_list: List[EvalData], orc: BaseOrchestrator, session: Dict, session_id: str ) -> List[EvalData]: """ - Generate prediction_tool_calls and prediction_output for golden dataset query. + Generate llm_tool_calls and llm_output for golden dataset query. + This function is only compatible with the langchain-tools orchestration. """ agent = orc.get_user_session(session_id) for eval_data in eval_list: @@ -39,10 +41,10 @@ async def run_llm_for_eval( except Exception as e: print(f"error invoking agent: {e}") else: - eval_data.prediction_output = query_response.get("output") + eval_data.llm_output = query_response.get("output") - # Retrieve prediction_tool_calls from query response - prediction_tool_calls = [] + # Retrieve llm_tool_calls from query response + llm_tool_calls = [] contexts = [] for step in query_response.get("intermediate_steps"): called_tool = step[0] @@ -50,12 +52,14 @@ async def run_llm_for_eval( name=called_tool.tool, arguments=called_tool.tool_input, ) - prediction_tool_calls.append(tool_call) + llm_tool_calls.append(tool_call) context = step[-1] contexts.append(context) - eval_data.prediction_tool_calls = prediction_tool_calls + eval_data.llm_tool_calls = llm_tool_calls eval_data.context = contexts + eval_data.prompt = PROMPT + eval_data.instruction = f"Answer user query based on context given. User query is {eval_data.query}." if eval_data.reset: orc.user_session_reset(session, session_id) @@ -68,7 +72,6 @@ def evaluate_retrieval_phase( """ Run evaluation for the ability of a model to select the right tool and arguments (retrieval phase). """ - metrics = ["tool_call_quality"] # Prepare evaluation task input responses = [] references = [] @@ -85,7 +88,7 @@ def evaluate_retrieval_phase( json.dumps( { "content": e.content, - "tool_calls": [t.model_dump() for t in e.prediction_tool_calls], + "tool_calls": [t.model_dump() for t in e.llm_tool_calls], } ) ) @@ -98,7 +101,7 @@ def evaluate_retrieval_phase( # Run evaluation eval_result = EvalTask( dataset=eval_dataset, - metrics=metrics, + metrics=retrieval_phase_metrics, experiment=experiment_name, ).evaluate() return eval_result @@ -110,29 +113,24 @@ def evaluate_response_phase( """ Run evaluation for the ability of a model to generate a response based on the context given (response phase). """ - metrics = [ - "text_generation_quality", - "text_generation_factuality", - "summarization_pointwise_reference_free", - "qa_pointwise_reference_free", - ] # Prepare evaluation task input instructions = [] contexts = [] responses = [] + prompts = [] for e in eval_datas: - instructions.append( - f"Answer user query based on context given. User query is {e.query}." - ) + instructions.append(e.instruction) context_str = ( [json.dumps(c) for c in e.context] if e.context else ["no data retrieved"] ) - contexts.append(PROMPT + ", " + ", ".join(context_str)) - responses.append(e.prediction_output or "") + prompts.append(e.prompt) + contexts.append(", ".join(context_str)) + responses.append(e.llm_output or "") eval_dataset = pd.DataFrame( { "instruction": instructions, + "prompt": prompts, "context": contexts, "response": responses, } @@ -140,7 +138,7 @@ def evaluate_response_phase( # Run evaluation eval_result = EvalTask( dataset=eval_dataset, - metrics=metrics, + metrics=response_phase_metrics, experiment=experiment_name, ).evaluate() return eval_result diff --git a/llm_demo/evaluation/metrics.py b/llm_demo/evaluation/metrics.py new file mode 100644 index 00000000..d545090e --- /dev/null +++ b/llm_demo/evaluation/metrics.py @@ -0,0 +1,49 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from vertexai.evaluation import MetricPromptTemplateExamples, PointwiseMetric + +text_quality_metric = PointwiseMetric( + metric="text_quality", + metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template( + "text_quality" + ), +) + +summarization_quality_metric = PointwiseMetric( + metric="summarization_quality", + metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template( + "summarization_quality" + ), +) + +question_answering_quality_metric = PointwiseMetric( + metric="question_answering_quality", + metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template( + "question_answering_quality" + ), +) + +response_phase_metrics = [ + text_quality_metric, + summarization_quality_metric, + question_answering_quality_metric, +] + +retrieval_phase_metrics = [ + "tool_call_valid", + "tool_name_match", + "tool_parameter_key_match", + "tool_parameter_kv_match", +] diff --git a/llm_demo/pyproject.toml b/llm_demo/pyproject.toml index 7c546217..5e0dcf9e 100644 --- a/llm_demo/pyproject.toml +++ b/llm_demo/pyproject.toml @@ -3,4 +3,8 @@ profile = "black" [tool.mypy] python_version = 3.11 -warn_unused_configs = true \ No newline at end of file +warn_unused_configs = true + +[[tool.mypy.overrides]] +module = ["vertexai.evaluation"] +ignore_missing_imports = true diff --git a/llm_demo/requirements.txt b/llm_demo/requirements.txt index 6a26693c..38f96086 100644 --- a/llm_demo/requirements.txt +++ b/llm_demo/requirements.txt @@ -1,6 +1,6 @@ fastapi==0.109.2 google-auth==2.33.0 -google-cloud-aiplatform[rapid_evaluation]==1.62.0 +google-cloud-aiplatform[evaluation]==1.72.0 itsdangerous==2.2.0 jinja2==3.1.4 langchain-community==0.2.9