diff --git a/continuous_eval/metrics/generation/text/__init__.py b/continuous_eval/metrics/generation/text/__init__.py index 4758309..6cb69cc 100644 --- a/continuous_eval/metrics/generation/text/__init__.py +++ b/continuous_eval/metrics/generation/text/__init__.py @@ -3,14 +3,22 @@ DeterministicAnswerCorrectness, FleschKincaidReadability, ) -from continuous_eval.metrics.generation.text.semantic import ( - BertAnswerRelevance, - BertAnswerSimilarity, - DebertaAnswerScores, -) +try: + from continuous_eval.metrics.generation.text.semantic import ( + BertAnswerRelevance, + BertAnswerSimilarity, + DebertaAnswerScores, + ) +except ImportError: + pass from continuous_eval.metrics.generation.text.llm_based import ( LLMBasedFaithfulness, LLMBasedAnswerCorrectness, LLMBasedAnswerRelevance, LLMBasedStyleConsistency, -) \ No newline at end of file +) +from continuous_eval.metrics.generation.text.custom import ( + EvaluationExample, + LLMBasedCustomMetric, + ScoringFunctions, +) diff --git a/continuous_eval/metrics/generation/text/custom.py b/continuous_eval/metrics/generation/text/custom.py new file mode 100644 index 0000000..ab4e121 --- /dev/null +++ b/continuous_eval/metrics/generation/text/custom.py @@ -0,0 +1,90 @@ +import re +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Union + +from continuous_eval.llm_factory import LLMInterface +from continuous_eval.metrics.base import LLMBasedMetric + + +def _numeric_matcher(input_val, min_val, max_val): + pattern = r"\d+(?:\.\d+)?" # Match any number (integer or float) + matches = re.findall(pattern, input_val) + if not matches: + raise ValueError(f"Could not find a number in the input: {input_val}") + return max(min_val, min(max_val, float(matches[0]))) + + +class ScoringFunctions: + @staticmethod + def Numeric( + min_val: Union[int, float] = 1, + max_val: Union[int, float] = 5, + ): + assert min_val < max_val, "min_val should be less than max_val" + return lambda input_val: _numeric_matcher(input_val, min_val, max_val) + + @staticmethod + def Identity(value: str): + return value + + +@dataclass +class EvaluationExample: + input: Union[str, Dict[str, Any]] + score: Any + justification: str + + def __str__(self): + in_str = self.input if isinstance(self.input, str) else "\n".join([f"{k}: {v}" for k, v in self.input.items()]) + return f"Input: {in_str}\nScore: {self.score}\nJustification: {self.justification}" + + +class LLMBasedCustomMetric(LLMBasedMetric): + def __init__( + self, + name: str, + definition: str, + scoring_rubric: str, + scoring_function: Callable = ScoringFunctions.Identity, + model: Optional[LLMInterface] = None, + model_parameters: Dict[str, Any] = dict(), + examples: Optional[List[EvaluationExample]] = None, + ): + super().__init__(model) + assert name, "Name is required" + assert definition, "Definition is required" + assert scoring_rubric, "Grading prompt is required" + assert scoring_function is not None, "Scoring function is required" + self._name = name + self._definition = definition + self._scoring_rubric = scoring_rubric + self._scoring_function = scoring_function + self._model_parameters = model_parameters + self._examples = examples + + @property + def name(self): + return self._name + + def _build_prompt(self, **kwargs): + prompt = {"system_prompt": "", "user_prompt": ""} + prompt[ + "system_prompt" + ] = "You are are an expert evaluator. The user will provide a description of the criteria and grading instructions, you will apply them with objectivity.\n" + prompt["user_prompt"] = ( + "CRITERIA: \n" + self._definition + "\n\n" + "GRADING INSTRUCTIONS: \n" + self._scoring_rubric + ) + if self._examples: + prompt["user_prompt"] += "\n\nEXAMPLES: \n" + for example in self._examples: + prompt["user_prompt"] += str(example) + prompt["user_prompt"] += "\n\n" + prompt["user_prompt"] += "Following the instructions, evaluate this:\n" + for argname, argval in kwargs.items(): + prompt["user_prompt"] += f"{argname}: {argval}\n" + return prompt + + def __call__(self, **kwargs): + res = self._llm.run(prompt=self._build_prompt(**kwargs), **self._model_parameters) + score = self._scoring_function(res) + return {f"{self.name}_score": score, f"{self.name}_reasoning": res} diff --git a/continuous_eval/metrics/generation/text/semantic.py b/continuous_eval/metrics/generation/text/semantic.py index e5f791f..97a6ac1 100644 --- a/continuous_eval/metrics/generation/text/semantic.py +++ b/continuous_eval/metrics/generation/text/semantic.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, Dict, List +from typing import Dict, List import pandas as pd diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs index be4c34d..0d82cc7 100644 --- a/docs/astro.config.mjs +++ b/docs/astro.config.mjs @@ -9,7 +9,7 @@ export default defineConfig({ site: 'https://docs.relari.ai', base: '/v0.3', outDir: './dist/v0.3', - trailingSlash: "always", + trailingSlash: "never", markdown: { remarkPlugins: [remarkMath], rehypePlugins: [rehypeMathjax], diff --git a/docs/src/content/docs/getting-started/Introduction.md b/docs/src/content/docs/getting-started/Introduction.md index 9c34576..79274ca 100644 --- a/docs/src/content/docs/getting-started/Introduction.md +++ b/docs/src/content/docs/getting-started/Introduction.md @@ -9,10 +9,9 @@ sidebar: ## What is continuous-eval? -`continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines. - - +`continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines. + ## How is continuous-eval different? diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx index bc7f2a2..1aca41a 100644 --- a/docs/src/content/docs/index.mdx +++ b/docs/src/content/docs/index.mdx @@ -14,7 +14,7 @@ import { Icon } from '@astrojs/starlight/components'; @@ -24,22 +24,22 @@ import { Icon } from '@astrojs/starlight/components'; diff --git a/docs/src/content/docs/metrics/Generation/LLM-Based/custom.md b/docs/src/content/docs/metrics/Generation/LLM-Based/custom.md new file mode 100644 index 0000000..c6a9736 --- /dev/null +++ b/docs/src/content/docs/metrics/Generation/LLM-Based/custom.md @@ -0,0 +1,100 @@ +--- +title: LLM-based Custom Metric +--- + +### Definition + +The class `LLMBasedCustomMetric` is a base class for creating custom LLM-based metrics. + +It requires: + +- `name`: a string used to identify the metric +- `definition`: the definition of the criteria +- `scoring_rubric`: the scoring rubric +- `scoring_function`: a function used to parse the LLM output and return a score +- `model`: an instance of `LLMInterface` (or `None` to use the default model) +- `model_parameters`: optional, a dictionary of any additional parameters to pass to the model +- `examples`: optional, a list of `EvaluationExample` objects + +The class `EvaluationExample` is used to define examples for the metric. It requires: + +- `input`: a string or a dictionary of the example input required by the metric +- `score`: the score the LLM should return for this example +- `justification`: a string explaining the expected score + +### Example Usage + +Let's create a custom metric to assess the conciseness of the answer to the question. +We will use the `LLMBasedCustomMetric` class to define the metric and the `EvaluationExample` class to define examples for the metric. +The metric will take the question and the generated answer as input and return a score between 1 and 3, where 1 means that the answer is too verbose and 3 means that the answer is concise. + +Let's start by defining the examples: + +```python +example_score_1 = EvaluationExample( + { + "question": " What causes sea breezes?", + "answer": "To understand what causes sea breezes, it's important to start by recognizing that the Earth is made up of various surfaces, such as land and water, which both play a significant role in the way our climate and weather patterns are formed. Now, during the daylight hours, what happens is quite fascinating. The sun, which is our primary source of light and heat, shines down upon the Earth's surface. However, not all surfaces on Earth respond to this heat in the same way. Specifically, land tends to heat up much more quickly and to a higher degree compared to water. This discrepancy in heating rates is crucial because it leads to differences in air pressure. Warmer air is less dense and tends to rise, whereas cooler air is more dense and tends to sink. So, as the land heats up, the air above it becomes warmer and rises, creating a kind of vacuum that needs to be filled. Consequently, the cooler, denser air over the water begins to move towards the land to fill this space. This movement of air from the sea to the land is what we experience as a sea breeze. It's a fascinating process that not only demonstrates the dynamic nature of our planet's climate system but also highlights the intricate interplay between the sun, the Earth's surface, and the atmosphere above it.", + }, + score=1, + justification="This answer would score lower on conciseness. While it is informative and covers the necessary scientific principles, it contains a significant amount of introductory and explanatory material that, while interesting, is not essential to answering the specific question about the cause of sea breezes.", +) + +example_score_2 = EvaluationExample( + { + "question": "What causes sea breezes?", + "answer": "Sea breezes are a result of the interesting interplay between the heating rates of land and water. Essentially, during the sunlit hours, land heats up much more rapidly compared to the ocean. This difference in heating leads to a variation in air pressure; as the warmer air over the land rises due to its lower density, a pressure difference is created. Cooler air from the sea, being denser, moves towards the land to balance this pressure difference. However, it’s not just about temperature and pressure; the Earth’s rotation also plays a part in directing the breeze, adding a slight twist to the direction the breeze comes from. This natural phenomenon is quite essential, contributing to local weather patterns and offering relief on hot days along coastal areas.", + }, + score=2, + justification="This answer would receive a score of 2 for conciseness. It provides a more detailed explanation than necessary for a straightforward question but does not delve into excessive verbosity. The answer introduces the basic concept accurately and includes relevant details about the cause of sea breezes. However, it also incorporates additional information about the Earth's rotation, which, while related, is not strictly necessary to understand the fundamental cause of sea breezes.", +) + +example_score_3 = EvaluationExample( + input={ + "question": "What causes sea breezes?", + "answer": "Sea breezes are caused by the differential heating of land and sea. During the day, land heats up faster than water, creating a pressure difference that drives cooler air from the sea towards the land.", + }, + score=3, + justification="This answer receives a high score for conciseness. It directly addresses the question without unnecessary details, providing the essential explanation in a clear and straightforward manner.", +) +``` + +Now, let's define the custom metric: + +```python +metric = LLMBasedCustomMetric( + name="Conciseness", + definition="Conciseness in communication refers to the expression of ideas in a clear and straightforward manner, using the fewest possible words without sacrificing clarity or completeness of information. It involves eliminating redundancy, verbosity, and unnecessary details, focusing instead on delivering the essential message efficiently. ", + scoring_rubric="""Use the following rubric to assign a score to the answer based on its conciseness: +- Score 1: The answer is overly verbose, containing a significant amount of unnecessary information, repetition, or redundant expressions that do not contribute to the understanding of the topic. +- Score 2: The answer includes some unnecessary details or slightly repetitive information, but the excess does not severely hinder understanding. +- Score 3:The answer is clear, direct, and to the point, with no unnecessary words, details, or repetition.""", + scoring_function=ScoringFunctions.Numeric(min_val=1, max_val=3), + model_parameters={"temperature": 0}, + examples=[example_score_1, example_score_2, example_score_3], +) +``` + +than we can use the metric to evaluate the conciseness of the generated answers: + +```python +datum = { + "question": "What causes seasons to change?", + "answer": "The change in seasons is primarily caused by the Earth's tilt on its axis combined with its orbit around the Sun. This tilt leads to variations in the angle and intensity of sunlight reaching different parts of Earth at different times of the year.", +} + +print(metric(**datum)) +``` + +With the following output: + +```JSON +{ + 'Conciseness_score': 3, + 'Conciseness_reasoning': "Score: 3\nJustification: The answer is concise, clear, and directly addresses the question without any unnecessary details. It provides a straightforward explanation of how the Earth's tilt on its axis and its orbit around the Sun cause the change in seasons." +} +``` + +:::note +Note: when using a custom metric in a `Pipeline` class, remember use the `Metric` method `use` to register metric inputs, for example `metric.use(question=dataset.question, answer=dataset.answer)`. +::: diff --git a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md index cb7ec9f..6399ac2 100644 --- a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md +++ b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md @@ -9,20 +9,16 @@ Context Precision is used to measure information density. $$ \text{LLM-Based Context Precision} = \frac{ - \text{Number of Relevant Chunks in Retrieved Sentences} + \text{Number of Relevant Chunks in Retrieved Contexts} }{ - \text{Total Number of Sentences in Retrieved Contexts} + \text{Total Number of Chunks in Retrieved Contexts} } $$ +$$ +\text{LLM-Based Average Precision (AP)} = \frac{1}{\text{Number of Relevant Chunks}} \sum_{j=1}^{\text{Number of Retrieved Context}} \text{ Precision at Rank } j $$ -\text{LLM-Based Context Average Precision} = -\frac{ - \text{Number of Relevant Chunks in Retrieved Sentences} -}{ - \text{Total Number of Sentences in Retrieved Contexts} -} -$$ + ### Example Usage diff --git a/examples/llm_custom_criteria.py b/examples/llm_custom_criteria.py new file mode 100644 index 0000000..0e6ffe0 --- /dev/null +++ b/examples/llm_custom_criteria.py @@ -0,0 +1,51 @@ +from continuous_eval.metrics.generation.text import EvaluationExample, LLMBasedCustomMetric, ScoringFunctions + +# In this example we want to create a custom metric to evaluate the conciseness of a given answer to a question. +# We will use a scale from 1 to 3, let's define an example for each score (we will use to define the metric later) +example_score_3 = EvaluationExample( + input={ + "question": "What causes sea breezes?", + "answer": "Sea breezes are caused by the differential heating of land and sea. During the day, land heats up faster than water, creating a pressure difference that drives cooler air from the sea towards the land.", + }, + score=3, + justification="This answer receives a high score for conciseness. It directly addresses the question without unnecessary details, providing the essential explanation in a clear and straightforward manner.", +) + +example_score_2 = EvaluationExample( + { + "question": "What causes sea breezes?", + "answer": "Sea breezes are a result of the interesting interplay between the heating rates of land and water. Essentially, during the sunlit hours, land heats up much more rapidly compared to the ocean. This difference in heating leads to a variation in air pressure; as the warmer air over the land rises due to its lower density, a pressure difference is created. Cooler air from the sea, being denser, moves towards the land to balance this pressure difference. However, it’s not just about temperature and pressure; the Earth’s rotation also plays a part in directing the breeze, adding a slight twist to the direction the breeze comes from. This natural phenomenon is quite essential, contributing to local weather patterns and offering relief on hot days along coastal areas.", + }, + score=2, + justification="This answer would receive a score of 2 for conciseness. It provides a more detailed explanation than necessary for a straightforward question but does not delve into excessive verbosity. The answer introduces the basic concept accurately and includes relevant details about the cause of sea breezes. However, it also incorporates additional information about the Earth's rotation, which, while related, is not strictly necessary to understand the fundamental cause of sea breezes.", +) + +example_score_1 = EvaluationExample( + { + "question": " What causes sea breezes?", + "answer": "To understand what causes sea breezes, it's important to start by recognizing that the Earth is made up of various surfaces, such as land and water, which both play a significant role in the way our climate and weather patterns are formed. Now, during the daylight hours, what happens is quite fascinating. The sun, which is our primary source of light and heat, shines down upon the Earth's surface. However, not all surfaces on Earth respond to this heat in the same way. Specifically, land tends to heat up much more quickly and to a higher degree compared to water. This discrepancy in heating rates is crucial because it leads to differences in air pressure. Warmer air is less dense and tends to rise, whereas cooler air is more dense and tends to sink. So, as the land heats up, the air above it becomes warmer and rises, creating a kind of vacuum that needs to be filled. Consequently, the cooler, denser air over the water begins to move towards the land to fill this space. This movement of air from the sea to the land is what we experience as a sea breeze. It's a fascinating process that not only demonstrates the dynamic nature of our planet's climate system but also highlights the intricate interplay between the sun, the Earth's surface, and the atmosphere above it.", + }, + score=1, + justification="This answer would score lower on conciseness. While it is informative and covers the necessary scientific principles, it contains a significant amount of introductory and explanatory material that, while interesting, is not essential to answering the specific question about the cause of sea breezes.", +) + +# Let's initialize the metric +metric = LLMBasedCustomMetric( + name="Conciseness", + definition="Conciseness in communication refers to the expression of ideas in a clear and straightforward manner, using the fewest possible words without sacrificing clarity or completeness of information. It involves eliminating redundancy, verbosity, and unnecessary details, focusing instead on delivering the essential message efficiently. ", + scoring_rubric="""Use the following rubric to assign a score to the answer based on its conciseness: +- Score 1: The answer is overly verbose, containing a significant amount of unnecessary information, repetition, or redundant expressions that do not contribute to the understanding of the topic. +- Score 2: The answer includes some unnecessary details or slightly repetitive information, but the excess does not severely hinder understanding. +- Score 3:The answer is clear, direct, and to the point, with no unnecessary words, details, or repetition.""", + scoring_function=ScoringFunctions.Numeric(min_val=1, max_val=3), + model_parameters={"temperature": 0}, + examples=[example_score_1, example_score_2, example_score_3], +) + +# Let's calculate the metric for the first datum +datum = { + "question": "What causes seasons to change?", + "answer": "The change in seasons is primarily caused by the Earth's tilt on its axis combined with its orbit around the Sun. This tilt leads to variations in the angle and intensity of sunlight reaching different parts of Earth at different times of the year.", +} + +print(metric(**datum)) diff --git a/pyproject.toml b/pyproject.toml index 70a07ea..416162e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "continuous-eval" -version = "0.3.3" +version = "0.3.4" description = "Open-Source Evaluation for GenAI Application Pipelines." authors = ["Yi Zhang ", "Pasquale Antonante "] readme = "README.md"