Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/llm custom metric #48

Merged
merged 5 commits into from
Mar 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions continuous_eval/metrics/generation/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,22 @@
DeterministicAnswerCorrectness,
FleschKincaidReadability,
)
from continuous_eval.metrics.generation.text.semantic import (
BertAnswerRelevance,
BertAnswerSimilarity,
DebertaAnswerScores,
)
try:
from continuous_eval.metrics.generation.text.semantic import (
BertAnswerRelevance,
BertAnswerSimilarity,
DebertaAnswerScores,
)
except ImportError:
pass
from continuous_eval.metrics.generation.text.llm_based import (
LLMBasedFaithfulness,
LLMBasedAnswerCorrectness,
LLMBasedAnswerRelevance,
LLMBasedStyleConsistency,
)
)
from continuous_eval.metrics.generation.text.custom import (
EvaluationExample,
LLMBasedCustomMetric,
ScoringFunctions,
)
90 changes: 90 additions & 0 deletions continuous_eval/metrics/generation/text/custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import re
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Union

from continuous_eval.llm_factory import LLMInterface
from continuous_eval.metrics.base import LLMBasedMetric


def _numeric_matcher(input_val, min_val, max_val):
pattern = r"\d+(?:\.\d+)?" # Match any number (integer or float)
matches = re.findall(pattern, input_val)
if not matches:
raise ValueError(f"Could not find a number in the input: {input_val}")
return max(min_val, min(max_val, float(matches[0])))


class ScoringFunctions:
@staticmethod
def Numeric(
min_val: Union[int, float] = 1,
max_val: Union[int, float] = 5,
):
assert min_val < max_val, "min_val should be less than max_val"
return lambda input_val: _numeric_matcher(input_val, min_val, max_val)

@staticmethod
def Identity(value: str):
return value


@dataclass
class EvaluationExample:
input: Union[str, Dict[str, Any]]
score: Any
justification: str

def __str__(self):
in_str = self.input if isinstance(self.input, str) else "\n".join([f"{k}: {v}" for k, v in self.input.items()])
return f"Input: {in_str}\nScore: {self.score}\nJustification: {self.justification}"


class LLMBasedCustomMetric(LLMBasedMetric):
def __init__(
self,
name: str,
definition: str,
scoring_rubric: str,
scoring_function: Callable = ScoringFunctions.Identity,
model: Optional[LLMInterface] = None,
model_parameters: Dict[str, Any] = dict(),
examples: Optional[List[EvaluationExample]] = None,
):
super().__init__(model)
assert name, "Name is required"
assert definition, "Definition is required"
assert scoring_rubric, "Grading prompt is required"
assert scoring_function is not None, "Scoring function is required"
self._name = name
self._definition = definition
self._scoring_rubric = scoring_rubric
self._scoring_function = scoring_function
self._model_parameters = model_parameters
self._examples = examples

@property
def name(self):
return self._name

def _build_prompt(self, **kwargs):
prompt = {"system_prompt": "", "user_prompt": ""}
prompt[
"system_prompt"
] = "You are are an expert evaluator. The user will provide a description of the criteria and grading instructions, you will apply them with objectivity.\n"
prompt["user_prompt"] = (
"CRITERIA: \n" + self._definition + "\n\n" + "GRADING INSTRUCTIONS: \n" + self._scoring_rubric
)
if self._examples:
prompt["user_prompt"] += "\n\nEXAMPLES: \n"
for example in self._examples:
prompt["user_prompt"] += str(example)
prompt["user_prompt"] += "\n\n"
prompt["user_prompt"] += "Following the instructions, evaluate this:\n"
for argname, argval in kwargs.items():
prompt["user_prompt"] += f"{argname}: {argval}\n"
return prompt

def __call__(self, **kwargs):
res = self._llm.run(prompt=self._build_prompt(**kwargs), **self._model_parameters)
score = self._scoring_function(res)
return {f"{self.name}_score": score, f"{self.name}_reasoning": res}
2 changes: 1 addition & 1 deletion continuous_eval/metrics/generation/text/semantic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from typing import Any, Dict, List
from typing import Dict, List

import pandas as pd

Expand Down
2 changes: 1 addition & 1 deletion docs/astro.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export default defineConfig({
site: 'https://docs.relari.ai',
base: '/v0.3',
outDir: './dist/v0.3',
trailingSlash: "always",
trailingSlash: "never",
markdown: {
remarkPlugins: [remarkMath],
rehypePlugins: [rehypeMathjax],
Expand Down
5 changes: 2 additions & 3 deletions docs/src/content/docs/getting-started/Introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@ sidebar:

## What is continuous-eval?

`continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines.

<img src="../../module-level-eval.png"></img>
`continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines.

<img src="/v0.3/module-level-eval.png"></img>

## How is continuous-eval different?

Expand Down
10 changes: 5 additions & 5 deletions docs/src/content/docs/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import { Icon } from '@astrojs/starlight/components';
<LinkCard
title="🚀 Getting Started"
description="Install the package and learn how to get started quickly."
href="./getting-started/introduction/"
href="v0.3/getting-started/introduction/"
/>
</CardGrid>

Expand All @@ -24,22 +24,22 @@ import { Icon } from '@astrojs/starlight/components';
<LinkCard
title="🚰 Pipeline"
description="Define your GenAI application pipeline and run evaluation over a tailored dataset."
href="./pipeline/pipeline/"
href="v0.3/pipeline/pipeline/"
/>
<LinkCard
title="📊 Metrics"
description="Explore the available metrics and learn how to combine multiple metrics effectively."
href="./metrics/overview/"
href="v0.3/metrics/overview/"
/>
<LinkCard
title="🔍 Datasets"
description="Explore sample datasets and try generating a synthetic evaluation dataset from documents."
href="./dataset/example_datasets/"
href="v0.3/dataset/example_datasets/"
/>
<LinkCard
title="💡 Examples"
description="Discover code snippets and examples to help you understand and implement different evaluation pipelines."
href="./examples/basics/0_single_metric/"
href="v0.3/examples/basics/0_single_metric/"
/>
</CardGrid>

Expand Down
100 changes: 100 additions & 0 deletions docs/src/content/docs/metrics/Generation/LLM-Based/custom.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
---
title: LLM-based Custom Metric
---

### Definition

The class `LLMBasedCustomMetric` is a base class for creating custom LLM-based metrics.

It requires:

- `name`: a string used to identify the metric
- `definition`: the definition of the criteria
- `scoring_rubric`: the scoring rubric
- `scoring_function`: a function used to parse the LLM output and return a score
- `model`: an instance of `LLMInterface` (or `None` to use the default model)
- `model_parameters`: optional, a dictionary of any additional parameters to pass to the model
- `examples`: optional, a list of `EvaluationExample` objects

The class `EvaluationExample` is used to define examples for the metric. It requires:

- `input`: a string or a dictionary of the example input required by the metric
- `score`: the score the LLM should return for this example
- `justification`: a string explaining the expected score

### Example Usage

Let's create a custom metric to assess the conciseness of the answer to the question.
We will use the `LLMBasedCustomMetric` class to define the metric and the `EvaluationExample` class to define examples for the metric.
The metric will take the question and the generated answer as input and return a score between 1 and 3, where 1 means that the answer is too verbose and 3 means that the answer is concise.

Let's start by defining the examples:

```python
example_score_1 = EvaluationExample(
{
"question": " What causes sea breezes?",
"answer": "To understand what causes sea breezes, it's important to start by recognizing that the Earth is made up of various surfaces, such as land and water, which both play a significant role in the way our climate and weather patterns are formed. Now, during the daylight hours, what happens is quite fascinating. The sun, which is our primary source of light and heat, shines down upon the Earth's surface. However, not all surfaces on Earth respond to this heat in the same way. Specifically, land tends to heat up much more quickly and to a higher degree compared to water. This discrepancy in heating rates is crucial because it leads to differences in air pressure. Warmer air is less dense and tends to rise, whereas cooler air is more dense and tends to sink. So, as the land heats up, the air above it becomes warmer and rises, creating a kind of vacuum that needs to be filled. Consequently, the cooler, denser air over the water begins to move towards the land to fill this space. This movement of air from the sea to the land is what we experience as a sea breeze. It's a fascinating process that not only demonstrates the dynamic nature of our planet's climate system but also highlights the intricate interplay between the sun, the Earth's surface, and the atmosphere above it.",
},
score=1,
justification="This answer would score lower on conciseness. While it is informative and covers the necessary scientific principles, it contains a significant amount of introductory and explanatory material that, while interesting, is not essential to answering the specific question about the cause of sea breezes.",
)

example_score_2 = EvaluationExample(
{
"question": "What causes sea breezes?",
"answer": "Sea breezes are a result of the interesting interplay between the heating rates of land and water. Essentially, during the sunlit hours, land heats up much more rapidly compared to the ocean. This difference in heating leads to a variation in air pressure; as the warmer air over the land rises due to its lower density, a pressure difference is created. Cooler air from the sea, being denser, moves towards the land to balance this pressure difference. However, it’s not just about temperature and pressure; the Earth’s rotation also plays a part in directing the breeze, adding a slight twist to the direction the breeze comes from. This natural phenomenon is quite essential, contributing to local weather patterns and offering relief on hot days along coastal areas.",
},
score=2,
justification="This answer would receive a score of 2 for conciseness. It provides a more detailed explanation than necessary for a straightforward question but does not delve into excessive verbosity. The answer introduces the basic concept accurately and includes relevant details about the cause of sea breezes. However, it also incorporates additional information about the Earth's rotation, which, while related, is not strictly necessary to understand the fundamental cause of sea breezes.",
)

example_score_3 = EvaluationExample(
input={
"question": "What causes sea breezes?",
"answer": "Sea breezes are caused by the differential heating of land and sea. During the day, land heats up faster than water, creating a pressure difference that drives cooler air from the sea towards the land.",
},
score=3,
justification="This answer receives a high score for conciseness. It directly addresses the question without unnecessary details, providing the essential explanation in a clear and straightforward manner.",
)
```

Now, let's define the custom metric:

```python
metric = LLMBasedCustomMetric(
name="Conciseness",
definition="Conciseness in communication refers to the expression of ideas in a clear and straightforward manner, using the fewest possible words without sacrificing clarity or completeness of information. It involves eliminating redundancy, verbosity, and unnecessary details, focusing instead on delivering the essential message efficiently. ",
scoring_rubric="""Use the following rubric to assign a score to the answer based on its conciseness:
- Score 1: The answer is overly verbose, containing a significant amount of unnecessary information, repetition, or redundant expressions that do not contribute to the understanding of the topic.
- Score 2: The answer includes some unnecessary details or slightly repetitive information, but the excess does not severely hinder understanding.
- Score 3:The answer is clear, direct, and to the point, with no unnecessary words, details, or repetition.""",
scoring_function=ScoringFunctions.Numeric(min_val=1, max_val=3),
model_parameters={"temperature": 0},
examples=[example_score_1, example_score_2, example_score_3],
)
```

than we can use the metric to evaluate the conciseness of the generated answers:

```python
datum = {
"question": "What causes seasons to change?",
"answer": "The change in seasons is primarily caused by the Earth's tilt on its axis combined with its orbit around the Sun. This tilt leads to variations in the angle and intensity of sunlight reaching different parts of Earth at different times of the year.",
}

print(metric(**datum))
```

With the following output:

```JSON
{
'Conciseness_score': 3,
'Conciseness_reasoning': "Score: 3\nJustification: The answer is concise, clear, and directly addresses the question without any unnecessary details. It provides a straightforward explanation of how the Earth's tilt on its axis and its orbit around the Sun cause the change in seasons."
}
```

:::note
Note: when using a custom metric in a `Pipeline` class, remember use the `Metric` method `use` to register metric inputs, for example `metric.use(question=dataset.question, answer=dataset.answer)`.
:::
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,16 @@ Context Precision is used to measure information density.
$$
\text{LLM-Based Context Precision} =
\frac{
\text{Number of Relevant Chunks in Retrieved Sentences}
\text{Number of Relevant Chunks in Retrieved Contexts}
}{
\text{Total Number of Sentences in Retrieved Contexts}
\text{Total Number of Chunks in Retrieved Contexts}
}
$$

$$
\text{LLM-Based Average Precision (AP)} = \frac{1}{\text{Number of Relevant Chunks}} \sum_{j=1}^{\text{Number of Retrieved Context}} \text{ Precision at Rank } j
$$
\text{LLM-Based Context Average Precision} =
\frac{
\text{Number of Relevant Chunks in Retrieved Sentences}
}{
\text{Total Number of Sentences in Retrieved Contexts}
}
$$


### Example Usage

Expand Down
51 changes: 51 additions & 0 deletions examples/llm_custom_criteria.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from continuous_eval.metrics.generation.text import EvaluationExample, LLMBasedCustomMetric, ScoringFunctions

# In this example we want to create a custom metric to evaluate the conciseness of a given answer to a question.
# We will use a scale from 1 to 3, let's define an example for each score (we will use to define the metric later)
example_score_3 = EvaluationExample(
input={
"question": "What causes sea breezes?",
"answer": "Sea breezes are caused by the differential heating of land and sea. During the day, land heats up faster than water, creating a pressure difference that drives cooler air from the sea towards the land.",
},
score=3,
justification="This answer receives a high score for conciseness. It directly addresses the question without unnecessary details, providing the essential explanation in a clear and straightforward manner.",
)

example_score_2 = EvaluationExample(
{
"question": "What causes sea breezes?",
"answer": "Sea breezes are a result of the interesting interplay between the heating rates of land and water. Essentially, during the sunlit hours, land heats up much more rapidly compared to the ocean. This difference in heating leads to a variation in air pressure; as the warmer air over the land rises due to its lower density, a pressure difference is created. Cooler air from the sea, being denser, moves towards the land to balance this pressure difference. However, it’s not just about temperature and pressure; the Earth’s rotation also plays a part in directing the breeze, adding a slight twist to the direction the breeze comes from. This natural phenomenon is quite essential, contributing to local weather patterns and offering relief on hot days along coastal areas.",
},
score=2,
justification="This answer would receive a score of 2 for conciseness. It provides a more detailed explanation than necessary for a straightforward question but does not delve into excessive verbosity. The answer introduces the basic concept accurately and includes relevant details about the cause of sea breezes. However, it also incorporates additional information about the Earth's rotation, which, while related, is not strictly necessary to understand the fundamental cause of sea breezes.",
)

example_score_1 = EvaluationExample(
{
"question": " What causes sea breezes?",
"answer": "To understand what causes sea breezes, it's important to start by recognizing that the Earth is made up of various surfaces, such as land and water, which both play a significant role in the way our climate and weather patterns are formed. Now, during the daylight hours, what happens is quite fascinating. The sun, which is our primary source of light and heat, shines down upon the Earth's surface. However, not all surfaces on Earth respond to this heat in the same way. Specifically, land tends to heat up much more quickly and to a higher degree compared to water. This discrepancy in heating rates is crucial because it leads to differences in air pressure. Warmer air is less dense and tends to rise, whereas cooler air is more dense and tends to sink. So, as the land heats up, the air above it becomes warmer and rises, creating a kind of vacuum that needs to be filled. Consequently, the cooler, denser air over the water begins to move towards the land to fill this space. This movement of air from the sea to the land is what we experience as a sea breeze. It's a fascinating process that not only demonstrates the dynamic nature of our planet's climate system but also highlights the intricate interplay between the sun, the Earth's surface, and the atmosphere above it.",
},
score=1,
justification="This answer would score lower on conciseness. While it is informative and covers the necessary scientific principles, it contains a significant amount of introductory and explanatory material that, while interesting, is not essential to answering the specific question about the cause of sea breezes.",
)

# Let's initialize the metric
metric = LLMBasedCustomMetric(
name="Conciseness",
definition="Conciseness in communication refers to the expression of ideas in a clear and straightforward manner, using the fewest possible words without sacrificing clarity or completeness of information. It involves eliminating redundancy, verbosity, and unnecessary details, focusing instead on delivering the essential message efficiently. ",
scoring_rubric="""Use the following rubric to assign a score to the answer based on its conciseness:
- Score 1: The answer is overly verbose, containing a significant amount of unnecessary information, repetition, or redundant expressions that do not contribute to the understanding of the topic.
- Score 2: The answer includes some unnecessary details or slightly repetitive information, but the excess does not severely hinder understanding.
- Score 3:The answer is clear, direct, and to the point, with no unnecessary words, details, or repetition.""",
scoring_function=ScoringFunctions.Numeric(min_val=1, max_val=3),
model_parameters={"temperature": 0},
examples=[example_score_1, example_score_2, example_score_3],
)

# Let's calculate the metric for the first datum
datum = {
"question": "What causes seasons to change?",
"answer": "The change in seasons is primarily caused by the Earth's tilt on its axis combined with its orbit around the Sun. This tilt leads to variations in the angle and intensity of sunlight reaching different parts of Earth at different times of the year.",
}

print(metric(**datum))
Loading