refuel-ai · rajasbansal · Nov 20, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,8 @@ dependencies = [
     "jsonschema >= 4.17.3",
     "tabulate >= 0.9.0",
     "typer[all] >= 0.9.0",
-    "simple-term-menu >= 1.6.1"
+    "simple-term-menu >= 1.6.1",
+    "transformers >= 4.25.0",
 ]
 requires-python = ">=3.6"
 

diff --git a/src/autolabel/confidence.py b/src/autolabel/confidence.py
@@ -10,7 +10,7 @@
 
 from autolabel.schema import LLMAnnotation, ConfidenceCacheEntry
 from autolabel.models import BaseModel
-from autolabel.cache import BaseCache, SQLAlchemyConfidenceCache
+from autolabel.cache import BaseCache
 
 from tenacity import (
     before_sleep_log,
@@ -86,7 +86,7 @@ def logprob_average_per_key(
         indices = []
         for ind, logprob in enumerate(logprobs):
             key = list(logprob.keys())[0]
-            if key == '"' or key == '",':
+            if '"' in key:
                 indices.append(ind)
         if len(indices) != 4 * len(keys):
             logger.error("Unable to find all keys in prompt")

diff --git a/src/autolabel/configs/config.py b/src/autolabel/configs/config.py
@@ -17,6 +17,7 @@ class AutolabelConfig(BaseConfig):
     EMBEDDING_CONFIG_KEY = "embedding"
     PROMPT_CONFIG_KEY = "prompt"
     DATASET_GENERATION_CONFIG_KEY = "dataset_generation"
+    CHUNKING_CONFIG_KEY = "chunking"
 
     # Dataset config keys (config["dataset"][<key>])
     LABEL_COLUMN_KEY = "label_column"
@@ -58,6 +59,10 @@ class AutolabelConfig(BaseConfig):
     DATASET_GENERATION_GUIDELINES_KEY = "guidelines"
     DATASET_GENERATION_NUM_ROWS_KEY = "num_rows"
 
+    # Chunking config keys (config["chunking"][<key>])
+    CONFIDENCE_CHUNK_SIZE_KEY = "confidence_chunk_size"
+    CONFIDENCE_MERGE_FUNCTION_KEY = "confidence_merge_function"
+
     def __init__(self, config: Union[str, Dict], validate: bool = True) -> None:
         super().__init__(config, validate=validate)
 
@@ -96,6 +101,11 @@ def _dataset_generation_config(self) -> Dict:
         """Returns information about the prompt for synthetic dataset generation"""
         return self.config.get(self.DATASET_GENERATION_CONFIG_KEY, {})
 
+    @cached_property
+    def _chunking_config(self) -> Dict:
+        """Returns information about the chunking config"""
+        return self.config.get(self.CHUNKING_CONFIG_KEY, {})
+
     # project and task definition config
     def task_name(self) -> str:
         return self.config[self.TASK_NAME_KEY]
@@ -243,3 +253,11 @@ def dataset_generation_num_rows(self) -> int:
         return self._dataset_generation_config.get(
             self.DATASET_GENERATION_NUM_ROWS_KEY, 1
         )
+
+    def confidence_chunk_size(self) -> int:
+        """Returns the chunk size for confidence chunking"""
+        return self._chunking_config.get(self.CONFIDENCE_CHUNK_SIZE_KEY, 0)
+
+    def confidence_merge_function(self) -> str:
+        """Returns the function to use when merging confidence scores"""
+        return self._chunking_config.get(self.CONFIDENCE_MERGE_FUNCTION_KEY, "max")
diff --git a/src/autolabel/labeler.py b/src/autolabel/labeler.py
@@ -6,6 +6,8 @@
 import pickle
 import asyncio
 import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer
 from rich import print as pprint
 from rich.console import Console
 from rich.prompt import Confirm
@@ -37,6 +39,7 @@
     TaskRun,
     TaskStatus,
     TaskType,
+    AggregationFunction,
 )
 from autolabel.tasks import TaskFactory
 from autolabel.utils import (
@@ -58,9 +61,15 @@
 }
 METRIC_TABLE_STYLE = "cyan bold"
 
+MERGE_FUNCTION = {
+    AggregationFunction.MAX: np.max,
+    AggregationFunction.MEAN: np.mean,
+}
+
 
 class LabelingAgent:
     COST_KEY = "Cost in $"
+    CONFIDENCE_MAX_CONTEXT_LENGTH = 3400
 
     def __init__(
         self,
@@ -103,11 +112,14 @@ def __init__(
             self.config, cache=self.generation_cache
         )
 
+        self.confidence_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
         score_type = "logprob_average"
         if self.config.task_type() == TaskType.ATTRIBUTE_EXTRACTION:
             score_type = "logprob_average_per_key"
         self.confidence = ConfidenceCalculator(
-            score_type=score_type, llm=self.llm, cache=self.confidence_cache
+            score_type=score_type,
+            llm=self.llm,
+            cache=self.confidence_cache,
         )
 
         self.example_selector = example_selector
@@ -274,9 +286,80 @@ def run(
                         )
 
                         if self.config.confidence():
-                            annotation.confidence_score = self.confidence.calculate(
-                                model_generation=annotation,
+                            full_confidence_input = (
+                                annotation.prompt + annotation.raw_response
                             )
+                            if (
+                                not self.config.confidence_chunk_size()
+                                or self.get_num_tokens(full_confidence_input)
+                                < self.CONFIDENCE_MAX_CONTEXT_LENGTH
+                            ):
+                                annotation.confidence_score = self.confidence.calculate(
+                                    model_generation=annotation,
+                                )
+                            else:
+                                key_to_chunk = None
+                                for key in chunk.keys():
+                                    # TODO(rajas): Better way to find the key to chunk
+                                    # Potentially take this as an input from the user
+                                    if (
+                                        self.get_num_tokens(chunk[key])
+                                        > self.CONFIDENCE_MAX_CONTEXT_LENGTH
+                                    ):
+                                        key_to_chunk = key
+                                        break
+                                if key_to_chunk is None:
+                                    raise ValueError(
+                                        f"Unable to find a key in the chunk with a value that is longer than {num_tokens_per_chunk} tokens."
+                                    )
+
+                                empty_chunk = chunk.copy()
+                                empty_chunk[key_to_chunk] = ""
+                                empty_prompt = self.task.construct_prompt(
+                                    empty_chunk, examples
+                                )
+                                num_tokens_empty_prompt = self.get_num_tokens(
+                                    empty_prompt
+                                )
+                                num_tokens_per_chunk = (
+                                    self.config.confidence_chunk_size()
+                                    - num_tokens_empty_prompt
+                                )
+                                confidence_chunks = self.chunk_string(
+                                    chunk[key_to_chunk], num_tokens_per_chunk
+                                )
+
+                                confidence_scores = []
+                                for confidence_chunk in confidence_chunks:
+                                    new_chunk = chunk.copy()
+                                    new_chunk[key_to_chunk] = confidence_chunk
+                                    new_prompt = self.task.construct_prompt(
+                                        new_chunk, examples
+                                    )
+                                    annotation_dict = annotation.dict()
+                                    annotation_dict["prompt"] = new_prompt
+                                    confidence_scores.append(
+                                        self.confidence.calculate(
+                                            model_generation=LLMAnnotation(
+                                                **annotation_dict
+                                            ),
+                                        )
+                                    )
+
+                                merge_function = MERGE_FUNCTION[
+                                    self.config.confidence_merge_function()
+                                ]
+                                if isinstance(confidence_scores[0], dict):
+                                    merged_confidence = {}
+                                    for key in confidence_scores[0].keys():
+                                        merged_confidence[key] = merge_function(
+                                            [conf[key] for conf in confidence_scores]
+                                        )
+                                else:
+                                    merged_confidence = merge_function(
+                                        confidence_scores
+                                    )
+                                annotation.confidence_score = merged_confidence
 
                         annotations.append(annotation)
                     annotation = self.majority_annotation(annotations)
@@ -651,3 +734,13 @@ def get_all_annotations(self):
             return [pickle.loads(a.llm_annotation) for a in db_result]
         else:
             return self.all_annotations
+
+    def get_num_tokens(self, inp: str) -> int:
+        """Returns the number of tokens in the prompt"""
+        return len(self.confidence_tokenizer.encode(str(inp)))
+
+    def chunk_string(self, inp: str, chunk_size: int) -> List[str]:
+        """Chunks the input string into chunks of size chunk_size"""
+        tokens = self.confidence_tokenizer.encode(inp)
+        chunks = [tokens[i : i + chunk_size] for i in range(0, len(tokens), chunk_size)]
+        return [self.confidence_tokenizer.decode(chunk) for chunk in chunks]
diff --git a/src/autolabel/schema.py b/src/autolabel/schema.py
@@ -270,3 +270,10 @@ class RefuelLLMResult(BaseModel):
 
     """Costs incurred during the labeling job"""
     costs: Optional[List[float]] = []
+
+
+class AggregationFunction(str, Enum):
+    """Enum of supported aggregation functions"""
+
+    MAX = "max"
+    MEAN = "mean"