Merge pull request #26 from Nayjest/tokenize_remote_models

Tiktoken usage for estimating number of tokens in prompt / response, fitting semantic search results to target token num
Nayjest · Jul 31, 2024 · edb2543 · edb2543
2 parents f7a18ff + 2e4c365
commit edb2543
Show file tree

Hide file tree

Showing 12 changed files with 191 additions and 22 deletions.
diff --git a/microcore/__init__.py b/microcore/__init__.py
@@ -9,7 +9,7 @@
 
 import os
 import microcore.ui  # noqa
-from .embedding_db import SearchResult, AbstractEmbeddingDB
+from .embedding_db import SearchResult, AbstractEmbeddingDB, SearchResults
 from .file_storage import storage
 from ._env import configure, env, config
 from .logging import use_logging
@@ -67,10 +67,10 @@ def search(
         n_results: int = 5,
         where: dict = None,
         **kwargs,
-    ) -> list[str | SearchResult]:
+    ) -> SearchResults | list[str | SearchResult]:
         return env().texts.search(collection, query, n_results, where, **kwargs)
 
-    def find(self, *args, **kwargs) -> list[str | SearchResult]:
+    def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
         return self.search(*args, **kwargs)
 
     def find_all(
@@ -79,7 +79,7 @@ def find_all(
         query: str | list,
         where: dict = None,
         **kwargs,
-    ) -> list[str | SearchResult]:
+    ) -> SearchResults | list[str | SearchResult]:
         return env().texts.find_all(collection, query, where, **kwargs)
 
     def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
@@ -128,6 +128,8 @@ def delete(self, collection: str, what: str | list[str] | dict):
     "LLMResponse",
     "PromptWrapper",
     "parse",
+    "SearchResult",
+    "SearchResults",
     "dedent",
     # submodules
     "embedding_db",
@@ -142,4 +144,4 @@ def delete(self, collection: str, what: str | list[str] | dict):
     # "wrappers",
 ]
 
-__version__ = "3.9.1"
+__version__ = "3.10.0"
diff --git a/microcore/_env.py b/microcore/_env.py
@@ -21,10 +21,10 @@ class Env:
     llm_before_handlers: list[callable] = field(default_factory=list)
     llm_after_handlers: list[callable] = field(default_factory=list)
     texts: AbstractEmbeddingDB = None
-    model: "PreTrainedModel" = field(default=None, init=False, repr=False)  # noqa
-    tokenizer: "PreTrainedTokenizer" = field(
+    model: "transformers.PreTrainedModel" = field(default=None, init=False, repr=False)  # noqa
+    tokenizer: "transformers.PreTrainedTokenizer" = field(  # noqa
         default=None, init=False, repr=False
-    )  # noqa
+    )
 
     def __post_init__(self):
         global _env

diff --git a/microcore/configuration.py b/microcore/configuration.py
@@ -175,6 +175,9 @@ class LLMConfig(BaseConfig, _OpenAIEnvVars, _AnthropicEnvVars, _GoogleVertexAiEn
     MODEL: str = from_env()
     """Language model name"""
 
+    TIKTOKEN_ENCODING: str = from_env()
+    """Will enforce using specific encoding for token size measurement"""
+
     LLM_DEFAULT_ARGS: dict = from_env(dtype=dict)
     """
     You may specify here default arguments for the LLM API calls,

diff --git a/microcore/embedding_db/__init__.py b/microcore/embedding_db/__init__.py
@@ -1,10 +1,39 @@
+import logging
 import sys
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 
+import tiktoken
+
 from ..utils import ExtendedString
 
 
+class SearchResults(list):
+    def fit_to_token_size(
+            self,
+            max_tokens: int,
+            min_documents: int = None,
+            for_model: str = None,
+            encoding: str | tiktoken.Encoding = None,
+            verbose=True
+    ):
+        from ..tokenizing import fit_to_token_size
+        records, removed = fit_to_token_size(
+            self, max_tokens=max_tokens,
+            min_documents=min_documents,
+            for_model=for_model,
+            encoding=encoding
+        )
+        if verbose and removed:
+            logging.info(
+                "For fitting %d records to %d tokens, %d records was removed",
+                len(self),
+                max_tokens,
+                removed
+            )
+        return SearchResults(list(records))
+
+
 class SearchResult(ExtendedString):
     """
     String containing the search result with additional information in attributes
@@ -46,7 +75,7 @@ def search(
             **kwargs: additional arguments
         """
 
-    def find(self, *args, **kwargs) -> list[str | SearchResult]:
+    def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
         """
         Alias for `search`
         """
@@ -58,13 +87,13 @@ def find_all(
         query: str | list,
         where: dict = None,
         **kwargs,
-    ) -> list[str | SearchResult]:
+    ) -> SearchResults | list[str | SearchResult]:
         return self.search(
             collection, query, n_results=sys.maxsize - 1, where=where, **kwargs
         )
 
     @abstractmethod
-    def get_all(self, collection: str) -> list[str | SearchResult]:
+    def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]:
         """Return all documents in the collection"""
 
     def save(self, collection: str, text: str, metadata: dict = None):

diff --git a/microcore/embedding_db/chromadb.py b/microcore/embedding_db/chromadb.py
@@ -4,7 +4,7 @@
 from chromadb.config import Settings
 from chromadb.utils import embedding_functions
 from ..configuration import Config
-from .. import SearchResult, AbstractEmbeddingDB
+from .. import SearchResult, SearchResults, AbstractEmbeddingDB
 
 
 @dataclass
@@ -25,7 +25,7 @@ def __post_init__(self):
 
     @classmethod
     def _wrap_results(cls, results) -> list[str | SearchResult]:
-        return [
+        return SearchResults([
             SearchResult(
                 results["documents"][0][i],
                 dict(
@@ -35,7 +35,7 @@ def _wrap_results(cls, results) -> list[str | SearchResult]:
                 ),
             )
             for i in range(len(results["documents"][0]))
-        ]
+        ])
 
     def search(
         self,
@@ -50,7 +50,7 @@ def search(
                 collection, embedding_function=self.embedding_function
             )
         except ValueError:
-            return []
+            return SearchResults([])
 
         if isinstance(query, str):
             query = [query]
@@ -61,7 +61,7 @@ def search(
         return (
             self._wrap_results(d)
             if d and d.get("documents") and d["documents"][0]
-            else []
+            else SearchResults([])
         )
 
     def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
@@ -122,12 +122,12 @@ def get_all(self, collection: str) -> list[str | SearchResult]:
                 collection, embedding_function=self.embedding_function
             )
         except ValueError:
-            return []
+            return SearchResults([])
         results = chroma_collection.get()
-        return [
+        return SearchResults([
             SearchResult(
                 results["documents"][i],
                 {"metadata": results["metadatas"][i] or {}, "id": results["ids"][i]},
             )
             for i in range(len(results["documents"]))
-        ]
+        ])
diff --git a/microcore/json_parsing.py b/microcore/json_parsing.py
@@ -42,7 +42,7 @@ def unwrap_json_substring(
         ...
 
     return (
-        input_string[start : end + 1]
+        input_string[start:end + 1]
         if brace
         else input_string if return_original_on_fail else ""
     )

diff --git a/microcore/llm/local_transformers.py b/microcore/llm/local_transformers.py
@@ -16,7 +16,7 @@ def inference(prompt: str, model, tokenizer, **kwargs):
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     outputs = model.generate(**inputs, **kwargs)
     outputs = [
-        tokenizer.decode(i[len(inputs[0]) :], skip_special_tokens=skip_special_tokens)
+        tokenizer.decode(i[len(inputs[0]):], skip_special_tokens=skip_special_tokens)
         for i in outputs
     ]
     return LLMResponse(outputs[0], dict(all=outputs))

diff --git a/microcore/tokenizing.py b/microcore/tokenizing.py
@@ -0,0 +1,77 @@
+import logging
+
+import tiktoken
+import requests.exceptions
+from ._env import env
+
+
+class CantLoadTikTokenEncoding(RuntimeError):
+    ...
+
+
+def _resolve_tiktoken_encoding(
+    for_model: str = None, encoding: str | tiktoken.Encoding = None
+) -> tiktoken.Encoding:
+    assert (
+        for_model is None or encoding is None
+    ), "You may specify encoding or for_model(LLM), but not both"
+    if isinstance(encoding, tiktoken.Encoding):
+        return encoding
+    if for_model is None and encoding is None:
+        if env().config.TIKTOKEN_ENCODING:
+            return _resolve_tiktoken_encoding(encoding=env().config.TIKTOKEN_ENCODING)
+        for_model = (
+            env().config.LLM_DEFAULT_ARGS.get("model", None) or env().config.MODEL
+        )
+    if for_model:
+        try:
+            return tiktoken.encoding_for_model(for_model)
+        except (KeyError, requests.exceptions.ConnectionError):
+            logging.warning(
+                f"Can't resolve tiktoken encoding for '{for_model}'. "
+                f"Default encoding will be used."
+            )
+    encoding = encoding or "cl100k_base"
+    try:
+        return tiktoken.get_encoding(encoding)
+    except (ValueError, requests.exceptions.ConnectionError) as e:
+        raise CantLoadTikTokenEncoding(
+            f"Can't load tiktok encoding '{encoding}'"
+        ) from e
+
+
+def encode(
+    string: str, for_model: str = None, encoding: str | tiktoken.Encoding = None
+) -> list[int]:
+    """Encodes string to LLM tokens"""
+    return _resolve_tiktoken_encoding(for_model, encoding).encode(string)
+
+
+def num_tokens_from_string(
+    string: str, for_model: str = None, encoding: str | tiktoken.Encoding = None
+) -> int:
+    """Returns the number of tokens in a text string."""
+    return len(encode(string, for_model=for_model, encoding=encoding))
+
+
+def fit_to_token_size(
+    docs: list[str],
+    max_tokens: int,
+    min_documents: int = None,
+    for_model: str = None,
+    encoding: str | tiktoken.Encoding = None,
+) -> tuple[list[str], int]:
+    """
+    Fit the list of documents to the max_tokens size.
+    Returns the new list of documents and qty of removed items
+    """
+    encoding = _resolve_tiktoken_encoding(for_model, encoding)
+    tot_size = 0
+    for i, doc in enumerate(docs):
+        tot_size += num_tokens_from_string(doc, encoding=encoding)
+        if min_documents and i < min_documents:
+            continue
+        if tot_size > max_tokens:
+            result = docs[:i]
+            return result, len(docs) - len(result)
+    return docs, 0
diff --git a/microcore/utils.py b/microcore/utils.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 from typing import Any, Union, Callable
 
+import tiktoken
 from colorama import Fore
 
 from .configuration import Config
@@ -82,6 +83,21 @@ def method_handler(*args, **kwargs):
             f"'{self.__class__.__name__}' object has no attribute '{item}'"
         )
 
+    def to_tokens(
+        self,
+        for_model: str = None,
+        encoding: str | tiktoken.Encoding = None
+    ):
+        from .tokenizing import encode
+        return encode(self, for_model=for_model, encoding=encoding)
+
+    def num_tokens(
+        self,
+        for_model: str = None,
+        encoding: str | tiktoken.Encoding = None
+    ):
+        return len(self.to_tokens(for_model=for_model, encoding=encoding))
+
 
 class DataclassEncoder(json.JSONEncoder):
     """@private"""

diff --git a/requirements/extended.txt b/requirements/extended.txt
@@ -1,3 +1,6 @@
 -r min.txt
 chromadb>=0.4.18,<0.6
 anthropic>=0.19.1,<=0.25.8
+google-generativeai>=0.7.2,<1
+vertexai>=1.60.0,<2
+transformers>=4.43.3,<5
diff --git a/requirements/min.txt b/requirements/min.txt
@@ -4,4 +4,5 @@ Jinja2~=3.1.2
 colorama~=0.4.6
 chardet~=5.2.0
 PyYAML~=6.0
-aiohttp>=3.8.6,<4.0
+aiohttp>=3.8.6,<4.0
+tiktoken>=0.7.0,<1.0
diff --git a/tests/basic/test_fit_vector_search_to_tokens.py b/tests/basic/test_fit_vector_search_to_tokens.py
@@ -0,0 +1,38 @@
+import microcore as mc
+from microcore import SearchResult
+
+
+def test_fit_vector_search_to_tokens():
+    mc.texts.clear("test_collection")
+    raw_items = [str(i) for i in range(10)]
+    mc.texts.save_many("test_collection", raw_items)
+    res = mc.texts.search("test_collection", "qwe", n_results=10)
+    # Check all loaded
+    assert sorted(res) == raw_items
+
+    fres = res.fit_to_token_size(3)
+    # check fit
+    assert len(fres) == 3
+    assert any(i in raw_items for i in fres)
+
+    # check that distances of fitted elements are smallest
+    smallest_dist = sorted(i.distance for i in res)[:3]
+    fitted_dist = sorted(i.distance for i in fres)
+    assert fitted_dist == smallest_dist
+
+    assert fres[0].num_tokens() == 1
+
+
+def test_fit_vector_search_to_tokens_min_docs():
+    mc.texts.clear("test_collection")
+    raw_items = [str(i) for i in range(10)]
+    mc.texts.save_many("test_collection", raw_items)
+    res = mc.texts.search("test_collection", "qwe", n_results=10).fit_to_token_size(3, 4)
+    assert len(res) == 4
+    res = mc.texts.search("test_collection", "qwe", n_results=10).fit_to_token_size(5, 3)
+    assert len(res) == 5
+
+
+def test_num_tokens():
+    assert SearchResult("apple pineapple orange").num_tokens(encoding='cl100k_base') >= 3
+    assert SearchResult("Hi").num_tokens(for_model='gpt-4') <= 2