Skip to content

Commit

Permalink
Merge pull request #26 from Nayjest/tokenize_remote_models
Browse files Browse the repository at this point in the history
Tiktoken usage for estimating number of tokens in prompt / response, fitting semantic search results to target token num
  • Loading branch information
Nayjest committed Jul 31, 2024
2 parents f7a18ff + 2e4c365 commit edb2543
Show file tree
Hide file tree
Showing 12 changed files with 191 additions and 22 deletions.
12 changes: 7 additions & 5 deletions microcore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import os
import microcore.ui # noqa
from .embedding_db import SearchResult, AbstractEmbeddingDB
from .embedding_db import SearchResult, AbstractEmbeddingDB, SearchResults
from .file_storage import storage
from ._env import configure, env, config
from .logging import use_logging
Expand Down Expand Up @@ -67,10 +67,10 @@ def search(
n_results: int = 5,
where: dict = None,
**kwargs,
) -> list[str | SearchResult]:
) -> SearchResults | list[str | SearchResult]:
return env().texts.search(collection, query, n_results, where, **kwargs)

def find(self, *args, **kwargs) -> list[str | SearchResult]:
def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
return self.search(*args, **kwargs)

def find_all(
Expand All @@ -79,7 +79,7 @@ def find_all(
query: str | list,
where: dict = None,
**kwargs,
) -> list[str | SearchResult]:
) -> SearchResults | list[str | SearchResult]:
return env().texts.find_all(collection, query, where, **kwargs)

def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
Expand Down Expand Up @@ -128,6 +128,8 @@ def delete(self, collection: str, what: str | list[str] | dict):
"LLMResponse",
"PromptWrapper",
"parse",
"SearchResult",
"SearchResults",
"dedent",
# submodules
"embedding_db",
Expand All @@ -142,4 +144,4 @@ def delete(self, collection: str, what: str | list[str] | dict):
# "wrappers",
]

__version__ = "3.9.1"
__version__ = "3.10.0"
6 changes: 3 additions & 3 deletions microcore/_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ class Env:
llm_before_handlers: list[callable] = field(default_factory=list)
llm_after_handlers: list[callable] = field(default_factory=list)
texts: AbstractEmbeddingDB = None
model: "PreTrainedModel" = field(default=None, init=False, repr=False) # noqa
tokenizer: "PreTrainedTokenizer" = field(
model: "transformers.PreTrainedModel" = field(default=None, init=False, repr=False) # noqa
tokenizer: "transformers.PreTrainedTokenizer" = field( # noqa
default=None, init=False, repr=False
) # noqa
)

def __post_init__(self):
global _env
Expand Down
3 changes: 3 additions & 0 deletions microcore/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ class LLMConfig(BaseConfig, _OpenAIEnvVars, _AnthropicEnvVars, _GoogleVertexAiEn
MODEL: str = from_env()
"""Language model name"""

TIKTOKEN_ENCODING: str = from_env()
"""Will enforce using specific encoding for token size measurement"""

LLM_DEFAULT_ARGS: dict = from_env(dtype=dict)
"""
You may specify here default arguments for the LLM API calls,
Expand Down
35 changes: 32 additions & 3 deletions microcore/embedding_db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
import logging
import sys
from abc import ABC, abstractmethod
from dataclasses import dataclass

import tiktoken

from ..utils import ExtendedString


class SearchResults(list):
def fit_to_token_size(
self,
max_tokens: int,
min_documents: int = None,
for_model: str = None,
encoding: str | tiktoken.Encoding = None,
verbose=True
):
from ..tokenizing import fit_to_token_size
records, removed = fit_to_token_size(
self, max_tokens=max_tokens,
min_documents=min_documents,
for_model=for_model,
encoding=encoding
)
if verbose and removed:
logging.info(
"For fitting %d records to %d tokens, %d records was removed",
len(self),
max_tokens,
removed
)
return SearchResults(list(records))


class SearchResult(ExtendedString):
"""
String containing the search result with additional information in attributes
Expand Down Expand Up @@ -46,7 +75,7 @@ def search(
**kwargs: additional arguments
"""

def find(self, *args, **kwargs) -> list[str | SearchResult]:
def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]:
"""
Alias for `search`
"""
Expand All @@ -58,13 +87,13 @@ def find_all(
query: str | list,
where: dict = None,
**kwargs,
) -> list[str | SearchResult]:
) -> SearchResults | list[str | SearchResult]:
return self.search(
collection, query, n_results=sys.maxsize - 1, where=where, **kwargs
)

@abstractmethod
def get_all(self, collection: str) -> list[str | SearchResult]:
def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]:
"""Return all documents in the collection"""

def save(self, collection: str, text: str, metadata: dict = None):
Expand Down
16 changes: 8 additions & 8 deletions microcore/embedding_db/chromadb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from ..configuration import Config
from .. import SearchResult, AbstractEmbeddingDB
from .. import SearchResult, SearchResults, AbstractEmbeddingDB


@dataclass
Expand All @@ -25,7 +25,7 @@ def __post_init__(self):

@classmethod
def _wrap_results(cls, results) -> list[str | SearchResult]:
return [
return SearchResults([
SearchResult(
results["documents"][0][i],
dict(
Expand All @@ -35,7 +35,7 @@ def _wrap_results(cls, results) -> list[str | SearchResult]:
),
)
for i in range(len(results["documents"][0]))
]
])

def search(
self,
Expand All @@ -50,7 +50,7 @@ def search(
collection, embedding_function=self.embedding_function
)
except ValueError:
return []
return SearchResults([])

if isinstance(query, str):
query = [query]
Expand All @@ -61,7 +61,7 @@ def search(
return (
self._wrap_results(d)
if d and d.get("documents") and d["documents"][0]
else []
else SearchResults([])
)

def save_many(self, collection: str, items: list[tuple[str, dict] | str]):
Expand Down Expand Up @@ -122,12 +122,12 @@ def get_all(self, collection: str) -> list[str | SearchResult]:
collection, embedding_function=self.embedding_function
)
except ValueError:
return []
return SearchResults([])
results = chroma_collection.get()
return [
return SearchResults([
SearchResult(
results["documents"][i],
{"metadata": results["metadatas"][i] or {}, "id": results["ids"][i]},
)
for i in range(len(results["documents"]))
]
])
2 changes: 1 addition & 1 deletion microcore/json_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def unwrap_json_substring(
...

return (
input_string[start : end + 1]
input_string[start:end + 1]
if brace
else input_string if return_original_on_fail else ""
)
Expand Down
2 changes: 1 addition & 1 deletion microcore/llm/local_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def inference(prompt: str, model, tokenizer, **kwargs):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **kwargs)
outputs = [
tokenizer.decode(i[len(inputs[0]) :], skip_special_tokens=skip_special_tokens)
tokenizer.decode(i[len(inputs[0]):], skip_special_tokens=skip_special_tokens)
for i in outputs
]
return LLMResponse(outputs[0], dict(all=outputs))
Expand Down
77 changes: 77 additions & 0 deletions microcore/tokenizing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging

import tiktoken
import requests.exceptions
from ._env import env


class CantLoadTikTokenEncoding(RuntimeError):
...


def _resolve_tiktoken_encoding(
for_model: str = None, encoding: str | tiktoken.Encoding = None
) -> tiktoken.Encoding:
assert (
for_model is None or encoding is None
), "You may specify encoding or for_model(LLM), but not both"
if isinstance(encoding, tiktoken.Encoding):
return encoding
if for_model is None and encoding is None:
if env().config.TIKTOKEN_ENCODING:
return _resolve_tiktoken_encoding(encoding=env().config.TIKTOKEN_ENCODING)
for_model = (
env().config.LLM_DEFAULT_ARGS.get("model", None) or env().config.MODEL
)
if for_model:
try:
return tiktoken.encoding_for_model(for_model)
except (KeyError, requests.exceptions.ConnectionError):
logging.warning(
f"Can't resolve tiktoken encoding for '{for_model}'. "
f"Default encoding will be used."
)
encoding = encoding or "cl100k_base"
try:
return tiktoken.get_encoding(encoding)
except (ValueError, requests.exceptions.ConnectionError) as e:
raise CantLoadTikTokenEncoding(
f"Can't load tiktok encoding '{encoding}'"
) from e


def encode(
string: str, for_model: str = None, encoding: str | tiktoken.Encoding = None
) -> list[int]:
"""Encodes string to LLM tokens"""
return _resolve_tiktoken_encoding(for_model, encoding).encode(string)


def num_tokens_from_string(
string: str, for_model: str = None, encoding: str | tiktoken.Encoding = None
) -> int:
"""Returns the number of tokens in a text string."""
return len(encode(string, for_model=for_model, encoding=encoding))


def fit_to_token_size(
docs: list[str],
max_tokens: int,
min_documents: int = None,
for_model: str = None,
encoding: str | tiktoken.Encoding = None,
) -> tuple[list[str], int]:
"""
Fit the list of documents to the max_tokens size.
Returns the new list of documents and qty of removed items
"""
encoding = _resolve_tiktoken_encoding(for_model, encoding)
tot_size = 0
for i, doc in enumerate(docs):
tot_size += num_tokens_from_string(doc, encoding=encoding)
if min_documents and i < min_documents:
continue
if tot_size > max_tokens:
result = docs[:i]
return result, len(docs) - len(result)
return docs, 0
16 changes: 16 additions & 0 deletions microcore/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pathlib import Path
from typing import Any, Union, Callable

import tiktoken
from colorama import Fore

from .configuration import Config
Expand Down Expand Up @@ -82,6 +83,21 @@ def method_handler(*args, **kwargs):
f"'{self.__class__.__name__}' object has no attribute '{item}'"
)

def to_tokens(
self,
for_model: str = None,
encoding: str | tiktoken.Encoding = None
):
from .tokenizing import encode
return encode(self, for_model=for_model, encoding=encoding)

def num_tokens(
self,
for_model: str = None,
encoding: str | tiktoken.Encoding = None
):
return len(self.to_tokens(for_model=for_model, encoding=encoding))


class DataclassEncoder(json.JSONEncoder):
"""@private"""
Expand Down
3 changes: 3 additions & 0 deletions requirements/extended.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
-r min.txt
chromadb>=0.4.18,<0.6
anthropic>=0.19.1,<=0.25.8
google-generativeai>=0.7.2,<1
vertexai>=1.60.0,<2
transformers>=4.43.3,<5
3 changes: 2 additions & 1 deletion requirements/min.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ Jinja2~=3.1.2
colorama~=0.4.6
chardet~=5.2.0
PyYAML~=6.0
aiohttp>=3.8.6,<4.0
aiohttp>=3.8.6,<4.0
tiktoken>=0.7.0,<1.0
38 changes: 38 additions & 0 deletions tests/basic/test_fit_vector_search_to_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import microcore as mc
from microcore import SearchResult


def test_fit_vector_search_to_tokens():
mc.texts.clear("test_collection")
raw_items = [str(i) for i in range(10)]
mc.texts.save_many("test_collection", raw_items)
res = mc.texts.search("test_collection", "qwe", n_results=10)
# Check all loaded
assert sorted(res) == raw_items

fres = res.fit_to_token_size(3)
# check fit
assert len(fres) == 3
assert any(i in raw_items for i in fres)

# check that distances of fitted elements are smallest
smallest_dist = sorted(i.distance for i in res)[:3]
fitted_dist = sorted(i.distance for i in fres)
assert fitted_dist == smallest_dist

assert fres[0].num_tokens() == 1


def test_fit_vector_search_to_tokens_min_docs():
mc.texts.clear("test_collection")
raw_items = [str(i) for i in range(10)]
mc.texts.save_many("test_collection", raw_items)
res = mc.texts.search("test_collection", "qwe", n_results=10).fit_to_token_size(3, 4)
assert len(res) == 4
res = mc.texts.search("test_collection", "qwe", n_results=10).fit_to_token_size(5, 3)
assert len(res) == 5


def test_num_tokens():
assert SearchResult("apple pineapple orange").num_tokens(encoding='cl100k_base') >= 3
assert SearchResult("Hi").num_tokens(for_model='gpt-4') <= 2

0 comments on commit edb2543

Please sign in to comment.