From b8099a1df6fa260ce6cb7d68241c6693533ac47a Mon Sep 17 00:00:00 2001 From: Nayjest Date: Tue, 30 Jul 2024 18:45:55 +0200 Subject: [PATCH 1/6] upd. extended requirements (include everything used in order to generate the docs) --- requirements/extended.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements/extended.txt b/requirements/extended.txt index d45a288..3e11b0f 100644 --- a/requirements/extended.txt +++ b/requirements/extended.txt @@ -1,3 +1,6 @@ -r min.txt chromadb>=0.4.18,<0.6 anthropic>=0.19.1,<=0.25.8 +google-generativeai>=0.7.2,<1 +vertexai>=1.60.0,<2 +transformers>=4.43.3,<5 \ No newline at end of file From ceea6b378164ad284520f07619fdc57e09d9b131 Mon Sep 17 00:00:00 2001 From: Nayjest Date: Tue, 30 Jul 2024 18:48:06 +0200 Subject: [PATCH 2/6] cs fixes --- microcore/json_parsing.py | 2 +- microcore/llm/local_transformers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/microcore/json_parsing.py b/microcore/json_parsing.py index b3acf1c..0f96a10 100644 --- a/microcore/json_parsing.py +++ b/microcore/json_parsing.py @@ -42,7 +42,7 @@ def unwrap_json_substring( ... return ( - input_string[start : end + 1] + input_string[start:end + 1] if brace else input_string if return_original_on_fail else "" ) diff --git a/microcore/llm/local_transformers.py b/microcore/llm/local_transformers.py index a8b55c4..0345506 100644 --- a/microcore/llm/local_transformers.py +++ b/microcore/llm/local_transformers.py @@ -16,7 +16,7 @@ def inference(prompt: str, model, tokenizer, **kwargs): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, **kwargs) outputs = [ - tokenizer.decode(i[len(inputs[0]) :], skip_special_tokens=skip_special_tokens) + tokenizer.decode(i[len(inputs[0]):], skip_special_tokens=skip_special_tokens) for i in outputs ] return LLMResponse(outputs[0], dict(all=outputs)) From c9da8d1361e1a10e42ecadd966e709ca58a616e7 Mon Sep 17 00:00:00 2001 From: Nayjest Date: Tue, 30 Jul 2024 18:50:13 +0200 Subject: [PATCH 3/6] use tiktoken for estimating number of tokens in prompt / LLM response, fitting vector db search results to target token limit --- microcore/__init__.py | 12 +++-- microcore/_env.py | 6 +-- microcore/configuration.py | 3 ++ microcore/embedding_db/__init__.py | 29 ++++++++++-- microcore/embedding_db/chromadb.py | 16 +++---- microcore/tokenizing.py | 74 ++++++++++++++++++++++++++++++ microcore/utils.py | 16 +++++++ requirements/min.txt | 3 +- 8 files changed, 139 insertions(+), 20 deletions(-) create mode 100644 microcore/tokenizing.py diff --git a/microcore/__init__.py b/microcore/__init__.py index ffff81d..2757297 100644 --- a/microcore/__init__.py +++ b/microcore/__init__.py @@ -9,7 +9,7 @@ import os import microcore.ui # noqa -from .embedding_db import SearchResult, AbstractEmbeddingDB +from .embedding_db import SearchResult, AbstractEmbeddingDB, SearchResults from .file_storage import storage from ._env import configure, env, config from .logging import use_logging @@ -67,10 +67,10 @@ def search( n_results: int = 5, where: dict = None, **kwargs, - ) -> list[str | SearchResult]: + ) -> SearchResults | list[str | SearchResult]: return env().texts.search(collection, query, n_results, where, **kwargs) - def find(self, *args, **kwargs) -> list[str | SearchResult]: + def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]: return self.search(*args, **kwargs) def find_all( @@ -79,7 +79,7 @@ def find_all( query: str | list, where: dict = None, **kwargs, - ) -> list[str | SearchResult]: + ) -> SearchResults | list[str | SearchResult]: return env().texts.find_all(collection, query, where, **kwargs) def save_many(self, collection: str, items: list[tuple[str, dict] | str]): @@ -128,6 +128,8 @@ def delete(self, collection: str, what: str | list[str] | dict): "LLMResponse", "PromptWrapper", "parse", + "SearchResult", + "SearchResults", "dedent", # submodules "embedding_db", @@ -142,4 +144,4 @@ def delete(self, collection: str, what: str | list[str] | dict): # "wrappers", ] -__version__ = "3.9.1" +__version__ = "3.10.0" diff --git a/microcore/_env.py b/microcore/_env.py index 3fe2583..7ed2105 100644 --- a/microcore/_env.py +++ b/microcore/_env.py @@ -21,10 +21,10 @@ class Env: llm_before_handlers: list[callable] = field(default_factory=list) llm_after_handlers: list[callable] = field(default_factory=list) texts: AbstractEmbeddingDB = None - model: "PreTrainedModel" = field(default=None, init=False, repr=False) # noqa - tokenizer: "PreTrainedTokenizer" = field( + model: "transformers.PreTrainedModel" = field(default=None, init=False, repr=False) # noqa + tokenizer: "transformers.PreTrainedTokenizer" = field( # noqa default=None, init=False, repr=False - ) # noqa + ) def __post_init__(self): global _env diff --git a/microcore/configuration.py b/microcore/configuration.py index 1e61e74..5198138 100644 --- a/microcore/configuration.py +++ b/microcore/configuration.py @@ -175,6 +175,9 @@ class LLMConfig(BaseConfig, _OpenAIEnvVars, _AnthropicEnvVars, _GoogleVertexAiEn MODEL: str = from_env() """Language model name""" + TIKTOKEN_ENCODING: str = from_env() + """Will enforce using specific encoding for token size measurement""" + LLM_DEFAULT_ARGS: dict = from_env(dtype=dict) """ You may specify here default arguments for the LLM API calls, diff --git a/microcore/embedding_db/__init__.py b/microcore/embedding_db/__init__.py index 5defce0..dbc9a0a 100644 --- a/microcore/embedding_db/__init__.py +++ b/microcore/embedding_db/__init__.py @@ -1,10 +1,33 @@ +import logging import sys from abc import ABC, abstractmethod from dataclasses import dataclass +import tiktoken + from ..utils import ExtendedString +class SearchResults(list): + def fit_to_token_size( + self, + max_tokens: int, + for_model: str = None, + encoding: str | tiktoken.Encoding = None, + verbose=True + ): + from ..tokenizing import fit_to_token_size + records, removed = fit_to_token_size(self, max_tokens, for_model, encoding) + if verbose and len(records) < len(self): + logging.info( + "For fitting %d records to %d tokens, %d records was removed", + len(self), + max_tokens, + removed + ) + return SearchResults(list(records)) + + class SearchResult(ExtendedString): """ String containing the search result with additional information in attributes @@ -46,7 +69,7 @@ def search( **kwargs: additional arguments """ - def find(self, *args, **kwargs) -> list[str | SearchResult]: + def find(self, *args, **kwargs) -> SearchResults | list[str | SearchResult]: """ Alias for `search` """ @@ -58,13 +81,13 @@ def find_all( query: str | list, where: dict = None, **kwargs, - ) -> list[str | SearchResult]: + ) -> SearchResults | list[str | SearchResult]: return self.search( collection, query, n_results=sys.maxsize - 1, where=where, **kwargs ) @abstractmethod - def get_all(self, collection: str) -> list[str | SearchResult]: + def get_all(self, collection: str) -> SearchResults | list[str | SearchResult]: """Return all documents in the collection""" def save(self, collection: str, text: str, metadata: dict = None): diff --git a/microcore/embedding_db/chromadb.py b/microcore/embedding_db/chromadb.py index 8a27e5d..00fb061 100644 --- a/microcore/embedding_db/chromadb.py +++ b/microcore/embedding_db/chromadb.py @@ -4,7 +4,7 @@ from chromadb.config import Settings from chromadb.utils import embedding_functions from ..configuration import Config -from .. import SearchResult, AbstractEmbeddingDB +from .. import SearchResult, SearchResults, AbstractEmbeddingDB @dataclass @@ -25,7 +25,7 @@ def __post_init__(self): @classmethod def _wrap_results(cls, results) -> list[str | SearchResult]: - return [ + return SearchResults([ SearchResult( results["documents"][0][i], dict( @@ -35,7 +35,7 @@ def _wrap_results(cls, results) -> list[str | SearchResult]: ), ) for i in range(len(results["documents"][0])) - ] + ]) def search( self, @@ -50,7 +50,7 @@ def search( collection, embedding_function=self.embedding_function ) except ValueError: - return [] + return SearchResults([]) if isinstance(query, str): query = [query] @@ -61,7 +61,7 @@ def search( return ( self._wrap_results(d) if d and d.get("documents") and d["documents"][0] - else [] + else SearchResults([]) ) def save_many(self, collection: str, items: list[tuple[str, dict] | str]): @@ -122,12 +122,12 @@ def get_all(self, collection: str) -> list[str | SearchResult]: collection, embedding_function=self.embedding_function ) except ValueError: - return [] + return SearchResults([]) results = chroma_collection.get() - return [ + return SearchResults([ SearchResult( results["documents"][i], {"metadata": results["metadatas"][i] or {}, "id": results["ids"][i]}, ) for i in range(len(results["documents"])) - ] + ]) diff --git a/microcore/tokenizing.py b/microcore/tokenizing.py new file mode 100644 index 0000000..21d0d74 --- /dev/null +++ b/microcore/tokenizing.py @@ -0,0 +1,74 @@ +import logging + +import tiktoken +import requests.exceptions +from ._env import env + + +class CantLoadTikTokenEncoding(RuntimeError): + ... + + +def _resolve_tiktoken_encoding( + for_model: str = None, encoding: str | tiktoken.Encoding = None +) -> tiktoken.Encoding: + assert ( + for_model is None or encoding is None + ), "You may specify encoding or for_model(LLM), but not both" + if isinstance(encoding, tiktoken.Encoding): + return encoding + if for_model is None and encoding is None: + if env().config.TIKTOKEN_ENCODING: + return _resolve_tiktoken_encoding(encoding=env().config.TIKTOKEN_ENCODING) + for_model = ( + env().config.LLM_DEFAULT_ARGS.get("model", None) or env().config.MODEL + ) + if for_model: + try: + return tiktoken.encoding_for_model(for_model) + except (KeyError, requests.exceptions.ConnectionError): + logging.warning( + f"Can't resolve tiktoken encoding for '{for_model}'. " + f"Default encoding will be used." + ) + encoding = encoding or "cl100k_base" + try: + return tiktoken.get_encoding(encoding) + except (ValueError, requests.exceptions.ConnectionError) as e: + raise CantLoadTikTokenEncoding( + f"Can't load tiktok encoding '{encoding}'" + ) from e + + +def encode( + string: str, for_model: str = None, encoding: str | tiktoken.Encoding = None +) -> list[int]: + """Encodes string to LLM tokens""" + return _resolve_tiktoken_encoding(for_model, encoding).encode(string) + + +def num_tokens_from_string( + string: str, for_model: str = None, encoding: str | tiktoken.Encoding = None +) -> int: + """Returns the number of tokens in a text string.""" + return len(encode(string, for_model=for_model, encoding=encoding)) + + +def fit_to_token_size( + docs: list[str], + max_tokens: int, + for_model: str = None, + encoding: str | tiktoken.Encoding = None, +) -> tuple[list[str], int]: + """ + Fit the list of documents to the max_tokens size. + Returns the new list of documents and qty of removed items + """ + encoding = _resolve_tiktoken_encoding(for_model, encoding) + tot_size = 0 + for i, doc in enumerate(docs): + tot_size += num_tokens_from_string(doc, encoding=encoding) + if tot_size > max_tokens: + result = docs[:i] + return result, len(docs) - len(result) + return docs, 0 diff --git a/microcore/utils.py b/microcore/utils.py index a2de463..fd97d88 100644 --- a/microcore/utils.py +++ b/microcore/utils.py @@ -12,6 +12,7 @@ from pathlib import Path from typing import Any, Union, Callable +import tiktoken from colorama import Fore from .configuration import Config @@ -82,6 +83,21 @@ def method_handler(*args, **kwargs): f"'{self.__class__.__name__}' object has no attribute '{item}'" ) + def to_tokens( + self, + for_model: str = None, + encoding: str | tiktoken.Encoding = None + ): + from .tokenizing import encode + return encode(self, for_model=for_model, encoding=encoding) + + def num_tokens( + self, + for_model: str = None, + encoding: str | tiktoken.Encoding = None + ): + return len(self.to_tokens(for_model=for_model, encoding=encoding)) + class DataclassEncoder(json.JSONEncoder): """@private""" diff --git a/requirements/min.txt b/requirements/min.txt index f4c8e6a..ad8daf8 100644 --- a/requirements/min.txt +++ b/requirements/min.txt @@ -4,4 +4,5 @@ Jinja2~=3.1.2 colorama~=0.4.6 chardet~=5.2.0 PyYAML~=6.0 -aiohttp>=3.8.6,<4.0 \ No newline at end of file +aiohttp>=3.8.6,<4.0 +tiktoken>=0.7.0,<1.0 \ No newline at end of file From da614b884f2220224bba3b74d52a57a8d8a6bc6f Mon Sep 17 00:00:00 2001 From: Nayjest Date: Tue, 30 Jul 2024 18:55:01 +0200 Subject: [PATCH 4/6] tiktoken tests --- .../basic/test_fit_vector_search_to_tokens.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tests/basic/test_fit_vector_search_to_tokens.py diff --git a/tests/basic/test_fit_vector_search_to_tokens.py b/tests/basic/test_fit_vector_search_to_tokens.py new file mode 100644 index 0000000..a09c5ee --- /dev/null +++ b/tests/basic/test_fit_vector_search_to_tokens.py @@ -0,0 +1,28 @@ +import microcore as mc +from microcore import SearchResult + + +def test_fit_vector_search_to_tokens(): + mc.texts.clear("test_collection") + raw_items = [str(i) for i in range(10)] + mc.texts.save_many("test_collection", raw_items) + res = mc.texts.search("test_collection", "qwe", n_results=10) + # Check all loaded + assert sorted(res) == raw_items + + fres = res.fit_to_token_size(3) + # check fit + assert len(fres) == 3 + assert any(i in raw_items for i in fres) + + # check that distances of fitted elements are smallest + smallest_dist = sorted(i.distance for i in res)[:3] + fitted_dist = sorted(i.distance for i in fres) + assert fitted_dist == smallest_dist + + assert fres[0].num_tokens() == 1 + + +def test_num_tokens(): + assert SearchResult("apple pineapple orange").num_tokens(encoding='cl100k_base') >= 3 + assert SearchResult("Hi").num_tokens(for_model='gpt-4') <= 2 From 196ada0a1a963fbfe0a8d07453002b37e76b2ef1 Mon Sep 17 00:00:00 2001 From: Nayjest Date: Tue, 30 Jul 2024 19:31:46 +0200 Subject: [PATCH 5/6] fit_to_token_size(): support of min_documents: int argument --- microcore/embedding_db/__init__.py | 8 +++++++- microcore/tokenizing.py | 3 +++ tests/basic/test_fit_vector_search_to_tokens.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/microcore/embedding_db/__init__.py b/microcore/embedding_db/__init__.py index dbc9a0a..1162ede 100644 --- a/microcore/embedding_db/__init__.py +++ b/microcore/embedding_db/__init__.py @@ -12,12 +12,18 @@ class SearchResults(list): def fit_to_token_size( self, max_tokens: int, + min_documents: int = None, for_model: str = None, encoding: str | tiktoken.Encoding = None, verbose=True ): from ..tokenizing import fit_to_token_size - records, removed = fit_to_token_size(self, max_tokens, for_model, encoding) + records, removed = fit_to_token_size( + self, max_tokens=max_tokens, + min_documents=min_documents, + for_model=for_model, + encoding=encoding + ) if verbose and len(records) < len(self): logging.info( "For fitting %d records to %d tokens, %d records was removed", diff --git a/microcore/tokenizing.py b/microcore/tokenizing.py index 21d0d74..50fb1fc 100644 --- a/microcore/tokenizing.py +++ b/microcore/tokenizing.py @@ -57,6 +57,7 @@ def num_tokens_from_string( def fit_to_token_size( docs: list[str], max_tokens: int, + min_documents: int = None, for_model: str = None, encoding: str | tiktoken.Encoding = None, ) -> tuple[list[str], int]: @@ -68,6 +69,8 @@ def fit_to_token_size( tot_size = 0 for i, doc in enumerate(docs): tot_size += num_tokens_from_string(doc, encoding=encoding) + if min_documents and i < min_documents: + continue if tot_size > max_tokens: result = docs[:i] return result, len(docs) - len(result) diff --git a/tests/basic/test_fit_vector_search_to_tokens.py b/tests/basic/test_fit_vector_search_to_tokens.py index a09c5ee..190b6b8 100644 --- a/tests/basic/test_fit_vector_search_to_tokens.py +++ b/tests/basic/test_fit_vector_search_to_tokens.py @@ -23,6 +23,16 @@ def test_fit_vector_search_to_tokens(): assert fres[0].num_tokens() == 1 +def test_fit_vector_search_to_tokens_min_docs(): + mc.texts.clear("test_collection") + raw_items = [str(i) for i in range(10)] + mc.texts.save_many("test_collection", raw_items) + res = mc.texts.search("test_collection", "qwe", n_results=10).fit_to_token_size(3, 4) + assert len(res) == 4 + res = mc.texts.search("test_collection", "qwe", n_results=10).fit_to_token_size(5, 3) + assert len(res) == 5 + + def test_num_tokens(): assert SearchResult("apple pineapple orange").num_tokens(encoding='cl100k_base') >= 3 assert SearchResult("Hi").num_tokens(for_model='gpt-4') <= 2 From 2e4c3656c4f98973e0b3503fff44f6374573525d Mon Sep 17 00:00:00 2001 From: Nayjest Date: Wed, 31 Jul 2024 19:58:34 +0200 Subject: [PATCH 6/6] CR fix --- microcore/embedding_db/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/microcore/embedding_db/__init__.py b/microcore/embedding_db/__init__.py index 1162ede..c666a52 100644 --- a/microcore/embedding_db/__init__.py +++ b/microcore/embedding_db/__init__.py @@ -24,7 +24,7 @@ def fit_to_token_size( for_model=for_model, encoding=encoding ) - if verbose and len(records) < len(self): + if verbose and removed: logging.info( "For fitting %d records to %d tokens, %d records was removed", len(self),