Anthropic bugfix - upgrading sdk, tokenizer usage (#597)

refuel-ai · Oct 12, 2023 · 86a1e56 · 86a1e56
1 parent 374dc58
commit 86a1e56
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "numpy >= 1.23.0",
     "requests >= 2.27.0",
     "datasets >= 2.7.0",
-    "langchain == 0.0.210",
+    "langchain == 0.0.226",
     "nervaluate >= 0.1.8",
     "pandas >= 1.3.0",
     "scikit-learn >= 1.0.0",
@@ -66,7 +66,7 @@ openai = [
     "tiktoken >= 0.3.3"
 ]
 anthropic = [
-    "anthropic == 0.2.6"
+    "anthropic == 0.3.0"
 ]
 huggingface = [
     "transformers >= 4.25.0",
@@ -88,7 +88,7 @@ all = [
     "pre-commit",
     "openai >= 0.27.4",
     "tiktoken >= 0.3.3",
-    "anthropic == 0.2.6",
+    "anthropic == 0.3.0",
     "transformers >= 4.25.0",
     "google-cloud-aiplatform>=1.25.0",
     "google-search-results>=2.4.2",

diff --git a/src/autolabel/models/anthropic.py b/src/autolabel/models/anthropic.py
@@ -31,7 +31,7 @@ def __init__(self, config: AutolabelConfig, cache: BaseCache = None) -> None:
 
         try:
             from langchain.chat_models import ChatAnthropic
-            from anthropic import tokenizer
+            from anthropic._tokenizers import sync_get_tokenizer
         except ImportError:
             raise ImportError(
                 "anthropic is required to use the anthropic LLM. Please install it with the following command: pip install 'refuel-autolabel[anthropic]'"
@@ -45,7 +45,7 @@ def __init__(self, config: AutolabelConfig, cache: BaseCache = None) -> None:
         # initialize LLM
         self.llm = ChatAnthropic(model=self.model_name, **self.model_params)
 
-        self.tokenizer = tokenizer
+        self.tokenizer = sync_get_tokenizer()
 
     def _label(self, prompts: List[str]) -> RefuelLLMResult:
         prompts = [[HumanMessage(content=prompt)] for prompt in prompts]
@@ -58,9 +58,9 @@ def _label(self, prompts: List[str]) -> RefuelLLMResult:
             return self._label_individually(prompts)
 
     def get_cost(self, prompt: str, label: Optional[str] = "") -> float:
-        num_prompt_toks = self.tokenizer.count_tokens(prompt)
+        num_prompt_toks = len(self.tokenizer.encode(prompt).ids)
         if label:
-            num_label_toks = self.tokenizer.count_tokens(label)
+            num_label_toks = len(self.tokenizer.encode(label).ids)
         else:
             # get an upper bound
             num_label_toks = self.model_params["max_tokens_to_sample"]