AbanteAI · biobootloader · Apr 3, 2024 · Mar 19, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/mentat/agent_handler.py b/mentat/agent_handler.py
@@ -48,7 +48,7 @@ async def enable_agent_mode(self):
         ]
         model = ctx.config.model
         response = await ctx.llm_api_handler.call_llm_api(messages, model, False)
-        content = response.choices[0].message.content or ""
+        content = response.text
 
         paths = [Path(path) for path in content.strip().split("\n") if Path(path).exists()]
         self.agent_file_message = ""
@@ -87,7 +87,7 @@ async def _determine_commands(self) -> List[str]:
             ctx.stream.send(f"Error accessing OpenAI API: {e.message}", style="error")
             return []
 
-        content = response.choices[0].message.content or ""
+        content = response.text
 
         messages.append(ChatCompletionAssistantMessageParam(role="assistant", content=content))
         parsed_llm_response = await ctx.config.parser.parse_llm_response(content)

diff --git a/mentat/conversation.py b/mentat/conversation.py
@@ -227,7 +227,7 @@ async def _stream_model_response(
         # TODO: control-c doesn't make sense for VSCode; send information in client agnostic way
         stream.send("Streaming... use control-c to interrupt the model at any point\n")
         async with stream.interrupt_catcher(parser.shutdown):
-            parsed_llm_response = await parser.stream_and_parse_llm_response(add_newline(response))
+            parsed_llm_response = await parser.stream_and_parse_llm_response(add_newline(response.stream()))
         # Sampler and History require previous_file_lines
         for file_edit in parsed_llm_response.file_edits:
             file_edit.previous_file_lines = code_file_manager.file_lines.get(file_edit.file_path, [])

diff --git a/mentat/feature_filters/llm_feature_filter.py b/mentat/feature_filters/llm_feature_filter.py
@@ -96,7 +96,7 @@ async def filter(
             stream=False,
             response_format=ResponseFormat(type="json_object"),
         )
-        message = (llm_response.choices[0].message.content) or ""
+        message = llm_response.text
         tokens = prompt_tokens(messages, model)
         response_tokens = count_tokens(message, model, full_message=True)
         cost_tracker.log_api_call_stats(

diff --git a/mentat/llm_api_handler.py b/mentat/llm_api_handler.py
@@ -44,6 +44,7 @@
 )
 from openai.types.chat.completion_create_params import ResponseFormat
 from PIL import Image
+from spice import Spice, SpiceResponse
 
 from mentat.errors import MentatError, ReturnToUser, UserError
 from mentat.session_context import SESSION_CONTEXT
@@ -97,11 +98,8 @@ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
 
 
 # Ensures that each chunk will have at most one newline character
-def chunk_to_lines(chunk: ChatCompletionChunk) -> list[str]:
-    content = None
-    if len(chunk.choices) > 0:
-        content = chunk.choices[0].delta.content
-    return ("" if content is None else content).splitlines(keepends=True)
+def chunk_to_lines(content: str) -> list[str]:
+    return content.splitlines(keepends=True)
 
 
 def get_encoding_for_model(model: str) -> tiktoken.Encoding:
@@ -264,6 +262,9 @@ def __contains__(self, key: object) -> bool:
         "gpt-3.5-turbo-16k-0613": Model("gpt-3.5-turbo-16k-0613", 16385, 0.003, 0.004),
         "gpt-3.5-turbo-0301": Model("gpt-3.5-turbo-0301", 4096, 0.0015, 0.002),
         "text-embedding-ada-002": Model("text-embedding-ada-002", 8191, 0.0001, 0, embedding_model=True),
+        "claude-3-opus-20240229": Model("claude-3-opus-20240229", 200000, 0.015, 0.075),
+        "claude-3-sonnet-20240229": Model("claude-3-sonnet-20240229", 200000, 0.003, 0.015),
+        "claude-3-haiku-20240307": Model("claude-3-haiku-20240307", 200000, 0.00025, 0.00125),
     }
 )
 
@@ -362,39 +363,21 @@ def initialize_client(self):
             self.async_client = AsyncOpenAI(api_key=key, base_url=base_url)
             self.sync_client = OpenAI(api_key=key, base_url=base_url)
 
+        self.spice_client = Spice(provider="anthropic")
+
         try:
             self.async_client.models.list()  # Test the key
         except AuthenticationError as e:
             raise UserError(f"API gave an Authentication Error:\n{e}")
 
-    @overload
-    async def call_llm_api(
-        self,
-        messages: list[ChatCompletionMessageParam],
-        model: str,
-        stream: Literal[True],
-        response_format: ResponseFormat = ResponseFormat(type="text"),
-    ) -> AsyncIterator[ChatCompletionChunk]:
-        ...
-
-    @overload
-    async def call_llm_api(
-        self,
-        messages: list[ChatCompletionMessageParam],
-        model: str,
-        stream: Literal[False],
-        response_format: ResponseFormat = ResponseFormat(type="text"),
-    ) -> ChatCompletion:
-        ...
-
     @api_guard
     async def call_llm_api(
         self,
         messages: list[ChatCompletionMessageParam],
         model: str,
         stream: bool,
         response_format: ResponseFormat = ResponseFormat(type="text"),
-    ) -> ChatCompletion | AsyncIterator[ChatCompletionChunk]:
+    ) -> SpiceResponse:
         session_context = SESSION_CONTEXT.get()
         config = session_context.config
         cost_tracker = session_context.cost_tracker
@@ -409,52 +392,34 @@ async def call_llm_api(
         start_time = default_timer()
         with sentry_sdk.start_span(description="LLM Call") as span:
             span.set_tag("model", model)
+
+            # TODO: handle this for gpt-4-vision-preview in spice?
             # OpenAI's API is bugged; when gpt-4-vision-preview is used, including the response format
             # at all returns a 400 error. Additionally, gpt-4-vision-preview has a max response of 30 tokens by default.
             # Until this is fixed, we have to use this workaround.
-            if model == "gpt-4-vision-preview":
-                response = await self.async_client.chat.completions.create(
-                    model=model,
-                    messages=messages,
-                    temperature=config.temperature,
-                    stream=stream,
-                    max_tokens=4096,
-                )
-            else:
-                # This makes it slightly easier when using the litellm proxy or models outside of OpenAI
-                if response_format["type"] == "text":
-                    response = await self.async_client.chat.completions.create(
-                        model=model,
-                        messages=messages,
-                        temperature=config.temperature,
-                        stream=stream,
-                        max_tokens=4096,
-                    )
-                else:
-                    response = await self.async_client.chat.completions.create(
-                        model=model,
-                        messages=messages,
-                        temperature=config.temperature,
-                        stream=stream,
-                        response_format=response_format,
-                        max_tokens=4096,
-                    )
 
-        # We have to cast response since pyright isn't smart enough to connect
-        # the dots between stream and the overloaded create function
+            response = await self.spice_client.call_llm(
+                model=model,
+                messages=messages,
+                stream=stream,
+                temperature=config.temperature,
+                response_format=response_format,
+            )
+
         if not stream:
             time_elapsed = default_timer() - start_time
             response_tokens = count_tokens(
-                cast(ChatCompletion, response).choices[0].message.content or "",
+                response.text,
                 model,
                 full_message=False,
             )
             cost_tracker.log_api_call_stats(tokens, response_tokens, model, time_elapsed)
         else:
             cost_tracker.last_api_call = ""
-            response = cost_tracker.response_logger_wrapper(
-                tokens, cast(AsyncStream[ChatCompletionChunk], response), model
-            )
+            # TODO: replace this tracking for stream
+            # response = cost_tracker.response_logger_wrapper(
+            #     tokens, cast(AsyncStream[ChatCompletionChunk], response), model
+            # )
 
         return response
 

diff --git a/mentat/revisor/revisor.py b/mentat/revisor/revisor.py
@@ -71,7 +71,7 @@ async def revise_edit(file_edit: FileEdit):
         style="info",
     )
     response = await ctx.llm_api_handler.call_llm_api(messages, model=ctx.config.model, stream=False)
-    message = response.choices[0].message.content or ""
+    message = response.text
     messages.append(ChatCompletionAssistantMessageParam(content=message, role="assistant"))
     ctx.conversation.add_transcript_message(
         ModelMessage(message=message, prior_messages=messages, message_type="revisor")

diff --git a/mentat/splash_messages.py b/mentat/splash_messages.py
@@ -72,14 +72,14 @@ def check_version():
 def check_model():
     ctx = SESSION_CONTEXT.get()
     model = ctx.config.model
-    if "gpt-4" not in model:
+    if "gpt-4" not in model and "opus" not in model:
         ctx.stream.send(
-            "Warning: Mentat has only been tested on GPT-4. You may experience"
-            " issues with quality. This model may not be able to respond in"
-            " mentat's edit format.",
+            "Warning: The only recommended models are GPT-4 and Claude 3 Opus. "
+            "You may experience issues with quality. This model may not be able to "
+            "respond in mentat's edit format.",
             style="warning",
         )
-        if "gpt-3.5" not in model:
+        if "gpt-3.5" not in model and "claude-3" not in model:
             ctx.stream.send(
                 "Warning: Mentat does not know how to calculate costs or context" " size for this model.",
                 style="warning",

diff --git a/mentat/utils.py b/mentat/utils.py
@@ -89,9 +89,9 @@ def create_viewer(transcripts: list[Transcript]) -> Path:
 
 
 async def add_newline(
-    iterator: AsyncIterator[ChatCompletionChunk],
+    iterator: AsyncIterator[str],
     role: Optional[Literal["system", "user", "assistant", "tool"]] = "assistant",
-) -> AsyncIterator[ChatCompletionChunk]:
+) -> AsyncIterator[str]:
     """
     The model often doesn't end it's responses in a newline;
     adding a newline makes it significantly easier for us to parse.
@@ -101,20 +101,7 @@ async def add_newline(
         last_chunk = chunk
         yield chunk
     if last_chunk is not None:
-        yield ChatCompletionChunk(
-            id=last_chunk.id,
-            choices=[
-                Choice(
-                    delta=ChoiceDelta(content="\n", role=role),
-                    finish_reason=last_chunk.choices[0].finish_reason,
-                    index=0,
-                )
-            ],
-            created=last_chunk.created,
-            model=last_chunk.model,
-            object=last_chunk.object,
-            system_fingerprint=last_chunk.system_fingerprint,
-        )
+        yield "\n"
-        yield "\n"
-        yield "\n"
 
 
 def get_relative_path(path: Path, target: Path) -> Path:

diff --git a/requirements.txt b/requirements.txt
@@ -29,3 +29,4 @@ typing_extensions==4.8.0
 tqdm==4.66.1
 webdriver_manager==4.0.1
 watchfiles==0.21.0
+spice @ git+https://github.com/AbanteAI/spice@main