experimental: Cohere compatible endpoints. (#644)

* feat: add generate endpoint Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: update generation Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * fix(cohere): generate endpoints Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: --wip-- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat: update testing clients and chat implementation Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * chore: disable schemas for easter eggs Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
bentoml · Nov 14, 2023 · b0ab8cc · b0ab8cc
1 parent 0bf6ec7
commit b0ab8cc
Show file tree

Hide file tree

Showing 8 changed files with 638 additions and 16 deletions.
diff --git a/examples/cohere_client.py b/examples/cohere_client.py
@@ -0,0 +1,40 @@
+import os
+
+import cohere
+from cohere.responses.chat import StreamTextGeneration
+
+co = cohere.Client(api_key='na', api_url=os.getenv('OPENLLM_ENDPOINT', 'http://localhost:3000') + '/cohere')
+
+generation = co.generate(prompt='Write me a tag line for an ice cream shop.')
+print(generation.generations[0].text)
+
+for it in co.generate(prompt='Write me a tag line for an ice cream shop.', stream=True):
+  print(it.text, flush=True, end='')
+
+response = co.chat(
+  message="What is Epicurus's philosophy of life?",
+  temperature=0.6,
+  chat_history=[
+    {'role': 'User', 'message': 'What is the meaning of life?'},
+    {
+      'role': 'Chatbot',
+      'message': "Many thinkers have proposed theories about the meaning of life. \n\nFor instance, Jean-Paul Sartre believed that existence precedes essence, meaning that the essence, or meaning, of one's life arises after birth. Søren Kierkegaard argued that life is full of absurdity and that one must make one's own values in an indifferent world. Arthur Schopenhauer stated that one's life reflects one's will, and that the will (or life) is without aim, irrational, and full of pain. \n\nEarly thinkers such as John Locke, Jean-Jacques Rousseau and Adam Smith believed that humankind should find meaning through labour, property and social contracts. \n\nAnother way of thinking about the meaning of life is to focus on the pursuit of happiness or pleasure. Aristippus of Cyrene, a student of Socrates, founded an early Socratic school that emphasised one aspect of Socrates's teachings: that happiness is the end goal of moral action and that pleasure is the supreme good. Epicurus taught that the pursuit of modest pleasures was the greatest good, as it leads to tranquility, freedom from fear and absence of bodily pain. \n\nUltimately, the meaning of life is a subjective concept and what provides life with meaning differs for each individual.",
+    },
+  ],
+)
+print(response)
+
+for it in co.chat(
+  message="What is Epicurus's philosophy of life?",
+  temperature=0.6,
+  chat_history=[
+    {'role': 'User', 'message': 'What is the meaning of life?'},
+    {
+      'role': 'Chatbot',
+      'message': "Many thinkers have proposed theories about the meaning of life. \n\nFor instance, Jean-Paul Sartre believed that existence precedes essence, meaning that the essence, or meaning, of one's life arises after birth. Søren Kierkegaard argued that life is full of absurdity and that one must make one's own values in an indifferent world. Arthur Schopenhauer stated that one's life reflects one's will, and that the will (or life) is without aim, irrational, and full of pain. \n\nEarly thinkers such as John Locke, Jean-Jacques Rousseau and Adam Smith believed that humankind should find meaning through labour, property and social contracts. \n\nAnother way of thinking about the meaning of life is to focus on the pursuit of happiness or pleasure. Aristippus of Cyrene, a student of Socrates, founded an early Socratic school that emphasised one aspect of Socrates's teachings: that happiness is the end goal of moral action and that pleasure is the supreme good. Epicurus taught that the pursuit of modest pleasures was the greatest good, as it leads to tranquility, freedom from fear and absence of bodily pain. \n\nUltimately, the meaning of life is a subjective concept and what provides life with meaning differs for each individual.",
+    },
+  ],
+  stream=True,
+):
+  if isinstance(it, StreamTextGeneration):
+    print(it.text, flush=True, end='')
diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
@@ -1,6 +1,7 @@
 # mypy: disable-error-code="attr-defined,no-untyped-call,type-var,operator,arg-type,no-redef,misc"
 from __future__ import annotations
 import copy
+import importlib.util
 import logging
 import os
 import sys
@@ -31,15 +32,17 @@
   Self,
   overload,
 )
-from .exceptions import ForbiddenAttributeError
+from .exceptions import ForbiddenAttributeError, MissingDependencyError
 from .utils import LazyLoader, ReprMixin, codegen, converter, dantic, field_env_key, first_not_none, lenient_issubclass
 from .utils.peft import PEFT_TASK_TYPE_TARGET_MAPPING, FineTuneConfig
 
 if t.TYPE_CHECKING:
   import click
   import transformers
   import vllm
+  from attrs import AttrsInstance
 
+  from openllm.protocol.cohere import CohereChatRequest, CohereGenerateRequest
   from openllm.protocol.openai import ChatCompletionRequest, CompletionRequest
 else:
   vllm = LazyLoader('vllm', globals(), 'vllm')
@@ -1460,7 +1463,28 @@ def to_generation_config(self, return_as_dict: bool = False) -> transformers.Gen
   def to_sampling_config(self) -> vllm.SamplingParams:
     return self.sampling_config.build()
 
-  def with_openai_request(self, request: ChatCompletionRequest | CompletionRequest) -> dict[str, t.Any]:
+  @overload
+  def with_request(self, request: ChatCompletionRequest | CompletionRequest) -> dict[str, t.Any]: ...
+
+  @overload
+  def with_request(self, request: CohereChatRequest | CohereGenerateRequest) -> dict[str, t.Any]: ...
+
+  def with_request(self, request: AttrsInstance) -> dict[str, t.Any]:
+    if importlib.util.find_spec('openllm') is None:
+      raise MissingDependencyError(
+        "'openllm' is required to use 'with_request'. Make sure to install with 'pip install openllm'."
+      )
+    from openllm.protocol.cohere import CohereChatRequest, CohereGenerateRequest
+    from openllm.protocol.openai import ChatCompletionRequest, CompletionRequest
+
+    if isinstance(request, (ChatCompletionRequest, CompletionRequest)):
+      return self._with_openai_request(request)
+    elif isinstance(request, (CohereChatRequest, CohereGenerateRequest)):
+      return self._with_cohere_request(request)
+    else:
+      raise TypeError(f'Unknown request type {type(request)}')
+
+  def _with_openai_request(self, request: ChatCompletionRequest | CompletionRequest) -> dict[str, t.Any]:
     d = dict(
       temperature=first_not_none(request.temperature, self['temperature']),
       top_p=first_not_none(request.top_p, self['top_p']),
@@ -1476,6 +1500,21 @@ def with_openai_request(self, request: ChatCompletionRequest | CompletionRequest
       d['logprobs'] = first_not_none(request.logprobs, default=self['logprobs'])
     return d
 
+  def _with_cohere_request(self, request: CohereGenerateRequest | CohereChatRequest) -> dict[str, t.Any]:
+    d = dict(
+      max_new_tokens=first_not_none(request.max_tokens, default=self['max_new_tokens']),
+      temperature=first_not_none(request.temperature, default=self['temperature']),
+      top_k=first_not_none(request.k, default=self['top_k']),
+      top_p=first_not_none(request.p, default=self['top_p']),
+    )
+    if hasattr(request, 'num_generations'):
+      d['n'] = first_not_none(request.num_generations, default=self['n'])
+    if hasattr(request, 'frequency_penalty'):
+      d['frequency_penalty'] = first_not_none(request.frequency_penalty, default=self['frequency_penalty'])
+    if hasattr(request, 'presence_penalty'):
+      d['presence_penalty'] = first_not_none(request.presence_penalty, default=self['presence_penalty'])
+    return d
+
   @classmethod
   def to_click_options(cls, f: AnyCallable) -> click.Command:
     """Convert current configuration to click options.

diff --git a/openllm-python/src/openllm/entrypoints/__init__.py b/openllm-python/src/openllm/entrypoints/__init__.py
@@ -8,21 +8,28 @@
 """
 
 from __future__ import annotations
+import importlib
 import typing as t
 
 from openllm_core.utils import LazyModule
 
-from . import hf as hf, openai as openai
-
 if t.TYPE_CHECKING:
   import bentoml
   import openllm
 
-_import_structure: dict[str, list[str]] = {'openai': [], 'hf': []}
+
+class IntegrationModule(t.Protocol):
+  def mount_to_svc(self, svc: bentoml.Service, llm: openllm.LLM[t.Any, t.Any]) -> bentoml.Service: ...
+
+
+_import_structure: dict[str, list[str]] = {'openai': [], 'hf': [], 'cohere': []}
 
 
 def mount_entrypoints(svc: bentoml.Service, llm: openllm.LLM[t.Any, t.Any]) -> bentoml.Service:
-  return openai.mount_to_svc(hf.mount_to_svc(svc, llm), llm)
+  for module_name in _import_structure:
+    module = t.cast(IntegrationModule, importlib.import_module(f'.{module_name}', __name__))
+    svc = module.mount_to_svc(svc, llm)
+  return svc
 
 
 __lazy = LazyModule(

diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py
@@ -396,7 +396,7 @@
 summary: Describes a model offering that can be used with the API.
 tags:
   - HF
-x-bentoml-name: adapters_map
+x-bentoml-name: hf_adapters
 responses:
   200:
     description: Return list of LoRA adapters.
@@ -416,6 +416,65 @@
           $ref: '#/components/schemas/HFErrorResponse'
     description: Not Found
 """
+COHERE_GENERATE_SCHEMA = """\
+---
+consumes:
+  - application/json
+description: >-
+  Given a prompt, the model will return one or more predicted completions, and
+  can also return the probabilities of alternative tokens at each position.
+operationId: cohere__generate
+produces:
+  - application/json
+tags:
+  - Cohere
+x-bentoml-name: cohere_generate
+summary: Creates a completion for the provided prompt and parameters.
+requestBody:
+  required: true
+  content:
+    application/json:
+      schema:
+        $ref: '#/components/schemas/CohereGenerateRequest'
+      examples:
+        one-shot:
+          summary: One-shot input example
+          value:
+            prompt: This is a test
+            max_tokens: 256
+            temperature: 0.7
+            p: 0.43
+            k: 12
+            num_generations: 2
+            stream: false
+        streaming:
+          summary: Streaming input example
+          value:
+            prompt: This is a test
+            max_tokens: 256
+            temperature: 0.7
+            p: 0.43
+            k: 12
+            num_generations: 2
+            stream: true
+            stop_sequences:
+              - "\\n"
+              - "<|endoftext|>"
+"""
+COHERE_CHAT_SCHEMA = """\
+---
+consumes:
+- application/json
+description: >-
+  Given a list of messages comprising a conversation, the model will return a response.
+operationId: cohere__chat
+produces:
+  - application/json
+tags:
+  - Cohere
+x-bentoml-name: cohere_chat
+summary: Creates a model response for the given chat conversation.
+"""
 
 _SCHEMAS = {k[:-7].lower(): v for k, v in locals().items() if k.endswith('_SCHEMA')}
 
@@ -485,12 +544,15 @@ def get_schema(self, routes: list[BaseRoute], mount_path: str | None = None) ->
 
 
 def get_generator(
-  title: str, components: list[type[AttrsInstance]] | None = None, tags: list[dict[str, t.Any]] | None = None
+  title: str,
+  components: list[type[AttrsInstance]] | None = None,
+  tags: list[dict[str, t.Any]] | None = None,
+  inject: bool = True,
 ) -> OpenLLMSchemaGenerator:
   base_schema: dict[str, t.Any] = dict(info={'title': title, 'version': API_VERSION}, version=OPENAPI_VERSION)
-  if components:
+  if components and inject:
     base_schema['components'] = {'schemas': {c.__name__: component_schema_generator(c) for c in components}}
-  if tags is not None and tags:
+  if tags is not None and tags and inject:
     base_schema['tags'] = tags
   return OpenLLMSchemaGenerator(base_schema)