supports 4 models with basic prompts

NavinKumarMNK · Apr 11, 2024 · c65b4e4 · c65b4e4
1 parent 4b805e0
commit c65b4e4
Show file tree

Hide file tree

Showing 16 changed files with 78 additions and 221 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -4,7 +4,12 @@ version: 2
 mkdocs:
   configuration: mkdocs.yml
 
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
+
 # Optionally declare the Python requirements required to build your docs
 python:
    install:
-   - requirements: docs/requirements.txt
+   - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 # AI-Learning Platform
 
-## Documentation
+## Documentation [![Documentation Status](https://readthedocs.org/projects/ai-learning-platform/badge/?version=latest)](https://ai-learning-platform.readthedocs.io/en/latest/?badge=latest)
 - This project uses `mkdocs` as the documentation service
 - serve the document 
 
 ```bash
 pip install mkdocs
 mkdocs serve
-```
+```
diff --git a/ml_service/.gitattributes b/ml_service/.gitattributes
@@ -0,0 +1,2 @@
+*.html binary
+*.ipynb binary
diff --git a/ml_service/.gitignore b/ml_service/.gitignore
@@ -7,4 +7,5 @@ __pycache__
 cudnn.tar.xz
 *.tar.gz
 core.*
-.env
+.env
+.history
diff --git a/ml_service/Makefile b/ml_service/Makefile
@@ -75,4 +75,7 @@ ray-dev-ssh:
 	ssh -p $(PORT) root@$(IP)
 
 rm-pycache:
-	find . -type d -name __pycache__ -exec rm -r {} +
+	find . -type d -name __pycache__ -exec rm -r {} +
+
+llm_chat:
+	python3 ./test/llm_client_http.py
diff --git a/ml_service/config.yaml b/ml_service/config.yaml
@@ -2,22 +2,22 @@ app:
   name: vit-ray
 
 llm:
-  model_name: Nous-Capybara-34B
+  model_name: C4AI-35B # supported = [Nous-Capybara-34B, Qwen-32B, Mistral-7B, C4AI-35B]  
   time_consecutive_res: 0.5
   serve_config:
-    model: /data/nous-34b  # path
-    download_dir: null # path to model
+    model: /data/c4ai-35b # supported - [/data/nous-34b, /data/qwen-32b, /data/mistral-7b, /data/c4ai-35b]
+    download_dir: null # download model dir 
     load_format: auto #safetensors  # format of model {auto, pt, dummy, safetensors}
     dtype: float16  # data type {auto, float32, float16, bfloat16}
-    max_model_len: 8192 # max length of model
+    max_model_len: 16384 # max length of model
     worker_use_ray: false # use ray for worker
     engine_use_ray: false # use ray for engine
-    pipeline_parallel_size: 1  # size of pipeline parallel
+    # pipeline_parallel_size: 1  # size of pipeline parallel
     tensor_parallel_size: 4  # size of tensor parallel
-    # gpu_memory_utilization: 0.95  # gpu memory utilization
+    # gpu_memory_utilization: 0.9  # gpu memory utilization
     enforce_eager: true
     disable_custom_all_reduce: True
-    trust_remote_code: true
+    # trust_remote_code: true # for cohere models comment it
 
 emb:
   serve_config:

diff --git a/ml_service/config/model/c4ai-35b.yaml b/ml_service/config/model/c4ai-35b.yaml
@@ -1,5 +1,5 @@
 prompt_format:
-  system: """<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
+  system: "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
 The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
 
 # System Preamble
@@ -18,22 +18,23 @@ Here is a list of tools that you have available to you:
 
 ```python
 def internet_search(query: str) -> List[Dict]:
-    """Returns a list of relevant document snippets for a textual query retrieved from the internet
+    '''Returns a list of relevant document snippets for a textual query retrieved from the internet
 
     Args:
         query (str): Query to search the internet with
-    """
+    '''
     pass
 ```
 
 ```python
 def directly_answer() -> List[Dict]:
-    """Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history
-    """
+    '''Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history
+    '''
     pass
-```"""
-  assistant: "{instruction}"
+```<|END_OF_TURN_TOKEN|>"
+  assistant: "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{instruction}"
   trailing_assistant: ""
-  user: "USER: {system}{instruction} ASSISTANT:"
-  system_in_user: true
-  default_system_message: "Always follow the rules"
+  user: "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{instruction}<|END_OF_TURN_TOKEN|>"
+  system_in_user: false
+  accept_sys_from_req: false
+  recursive_sys_prompt: true
diff --git a/ml_service/config/model/mistral-7b.yaml b/ml_service/config/model/mistral-7b.yaml
@@ -1,11 +1,10 @@
 prompt_format:
-  system: Your are a Assitant who does the job of the Tutor, and you job is to guide user who \
-were engineering students in the best possible way. \
+  system: "Your name is MegAcad and you are an AI Assitant helps in tutoring & guiding undergraduate students \
 Think carefully through the topic, step by step in a systematic manner, and allow each step to logically build on the previous one. \
 Dont answer to any questions other than studies. Everyone one of your \
-response should be in english. use other languages only if its necessary {instruction}"
-  assistant: " {instruction} </s>"
+response should be in english. use other languages only if its necessary.\n"
+  assistant: "{instruction}"
   trailing_assistant: ""
-  user: "<s>[INST] {system}{instruction} [/INST]"
+  user: " [INST] {system} {instruction} [/INST]"
   system_in_user: true
-  default_system_message: "You are an assistant."
+  accept_sys_from_req: false
diff --git a/ml_service/config/model/qwen-32b.yaml b/ml_service/config/model/qwen-32b.yaml
@@ -0,0 +1,12 @@
+prompt_format:
+  system: "<|im_start|>system
+Your name is MegAcad and you are an AI Assitant helps in tutoring & guiding undergraduate students \
+Think carefully through the topic, step by step in a systematic manner, and allow each step to logically build on the previous one. \
+Dont answer to any questions other than studies. Everyone one of your \
+response should be in english. Don't use other languages unless it necessary.<|im_end|>"
+  assistant: "<|im_start|>assistant{instruction}"
+  trailing_assistant: ""
+  user: "<|im_start|>user
+{instruction}<|im_end|>"
+  system_in_user: false
+  accept_sys_from_req: false
diff --git a/ml_service/config/model/yi-34b.yaml b/ml_service/config/model/yi-34b.yaml
diff --git a/ml_service/config/ray/ray-serve.yaml b/ml_service/config/ray/ray-serve.yaml
@@ -37,17 +37,6 @@ applications:
     #  num_cpus: 32
     #  num_gpus: 4
 
-# - name: stt
-#  route_prefix: /stt
-#  import_path: stt_serve:app
-#  
-#  deployments:
-#  - name: STTDeployment
-#    num_replicas: 1
-#    max_concurrent_queries: 8
-#    ray_actor_options:
-#      num_gpus: 1
-
 - name: emb
   route_prefix: /api/v1/embedder
   import_path: emb_serve:main

diff --git a/ml_service/llm_serve.py b/ml_service/llm_serve.py
@@ -120,7 +120,6 @@ async def _stream_results(self, output_generator) -> AsyncGenerator[bytes, None]
         output_token_count = 0
 
         async for request_output in output_generator:
-            self.logger.info(f"{request_output = }")
             output = request_output.outputs[0]
             output_token_count += 1
             # Check if one second has passed since last stream
@@ -143,15 +142,13 @@ async def _stream_results(self, output_generator) -> AsyncGenerator[bytes, None]
         # Stream any remaining output at the end
         if output_token_count:
             text_output = output.text[num_returned:]
-            self.logger.info(f"text returned {text_output}")
             response = GenerateResponse(
                 output=text_output,
                 prompt_tokens=len(request_output.prompt_token_ids),
                 output_tokens=output_token_count,
                 finish_reason=output.finish_reason,
             )
 
-            self.logger.info(response.json())
             yield (response.json() + "\n").encode("utf-8")
 
     async def _abort_request(self, request_id) -> None:
@@ -169,7 +166,6 @@ async def generate(
         """Generate Completion for the requested prompt"""
         try:
             # either prompt or messages should provided
-            self.logger.info(f"{request}")
             if not request.prompt and not request.messages:
                 return create_error_response(
                     status_code=400,
@@ -197,7 +193,6 @@ async def generate(
             sampling_params = SamplingParams(**request_dict)
             request_id = self._next_request_id()
 
-            self.logger.info(f"Prompt Made from user: {prompt}")
             output_generator = self.engine.generate(
                 prompt=prompt,
                 sampling_params=sampling_params,

diff --git a/ml_service/ml/llm/prompt_format.py b/ml_service/ml/llm/prompt_format.py
@@ -49,6 +49,7 @@ class PromptFormat(BaseModel):
     strip_whitespace: bool = True
     trailing_assistant: str
     accept_sys_from_req: bool = False
+    recursive_sys_prompt: bool = True
 
     @validator("assistant")
     def check_assistant(cls, value):
@@ -96,8 +97,6 @@ def generate_prompt(self, messages: List[Message]) -> str:
 
         messages.insert(0, Message(role='system', content=self.system))
 
-        print(messages)
-
         if all(message.role == "system" for message in messages):
             raise ValueError("Only System messages are not allowed")
 
@@ -117,8 +116,6 @@ def generate_prompt(self, messages: List[Message]) -> str:
                     instruction=messages[1].content
                 )
             )
-
-        print(prompt, messages)
 
         for message in messages[2:]:
             message_content = message.content
@@ -135,7 +132,6 @@ def generate_prompt(self, messages: List[Message]) -> str:
                 prompt.append(self.assistant.format(instruction=message_content))
 
         prompt.append(self.trailing_assistant)
-        print(prompt)
         return "".join(prompt)
 
 

diff --git a/ml_service/stt_serve.py b/ml_service/stt_serve.py
diff --git a/ml_service/test/llm_client_http.py → ml_service/test/llm_client.py b/ml_service/test/llm_client_http.py → ml_service/test/llm_client.py
@@ -6,8 +6,12 @@
 import traceback
 import click
 import httpx
-
+from prompt_toolkit import PromptSession
+from prompt_toolkit.history import FileHistory
 from rich.console import Console
+from rich.text import Text
+from rich.panel import Panel
+
 
 httpx_logger = logging.getLogger("httpx")
 httpx_logger.setLevel(logging.WARNING)
@@ -72,23 +76,35 @@ def main(
     logging.basicConfig(level=logging.INFO)
 
     console = Console()
+
     url = f"http://{host}:{port}/api/v1/llm/generate"
-    print("""Hi, This is MegAcad, your AI Educational Tutor
-You can type the prompts (or) messages.
-Please be polite. Remember AI can make mistake.""")
+    welcome_message = Text("""Hello, this is MegAcad, your AI Educational Tutor 
+You can type the prompts or messages 
+Please be polite towards me & Remember, I can make mistakes too """)
+    welcome_message = Panel(welcome_message, title="MegAcad AI")  # Use 'solid' style
 
+    console.print(welcome_message)
     messages = []
+
+    session = PromptSession(history=FileHistory(".history"))
+
     i = 0
-    # console.begin_capture()
     while i <= 30:
         try:
-            user_message = console.input(">>> ")
+            user_message = session.prompt(">>> ")
             if user_message == "\\q":
                 print("Session Exited")
                 break
-            if user_message == "\\n":
-                print("New Session")
+            elif user_message == "\\n":
                 messages = []
+                console.clear()
+                console.print("New Session")
+                console.print(welcome_message)
+                continue
+            elif user_message.strip() == "":
+                continue
+            elif user_message == "\\c":
+                console.clear()
                 continue
 
             if is_prompt:
@@ -123,6 +139,8 @@ def main(
             traceback.print_exc()
 
         i += 1
+    # console.end_capture()
+
 
 if __name__ == "__main__":
     main()