Merge branch 'main' into tinyra-refactor

microsoft · Feb 19, 2024 · abad9ff · abad9ff
2 parents df5301d + d34bd7e
commit abad9ff
Show file tree

Hide file tree

Showing 74 changed files with 2,558 additions and 1,040 deletions.
diff --git a/OAI_CONFIG_LIST_sample b/OAI_CONFIG_LIST_sample
@@ -12,13 +12,13 @@
         "api_key": "<your Azure OpenAI API key here>",
         "base_url": "<your Azure OpenAI API base here>",
         "api_type": "azure",
-        "api_version": "2023-07-01-preview"
+        "api_version": "2024-02-15-preview"
     },
     {
         "model": "<your Azure OpenAI deployment name>",
         "api_key": "<your Azure OpenAI API key here>",
         "base_url": "<your Azure OpenAI API base here>",
         "api_type": "azure",
-        "api_version": "2023-07-01-preview"
+        "api_version": "2024-02-15-preview"
     }
 ]
diff --git a/README.md b/README.md
@@ -13,6 +13,8 @@
     <br>
 </p> -->
 
+:fire: Jan 30: AutoGen is highlighted by Peter Lee in Microsoft Research Forum [Keynote](https://t.co/nUBSjPDjqD).
+
 :warning: Jan 23: **Breaking Change in Latest Release v0.2.8** `use_docker` defaults to `True` for code-execution. See [blog post](https://microsoft.github.io/autogen/blog/2024/01/23/Code-execution-in-docker) for details and [FAQ](https://microsoft.github.io/autogen/docs/FAQ#agents-are-throwing-due-to-docker-not-running-how-can-i-resolve-this) for troubleshooting any issues.
 
 :fire: Dec 31: [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155) is selected by [TheSequence: My Five Favorite AI Papers of 2023](https://thesequence.substack.com/p/my-five-favorite-ai-papers-of-2023).
@@ -23,7 +25,7 @@
 
 :fire: Nov 8: AutoGen is selected into [Open100: Top 100 Open Source achievements](https://www.benchcouncil.org/evaluation/opencs/annual.html) 35 days after spinoff.
 
-:fire: Nov 6: AutoGen is mentioned by Satya Nadella in a [fireside chat](https://youtu.be/0pLBvgYtv6U) around 13:20.
+:fire: Nov 6: AutoGen is mentioned by Satya Nadella in a [fireside chat](https://youtu.be/0pLBvgYtv6U).
 
 :fire: Nov 1: AutoGen is the top trending repo on GitHub in October 2023.
 

diff --git a/autogen/__init__.py b/autogen/__init__.py
@@ -2,6 +2,7 @@
 from .version import __version__
 from .oai import *
 from .agentchat import *
+from .exception_utils import *
 from .code_utils import DEFAULT_MODEL, FAST_MODEL
 
 

diff --git a/autogen/agentchat/chat.py b/autogen/agentchat/chat.py
@@ -34,31 +34,37 @@ def initiate_chats(chat_queue: List[Dict[str, Any]]) -> List[ChatResult]:
 
     args:
         chat_queue (List[Dict]): a list of dictionaries containing the information of the chats.
-                Each dictionary should contain the following fields:
+                Each dictionary should contain the input arguments for `ConversableAgent.initiate_chat`.
+                More specifically, each dictionary could include the following fields:
+                recipient: the recipient agent.
+                - "sender": the sender agent.
                 - "recipient": the recipient agent.
-                - "context": any context information, e.g., the request message. The following fields are reserved:
-                    "message" needs to be provided if the `generate_init_message` method is not overridden.
-                          Otherwise, input() will be called to get the initial message.
-                    "summary_method": a string or callable specifying the method to get a summary from the chat. Default is DEFAULT_summary_method, i.e., "last_msg".
-                        - Supported string are "last_msg" and "reflection_with_llm":
-                            when set "last_msg", it returns the last message of the dialog as the summary.
-                            when set "reflection_with_llm", it returns a summary extracted using an llm client.
-                            `llm_config` must be set in either the recipient or sender.
-                            "reflection_with_llm" requires the llm_config to be set in either the sender or the recipient.
-                        - A callable summary_method should take the recipient and sender agent in a chat as input and return a string of summary. E.g,
-                        ```python
-                        def my_summary_method(
-                            sender: ConversableAgent,
-                            recipient: ConversableAgent,
-                        ):
-                            return recipient.last_message(sender)["content"]
-                        ```
-                    "summary_prompt" can be used to specify the prompt used to extract a summary when summary_method is "reflection_with_llm".
-                        Default is None and the following default prompt will be used when "summary_method" is set to "reflection_with_llm":
-                        "Identify and extract the final solution to the originally asked question based on the conversation."
-                    "carryover" can be used to specify the carryover information to be passed to this chat.
-                        If provided, we will combine this carryover with the "message" content when generating the initial chat
-                        message in `generate_init_message`.
+                - clear_history (bool): whether to clear the chat history with the agent. Default is True.
+                - silent (bool or None): (Experimental) whether to print the messages for this conversation. Default is False.
+                - cache (Cache or None): the cache client to be used for this conversation. Default is None.
+                - max_turns (int or None): the maximum number of turns for the chat. If None, the chat will continue until a termination condition is met. Default is None.
+                - "message" needs to be provided if the `generate_init_message` method is not overridden.
+                        Otherwise, input() will be called to get the initial message.
+                - "summary_method": a string or callable specifying the method to get a summary from the chat. Default is DEFAULT_summary_method, i.e., "last_msg".
+                    - Supported string are "last_msg" and "reflection_with_llm":
+                        when set "last_msg", it returns the last message of the dialog as the summary.
+                        when set "reflection_with_llm", it returns a summary extracted using an llm client.
+                        `llm_config` must be set in either the recipient or sender.
+                        "reflection_with_llm" requires the llm_config to be set in either the sender or the recipient.
+                    - A callable summary_method should take the recipient and sender agent in a chat as input and return a string of summary. E.g,
+                    ```python
+                    def my_summary_method(
+                        sender: ConversableAgent,
+                        recipient: ConversableAgent,
+                    ):
+                        return recipient.last_message(sender)["content"]
+                    ```
+                "summary_prompt" can be used to specify the prompt used to extract a summary when summary_method is "reflection_with_llm".
+                    Default is None and the following default prompt will be used when "summary_method" is set to "reflection_with_llm":
+                    "Identify and extract the final solution to the originally asked question based on the conversation."
+                "carryover" can be used to specify the carryover information to be passed to this chat.
+                    If provided, we will combine this carryover with the "message" content when generating the initial chat
+                    message in `generate_init_message`.
 
 
     returns:

diff --git a/autogen/agentchat/contrib/capabilities/context_handling.py b/autogen/agentchat/contrib/capabilities/context_handling.py
@@ -45,7 +45,7 @@ def add_to_agent(self, agent: ConversableAgent):
         """
         Adds TransformChatHistory capability to the given agent.
         """
-        agent.register_hook(hookable_method=agent.process_all_messages, hook=self._transform_messages)
+        agent.register_hook(hookable_method="process_all_messages", hook=self._transform_messages)
 
     def _transform_messages(self, messages: List[Dict]) -> List[Dict]:
         """

diff --git a/autogen/agentchat/contrib/capabilities/teachability.py b/autogen/agentchat/contrib/capabilities/teachability.py
@@ -61,7 +61,7 @@ def add_to_agent(self, agent: ConversableAgent):
         self.teachable_agent = agent
 
         # Register a hook for processing the last message.
-        agent.register_hook(hookable_method=agent.process_last_message, hook=self.process_last_message)
+        agent.register_hook(hookable_method="process_last_message", hook=self.process_last_message)
 
         # Was an llm_config passed to the constructor?
         if self.llm_config is None:

diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py
@@ -1,5 +1,7 @@
 import base64
+import copy
 import mimetypes
+import os
 import re
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -8,17 +10,63 @@
 from PIL import Image
 
 
-def get_image_data(image_file: str, use_b64=True) -> bytes:
+def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
+    """
+    Loads an image from a file and returns a PIL Image object.
+
+    Parameters:
+        image_file (str, or Image): The filename, URL, URI, or base64 string of the image file.
+
+    Returns:
+        Image.Image: The PIL Image object.
+    """
+    if isinstance(image_file, Image.Image):
+        # Already a PIL Image object
+        return image_file
+
     if image_file.startswith("http://") or image_file.startswith("https://"):
+        # A URL file
         response = requests.get(image_file)
-        content = response.content
+        content = BytesIO(response.content)
+        image = Image.open(content)
     elif re.match(r"data:image/(?:png|jpeg);base64,", image_file):
-        return re.sub(r"data:image/(?:png|jpeg);base64,", "", image_file)
+        # A URI. Remove the prefix and decode the base64 string.
+        base64_data = re.sub(r"data:image/(?:png|jpeg);base64,", "", image_file)
+        image = _to_pil(base64_data)
+    elif os.path.exists(image_file):
+        # A local file
+        image = Image.open(image_file)
     else:
-        image = Image.open(image_file).convert("RGB")
-        buffered = BytesIO()
-        image.save(buffered, format="PNG")
-        content = buffered.getvalue()
+        # base64 encoded string
+        image = _to_pil(image_file)
+
+    return image.convert("RGB")
+
+
+def get_image_data(image_file: Union[str, Image.Image], use_b64=True) -> bytes:
+    """
+    Loads an image and returns its data either as raw bytes or in base64-encoded format.
+
+    This function first loads an image from the specified file, URL, or base64 string using
+    the `get_pil_image` function. It then saves this image in memory in PNG format and
+    retrieves its binary content. Depending on the `use_b64` flag, this binary content is
+    either returned directly or as a base64-encoded string.
+
+    Parameters:
+        image_file (str, or Image): The path to the image file, a URL to an image, or a base64-encoded
+                          string of the image.
+        use_b64 (bool): If True, the function returns a base64-encoded string of the image data.
+                        If False, it returns the raw byte data of the image. Defaults to True.
+
+    Returns:
+        bytes: The image data in raw bytes if `use_b64` is False, or a base64-encoded string
+               if `use_b64` is True.
+    """
+    image = get_pil_image(image_file)
+
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    content = buffered.getvalue()
 
     if use_b64:
         return base64.b64encode(content).decode("utf-8")
@@ -72,6 +120,22 @@ def llava_formatter(prompt: str, order_image_tokens: bool = False) -> Tuple[str,
     return new_prompt, images
 
 
+def pil_to_data_uri(image: Image.Image) -> str:
+    """
+    Converts a PIL Image object to a data URI.
+
+    Parameters:
+        image (Image.Image): The PIL Image object.
+
+    Returns:
+        str: The data URI string.
+    """
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    content = buffered.getvalue()
+    return convert_base64_to_data_uri(base64.b64encode(content).decode("utf-8"))
+
+
 def convert_base64_to_data_uri(base64_image):
     def _get_mime_type_from_data_uri(base64_image):
         # Decode the base64 string
@@ -92,16 +156,19 @@ def _get_mime_type_from_data_uri(base64_image):
     return data_uri
 
 
-def gpt4v_formatter(prompt: str) -> List[Union[str, dict]]:
+def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dict]]:
     """
     Formats the input prompt by replacing image tags and returns a list of text and images.
 
-    Parameters:
+    Args:
         - prompt (str): The input string that may contain image tags like <img ...>.
+        - img_format (str): what image format should be used. One of "uri", "url", "pil".
 
     Returns:
         - List[Union[str, dict]]: A list of alternating text and image dictionary items.
     """
+    assert img_format in ["uri", "url", "pil"]
+
     output = []
     last_index = 0
     image_count = 0
@@ -114,7 +181,15 @@ def gpt4v_formatter(prompt: str) -> List[Union[str, dict]]:
         image_location = match.group(1)
 
         try:
-            img_data = get_image_data(image_location)
+            if img_format == "pil":
+                img_data = get_pil_image(image_location)
+            elif img_format == "uri":
+                img_data = get_image_data(image_location)
+                img_data = convert_base64_to_data_uri(img_data)
+            elif img_format == "url":
+                img_data = image_location
+            else:
+                raise ValueError(f"Unknown image format {img_format}")
         except Exception as e:
             # Warning and skip this token
             print(f"Warning! Unable to load image from {image_location}, because {e}")
@@ -124,7 +199,7 @@ def gpt4v_formatter(prompt: str) -> List[Union[str, dict]]:
         output.append({"type": "text", "text": prompt[last_index : match.start()]})
 
         # Add image data to output list
-        output.append({"type": "image_url", "image_url": {"url": convert_base64_to_data_uri(img_data)}})
+        output.append({"type": "image_url", "image_url": {"url": img_data}})
 
         last_index = match.end()
         image_count += 1
@@ -162,9 +237,61 @@ def _to_pil(data: str) -> Image.Image:
     and finally creates and returns a PIL Image object from the BytesIO object.
 
     Parameters:
-        data (str): The base64 encoded image data string.
+        data (str): The encoded image data string.
 
     Returns:
         Image.Image: The PIL Image object created from the input data.
     """
     return Image.open(BytesIO(base64.b64decode(data)))
+
+
+def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]:
+    """
+    Converts the PIL image URLs in the messages to base64 encoded data URIs.
+
+    This function iterates over a list of message dictionaries. For each message,
+    if it contains a 'content' key with a list of items, it looks for items
+    with an 'image_url' key. The function then converts the PIL image URL
+    (pointed to by 'image_url') to a base64 encoded data URI.
+
+    Parameters:
+        messages (List[Dict]): A list of message dictionaries. Each dictionary
+                               may contain a 'content' key with a list of items,
+                               some of which might be image URLs.
+
+    Returns:
+        List[Dict]: A new list of message dictionaries with PIL image URLs in the
+                    'image_url' key converted to base64 encoded data URIs.
+
+    Example Input:
+        [
+            {'content': [{'type': 'text', 'text': 'You are a helpful AI assistant.'}], 'role': 'system'},
+            {'content': [
+                {'type': 'text', 'text': "What's the breed of this dog here? \n"},
+                {'type': 'image_url', 'image_url': {'url': a PIL.Image.Image}},
+                {'type': 'text', 'text': '.'}],
+            'role': 'user'}
+        ]
+
+    Example Output:
+        [
+            {'content': [{'type': 'text', 'text': 'You are a helpful AI assistant.'}], 'role': 'system'},
+            {'content': [
+                {'type': 'text', 'text': "What's the breed of this dog here? \n"},
+                {'type': 'image_url', 'image_url': {'url': a B64 Image}},
+                {'type': 'text', 'text': '.'}],
+            'role': 'user'}
+        ]
+    """
+    new_messages = []
+    for message in messages:
+        # Handle the new GPT messages format.
+        if isinstance(message, dict) and "content" in message and isinstance(message["content"], list):
+            message = copy.deepcopy(message)
+            for item in message["content"]:
+                if isinstance(item, dict) and "image_url" in item:
+                    item["image_url"]["url"] = pil_to_data_uri(item["image_url"]["url"])
+
+        new_messages.append(message)
+
+    return new_messages
diff --git a/autogen/agentchat/contrib/llava_agent.py b/autogen/agentchat/contrib/llava_agent.py
@@ -77,7 +77,9 @@ def _image_reply(self, messages=None, sender=None, config=None):
             content_prompt = content_str(msg["content"])
             prompt += f"{SEP}{role}: {content_prompt}\n"
         prompt += "\n" + SEP + "Assistant: "
-        images = [re.sub("data:image/.+;base64,", "", im, count=1) for im in images]
+
+        # TODO: PIL to base64
+        images = [get_image_data(im) for im in images]
         print(colored(prompt, "blue"))
 
         out = ""