Merge branch 'microsoft:main' into huggingface_agent

microsoft · May 24, 2024 · 89aebb8 · 89aebb8
2 parents c158916 + 702c010
commit 89aebb8
Show file tree

Hide file tree

Showing 210 changed files with 13,832 additions and 8,361 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -88,7 +88,9 @@ jobs:
           fi
       - name: Test with pytest skipping openai tests
         if: matrix.python-version != '3.10' && matrix.os == 'ubuntu-latest'
+        # Remove the line below once https://github.com/docker/docker-py/issues/3256 is merged
         run: |
+          pip install "requests<2.32.0"
           pytest test --ignore=test/agentchat/contrib --skip-openai --durations=10 --durations-min=1.0
       - name: Test with pytest skipping openai and docker tests
         if: matrix.python-version != '3.10' && matrix.os != 'ubuntu-latest'

diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml
@@ -74,7 +74,43 @@ jobs:
         with:
           file: ./coverage.xml
           flags: unittests
-
+  AgentEvalTest:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.10"]
+    runs-on: ${{ matrix.os }}
+    environment: openai1
+    steps:
+      # checkout to pr branch
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install packages and dependencies
+        run: |
+          docker --version
+          python -m pip install --upgrade pip wheel
+          pip install -e .
+          python -c "import autogen"
+          pip install pytest-cov>=5 pytest-asyncio
+      - name: Coverage
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+          AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }}
+          OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }}
+        run: |
+          pytest test/agentchat/contrib/agent_eval/test_agent_eval.py
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          file: ./coverage.xml
+          flags: unittests
   CompressionTest:
     strategy:
       matrix:

diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml
@@ -107,7 +107,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y tesseract-ocr poppler-utils
-          pip install unstructured[all-docs]==0.13.0
+          pip install --no-cache-dir unstructured[all-docs]==0.13.0
       - name: Install packages and dependencies for RetrieveChat
         run: |
           pip install -e .[retrievechat]
@@ -125,6 +125,35 @@ jobs:
           file: ./coverage.xml
           flags: unittests
 
+  AgentEvalTest:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.10"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install packages and dependencies for all tests
+        run: |
+          python -m pip install --upgrade pip wheel
+          pip install pytest-cov>=5
+      - name: Install packages and dependencies for AgentEval
+        run: |
+          pip install -e .
+      - name: Coverage
+        run: |
+          pytest test/agentchat/contrib/agent_eval/ --skip-openai
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          file: ./coverage.xml
+          flags: unittests
+
   CompressionTest:
     runs-on: ${{ matrix.os }}
     strategy:

diff --git a/README.md b/README.md
@@ -7,34 +7,41 @@
 [![Discord](https://img.shields.io/discord/1153072414184452236?logo=discord&style=flat)](https://aka.ms/autogen-dc)
 [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow%20%40pyautogen)](https://twitter.com/pyautogen)
 
+[![NuGet version](https://badge.fury.io/nu/AutoGen.Core.svg)](https://badge.fury.io/nu/AutoGen.Core)
 
 # AutoGen
 [📚 Cite paper](#related-papers).
 <!-- <p align="center">
     <img src="https://github.com/microsoft/autogen/blob/main/website/static/img/flaml.svg"  width=200>
     <br>
 </p> -->
+:fire: May 13, 2024: [The Economist](https://www.economist.com/science-and-technology/2024/05/13/todays-ai-models-are-impressive-teams-of-them-will-be-formidable) published an article about multi-agent systems (MAS) following a January 2024 interview with [Chi Wang](https://github.com/sonichi).
+
+:fire: May 11, 2024: [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation](https://openreview.net/pdf?id=uAjxFFing2) received the best paper award in [ICLR 2024 LLM Agents Workshop](https://llmagents.github.io/).
+
+:fire: Apr 26, 2024: [AutoGen.NET](https://microsoft.github.io/autogen-for-net/) is available for .NET developers!
+
 :fire: Apr 17, 2024: Andrew Ng cited AutoGen in [The Batch newsletter](https://www.deeplearning.ai/the-batch/issue-245/) and [What's next for AI agentic workflows](https://youtu.be/sal78ACtGTc?si=JduUzN_1kDnMq0vF) at Sequoia Capital's AI Ascent (Mar 26).
 
 :fire: Mar 3, 2024: What's new in AutoGen? 📰[Blog](https://microsoft.github.io/autogen/blog/2024/03/03/AutoGen-Update); 📺[Youtube](https://www.youtube.com/watch?v=j_mtwQiaLGU).
 
 :fire: Mar 1, 2024: the first AutoGen multi-agent experiment on the challenging [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) benchmark achieved the No. 1 accuracy in all the three levels.
 
-:tada: Jan 30, 2024: AutoGen is highlighted by Peter Lee in Microsoft Research Forum [Keynote](https://t.co/nUBSjPDjqD).
+<!-- :tada: Jan 30, 2024: AutoGen is highlighted by Peter Lee in Microsoft Research Forum [Keynote](https://t.co/nUBSjPDjqD). -->
 
 :tada: Dec 31, 2023: [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155) is selected by [TheSequence: My Five Favorite AI Papers of 2023](https://thesequence.substack.com/p/my-five-favorite-ai-papers-of-2023).
 
 <!-- :fire: Nov 24: pyautogen [v0.2](https://github.com/microsoft/autogen/releases/tag/v0.2.0) is released with many updates and new features compared to v0.1.1. It switches to using openai-python v1. Please read the [migration guide](https://microsoft.github.io/autogen/docs/Installation#python). -->
 
 <!-- :fire: Nov 11: OpenAI's Assistants are available in AutoGen and interoperatable with other AutoGen agents! Checkout our [blogpost](https://microsoft.github.io/autogen/blog/2023/11/13/OAI-assistants) for details and examples. -->
 
-:tada: Nov 8, 2023: AutoGen is selected into [Open100: Top 100 Open Source achievements](https://www.benchcouncil.org/evaluation/opencs/annual.html) 35 days after spinoff.
+:tada: Nov 8, 2023: AutoGen is selected into [Open100: Top 100 Open Source achievements](https://www.benchcouncil.org/evaluation/opencs/annual.html) 35 days after spinoff from [FLAML](https://github.com/microsoft/FLAML).
 
-:tada: Nov 6, 2023: AutoGen is mentioned by Satya Nadella in a [fireside chat](https://youtu.be/0pLBvgYtv6U).
+<!-- :tada: Nov 6, 2023: AutoGen is mentioned by Satya Nadella in a [fireside chat](https://youtu.be/0pLBvgYtv6U). -->
 
-:tada: Nov 1, 2023: AutoGen is the top trending repo on GitHub in October 2023.
+<!-- :tada: Nov 1, 2023: AutoGen is the top trending repo on GitHub in October 2023. -->
 
-:tada: Oct 03, 2023: AutoGen spins off from FLAML on GitHub and has a major paper update (first version on Aug 16).
+<!-- :tada: Oct 03, 2023: AutoGen spins off from [FLAML](https://github.com/microsoft/FLAML) on GitHub. -->
 
 <!-- :tada: Aug 16: Paper about AutoGen on [arxiv](https://arxiv.org/abs/2308.08155). -->
 

diff --git a/autogen/agentchat/chat.py b/autogen/agentchat/chat.py
@@ -195,7 +195,9 @@ def initiate_chats(chat_queue: List[Dict[str, Any]]) -> List[ChatResult]:
             r.summary for i, r in enumerate(finished_chats) if i not in finished_chat_indexes_to_exclude_from_carryover
         ]
 
-        __post_carryover_processing(chat_info)
+        if not chat_info.get("silent", False):
+            __post_carryover_processing(chat_info)
+
         sender = chat_info["sender"]
         chat_res = sender.initiate_chat(**chat_info)
         finished_chats.append(chat_res)
@@ -236,7 +238,10 @@ async def _dependent_chat_future(
     if isinstance(_chat_carryover, str):
         _chat_carryover = [_chat_carryover]
     chat_info["carryover"] = _chat_carryover + [finished_chats[pre_id].summary for pre_id in finished_chats]
-    __post_carryover_processing(chat_info)
+
+    if not chat_info.get("silent", False):
+        __post_carryover_processing(chat_info)
+
     sender = chat_info["sender"]
     chat_res_future = asyncio.create_task(sender.a_initiate_chat(**chat_info))
     call_back_with_args = partial(_on_chat_future_done, chat_id=chat_id)

diff --git a/autogen/agentchat/contrib/agent_eval/README.md b/autogen/agentchat/contrib/agent_eval/README.md
@@ -0,0 +1,7 @@
+Agents for running the AgentEval pipeline.
+
+AgentEval is a process for evaluating a LLM-based system's performance on a given task.
+
+When given a task to evaluate and a few example runs, the critic and subcritic agents create evaluation criteria for evaluating a system's solution. Once the criteria has been created, the quantifier agent can evaluate subsequent task solutions based on the generated criteria.
+
+For more information see: [AgentEval Integration Roadmap](https://github.com/microsoft/autogen/issues/2162)
diff --git a/autogen/agentchat/contrib/agent_eval/agent_eval.py b/autogen/agentchat/contrib/agent_eval/agent_eval.py
@@ -0,0 +1,101 @@
+from typing import Dict, List, Literal, Optional, Union
+
+import autogen
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+from autogen.agentchat.contrib.agent_eval.critic_agent import CriticAgent
+from autogen.agentchat.contrib.agent_eval.quantifier_agent import QuantifierAgent
+from autogen.agentchat.contrib.agent_eval.subcritic_agent import SubCriticAgent
+from autogen.agentchat.contrib.agent_eval.task import Task
+
+
+def generate_criteria(
+    llm_config: Optional[Union[Dict, Literal[False]]] = None,
+    task: Task = None,
+    additional_instructions: str = "",
+    max_round=2,
+    use_subcritic: bool = False,
+):
+    """
+    Creates a list of criteria for evaluating the utility of a given task.
+    Args:
+        llm_config (dict or bool): llm inference configuration.
+        task (Task): The task to evaluate.
+        additional_instructions (str): Additional instructions for the criteria agent.
+        max_round (int): The maximum number of rounds to run the conversation.
+        use_subcritic (bool): Whether to use the subcritic agent to generate subcriteria.
+    Returns:
+        list: A list of Criterion objects for evaluating the utility of the given task.
+    """
+    critic = CriticAgent(
+        system_message=CriticAgent.DEFAULT_SYSTEM_MESSAGE + "\n" + additional_instructions,
+        llm_config=llm_config,
+    )
+
+    critic_user = autogen.UserProxyAgent(
+        name="critic_user",
+        max_consecutive_auto_reply=0,  # terminate without auto-reply
+        human_input_mode="NEVER",
+        code_execution_config={"use_docker": False},
+    )
+
+    agents = [critic_user, critic]
+
+    if use_subcritic:
+        subcritic = SubCriticAgent(
+            llm_config=llm_config,
+        )
+        agents.append(subcritic)
+
+    groupchat = autogen.GroupChat(
+        agents=agents, messages=[], max_round=max_round, speaker_selection_method="round_robin"
+    )
+    critic_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)
+
+    critic_user.initiate_chat(critic_manager, message=task.get_sys_message())
+    criteria = critic_user.last_message()
+    content = criteria["content"]
+    # need to strip out any extra code around the returned json
+    content = content[content.find("[") : content.rfind("]") + 1]
+    criteria = Criterion.parse_json_str(content)
+    return criteria
+
+
+def quantify_criteria(
+    llm_config: Optional[Union[Dict, Literal[False]]] = None,
+    criteria: List[Criterion] = None,
+    task: Task = None,
+    test_case: str = "",
+    ground_truth: str = "",
+):
+    """
+    Quantifies the performance of a system using the provided criteria.
+    Args:
+        llm_config (dict or bool): llm inference configuration.
+        criteria ([Criterion]): A list of criteria for evaluating the utility of a given task.
+        task (Task): The task to evaluate.
+        test_case (str): The test case to evaluate.
+        ground_truth (str): The ground truth for the test case.
+    Returns:
+        dict: A dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.
+    """
+    quantifier = QuantifierAgent(
+        llm_config=llm_config,
+    )
+
+    quantifier_user = autogen.UserProxyAgent(
+        name="quantifier_user",
+        max_consecutive_auto_reply=0,  # terminate without auto-reply
+        human_input_mode="NEVER",
+        code_execution_config={"use_docker": False},
+    )
+
+    quantifier_user.initiate_chat(  # noqa: F841
+        quantifier,
+        message=task.get_sys_message()
+        + "Evaluation dictionary: "
+        + Criterion.write_json(criteria)
+        + "actual test case to evaluate: "
+        + test_case,
+    )
+    quantified_results = quantifier_user.last_message()
+    return {"actual_success": ground_truth, "estimated_performance": quantified_results["content"]}
diff --git a/autogen/agentchat/contrib/agent_eval/criterion.py b/autogen/agentchat/contrib/agent_eval/criterion.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+import json
+from typing import List
+
+import pydantic_core
+from pydantic import BaseModel
+from pydantic.json import pydantic_encoder
+
+
+class Criterion(BaseModel):
+    """
+    A class that represents a criterion for agent evaluation.
+    """
+
+    name: str
+    description: str
+    accepted_values: List[str]
+    sub_criteria: List[Criterion] = list()
+
+    @staticmethod
+    def parse_json_str(criteria: str):
+        """
+        Create a list of Criterion objects from a json string.
+        Args:
+            criteria (str): Json string that represents the criteria
+        returns:
+            [Criterion]: A list of Criterion objects that represents the json criteria information.
+        """
+        return [Criterion(**crit) for crit in json.loads(criteria)]
+
+    @staticmethod
+    def write_json(criteria):
+        """
+        Create a json string from a list of Criterion objects.
+        Args:
+            criteria ([Criterion]): A list of Criterion objects.
+        Returns:
+            str: A json string that represents the list of Criterion objects.
+        """
+        return json.dumps([crit.model_dump() for crit in criteria], indent=2)
diff --git a/autogen/agentchat/contrib/agent_eval/critic_agent.py b/autogen/agentchat/contrib/agent_eval/critic_agent.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+from autogen.agentchat.conversable_agent import ConversableAgent
+
+
+class CriticAgent(ConversableAgent):
+    """
+    An agent for creating list of criteria for evaluating the utility of a given task.
+    """
+
+    DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant.
+    Convert the evaluation criteria into a list where each item is a criteria which consists of the following dictionary as follows
+    {"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key}
+    Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description.
+    Output just the criteria string you have created, no code.
+    """
+
+    DEFAULT_DESCRIPTION = "An AI agent for creating list criteria for evaluating the utility of a given task."
+
+    def __init__(
+        self,
+        name="critic",
+        system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
+        description: Optional[str] = DEFAULT_DESCRIPTION,
+        **kwargs,
+    ):
+        """
+        Args:
+            name (str): agent name.
+            system_message (str): system message for the ChatCompletion inference.
+                Please override this attribute if you want to reprogram the agent.
+            description (str): The description of the agent.
+            **kwargs (dict): Please refer to other kwargs in
+                [ConversableAgent](../../conversable_agent#__init__).
+        """
+        super().__init__(
+            name=name,
+            system_message=system_message,
+            description=description,
+            **kwargs,
+        )