diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d06999db34c..a92044f15b7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -88,7 +88,9 @@ jobs: fi - name: Test with pytest skipping openai tests if: matrix.python-version != '3.10' && matrix.os == 'ubuntu-latest' + # Remove the line below once https://github.com/docker/docker-py/issues/3256 is merged run: | + pip install "requests<2.32.0" pytest test --ignore=test/agentchat/contrib --skip-openai --durations=10 --durations-min=1.0 - name: Test with pytest skipping openai and docker tests if: matrix.python-version != '3.10' && matrix.os != 'ubuntu-latest' diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index 1bf71115d6b..b1b3e35e478 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -74,7 +74,43 @@ jobs: with: file: ./coverage.xml flags: unittests - + AgentEvalTest: + strategy: + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + runs-on: ${{ matrix.os }} + environment: openai1 + steps: + # checkout to pr branch + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install packages and dependencies + run: | + docker --version + python -m pip install --upgrade pip wheel + pip install -e . + python -c "import autogen" + pip install pytest-cov>=5 pytest-asyncio + - name: Coverage + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }} + OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} + run: | + pytest test/agentchat/contrib/agent_eval/test_agent_eval.py + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + flags: unittests CompressionTest: strategy: matrix: diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index f8dd1d46186..38fab877402 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -107,7 +107,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y tesseract-ocr poppler-utils - pip install unstructured[all-docs]==0.13.0 + pip install --no-cache-dir unstructured[all-docs]==0.13.0 - name: Install packages and dependencies for RetrieveChat run: | pip install -e .[retrievechat] @@ -125,6 +125,35 @@ jobs: file: ./coverage.xml flags: unittests + AgentEvalTest: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install packages and dependencies for all tests + run: | + python -m pip install --upgrade pip wheel + pip install pytest-cov>=5 + - name: Install packages and dependencies for AgentEval + run: | + pip install -e . + - name: Coverage + run: | + pytest test/agentchat/contrib/agent_eval/ --skip-openai + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + flags: unittests + CompressionTest: runs-on: ${{ matrix.os }} strategy: diff --git a/README.md b/README.md index fabbff99b63..e78d4b91aad 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [![Discord](https://img.shields.io/discord/1153072414184452236?logo=discord&style=flat)](https://aka.ms/autogen-dc) [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow%20%40pyautogen)](https://twitter.com/pyautogen) +[![NuGet version](https://badge.fury.io/nu/AutoGen.Core.svg)](https://badge.fury.io/nu/AutoGen.Core) # AutoGen [📚 Cite paper](#related-papers). @@ -14,13 +15,19 @@

--> +:fire: May 13, 2024: [The Economist](https://www.economist.com/science-and-technology/2024/05/13/todays-ai-models-are-impressive-teams-of-them-will-be-formidable) published an article about multi-agent systems (MAS) following a January 2024 interview with [Chi Wang](https://github.com/sonichi). + +:fire: May 11, 2024: [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation](https://openreview.net/pdf?id=uAjxFFing2) received the best paper award in [ICLR 2024 LLM Agents Workshop](https://llmagents.github.io/). + +:fire: Apr 26, 2024: [AutoGen.NET](https://microsoft.github.io/autogen-for-net/) is available for .NET developers! + :fire: Apr 17, 2024: Andrew Ng cited AutoGen in [The Batch newsletter](https://www.deeplearning.ai/the-batch/issue-245/) and [What's next for AI agentic workflows](https://youtu.be/sal78ACtGTc?si=JduUzN_1kDnMq0vF) at Sequoia Capital's AI Ascent (Mar 26). :fire: Mar 3, 2024: What's new in AutoGen? 📰[Blog](https://microsoft.github.io/autogen/blog/2024/03/03/AutoGen-Update); 📺[Youtube](https://www.youtube.com/watch?v=j_mtwQiaLGU). :fire: Mar 1, 2024: the first AutoGen multi-agent experiment on the challenging [GAIA](https://huggingface.co/spaces/gaia-benchmark/leaderboard) benchmark achieved the No. 1 accuracy in all the three levels. -:tada: Jan 30, 2024: AutoGen is highlighted by Peter Lee in Microsoft Research Forum [Keynote](https://t.co/nUBSjPDjqD). + :tada: Dec 31, 2023: [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155) is selected by [TheSequence: My Five Favorite AI Papers of 2023](https://thesequence.substack.com/p/my-five-favorite-ai-papers-of-2023). @@ -28,13 +35,13 @@ -:tada: Nov 8, 2023: AutoGen is selected into [Open100: Top 100 Open Source achievements](https://www.benchcouncil.org/evaluation/opencs/annual.html) 35 days after spinoff. +:tada: Nov 8, 2023: AutoGen is selected into [Open100: Top 100 Open Source achievements](https://www.benchcouncil.org/evaluation/opencs/annual.html) 35 days after spinoff from [FLAML](https://github.com/microsoft/FLAML). -:tada: Nov 6, 2023: AutoGen is mentioned by Satya Nadella in a [fireside chat](https://youtu.be/0pLBvgYtv6U). + -:tada: Nov 1, 2023: AutoGen is the top trending repo on GitHub in October 2023. + -:tada: Oct 03, 2023: AutoGen spins off from FLAML on GitHub and has a major paper update (first version on Aug 16). + diff --git a/autogen/agentchat/chat.py b/autogen/agentchat/chat.py index b527f8e0bae..dd489c03625 100644 --- a/autogen/agentchat/chat.py +++ b/autogen/agentchat/chat.py @@ -195,7 +195,9 @@ def initiate_chats(chat_queue: List[Dict[str, Any]]) -> List[ChatResult]: r.summary for i, r in enumerate(finished_chats) if i not in finished_chat_indexes_to_exclude_from_carryover ] - __post_carryover_processing(chat_info) + if not chat_info.get("silent", False): + __post_carryover_processing(chat_info) + sender = chat_info["sender"] chat_res = sender.initiate_chat(**chat_info) finished_chats.append(chat_res) @@ -236,7 +238,10 @@ async def _dependent_chat_future( if isinstance(_chat_carryover, str): _chat_carryover = [_chat_carryover] chat_info["carryover"] = _chat_carryover + [finished_chats[pre_id].summary for pre_id in finished_chats] - __post_carryover_processing(chat_info) + + if not chat_info.get("silent", False): + __post_carryover_processing(chat_info) + sender = chat_info["sender"] chat_res_future = asyncio.create_task(sender.a_initiate_chat(**chat_info)) call_back_with_args = partial(_on_chat_future_done, chat_id=chat_id) diff --git a/autogen/agentchat/contrib/agent_eval/README.md b/autogen/agentchat/contrib/agent_eval/README.md new file mode 100644 index 00000000000..6588a1ec611 --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/README.md @@ -0,0 +1,7 @@ +Agents for running the AgentEval pipeline. + +AgentEval is a process for evaluating a LLM-based system's performance on a given task. + +When given a task to evaluate and a few example runs, the critic and subcritic agents create evaluation criteria for evaluating a system's solution. Once the criteria has been created, the quantifier agent can evaluate subsequent task solutions based on the generated criteria. + +For more information see: [AgentEval Integration Roadmap](https://github.com/microsoft/autogen/issues/2162) diff --git a/autogen/agentchat/contrib/agent_eval/agent_eval.py b/autogen/agentchat/contrib/agent_eval/agent_eval.py new file mode 100644 index 00000000000..b48c65a66d2 --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/agent_eval.py @@ -0,0 +1,101 @@ +from typing import Dict, List, Literal, Optional, Union + +import autogen +from autogen.agentchat.contrib.agent_eval.criterion import Criterion +from autogen.agentchat.contrib.agent_eval.critic_agent import CriticAgent +from autogen.agentchat.contrib.agent_eval.quantifier_agent import QuantifierAgent +from autogen.agentchat.contrib.agent_eval.subcritic_agent import SubCriticAgent +from autogen.agentchat.contrib.agent_eval.task import Task + + +def generate_criteria( + llm_config: Optional[Union[Dict, Literal[False]]] = None, + task: Task = None, + additional_instructions: str = "", + max_round=2, + use_subcritic: bool = False, +): + """ + Creates a list of criteria for evaluating the utility of a given task. + Args: + llm_config (dict or bool): llm inference configuration. + task (Task): The task to evaluate. + additional_instructions (str): Additional instructions for the criteria agent. + max_round (int): The maximum number of rounds to run the conversation. + use_subcritic (bool): Whether to use the subcritic agent to generate subcriteria. + Returns: + list: A list of Criterion objects for evaluating the utility of the given task. + """ + critic = CriticAgent( + system_message=CriticAgent.DEFAULT_SYSTEM_MESSAGE + "\n" + additional_instructions, + llm_config=llm_config, + ) + + critic_user = autogen.UserProxyAgent( + name="critic_user", + max_consecutive_auto_reply=0, # terminate without auto-reply + human_input_mode="NEVER", + code_execution_config={"use_docker": False}, + ) + + agents = [critic_user, critic] + + if use_subcritic: + subcritic = SubCriticAgent( + llm_config=llm_config, + ) + agents.append(subcritic) + + groupchat = autogen.GroupChat( + agents=agents, messages=[], max_round=max_round, speaker_selection_method="round_robin" + ) + critic_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config) + + critic_user.initiate_chat(critic_manager, message=task.get_sys_message()) + criteria = critic_user.last_message() + content = criteria["content"] + # need to strip out any extra code around the returned json + content = content[content.find("[") : content.rfind("]") + 1] + criteria = Criterion.parse_json_str(content) + return criteria + + +def quantify_criteria( + llm_config: Optional[Union[Dict, Literal[False]]] = None, + criteria: List[Criterion] = None, + task: Task = None, + test_case: str = "", + ground_truth: str = "", +): + """ + Quantifies the performance of a system using the provided criteria. + Args: + llm_config (dict or bool): llm inference configuration. + criteria ([Criterion]): A list of criteria for evaluating the utility of a given task. + task (Task): The task to evaluate. + test_case (str): The test case to evaluate. + ground_truth (str): The ground truth for the test case. + Returns: + dict: A dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria. + """ + quantifier = QuantifierAgent( + llm_config=llm_config, + ) + + quantifier_user = autogen.UserProxyAgent( + name="quantifier_user", + max_consecutive_auto_reply=0, # terminate without auto-reply + human_input_mode="NEVER", + code_execution_config={"use_docker": False}, + ) + + quantifier_user.initiate_chat( # noqa: F841 + quantifier, + message=task.get_sys_message() + + "Evaluation dictionary: " + + Criterion.write_json(criteria) + + "actual test case to evaluate: " + + test_case, + ) + quantified_results = quantifier_user.last_message() + return {"actual_success": ground_truth, "estimated_performance": quantified_results["content"]} diff --git a/autogen/agentchat/contrib/agent_eval/criterion.py b/autogen/agentchat/contrib/agent_eval/criterion.py new file mode 100644 index 00000000000..5efd121ec07 --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/criterion.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import json +from typing import List + +import pydantic_core +from pydantic import BaseModel +from pydantic.json import pydantic_encoder + + +class Criterion(BaseModel): + """ + A class that represents a criterion for agent evaluation. + """ + + name: str + description: str + accepted_values: List[str] + sub_criteria: List[Criterion] = list() + + @staticmethod + def parse_json_str(criteria: str): + """ + Create a list of Criterion objects from a json string. + Args: + criteria (str): Json string that represents the criteria + returns: + [Criterion]: A list of Criterion objects that represents the json criteria information. + """ + return [Criterion(**crit) for crit in json.loads(criteria)] + + @staticmethod + def write_json(criteria): + """ + Create a json string from a list of Criterion objects. + Args: + criteria ([Criterion]): A list of Criterion objects. + Returns: + str: A json string that represents the list of Criterion objects. + """ + return json.dumps([crit.model_dump() for crit in criteria], indent=2) diff --git a/autogen/agentchat/contrib/agent_eval/critic_agent.py b/autogen/agentchat/contrib/agent_eval/critic_agent.py new file mode 100644 index 00000000000..2f5e5598ba6 --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/critic_agent.py @@ -0,0 +1,41 @@ +from typing import Optional + +from autogen.agentchat.conversable_agent import ConversableAgent + + +class CriticAgent(ConversableAgent): + """ + An agent for creating list of criteria for evaluating the utility of a given task. + """ + + DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant. + Convert the evaluation criteria into a list where each item is a criteria which consists of the following dictionary as follows + {"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key} + Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description. + Output just the criteria string you have created, no code. + """ + + DEFAULT_DESCRIPTION = "An AI agent for creating list criteria for evaluating the utility of a given task." + + def __init__( + self, + name="critic", + system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE, + description: Optional[str] = DEFAULT_DESCRIPTION, + **kwargs, + ): + """ + Args: + name (str): agent name. + system_message (str): system message for the ChatCompletion inference. + Please override this attribute if you want to reprogram the agent. + description (str): The description of the agent. + **kwargs (dict): Please refer to other kwargs in + [ConversableAgent](../../conversable_agent#__init__). + """ + super().__init__( + name=name, + system_message=system_message, + description=description, + **kwargs, + ) diff --git a/autogen/agentchat/contrib/agent_eval/quantifier_agent.py b/autogen/agentchat/contrib/agent_eval/quantifier_agent.py new file mode 100644 index 00000000000..02a8f650fab --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/quantifier_agent.py @@ -0,0 +1,36 @@ +from typing import Optional + +from autogen.agentchat.conversable_agent import ConversableAgent + + +class QuantifierAgent(ConversableAgent): + """ + An agent for quantifying the performance of a system using the provided criteria. + """ + + DEFAULT_SYSTEM_MESSAGE = """"You are a helpful assistant. You quantify the output of different tasks based on the given criteria. + The criterion is given in a json list format where each element is a distinct criteria. + The each element is a dictionary as follows {"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key} + You are going to quantify each of the crieria for a given task based on the task description. + Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria. + Return only the dictionary, no code.""" + + DEFAULT_DESCRIPTION = "An AI agent for quantifing the performance of a system using the provided criteria." + + def __init__( + self, + name="quantifier", + system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE, + description: Optional[str] = DEFAULT_DESCRIPTION, + **kwargs, + ): + """ + Args: + name (str): agent name. + system_message (str): system message for the ChatCompletion inference. + Please override this attribute if you want to reprogram the agent. + description (str): The description of the agent. + **kwargs (dict): Please refer to other kwargs in + [ConversableAgent](../../conversable_agent#__init__). + """ + super().__init__(name=name, system_message=system_message, description=description, **kwargs) diff --git a/autogen/agentchat/contrib/agent_eval/subcritic_agent.py b/autogen/agentchat/contrib/agent_eval/subcritic_agent.py new file mode 100755 index 00000000000..fa994ee7bda --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/subcritic_agent.py @@ -0,0 +1,42 @@ +from typing import Optional + +from autogen.agentchat.conversable_agent import ConversableAgent + + +class SubCriticAgent(ConversableAgent): + """ + An agent for creating subcriteria from a given list of criteria for evaluating the utility of a given task. + """ + + DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant to the critic agent. You suggest sub criteria for evaluating different tasks based on the criteria provided by the critic agent (if you feel it is needed). + They should be distinguishable, quantifiable, and related to the overall theme of the critic's provided criteria. + You operate by taking in the description of the criteria. You then create a new key called sub criteria where you provide the sub criteria for the given criteria. + The value of the sub_criteria is a dictionary where the keys are the subcriteria and each value is as follows {"description": sub criteria description , "accepted_values": possible accepted inputs for this key} + Do this for each criteria provided by the critic (removing the criteria's accepted values). "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description. + Once you have created the sub criteria for the given criteria, you return the json (make sure to include the contents of the critic's dictionary in the final dictionary as well). + Make sure to return a valid json and no code""" + + DEFAULT_DESCRIPTION = "An AI agent for creating subcriteria from a given list of criteria." + + def __init__( + self, + name="subcritic", + system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE, + description: Optional[str] = DEFAULT_DESCRIPTION, + **kwargs, + ): + """ + Args: + name (str): agent name. + system_message (str): system message for the ChatCompletion inference. + Please override this attribute if you want to reprogram the agent. + description (str): The description of the agent. + **kwargs (dict): Please refer to other kwargs in + [ConversableAgent](../../conversable_agent#__init__). + """ + super().__init__( + name=name, + system_message=system_message, + description=description, + **kwargs, + ) diff --git a/autogen/agentchat/contrib/agent_eval/task.py b/autogen/agentchat/contrib/agent_eval/task.py new file mode 100644 index 00000000000..9f96fbf79e2 --- /dev/null +++ b/autogen/agentchat/contrib/agent_eval/task.py @@ -0,0 +1,37 @@ +import json + +from pydantic import BaseModel + + +class Task(BaseModel): + """ + Class representing a task for agent completion, includes example agent execution for criteria generation. + """ + + name: str + description: str + successful_response: str + failed_response: str + + def get_sys_message(self): + return f"""Task: {self.name}. + Task description: {self.description} + Task successful example: {self.successful_response} + Task failed example: {self.failed_response} + """ + + @staticmethod + def parse_json_str(task: str): + """ + Create a Task object from a json object. + Args: + json_data (dict): A dictionary that represents the task. + Returns: + Task: A Task object that represents the json task information. + """ + json_data = json.loads(task) + name = json_data.get("name") + description = json_data.get("description") + successful_response = json_data.get("successful_response") + failed_response = json_data.get("failed_response") + return Task(name, description, successful_response, failed_response) diff --git a/autogen/agentchat/contrib/capabilities/context_handling.py b/autogen/agentchat/contrib/capabilities/context_handling.py index 173811842eb..44b10259f1b 100644 --- a/autogen/agentchat/contrib/capabilities/context_handling.py +++ b/autogen/agentchat/contrib/capabilities/context_handling.py @@ -8,8 +8,8 @@ from autogen import ConversableAgent, token_count_utils warn( - "Context handling with TransformChatHistory is deprecated. " - "Please use TransformMessages from autogen/agentchat/contrib/capabilities/transform_messages.py instead.", + "Context handling with TransformChatHistory is deprecated and will be removed in `0.2.30`. " + "Please use `TransformMessages`, documentation can be found at https://microsoft.github.io/autogen/docs/topics/handling_long_contexts/intro_to_transform_messages", DeprecationWarning, stacklevel=2, ) diff --git a/autogen/agentchat/contrib/capabilities/transforms.py b/autogen/agentchat/contrib/capabilities/transforms.py index 8303843e881..bc56efd74d2 100644 --- a/autogen/agentchat/contrib/capabilities/transforms.py +++ b/autogen/agentchat/contrib/capabilities/transforms.py @@ -8,6 +8,7 @@ from autogen import token_count_utils from autogen.cache import AbstractCache, Cache +from autogen.oai.openai_utils import filter_config from .text_compressors import LLMLingua, TextCompressor @@ -130,6 +131,8 @@ def __init__( max_tokens: Optional[int] = None, min_tokens: Optional[int] = None, model: str = "gpt-3.5-turbo-0613", + filter_dict: Optional[Dict] = None, + exclude_filter: bool = True, ): """ Args: @@ -140,11 +143,17 @@ def __init__( min_tokens (Optional[int]): Minimum number of tokens in messages to apply the transformation. Must be greater than or equal to 0 if not None. model (str): The target OpenAI model for tokenization alignment. + filter_dict (None or dict): A dictionary to filter out messages that you want/don't want to compress. + If None, no filters will be applied. + exclude_filter (bool): If exclude filter is True (the default value), messages that match the filter will be + excluded from token truncation. If False, messages that match the filter will be truncated. """ self._model = model self._max_tokens_per_message = self._validate_max_tokens(max_tokens_per_message) self._max_tokens = self._validate_max_tokens(max_tokens) self._min_tokens = self._validate_min_tokens(min_tokens, max_tokens) + self._filter_dict = filter_dict + self._exclude_filter = exclude_filter def apply_transform(self, messages: List[Dict]) -> List[Dict]: """Applies token truncation to the conversation history. @@ -169,10 +178,15 @@ def apply_transform(self, messages: List[Dict]) -> List[Dict]: for msg in reversed(temp_messages): # Some messages may not have content. - if not isinstance(msg.get("content"), (str, list)): + if not _is_content_right_type(msg.get("content")): processed_messages.insert(0, msg) continue + if not _should_transform_message(msg, self._filter_dict, self._exclude_filter): + processed_messages.insert(0, msg) + processed_messages_tokens += _count_tokens(msg["content"]) + continue + expected_tokens_remained = self._max_tokens - processed_messages_tokens - self._max_tokens_per_message # If adding this message would exceed the token limit, truncate the last message to meet the total token @@ -282,6 +296,8 @@ def __init__( min_tokens: Optional[int] = None, compression_params: Dict = dict(), cache: Optional[AbstractCache] = Cache.disk(), + filter_dict: Optional[Dict] = None, + exclude_filter: bool = True, ): """ Args: @@ -293,6 +309,10 @@ def __init__( dictionary. cache (None or AbstractCache): The cache client to use to store and retrieve previously compressed messages. If None, no caching will be used. + filter_dict (None or dict): A dictionary to filter out messages that you want/don't want to compress. + If None, no filters will be applied. + exclude_filter (bool): If exclude filter is True (the default value), messages that match the filter will be + excluded from compression. If False, messages that match the filter will be compressed. """ if text_compressor is None: @@ -303,6 +323,8 @@ def __init__( self._text_compressor = text_compressor self._min_tokens = min_tokens self._compression_args = compression_params + self._filter_dict = filter_dict + self._exclude_filter = exclude_filter self._cache = cache # Optimizing savings calculations to optimize log generation @@ -334,7 +356,10 @@ def apply_transform(self, messages: List[Dict]) -> List[Dict]: processed_messages = messages.copy() for message in processed_messages: # Some messages may not have content. - if not isinstance(message.get("content"), (str, list)): + if not _is_content_right_type(message.get("content")): + continue + + if not _should_transform_message(message, self._filter_dict, self._exclude_filter): continue if _is_content_text_empty(message["content"]): @@ -397,7 +422,7 @@ def _cache_set( self, content: Union[str, List[Dict]], compressed_content: Union[str, List[Dict]], tokens_saved: int ): if self._cache: - value = (tokens_saved, json.dumps(compressed_content)) + value = (tokens_saved, compressed_content) self._cache.set(self._cache_key(content), value) def _cache_key(self, content: Union[str, List[Dict]]) -> str: @@ -427,6 +452,10 @@ def _count_tokens(content: Union[str, List[Dict[str, Any]]]) -> int: return token_count +def _is_content_right_type(content: Any) -> bool: + return isinstance(content, (str, list)) + + def _is_content_text_empty(content: Union[str, List[Dict[str, Any]]]) -> bool: if isinstance(content, str): return content == "" @@ -434,3 +463,10 @@ def _is_content_text_empty(content: Union[str, List[Dict[str, Any]]]) -> bool: return all(_is_content_text_empty(item.get("text", "")) for item in content) else: return False + + +def _should_transform_message(message: Dict[str, Any], filter_dict: Optional[Dict[str, Any]], exclude: bool) -> bool: + if not filter_dict: + return True + + return len(filter_config([message], filter_dict, exclude)) > 0 diff --git a/autogen/agentchat/contrib/compressible_agent.py b/autogen/agentchat/contrib/compressible_agent.py index 9c4e78af852..cbedb17ceed 100644 --- a/autogen/agentchat/contrib/compressible_agent.py +++ b/autogen/agentchat/contrib/compressible_agent.py @@ -13,8 +13,8 @@ logger = logging.getLogger(__name__) warn( - "Context handling with CompressibleAgent is deprecated. " - "Please use `TransformMessages`, documentation can be found at https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/capabilities/transform_messages", + "Context handling with CompressibleAgent is deprecated and will be removed in `0.2.30`. " + "Please use `TransformMessages`, documentation can be found at https://microsoft.github.io/autogen/docs/topics/handling_long_contexts/intro_to_transform_messages", DeprecationWarning, stacklevel=2, ) diff --git a/autogen/agentchat/contrib/gpt_assistant_agent.py b/autogen/agentchat/contrib/gpt_assistant_agent.py index 0f5de8adcb5..40a28bfbcfa 100644 --- a/autogen/agentchat/contrib/gpt_assistant_agent.py +++ b/autogen/agentchat/contrib/gpt_assistant_agent.py @@ -11,6 +11,7 @@ from autogen.agentchat.agent import Agent from autogen.agentchat.assistant_agent import AssistantAgent, ConversableAgent from autogen.oai.openai_utils import create_gpt_assistant, retrieve_assistants_by_name, update_gpt_assistant +from autogen.runtime_logging import log_new_agent, logging_enabled logger = logging.getLogger(__name__) @@ -65,6 +66,8 @@ def __init__( super().__init__( name=name, system_message=instructions, human_input_mode="NEVER", llm_config=openai_client_cfg, **kwargs ) + if logging_enabled(): + log_new_agent(self, locals()) # GPTAssistantAgent's azure_deployment param may cause NotFoundError (404) in client.beta.assistants.list() # See: https://github.com/microsoft/autogen/pull/1721 @@ -169,10 +172,11 @@ def __init__( # Tools are specified but overwrite_tools is False; do not update the assistant's tools logger.warning("overwrite_tools is False. Using existing tools from assistant API.") + self.update_system_message(self._openai_assistant.instructions) # lazily create threads self._openai_threads = {} self._unread_index = defaultdict(int) - self.register_reply(Agent, GPTAssistantAgent._invoke_assistant, position=2) + self.register_reply([Agent, None], GPTAssistantAgent._invoke_assistant, position=2) def _invoke_assistant( self, diff --git a/autogen/agentchat/conversable_agent.py b/autogen/agentchat/conversable_agent.py index bfd38a54d60..c3394a96bb6 100644 --- a/autogen/agentchat/conversable_agent.py +++ b/autogen/agentchat/conversable_agent.py @@ -937,6 +937,7 @@ def my_summary_method( One example key is "summary_prompt", and value is a string of text used to prompt a LLM-based agent (the sender or receiver agent) to reflect on the conversation and extract a summary when summary_method is "reflection_with_llm". The default summary_prompt is DEFAULT_SUMMARY_PROMPT, i.e., "Summarize takeaway from the conversation. Do not add any introductory phrases. If the intended request is NOT properly addressed, please point it out." + Another available key is "summary_role", which is the role of the message sent to the agent in charge of summarizing. Default is "system". message (str, dict or Callable): the initial message to be sent to the recipient. Needs to be provided. Otherwise, input() will be called to get the initial message. - If a string or a dict is provided, it will be used as the initial message. `generate_init_message` is called to generate the initial message for the agent based on this string and the context. If dict, it may contain the following reserved fields (either content or tool_calls need to be provided). @@ -1168,8 +1169,13 @@ def _reflection_with_llm_as_summary(sender, recipient, summary_args): raise ValueError("The summary_prompt must be a string.") msg_list = recipient.chat_messages_for_summary(sender) agent = sender if recipient is None else recipient + role = summary_args.get("summary_role", None) + if role and not isinstance(role, str): + raise ValueError("The summary_role in summary_arg must be a string.") try: - summary = sender._reflection_with_llm(prompt, msg_list, llm_agent=agent, cache=summary_args.get("cache")) + summary = sender._reflection_with_llm( + prompt, msg_list, llm_agent=agent, cache=summary_args.get("cache"), role=role + ) except BadRequestError as e: warnings.warn( f"Cannot extract summary using reflection_with_llm: {e}. Using an empty str as summary.", UserWarning @@ -1178,7 +1184,12 @@ def _reflection_with_llm_as_summary(sender, recipient, summary_args): return summary def _reflection_with_llm( - self, prompt, messages, llm_agent: Optional[Agent] = None, cache: Optional[AbstractCache] = None + self, + prompt, + messages, + llm_agent: Optional[Agent] = None, + cache: Optional[AbstractCache] = None, + role: Union[str, None] = None, ) -> str: """Get a chat summary using reflection with an llm client based on the conversation history. @@ -1187,10 +1198,14 @@ def _reflection_with_llm( messages (list): The messages generated as part of a chat conversation. llm_agent: the agent with an llm client. cache (AbstractCache or None): the cache client to be used for this conversation. + role (str): the role of the message, usually "system" or "user". Default is "system". """ + if not role: + role = "system" + system_msg = [ { - "role": "system", + "role": role, "content": prompt, } ] @@ -2391,6 +2406,8 @@ def register_function(self, function_map: Dict[str, Union[Callable, None]]): self._assert_valid_name(name) if func is None and name not in self._function_map.keys(): warnings.warn(f"The function {name} to remove doesn't exist", name) + if name in self._function_map: + warnings.warn(f"Function '{name}' is being overridden.", UserWarning) self._function_map.update(function_map) self._function_map = {k: v for k, v in self._function_map.items() if v is not None} @@ -2427,6 +2444,9 @@ def update_function_signature(self, func_sig: Union[str, Dict], is_remove: None) self._assert_valid_name(func_sig["name"]) if "functions" in self.llm_config.keys(): + if any(func["name"] == func_sig["name"] for func in self.llm_config["functions"]): + warnings.warn(f"Function '{func_sig['name']}' is being overridden.", UserWarning) + self.llm_config["functions"] = [ func for func in self.llm_config["functions"] if func.get("name") != func_sig["name"] ] + [func_sig] @@ -2466,7 +2486,9 @@ def update_tool_signature(self, tool_sig: Union[str, Dict], is_remove: None): f"The tool signature must be of the type dict. Received tool signature type {type(tool_sig)}" ) self._assert_valid_name(tool_sig["function"]["name"]) - if "tools" in self.llm_config.keys(): + if "tools" in self.llm_config: + if any(tool["function"]["name"] == tool_sig["function"]["name"] for tool in self.llm_config["tools"]): + warnings.warn(f"Function '{tool_sig['function']['name']}' is being overridden.", UserWarning) self.llm_config["tools"] = [ tool for tool in self.llm_config["tools"] diff --git a/autogen/agentchat/groupchat.py b/autogen/agentchat/groupchat.py index 86492455080..83c426272a2 100644 --- a/autogen/agentchat/groupchat.py +++ b/autogen/agentchat/groupchat.py @@ -1,3 +1,5 @@ +import copy +import json import logging import random import re @@ -12,6 +14,7 @@ from ..io.base import IOStream from ..runtime_logging import log_new_agent, logging_enabled from .agent import Agent +from .chat import ChatResult from .conversable_agent import ConversableAgent logger = logging.getLogger(__name__) @@ -36,6 +39,7 @@ class GroupChat: Then select the next role from {agentlist} to play. Only return the role." - select_speaker_prompt_template: customize the select speaker prompt (used in "auto" speaker selection), which appears last in the message context and generally includes the list of agents and guidance for the LLM to select the next agent. If the string contains "{agentlist}" it will be replaced with a comma-separated list of agent names in square brackets. The default value is: "Read the above conversation. Then select the next role from {agentlist} to play. Only return the role." + To ignore this prompt being used, set this to None. If set to None, ensure your instructions for selecting a speaker are in the select_speaker_message_template string. - select_speaker_auto_multiple_template: customize the follow-up prompt used when selecting a speaker fails with a response that contains multiple agent names. This prompt guides the LLM to return just one agent name. Applies only to "auto" speaker selection method. If the string contains "{agentlist}" it will be replaced with a comma-separated list of agent names in square brackets. The default value is: "You provided more than one name in your text, please return just the name of the next speaker. To determine the speaker use these prioritised rules: 1. If the context refers to themselves as a speaker e.g. "As the..." , choose that speaker's name @@ -98,15 +102,15 @@ def custom_speaker_selection_func( agents: List[Agent] messages: List[Dict] - max_round: Optional[int] = 10 - admin_name: Optional[str] = "Admin" - func_call_filter: Optional[bool] = True + max_round: int = 10 + admin_name: str = "Admin" + func_call_filter: bool = True speaker_selection_method: Union[Literal["auto", "manual", "random", "round_robin"], Callable] = "auto" - max_retries_for_selecting_speaker: Optional[int] = 2 + max_retries_for_selecting_speaker: int = 2 allow_repeat_speaker: Optional[Union[bool, List[Agent]]] = None allowed_or_disallowed_speaker_transitions: Optional[Dict] = None speaker_transitions_type: Literal["allowed", "disallowed", None] = None - enable_clear_history: Optional[bool] = False + enable_clear_history: bool = False send_introductions: bool = False select_speaker_message_template: str = """You are in a role play game. The following roles are available: {roles}. @@ -222,8 +226,8 @@ def __post_init__(self): if self.select_speaker_message_template is None or len(self.select_speaker_message_template) == 0: raise ValueError("select_speaker_message_template cannot be empty or None.") - if self.select_speaker_prompt_template is None or len(self.select_speaker_prompt_template) == 0: - raise ValueError("select_speaker_prompt_template cannot be empty or None.") + if self.select_speaker_prompt_template is not None and len(self.select_speaker_prompt_template) == 0: + self.select_speaker_prompt_template = None if self.role_for_select_speaker_messages is None or len(self.role_for_select_speaker_messages) == 0: raise ValueError("role_for_select_speaker_messages cannot be empty or None.") @@ -327,7 +331,13 @@ def select_speaker_msg(self, agents: Optional[List[Agent]] = None) -> str: return return_msg def select_speaker_prompt(self, agents: Optional[List[Agent]] = None) -> str: - """Return the floating system prompt selecting the next speaker. This is always the *last* message in the context.""" + """Return the floating system prompt selecting the next speaker. + This is always the *last* message in the context. + Will return None if the select_speaker_prompt_template is None.""" + + if self.select_speaker_prompt_template is None: + return None + if agents is None: agents = self.agents @@ -621,23 +631,35 @@ def validate_speaker_name(recipient, messages, sender, config) -> Tuple[bool, Un remove_other_reply_funcs=True, ) + # NOTE: Do we have a speaker prompt (select_speaker_prompt_template is not None)? If we don't, we need to feed in the last message to start the nested chat + # Agent for selecting a single agent name from the response speaker_selection_agent = ConversableAgent( "speaker_selection_agent", system_message=self.select_speaker_msg(agents), - chat_messages={checking_agent: messages}, + chat_messages=( + {checking_agent: messages} + if self.select_speaker_prompt_template is not None + else {checking_agent: messages[:-1]} + ), llm_config=selector.llm_config, human_input_mode="NEVER", # Suppresses some extra terminal outputs, outputs will be handled by select_speaker_auto_verbose ) + # Create the starting message + if self.select_speaker_prompt_template is not None: + start_message = { + "content": self.select_speaker_prompt(agents), + "override_role": self.role_for_select_speaker_messages, + } + else: + start_message = messages[-1] + # Run the speaker selection chat result = checking_agent.initiate_chat( speaker_selection_agent, cache=None, # don't use caching for the speaker selection chat - message={ - "content": self.select_speaker_prompt(agents), - "override_role": self.role_for_select_speaker_messages, - }, + message=start_message, max_turns=2 * max(1, max_attempts), # Limiting the chat to the number of attempts, including the initial one clear_history=False, @@ -708,6 +730,8 @@ def validate_speaker_name(recipient, messages, sender, config) -> Tuple[bool, Un remove_other_reply_funcs=True, ) + # NOTE: Do we have a speaker prompt (select_speaker_prompt_template is not None)? If we don't, we need to feed in the last message to start the nested chat + # Agent for selecting a single agent name from the response speaker_selection_agent = ConversableAgent( "speaker_selection_agent", @@ -717,11 +741,20 @@ def validate_speaker_name(recipient, messages, sender, config) -> Tuple[bool, Un human_input_mode="NEVER", # Suppresses some extra terminal outputs, outputs will be handled by select_speaker_auto_verbose ) + # Create the starting message + if self.select_speaker_prompt_template is not None: + start_message = { + "content": self.select_speaker_prompt(agents), + "override_role": self.role_for_select_speaker_messages, + } + else: + start_message = messages[-1] + # Run the speaker selection chat result = await checking_agent.a_initiate_chat( speaker_selection_agent, cache=None, # don't use caching for the speaker selection chat - message=self.select_speaker_prompt(agents), + message=start_message, max_turns=2 * max(1, max_attempts), # Limiting the chat to the number of attempts, including the initial one clear_history=False, @@ -914,6 +947,7 @@ def __init__( max_consecutive_auto_reply: Optional[int] = sys.maxsize, human_input_mode: Optional[str] = "NEVER", system_message: Optional[Union[str, List]] = "Group chat manager.", + silent: bool = False, **kwargs, ): if ( @@ -937,6 +971,8 @@ def __init__( # Store groupchat self._groupchat = groupchat + self._silent = silent + # Order of register_reply is important. # Allow sync chat if initiated using initiate_chat self.register_reply(Agent, GroupChatManager.run_chat, config=groupchat, reset_config=GroupChat.reset) @@ -989,6 +1025,7 @@ def run_chat( speaker = sender groupchat = config send_introductions = getattr(groupchat, "send_introductions", False) + silent = getattr(self, "_silent", False) if send_introductions: # Broadcast the intro @@ -1043,7 +1080,7 @@ def run_chat( reply["content"] = self.clear_agents_history(reply, groupchat) # The speaker sends the message without requesting a reply - speaker.send(reply, self, request_reply=False) + speaker.send(reply, self, request_reply=False, silent=silent) message = self.last_message(speaker) if self.client_cache is not None: for a in groupchat.agents: @@ -1064,6 +1101,7 @@ async def a_run_chat( speaker = sender groupchat = config send_introductions = getattr(groupchat, "send_introductions", False) + silent = getattr(self, "_silent", False) if send_introductions: # Broadcast the intro @@ -1108,7 +1146,7 @@ async def a_run_chat( if reply is None: break # The speaker sends the message without requesting a reply - await speaker.a_send(reply, self, request_reply=False) + await speaker.a_send(reply, self, request_reply=False, silent=silent) message = self.last_message(speaker) if self.client_cache is not None: for a in groupchat.agents: @@ -1116,6 +1154,290 @@ async def a_run_chat( a.previous_cache = None return True, None + def resume( + self, + messages: Union[List[Dict], str], + remove_termination_string: str = None, + silent: Optional[bool] = False, + ) -> Tuple[ConversableAgent, Dict]: + """Resumes a group chat using the previous messages as a starting point. Requires the agents, group chat, and group chat manager to be established + as per the original group chat. + + Args: + - messages Union[List[Dict], str]: The content of the previous chat's messages, either as a Json string or a list of message dictionaries. + - remove_termination_string str: Remove the provided string from the last message to prevent immediate termination + - silent (bool or None): (Experimental) whether to print the messages for this conversation. Default is False. + + Returns: + - Tuple[ConversableAgent, Dict]: A tuple containing the last agent who spoke and their message + """ + + # Convert messages from string to messages list, if needed + if isinstance(messages, str): + messages = self.messages_from_string(messages) + elif isinstance(messages, list) and all(isinstance(item, dict) for item in messages): + messages = copy.deepcopy(messages) + else: + raise Exception("Messages is not of type str or List[Dict]") + + # Clean up the objects, ensuring there are no messages in the agents and group chat + + # Clear agent message history + for agent in self._groupchat.agents: + if isinstance(agent, ConversableAgent): + agent.clear_history() + + # Clear Manager message history + self.clear_history() + + # Clear GroupChat messages + self._groupchat.reset() + + # Validation of message and agents + + try: + self._valid_resume_messages(messages) + except: + raise + + # Load the messages into the group chat + for i, message in enumerate(messages): + + if "name" in message: + message_speaker_agent = self._groupchat.agent_by_name(message["name"]) + else: + # If there's no name, assign the group chat manager (this is an indication the ChatResult messages was used instead of groupchat.messages as state) + message_speaker_agent = self + message["name"] = self.name + + # If it wasn't an agent speaking, it may be the manager + if not message_speaker_agent and message["name"] == self.name: + message_speaker_agent = self + + # Add previous messages to each agent (except their own messages and the last message, as we'll kick off the conversation with it) + if i != len(messages) - 1: + for agent in self._groupchat.agents: + if agent.name != message["name"]: + self.send(message, self._groupchat.agent_by_name(agent.name), request_reply=False, silent=True) + + # Add previous message to the new groupchat, if it's an admin message the name may not match so add the message directly + if message_speaker_agent: + self._groupchat.append(message, message_speaker_agent) + else: + self._groupchat.messages.append(message) + + # Last speaker agent + last_speaker_name = message["name"] + + # Last message to check for termination (we could avoid this by ignoring termination check for resume in the future) + last_message = message + + # Get last speaker as an agent + previous_last_agent = self._groupchat.agent_by_name(name=last_speaker_name) + + # If we didn't match a last speaker agent, we check that it's the group chat's admin name and assign the manager, if so + if not previous_last_agent and ( + last_speaker_name == self._groupchat.admin_name or last_speaker_name == self.name + ): + previous_last_agent = self + + # Termination removal and check + self._process_resume_termination(remove_termination_string, messages) + + if not silent: + iostream = IOStream.get_default() + iostream.print( + f"Prepared group chat with {len(messages)} messages, the last speaker is", + colored(last_speaker_name, "yellow"), + flush=True, + ) + + # Update group chat settings for resuming + self._groupchat.send_introductions = False + + return previous_last_agent, last_message + + async def a_resume( + self, + messages: Union[List[Dict], str], + remove_termination_string: str = None, + silent: Optional[bool] = False, + ) -> Tuple[ConversableAgent, Dict]: + """Resumes a group chat using the previous messages as a starting point, asynchronously. Requires the agents, group chat, and group chat manager to be established + as per the original group chat. + + Args: + - messages Union[List[Dict], str]: The content of the previous chat's messages, either as a Json string or a list of message dictionaries. + - remove_termination_string str: Remove the provided string from the last message to prevent immediate termination + - silent (bool or None): (Experimental) whether to print the messages for this conversation. Default is False. + + Returns: + - Tuple[ConversableAgent, Dict]: A tuple containing the last agent who spoke and their message + """ + + # Convert messages from string to messages list, if needed + if isinstance(messages, str): + messages = self.messages_from_string(messages) + elif isinstance(messages, list) and all(isinstance(item, dict) for item in messages): + messages = copy.deepcopy(messages) + else: + raise Exception("Messages is not of type str or List[Dict]") + + # Clean up the objects, ensuring there are no messages in the agents and group chat + + # Clear agent message history + for agent in self._groupchat.agents: + if isinstance(agent, ConversableAgent): + agent.clear_history() + + # Clear Manager message history + self.clear_history() + + # Clear GroupChat messages + self._groupchat.reset() + + # Validation of message and agents + + try: + self._valid_resume_messages(messages) + except: + raise + + # Load the messages into the group chat + for i, message in enumerate(messages): + + if "name" in message: + message_speaker_agent = self._groupchat.agent_by_name(message["name"]) + else: + # If there's no name, assign the group chat manager (this is an indication the ChatResult messages was used instead of groupchat.messages as state) + message_speaker_agent = self + message["name"] = self.name + + # If it wasn't an agent speaking, it may be the manager + if not message_speaker_agent and message["name"] == self.name: + message_speaker_agent = self + + # Add previous messages to each agent (except their own messages and the last message, as we'll kick off the conversation with it) + if i != len(messages) - 1: + for agent in self._groupchat.agents: + if agent.name != message["name"]: + await self.a_send( + message, self._groupchat.agent_by_name(agent.name), request_reply=False, silent=True + ) + + # Add previous message to the new groupchat, if it's an admin message the name may not match so add the message directly + if message_speaker_agent: + self._groupchat.append(message, message_speaker_agent) + else: + self._groupchat.messages.append(message) + + # Last speaker agent + last_speaker_name = message["name"] + + # Last message to check for termination (we could avoid this by ignoring termination check for resume in the future) + last_message = message + + # Get last speaker as an agent + previous_last_agent = self._groupchat.agent_by_name(name=last_speaker_name) + + # If we didn't match a last speaker agent, we check that it's the group chat's admin name and assign the manager, if so + if not previous_last_agent and ( + last_speaker_name == self._groupchat.admin_name or last_speaker_name == self.name + ): + previous_last_agent = self + + # Termination removal and check + self._process_resume_termination(remove_termination_string, messages) + + if not silent: + iostream = IOStream.get_default() + iostream.print( + f"Prepared group chat with {len(messages)} messages, the last speaker is", + colored(last_speaker_name, "yellow"), + flush=True, + ) + + # Update group chat settings for resuming + self._groupchat.send_introductions = False + + return previous_last_agent, last_message + + def _valid_resume_messages(self, messages: List[Dict]): + """Validates the messages used for resuming + + args: + messages (List[Dict]): list of messages to resume with + + returns: + - bool: Whether they are valid for resuming + """ + # Must have messages to start with, otherwise they should run run_chat + if not messages: + raise Exception( + "Cannot resume group chat as no messages were provided. Use GroupChatManager.run_chat or ConversableAgent.initiate_chat to start a new chat." + ) + + # Check that all agents in the chat messages exist in the group chat + for message in messages: + if message.get("name"): + if ( + not self._groupchat.agent_by_name(message["name"]) + and not message["name"] == self._groupchat.admin_name # ignore group chat's name + and not message["name"] == self.name # ignore group chat manager's name + ): + raise Exception(f"Agent name in message doesn't exist as agent in group chat: {message['name']}") + + def _process_resume_termination(self, remove_termination_string: str, messages: List[Dict]): + """Removes termination string, if required, and checks if termination may occur. + + args: + remove_termination_string (str): termination string to remove from the last message + + returns: + None + """ + + last_message = messages[-1] + + # Replace any given termination string in the last message + if remove_termination_string: + if messages[-1].get("content") and remove_termination_string in messages[-1]["content"]: + messages[-1]["content"] = messages[-1]["content"].replace(remove_termination_string, "") + + # Check if the last message meets termination (if it has one) + if self._is_termination_msg: + if self._is_termination_msg(last_message): + logger.warning("WARNING: Last message meets termination criteria and this may terminate the chat.") + + def messages_from_string(self, message_string: str) -> List[Dict]: + """Reads the saved state of messages in Json format for resume and returns as a messages list + + args: + - message_string: Json string, the saved state + + returns: + - List[Dict]: List of messages + """ + try: + state = json.loads(message_string) + except json.JSONDecodeError: + raise Exception("Messages string is not a valid JSON string") + + return state + + def messages_to_string(self, messages: List[Dict]) -> str: + """Converts the provided messages into a Json string that can be used for resuming the chat. + The state is made up of a list of messages + + args: + - messages (List[Dict]): set of messages to convert to a string + + returns: + - str: Json representation of the messages which can be persisted for resuming later + """ + + return json.dumps(messages) + def _raise_exception_on_async_reply_functions(self) -> None: """Raise an exception if any async reply functions are registered. diff --git a/autogen/code_utils.py b/autogen/code_utils.py index e1bc951f099..98ed6067066 100644 --- a/autogen/code_utils.py +++ b/autogen/code_utils.py @@ -6,8 +6,10 @@ import subprocess import sys import time +import venv from concurrent.futures import ThreadPoolExecutor, TimeoutError from hashlib import md5 +from types import SimpleNamespace from typing import Any, Callable, Dict, List, Optional, Tuple, Union import docker @@ -719,3 +721,19 @@ def implement( # cost += metrics["gen_cost"] # if metrics["succeed_assertions"] or i == len(configs) - 1: # return responses[metrics["index_selected"]], cost, i + + +def create_virtual_env(dir_path: str, **env_args) -> SimpleNamespace: + """Creates a python virtual environment and returns the context. + + Args: + dir_path (str): Directory path where the env will be created. + **env_args: Any extra args to pass to the `EnvBuilder` + + Returns: + SimpleNamespace: the virtual env context object.""" + if not env_args: + env_args = {"with_pip": True} + env_builder = venv.EnvBuilder(**env_args) + env_builder.create(dir_path) + return env_builder.ensure_directories(dir_path) diff --git a/autogen/coding/local_commandline_code_executor.py b/autogen/coding/local_commandline_code_executor.py index ed92cd527be..29172bbe922 100644 --- a/autogen/coding/local_commandline_code_executor.py +++ b/autogen/coding/local_commandline_code_executor.py @@ -1,4 +1,5 @@ import logging +import os import re import subprocess import sys @@ -6,6 +7,7 @@ from hashlib import md5 from pathlib import Path from string import Template +from types import SimpleNamespace from typing import Any, Callable, ClassVar, Dict, List, Optional, Union from typing_extensions import ParamSpec @@ -64,6 +66,7 @@ class LocalCommandLineCodeExecutor(CodeExecutor): def __init__( self, timeout: int = 60, + virtual_env_context: Optional[SimpleNamespace] = None, work_dir: Union[Path, str] = Path("."), functions: List[Union[FunctionWithRequirements[Any, A], Callable[..., Any], FunctionWithRequirementsStr]] = [], functions_module: str = "functions", @@ -82,8 +85,22 @@ def __init__( PowerShell (pwsh, powershell, ps1), HTML, CSS, and JavaScript. Execution policies determine whether each language's code blocks are executed or saved only. + ## Execution with a Python virtual environment + A python virtual env can be used to execute code and install dependencies. This has the added benefit of not polluting the + base environment with unwanted modules. + ```python + from autogen.code_utils import create_virtual_env + from autogen.coding import LocalCommandLineCodeExecutor + + venv_dir = ".venv" + venv_context = create_virtual_env(venv_dir) + + executor = LocalCommandLineCodeExecutor(virtual_env_context=venv_context) + ``` + Args: timeout (int): The timeout for code execution, default is 60 seconds. + virtual_env_context (Optional[SimpleNamespace]): The virtual environment context to use. work_dir (Union[Path, str]): The working directory for code execution, defaults to the current directory. functions (List[Union[FunctionWithRequirements[Any, A], Callable[..., Any], FunctionWithRequirementsStr]]): A list of callable functions available to the executor. functions_module (str): The module name under which functions are accessible. @@ -105,6 +122,7 @@ def __init__( self._timeout = timeout self._work_dir: Path = work_dir + self._virtual_env_context: Optional[SimpleNamespace] = virtual_env_context self._functions = functions # Setup could take some time so we intentionally wait for the first code block to do it. @@ -196,7 +214,11 @@ def _setup_functions(self) -> None: required_packages = list(set(flattened_packages)) if len(required_packages) > 0: logging.info("Ensuring packages are installed in executor.") - cmd = [sys.executable, "-m", "pip", "install"] + required_packages + if self._virtual_env_context: + py_executable = self._virtual_env_context.env_exe + else: + py_executable = sys.executable + cmd = [py_executable, "-m", "pip", "install"] + required_packages try: result = subprocess.run( cmd, cwd=self._work_dir, capture_output=True, text=True, timeout=float(self._timeout) @@ -269,9 +291,18 @@ def _execute_code_dont_check_setup(self, code_blocks: List[CodeBlock]) -> Comman program = _cmd(lang) cmd = [program, str(written_file.absolute())] + env = os.environ.copy() + + if self._virtual_env_context: + path_with_virtualenv = rf"{self._virtual_env_context.bin_path}{os.pathsep}{env['PATH']}" + env["PATH"] = path_with_virtualenv + if WIN32: + activation_script = os.path.join(self._virtual_env_context.bin_path, "activate.bat") + cmd = [activation_script, "&&", *cmd] + try: result = subprocess.run( - cmd, cwd=self._work_dir, capture_output=True, text=True, timeout=float(self._timeout) + cmd, cwd=self._work_dir, capture_output=True, text=True, timeout=float(self._timeout), env=env ) except subprocess.TimeoutExpired: logs_all += "\n" + TIMEOUT_MSG diff --git a/autogen/logger/file_logger.py b/autogen/logger/file_logger.py new file mode 100644 index 00000000000..466ed62c849 --- /dev/null +++ b/autogen/logger/file_logger.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import json +import logging +import os +import threading +import uuid +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +from openai import AzureOpenAI, OpenAI +from openai.types.chat import ChatCompletion + +from autogen.logger.base_logger import BaseLogger +from autogen.logger.logger_utils import get_current_ts, to_dict + +from .base_logger import LLMConfig + +if TYPE_CHECKING: + from autogen import Agent, ConversableAgent, OpenAIWrapper + from autogen.oai.gemini import GeminiClient + +logger = logging.getLogger(__name__) + + +class FileLogger(BaseLogger): + def __init__(self, config: Dict[str, Any]): + self.config = config + self.session_id = str(uuid.uuid4()) + + curr_dir = os.getcwd() + self.log_dir = os.path.join(curr_dir, "autogen_logs") + os.makedirs(self.log_dir, exist_ok=True) + + self.log_file = os.path.join(self.log_dir, self.config.get("filename", "runtime.log")) + try: + with open(self.log_file, "a"): + pass + except Exception as e: + logger.error(f"[file_logger] Failed to create logging file: {e}") + + self.logger = logging.getLogger(__name__) + self.logger.setLevel(logging.INFO) + file_handler = logging.FileHandler(self.log_file) + self.logger.addHandler(file_handler) + + def start(self) -> str: + """Start the logger and return the session_id.""" + try: + self.logger.info(f"Started new session with Session ID: {self.session_id}") + except Exception as e: + logger.error(f"[file_logger] Failed to create logging file: {e}") + finally: + return self.session_id + + def log_chat_completion( + self, + invocation_id: uuid.UUID, + client_id: int, + wrapper_id: int, + request: Dict[str, Union[float, str, List[Dict[str, str]]]], + response: Union[str, ChatCompletion], + is_cached: int, + cost: float, + start_time: str, + ) -> None: + """ + Log a chat completion. + """ + thread_id = threading.get_ident() + try: + log_data = json.dumps( + { + "invocation_id": str(invocation_id), + "client_id": client_id, + "wrapper_id": wrapper_id, + "request": to_dict(request), + "response": str(response), + "is_cached": is_cached, + "cost": cost, + "start_time": start_time, + "end_time": get_current_ts(), + "thread_id": thread_id, + } + ) + + self.logger.info(log_data) + except Exception as e: + self.logger.error(f"[file_logger] Failed to log chat completion: {e}") + + def log_new_agent(self, agent: ConversableAgent, init_args: Dict[str, Any] = {}) -> None: + """ + Log a new agent instance. + """ + thread_id = threading.get_ident() + + try: + log_data = json.dumps( + { + "id": id(agent), + "agent_name": agent.name if hasattr(agent, "name") and agent.name is not None else "", + "wrapper_id": to_dict( + agent.client.wrapper_id if hasattr(agent, "client") and agent.client is not None else "" + ), + "session_id": self.session_id, + "current_time": get_current_ts(), + "agent_type": type(agent).__name__, + "args": to_dict(init_args), + "thread_id": thread_id, + } + ) + self.logger.info(log_data) + except Exception as e: + self.logger.error(f"[file_logger] Failed to log new agent: {e}") + + def log_event(self, source: Union[str, Agent], name: str, **kwargs: Dict[str, Any]) -> None: + """ + Log an event from an agent or a string source. + """ + from autogen import Agent + + # This takes an object o as input and returns a string. If the object o cannot be serialized, instead of raising an error, + # it returns a string indicating that the object is non-serializable, along with its type's qualified name obtained using __qualname__. + json_args = json.dumps(kwargs, default=lambda o: f"<>") + thread_id = threading.get_ident() + + if isinstance(source, Agent): + try: + log_data = json.dumps( + { + "source_id": id(source), + "source_name": str(source.name) if hasattr(source, "name") else source, + "event_name": name, + "agent_module": source.__module__, + "agent_class": source.__class__.__name__, + "json_state": json_args, + "timestamp": get_current_ts(), + "thread_id": thread_id, + } + ) + self.logger.info(log_data) + except Exception as e: + self.logger.error(f"[file_logger] Failed to log event {e}") + else: + try: + log_data = json.dumps( + { + "source_id": id(source), + "source_name": str(source.name) if hasattr(source, "name") else source, + "event_name": name, + "json_state": json_args, + "timestamp": get_current_ts(), + "thread_id": thread_id, + } + ) + self.logger.info(log_data) + except Exception as e: + self.logger.error(f"[file_logger] Failed to log event {e}") + + def log_new_wrapper( + self, wrapper: OpenAIWrapper, init_args: Dict[str, Union[LLMConfig, List[LLMConfig]]] = {} + ) -> None: + """ + Log a new wrapper instance. + """ + thread_id = threading.get_ident() + + try: + log_data = json.dumps( + { + "wrapper_id": id(wrapper), + "session_id": self.session_id, + "json_state": json.dumps(init_args), + "timestamp": get_current_ts(), + "thread_id": thread_id, + } + ) + self.logger.info(log_data) + except Exception as e: + self.logger.error(f"[file_logger] Failed to log event {e}") + + def log_new_client( + self, client: AzureOpenAI | OpenAI | GeminiClient, wrapper: OpenAIWrapper, init_args: Dict[str, Any] + ) -> None: + """ + Log a new client instance. + """ + thread_id = threading.get_ident() + + try: + log_data = json.dumps( + { + "client_id": id(client), + "wrapper_id": id(wrapper), + "session_id": self.session_id, + "class": type(client).__name__, + "json_state": json.dumps(init_args), + "timestamp": get_current_ts(), + "thread_id": thread_id, + } + ) + self.logger.info(log_data) + except Exception as e: + self.logger.error(f"[file_logger] Failed to log event {e}") + + def get_connection(self) -> None: + """Method is intentionally left blank because there is no specific connection needed for the FileLogger.""" + pass + + def stop(self) -> None: + """Close the file handler and remove it from the logger.""" + for handler in self.logger.handlers: + if isinstance(handler, logging.FileHandler): + handler.close() + self.logger.removeHandler(handler) diff --git a/autogen/logger/logger_factory.py b/autogen/logger/logger_factory.py index 8073c0c07d3..ed9567977bb 100644 --- a/autogen/logger/logger_factory.py +++ b/autogen/logger/logger_factory.py @@ -1,6 +1,7 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional from autogen.logger.base_logger import BaseLogger +from autogen.logger.file_logger import FileLogger from autogen.logger.sqlite_logger import SqliteLogger __all__ = ("LoggerFactory",) @@ -8,11 +9,15 @@ class LoggerFactory: @staticmethod - def get_logger(logger_type: str = "sqlite", config: Optional[Dict[str, Any]] = None) -> BaseLogger: + def get_logger( + logger_type: Literal["sqlite", "file"] = "sqlite", config: Optional[Dict[str, Any]] = None + ) -> BaseLogger: if config is None: config = {} if logger_type == "sqlite": return SqliteLogger(config) + elif logger_type == "file": + return FileLogger(config) else: raise ValueError(f"[logger_factory] Unknown logger type: {logger_type}") diff --git a/autogen/logger/sqlite_logger.py b/autogen/logger/sqlite_logger.py index 6e95a571cd0..42db83d849d 100644 --- a/autogen/logger/sqlite_logger.py +++ b/autogen/logger/sqlite_logger.py @@ -18,6 +18,7 @@ if TYPE_CHECKING: from autogen import Agent, ConversableAgent, OpenAIWrapper + from autogen.oai.gemini import GeminiClient logger = logging.getLogger(__name__) lock = threading.Lock() @@ -316,7 +317,7 @@ def log_new_wrapper(self, wrapper: OpenAIWrapper, init_args: Dict[str, Union[LLM self._run_query(query=query, args=args) def log_new_client( - self, client: Union[AzureOpenAI, OpenAI], wrapper: OpenAIWrapper, init_args: Dict[str, Any] + self, client: Union[AzureOpenAI, OpenAI, GeminiClient], wrapper: OpenAIWrapper, init_args: Dict[str, Any] ) -> None: if self.con is None: return diff --git a/autogen/oai/client.py b/autogen/oai/client.py index 3edfa40d4ec..982d1c0d57f 100644 --- a/autogen/oai/client.py +++ b/autogen/oai/client.py @@ -435,7 +435,8 @@ def _register_default_client(self, config: Dict[str, Any], openai_config: Dict[s elif api_type is not None and api_type.startswith("google"): if gemini_import_exception: raise ImportError("Please install `google-generativeai` to use Google OpenAI API.") - self._clients.append(GeminiClient(**openai_config)) + client = GeminiClient(**openai_config) + self._clients.append(client) else: client = OpenAI(**openai_config) self._clients.append(OpenAIClient(client)) diff --git a/autogen/oai/gemini.py b/autogen/oai/gemini.py index fcf7e09c025..5c06a4def0c 100644 --- a/autogen/oai/gemini.py +++ b/autogen/oai/gemini.py @@ -5,8 +5,18 @@ llm_config={ "config_list": [{ "api_type": "google", - "model": "models/gemini-pro", - "api_key": os.environ.get("GOOGLE_API_KEY") + "model": "gemini-pro", + "api_key": os.environ.get("GOOGLE_API_KEY"), + "safety_settings": [ + {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"}, + {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_ONLY_HIGH"}, + {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_ONLY_HIGH"}, + {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_ONLY_HIGH"} + ], + "top_p":0.5, + "max_tokens": 2048, + "temperature": 1.0, + "top_k": 5 } ]} @@ -47,6 +57,17 @@ class GeminiClient: of AutoGen. """ + # Mapping, where Key is a term used by Autogen, and Value is a term used by Gemini + PARAMS_MAPPING = { + "max_tokens": "max_output_tokens", + # "n": "candidate_count", # Gemini supports only `n=1` + "stop_sequences": "stop_sequences", + "temperature": "temperature", + "top_p": "top_p", + "top_k": "top_k", + "max_output_tokens": "max_output_tokens", + } + def __init__(self, **kwargs): self.api_key = kwargs.get("api_key", None) if not self.api_key: @@ -93,12 +114,15 @@ def create(self, params: Dict) -> ChatCompletion: messages = params.get("messages", []) stream = params.get("stream", False) n_response = params.get("n", 1) - params.get("temperature", 0.5) - params.get("top_p", 1.0) - params.get("max_tokens", 4096) + + generation_config = { + gemini_term: params[autogen_term] + for autogen_term, gemini_term in self.PARAMS_MAPPING.items() + if autogen_term in params + } + safety_settings = params.get("safety_settings", {}) if stream: - # warn user that streaming is not supported warnings.warn( "Streaming is not supported for Gemini yet, and it will have no effect. Please set stream=False.", UserWarning, @@ -112,7 +136,9 @@ def create(self, params: Dict) -> ChatCompletion: gemini_messages = oai_messages_to_gemini_messages(messages) # we use chat model by default - model = genai.GenerativeModel(model_name) + model = genai.GenerativeModel( + model_name, generation_config=generation_config, safety_settings=safety_settings + ) genai.configure(api_key=self.api_key) chat = model.start_chat(history=gemini_messages[:-1]) max_retries = 5 @@ -142,7 +168,9 @@ def create(self, params: Dict) -> ChatCompletion: elif model_name == "gemini-pro-vision": # B. handle the vision model # Gemini's vision model does not support chat history yet - model = genai.GenerativeModel(model_name) + model = genai.GenerativeModel( + model_name, generation_config=generation_config, safety_settings=safety_settings + ) genai.configure(api_key=self.api_key) # chat = model.start_chat(history=gemini_messages[:-1]) # response = chat.send_message(gemini_messages[-1]) diff --git a/autogen/oai/openai_utils.py b/autogen/oai/openai_utils.py index 7e738b7bd61..1ed347f6271 100644 --- a/autogen/oai/openai_utils.py +++ b/autogen/oai/openai_utils.py @@ -16,7 +16,10 @@ NON_CACHE_KEY = ["api_key", "base_url", "api_type", "api_version"] DEFAULT_AZURE_API_VERSION = "2024-02-15-preview" OAI_PRICE1K = { - # https://openai.com/pricing + # https://openai.com/api/pricing/ + # gpt-4o + "gpt-4o": (0.005, 0.015), + "gpt-4o-2024-05-13": (0.005, 0.015), # gpt-4-turbo "gpt-4-turbo-2024-04-09": (0.01, 0.03), # gpt-4 @@ -376,11 +379,10 @@ def config_list_gpt4_gpt35( def filter_config( config_list: List[Dict[str, Any]], filter_dict: Optional[Dict[str, Union[List[Union[str, None]], Set[Union[str, None]]]]], + exclude: bool = False, ) -> List[Dict[str, Any]]: - """ - This function filters `config_list` by checking each configuration dictionary against the - criteria specified in `filter_dict`. A configuration dictionary is retained if for every - key in `filter_dict`, see example below. + """This function filters `config_list` by checking each configuration dictionary against the criteria specified in + `filter_dict`. A configuration dictionary is retained if for every key in `filter_dict`, see example below. Args: config_list (list of dict): A list of configuration dictionaries to be filtered. @@ -391,71 +393,68 @@ def filter_config( when it is found in the list of acceptable values. If the configuration's field's value is a list, then a match occurs if there is a non-empty intersection with the acceptable values. - - + exclude (bool): If False (the default value), configs that match the filter will be included in the returned + list. If True, configs that match the filter will be excluded in the returned list. Returns: list of dict: A list of configuration dictionaries that meet all the criteria specified in `filter_dict`. Example: - ```python - # Example configuration list with various models and API types - configs = [ - {'model': 'gpt-3.5-turbo'}, - {'model': 'gpt-4'}, - {'model': 'gpt-3.5-turbo', 'api_type': 'azure'}, - {'model': 'gpt-3.5-turbo', 'tags': ['gpt35_turbo', 'gpt-35-turbo']}, - ] - - # Define filter criteria to select configurations for the 'gpt-3.5-turbo' model - # that are also using the 'azure' API type - filter_criteria = { - 'model': ['gpt-3.5-turbo'], # Only accept configurations for 'gpt-3.5-turbo' - 'api_type': ['azure'] # Only accept configurations for 'azure' API type - } - - # Apply the filter to the configuration list - filtered_configs = filter_config(configs, filter_criteria) - - # The resulting `filtered_configs` will be: - # [{'model': 'gpt-3.5-turbo', 'api_type': 'azure', ...}] - - - # Define a filter to select a given tag - filter_criteria = { - 'tags': ['gpt35_turbo'], - } - - # Apply the filter to the configuration list - filtered_configs = filter_config(configs, filter_criteria) - - # The resulting `filtered_configs` will be: - # [{'model': 'gpt-3.5-turbo', 'tags': ['gpt35_turbo', 'gpt-35-turbo']}] - ``` - + ```python + # Example configuration list with various models and API types + configs = [ + {'model': 'gpt-3.5-turbo'}, + {'model': 'gpt-4'}, + {'model': 'gpt-3.5-turbo', 'api_type': 'azure'}, + {'model': 'gpt-3.5-turbo', 'tags': ['gpt35_turbo', 'gpt-35-turbo']}, + ] + # Define filter criteria to select configurations for the 'gpt-3.5-turbo' model + # that are also using the 'azure' API type + filter_criteria = { + 'model': ['gpt-3.5-turbo'], # Only accept configurations for 'gpt-3.5-turbo' + 'api_type': ['azure'] # Only accept configurations for 'azure' API type + } + # Apply the filter to the configuration list + filtered_configs = filter_config(configs, filter_criteria) + # The resulting `filtered_configs` will be: + # [{'model': 'gpt-3.5-turbo', 'api_type': 'azure', ...}] + # Define a filter to select a given tag + filter_criteria = { + 'tags': ['gpt35_turbo'], + } + # Apply the filter to the configuration list + filtered_configs = filter_config(configs, filter_criteria) + # The resulting `filtered_configs` will be: + # [{'model': 'gpt-3.5-turbo', 'tags': ['gpt35_turbo', 'gpt-35-turbo']}] + ``` Note: - If `filter_dict` is empty or None, no filtering is applied and `config_list` is returned as is. - If a configuration dictionary in `config_list` does not contain a key specified in `filter_dict`, it is considered a non-match and is excluded from the result. - If the list of acceptable values for a key in `filter_dict` includes None, then configuration dictionaries that do not have that key will also be considered a match. - """ - def _satisfies(config_value: Any, acceptable_values: Any) -> bool: - if isinstance(config_value, list): - return bool(set(config_value) & set(acceptable_values)) # Non-empty intersection - else: - return config_value in acceptable_values + """ if filter_dict: - config_list = [ - config - for config in config_list - if all(_satisfies(config.get(key), value) for key, value in filter_dict.items()) + return [ + item + for item in config_list + if all(_satisfies_criteria(item.get(key), values) != exclude for key, values in filter_dict.items()) ] return config_list +def _satisfies_criteria(value: Any, criteria_values: Any) -> bool: + if value is None: + return False + + if isinstance(value, list): + return bool(set(value) & set(criteria_values)) # Non-empty intersection + else: + return value in criteria_values + + def config_list_from_json( env_or_file: str, file_location: Optional[str] = "", @@ -782,3 +781,10 @@ def update_gpt_assistant(client: OpenAI, assistant_id: str, assistant_config: Di assistant_update_kwargs["file_ids"] = assistant_config["file_ids"] return client.beta.assistants.update(assistant_id=assistant_id, **assistant_update_kwargs) + + +def _satisfies(config_value: Any, acceptable_values: Any) -> bool: + if isinstance(config_value, list): + return bool(set(config_value) & set(acceptable_values)) # Non-empty intersection + else: + return config_value in acceptable_values diff --git a/autogen/runtime_logging.py b/autogen/runtime_logging.py index 1b9835eaa4b..ffc741482e6 100644 --- a/autogen/runtime_logging.py +++ b/autogen/runtime_logging.py @@ -3,16 +3,17 @@ import logging import sqlite3 import uuid -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union from openai import AzureOpenAI, OpenAI from openai.types.chat import ChatCompletion -from autogen.logger.base_logger import LLMConfig +from autogen.logger.base_logger import BaseLogger, LLMConfig from autogen.logger.logger_factory import LoggerFactory if TYPE_CHECKING: from autogen import Agent, ConversableAgent, OpenAIWrapper + from autogen.oai.gemini import GeminiClient logger = logging.getLogger(__name__) @@ -20,11 +21,27 @@ is_logging = False -def start(logger_type: str = "sqlite", config: Optional[Dict[str, Any]] = None) -> str: +def start( + logger: Optional[BaseLogger] = None, + logger_type: Literal["sqlite", "file"] = "sqlite", + config: Optional[Dict[str, Any]] = None, +) -> str: + """ + Start logging for the runtime. + Args: + logger (BaseLogger): A logger instance + logger_type (str): The type of logger to use (default: sqlite) + config (dict): Configuration for the logger + Returns: + session_id (str(uuid.uuid4)): a unique id for the logging session + """ global autogen_logger global is_logging - autogen_logger = LoggerFactory.get_logger(logger_type=logger_type, config=config) + if logger: + autogen_logger = logger + else: + autogen_logger = LoggerFactory.get_logger(logger_type=logger_type, config=config) try: session_id = autogen_logger.start() @@ -78,7 +95,9 @@ def log_new_wrapper(wrapper: OpenAIWrapper, init_args: Dict[str, Union[LLMConfig autogen_logger.log_new_wrapper(wrapper, init_args) -def log_new_client(client: Union[AzureOpenAI, OpenAI], wrapper: OpenAIWrapper, init_args: Dict[str, Any]) -> None: +def log_new_client( + client: Union[AzureOpenAI, OpenAI, GeminiClient], wrapper: OpenAIWrapper, init_args: Dict[str, Any] +) -> None: if autogen_logger is None: logger.error("[runtime logging] log_new_client: autogen logger is None") return diff --git a/autogen/token_count_utils.py b/autogen/token_count_utils.py index 589d7b404a7..b71dbc428a1 100644 --- a/autogen/token_count_utils.py +++ b/autogen/token_count_utils.py @@ -34,6 +34,8 @@ def get_max_token_limit(model: str = "gpt-3.5-turbo-0613") -> int: "gpt-4-0125-preview": 128000, "gpt-4-turbo-preview": 128000, "gpt-4-vision-preview": 128000, + "gpt-4o": 128000, + "gpt-4o-2024-05-13": 128000, } return max_token_limit[model] diff --git a/dotnet/.editorconfig b/dotnet/.editorconfig index 4da1adc5de6..5a604ce0096 100644 --- a/dotnet/.editorconfig +++ b/dotnet/.editorconfig @@ -141,7 +141,7 @@ csharp_preserve_single_line_statements = true csharp_preserve_single_line_blocks = true # Code block -csharp_prefer_braces = false:none +csharp_prefer_braces = true:warning # Using statements csharp_using_directive_placement = outside_namespace:error @@ -173,6 +173,11 @@ dotnet_diagnostic.CS1573.severity = none # disable CS1570: XML comment has badly formed XML dotnet_diagnostic.CS1570.severity = none +dotnet_diagnostic.IDE0035.severity = warning # Remove unreachable code +dotnet_diagnostic.IDE0161.severity = warning # Use file-scoped namespace + +csharp_style_var_elsewhere = true:suggestion # Prefer 'var' everywhere + # disable check for generated code [*.generated.cs] generated_code = true \ No newline at end of file diff --git a/dotnet/AutoGen.sln b/dotnet/AutoGen.sln index b46b8091cf5..be40e7b61b6 100644 --- a/dotnet/AutoGen.sln +++ b/dotnet/AutoGen.sln @@ -33,7 +33,18 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.Mistral", "src\Auto EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.Mistral.Tests", "test\AutoGen.Mistral.Tests\AutoGen.Mistral.Tests.csproj", "{15441693-3659-4868-B6C1-B106F52FF3BA}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AutoGen.SemanticKernel.Tests", "test\AutoGen.SemanticKernel.Tests\AutoGen.SemanticKernel.Tests.csproj", "{1DFABC4A-8458-4875-8DCB-59F3802DAC65}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.SemanticKernel.Tests", "test\AutoGen.SemanticKernel.Tests\AutoGen.SemanticKernel.Tests.csproj", "{1DFABC4A-8458-4875-8DCB-59F3802DAC65}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.OpenAI.Tests", "test\AutoGen.OpenAI.Tests\AutoGen.OpenAI.Tests.csproj", "{D36A85F9-C172-487D-8192-6BFE5D05B4A7}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.DotnetInteractive.Tests", "test\AutoGen.DotnetInteractive.Tests\AutoGen.DotnetInteractive.Tests.csproj", "{B61388CA-DC73-4B7F-A7B2-7B9A86C9229E}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.Ollama", "src\AutoGen.Ollama\AutoGen.Ollama.csproj", "{9F9E6DED-3D92-4970-909A-70FC11F1A665}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AutoGen.Ollama.Tests", "test\AutoGen.Ollama.Tests\AutoGen.Ollama.Tests.csproj", "{03E31CAA-3728-48D3-B936-9F11CF6C18FE}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AutoGen.Ollama.Sample", "sample\AutoGen.Ollama.Sample\AutoGen.Ollama.Sample.csproj", "{93AA4D0D-6EE4-44D5-AD77-7F73A3934544}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AutoGen.SemanticKernel.Sample", "sample\AutoGen.SemanticKernel.Sample\AutoGen.SemanticKernel.Sample.csproj", "{52958A60-3FF7-4243-9058-34A6E4F55C31}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -93,6 +104,30 @@ Global {1DFABC4A-8458-4875-8DCB-59F3802DAC65}.Debug|Any CPU.Build.0 = Debug|Any CPU {1DFABC4A-8458-4875-8DCB-59F3802DAC65}.Release|Any CPU.ActiveCfg = Release|Any CPU {1DFABC4A-8458-4875-8DCB-59F3802DAC65}.Release|Any CPU.Build.0 = Release|Any CPU + {D36A85F9-C172-487D-8192-6BFE5D05B4A7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D36A85F9-C172-487D-8192-6BFE5D05B4A7}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D36A85F9-C172-487D-8192-6BFE5D05B4A7}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D36A85F9-C172-487D-8192-6BFE5D05B4A7}.Release|Any CPU.Build.0 = Release|Any CPU + {B61388CA-DC73-4B7F-A7B2-7B9A86C9229E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B61388CA-DC73-4B7F-A7B2-7B9A86C9229E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B61388CA-DC73-4B7F-A7B2-7B9A86C9229E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B61388CA-DC73-4B7F-A7B2-7B9A86C9229E}.Release|Any CPU.Build.0 = Release|Any CPU + {9F9E6DED-3D92-4970-909A-70FC11F1A665}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9F9E6DED-3D92-4970-909A-70FC11F1A665}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9F9E6DED-3D92-4970-909A-70FC11F1A665}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9F9E6DED-3D92-4970-909A-70FC11F1A665}.Release|Any CPU.Build.0 = Release|Any CPU + {03E31CAA-3728-48D3-B936-9F11CF6C18FE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {03E31CAA-3728-48D3-B936-9F11CF6C18FE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {03E31CAA-3728-48D3-B936-9F11CF6C18FE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {03E31CAA-3728-48D3-B936-9F11CF6C18FE}.Release|Any CPU.Build.0 = Release|Any CPU + {93AA4D0D-6EE4-44D5-AD77-7F73A3934544}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {93AA4D0D-6EE4-44D5-AD77-7F73A3934544}.Debug|Any CPU.Build.0 = Debug|Any CPU + {93AA4D0D-6EE4-44D5-AD77-7F73A3934544}.Release|Any CPU.ActiveCfg = Release|Any CPU + {93AA4D0D-6EE4-44D5-AD77-7F73A3934544}.Release|Any CPU.Build.0 = Release|Any CPU + {52958A60-3FF7-4243-9058-34A6E4F55C31}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {52958A60-3FF7-4243-9058-34A6E4F55C31}.Debug|Any CPU.Build.0 = Debug|Any CPU + {52958A60-3FF7-4243-9058-34A6E4F55C31}.Release|Any CPU.ActiveCfg = Release|Any CPU + {52958A60-3FF7-4243-9058-34A6E4F55C31}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -111,6 +146,12 @@ Global {6585D1A4-3D97-4D76-A688-1933B61AEB19} = {18BF8DD7-0585-48BF-8F97-AD333080CE06} {15441693-3659-4868-B6C1-B106F52FF3BA} = {F823671B-3ECA-4AE6-86DA-25E920D3FE64} {1DFABC4A-8458-4875-8DCB-59F3802DAC65} = {F823671B-3ECA-4AE6-86DA-25E920D3FE64} + {D36A85F9-C172-487D-8192-6BFE5D05B4A7} = {F823671B-3ECA-4AE6-86DA-25E920D3FE64} + {B61388CA-DC73-4B7F-A7B2-7B9A86C9229E} = {F823671B-3ECA-4AE6-86DA-25E920D3FE64} + {9F9E6DED-3D92-4970-909A-70FC11F1A665} = {18BF8DD7-0585-48BF-8F97-AD333080CE06} + {03E31CAA-3728-48D3-B936-9F11CF6C18FE} = {F823671B-3ECA-4AE6-86DA-25E920D3FE64} + {93AA4D0D-6EE4-44D5-AD77-7F73A3934544} = {FBFEAD1F-29EB-4D99-A672-0CD8473E10B9} + {52958A60-3FF7-4243-9058-34A6E4F55C31} = {FBFEAD1F-29EB-4D99-A672-0CD8473E10B9} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {93384647-528D-46C8-922C-8DB36A382F0B} diff --git a/dotnet/NuGet.config b/dotnet/NuGet.config index 2eb25136c6a..1d0cf4c2bc7 100644 --- a/dotnet/NuGet.config +++ b/dotnet/NuGet.config @@ -2,8 +2,6 @@ - - diff --git a/dotnet/eng/MetaInfo.props b/dotnet/eng/MetaInfo.props index 8aff3c60226..0444dadfd5e 100644 --- a/dotnet/eng/MetaInfo.props +++ b/dotnet/eng/MetaInfo.props @@ -1,7 +1,7 @@ - 0.0.13 + 0.0.14 AutoGen https://microsoft.github.io/autogen-for-net/ https://github.com/microsoft/autogen diff --git a/dotnet/eng/Version.props b/dotnet/eng/Version.props index b9fc4367194..ae213015471 100644 --- a/dotnet/eng/Version.props +++ b/dotnet/eng/Version.props @@ -10,7 +10,7 @@ 6.8.0 2.4.2 17.7.0 - 1.0.0-beta.23523.2 + 1.0.0-beta.24229.4 8.0.0 4.0.0 diff --git a/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj b/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj index 0cafff3c0d0..afc76164906 100644 --- a/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj +++ b/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj @@ -4,7 +4,6 @@ Exe $(TestTargetFramework) enable - enable True $(NoWarn);CS8981;CS8600;CS8602;CS8604;CS8618;CS0219;SKEXP0054;SKEXP0050;SKEXP0110 diff --git a/dotnet/sample/AutoGen.BasicSamples/CodeSnippet/OpenAICodeSnippet.cs b/dotnet/sample/AutoGen.BasicSamples/CodeSnippet/OpenAICodeSnippet.cs index 022f7e9f984..cf045221223 100644 --- a/dotnet/sample/AutoGen.BasicSamples/CodeSnippet/OpenAICodeSnippet.cs +++ b/dotnet/sample/AutoGen.BasicSamples/CodeSnippet/OpenAICodeSnippet.cs @@ -84,7 +84,7 @@ public async Task CreateOpenAIChatAgentAsync() new TextMessage(Role.Assistant, "Hello", from: "user"), ], from: "user"), - new Message(Role.Assistant, "Hello", from: "user"), // Message type is going to be deprecated, please use TextMessage instead + new TextMessage(Role.Assistant, "Hello", from: "user"), // Message type is going to be deprecated, please use TextMessage instead }; foreach (var message in messages) diff --git a/dotnet/sample/AutoGen.BasicSamples/Example03_Agent_FunctionCall.cs b/dotnet/sample/AutoGen.BasicSamples/Example03_Agent_FunctionCall.cs index 57b9ea76dcb..0ef8eaa48ae 100644 --- a/dotnet/sample/AutoGen.BasicSamples/Example03_Agent_FunctionCall.cs +++ b/dotnet/sample/AutoGen.BasicSamples/Example03_Agent_FunctionCall.cs @@ -77,20 +77,30 @@ public static async Task RunAsync() // talk to the assistant agent var upperCase = await agent.SendAsync("convert to upper case: hello world"); upperCase.GetContent()?.Should().Be("HELLO WORLD"); - upperCase.Should().BeOfType>(); + upperCase.Should().BeOfType(); upperCase.GetToolCalls().Should().HaveCount(1); upperCase.GetToolCalls().First().FunctionName.Should().Be(nameof(UpperCase)); var concatString = await agent.SendAsync("concatenate strings: a, b, c, d, e"); concatString.GetContent()?.Should().Be("a b c d e"); - concatString.Should().BeOfType>(); + concatString.Should().BeOfType(); concatString.GetToolCalls().Should().HaveCount(1); concatString.GetToolCalls().First().FunctionName.Should().Be(nameof(ConcatString)); var calculateTax = await agent.SendAsync("calculate tax: 100, 0.1"); calculateTax.GetContent().Should().Be("tax is 10"); - calculateTax.Should().BeOfType>(); + calculateTax.Should().BeOfType(); calculateTax.GetToolCalls().Should().HaveCount(1); calculateTax.GetToolCalls().First().FunctionName.Should().Be(nameof(CalculateTax)); + + // parallel function calls + var calculateTaxes = await agent.SendAsync("calculate tax: 100, 0.1; calculate tax: 200, 0.2"); + calculateTaxes.GetContent().Should().Be("tax is 10\ntax is 40"); // "tax is 10\n tax is 40 + calculateTaxes.Should().BeOfType(); + calculateTaxes.GetToolCalls().Should().HaveCount(2); + calculateTaxes.GetToolCalls().First().FunctionName.Should().Be(nameof(CalculateTax)); + + // send aggregate message back to llm to get the final result + var finalResult = await agent.SendAsync(calculateTaxes); } } diff --git a/dotnet/sample/AutoGen.BasicSamples/Example04_Dynamic_GroupChat_Coding_Task.cs b/dotnet/sample/AutoGen.BasicSamples/Example04_Dynamic_GroupChat_Coding_Task.cs index c5d9a01f971..47dd8ce66c9 100644 --- a/dotnet/sample/AutoGen.BasicSamples/Example04_Dynamic_GroupChat_Coding_Task.cs +++ b/dotnet/sample/AutoGen.BasicSamples/Example04_Dynamic_GroupChat_Coding_Task.cs @@ -40,7 +40,8 @@ public static async Task RunAsync() name: "groupAdmin", systemMessage: "You are the admin of the group chat", temperature: 0f, - config: gptConfig); + config: gptConfig) + .RegisterPrintMessage(); var userProxy = new UserProxyAgent(name: "user", defaultReply: GroupChatExtension.TERMINATE, humanInputMode: HumanInputMode.NEVER) .RegisterPrintMessage(); diff --git a/dotnet/sample/AutoGen.BasicSamples/Example05_Dalle_And_GPT4V.cs b/dotnet/sample/AutoGen.BasicSamples/Example05_Dalle_And_GPT4V.cs index 9fccd7ab385..2d21615ef71 100644 --- a/dotnet/sample/AutoGen.BasicSamples/Example05_Dalle_And_GPT4V.cs +++ b/dotnet/sample/AutoGen.BasicSamples/Example05_Dalle_And_GPT4V.cs @@ -1,8 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Example05_Dalle_And_GPT4V.cs -using AutoGen; using AutoGen.Core; +using AutoGen.OpenAI; +using AutoGen.OpenAI.Extension; using Azure.AI.OpenAI; using FluentAssertions; using autogen = AutoGen.LLMConfigAPI; @@ -66,50 +67,39 @@ public static async Task RunAsync() File.Delete(imagePath); } - var dalleAgent = new AssistantAgent( - name: "dalle", - systemMessage: "You are a DALL-E agent that generate image from prompt, when conversation is terminated, return the most recent image url", - llmConfig: new ConversableAgentConfig - { - Temperature = 0, - ConfigList = gpt35Config, - FunctionContracts = new[] - { - instance.GenerateImageFunctionContract, - }, - }, + var generateImageFunctionMiddleware = new FunctionCallMiddleware( + functions: [instance.GenerateImageFunctionContract], functionMap: new Dictionary>> { { nameof(GenerateImage), instance.GenerateImageWrapper }, - }) + }); + var dalleAgent = new OpenAIChatAgent( + openAIClient: openAIClient, + modelName: "gpt-3.5-turbo", + name: "dalle", + systemMessage: "You are a DALL-E agent that generate image from prompt, when conversation is terminated, return the most recent image url") + .RegisterMessageConnector() + .RegisterStreamingMiddleware(generateImageFunctionMiddleware) .RegisterMiddleware(async (msgs, option, agent, ct) => { - // if last message contains [TERMINATE], then find the last image url and terminate the conversation - if (msgs.Last().GetContent()?.Contains("TERMINATE") is true) + if (msgs.Any(msg => msg.GetContent()?.ToLower().Contains("approve") is true)) { - var lastMessageWithImage = msgs.Last(msg => msg is ImageMessage) as ImageMessage; - var lastImageUrl = lastMessageWithImage.Url; - Console.WriteLine($"download image from {lastImageUrl} to {imagePath}"); - var httpClient = new HttpClient(); - var imageBytes = await httpClient.GetByteArrayAsync(lastImageUrl); - File.WriteAllBytes(imagePath, imageBytes); - - var messageContent = $@"{GroupChatExtension.TERMINATE} - -{lastImageUrl}"; - return new TextMessage(Role.Assistant, messageContent) - { - From = "dalle", - }; + return new TextMessage(Role.Assistant, $"The image satisfies the condition, conversation is terminated. {GroupChatExtension.TERMINATE}"); } - var reply = await agent.GenerateReplyAsync(msgs, option, ct); + var msgsWithoutImage = msgs.Where(msg => msg is not ImageMessage).ToList(); + var reply = await agent.GenerateReplyAsync(msgsWithoutImage, option, ct); if (reply.GetContent() is string content && content.Contains("IMAGE_GENERATION")) { var imageUrl = content.Split("\n").Last(); var imageMessage = new ImageMessage(Role.Assistant, imageUrl, from: reply.From); + Console.WriteLine($"download image from {imageUrl} to {imagePath}"); + var httpClient = new HttpClient(); + var imageBytes = await httpClient.GetByteArrayAsync(imageUrl, ct); + File.WriteAllBytes(imagePath, imageBytes); + return imageMessage; } else @@ -119,33 +109,25 @@ public static async Task RunAsync() }) .RegisterPrintMessage(); - var gpt4VAgent = new AssistantAgent( + var gpt4VAgent = new OpenAIChatAgent( + openAIClient: openAIClient, name: "gpt4v", + modelName: "gpt-4-vision-preview", systemMessage: @"You are a critism that provide feedback to DALL-E agent. Carefully check the image generated by DALL-E agent and provide feedback. -If the image satisfies the condition, then terminate the conversation by saying [TERMINATE]. +If the image satisfies the condition, then say [APPROVE]. Otherwise, provide detailed feedback to DALL-E agent so it can generate better image. The image should satisfy the following conditions: - There should be a cat and a mouse in the image -- The cat should be chasing after the mouse -", - llmConfig: new ConversableAgentConfig - { - Temperature = 0, - ConfigList = gpt4vConfig, - }) +- The cat should be chasing after the mouse") + .RegisterMessageConnector() .RegisterPrintMessage(); - IEnumerable conversation = new List() - { - new TextMessage(Role.User, "Hey dalle, please generate image from prompt: English short hair blue cat chase after a mouse") - }; - var maxRound = 20; await gpt4VAgent.InitiateChatAsync( receiver: dalleAgent, message: "Hey dalle, please generate image from prompt: English short hair blue cat chase after a mouse", - maxRound: maxRound); + maxRound: 10); File.Exists(imagePath).Should().BeTrue(); } diff --git a/dotnet/sample/AutoGen.Ollama.Sample/AutoGen.Ollama.Sample.csproj b/dotnet/sample/AutoGen.Ollama.Sample/AutoGen.Ollama.Sample.csproj new file mode 100644 index 00000000000..1dc94400869 --- /dev/null +++ b/dotnet/sample/AutoGen.Ollama.Sample/AutoGen.Ollama.Sample.csproj @@ -0,0 +1,24 @@ + + + Exe + $(TestTargetFramework) + enable + True + $(NoWarn);CS8981;CS8600;CS8602;CS8604;CS8618;CS0219;SKEXP0054;SKEXP0050;SKEXP0110 + + + + + + + + + + + + + PreserveNewest + + + + diff --git a/dotnet/sample/AutoGen.Ollama.Sample/Chat_With_LLaMA.cs b/dotnet/sample/AutoGen.Ollama.Sample/Chat_With_LLaMA.cs new file mode 100644 index 00000000000..e1af08c574c --- /dev/null +++ b/dotnet/sample/AutoGen.Ollama.Sample/Chat_With_LLaMA.cs @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Chat_With_LLaMA.cs + +using AutoGen.Core; +using AutoGen.Ollama.Extension; + +namespace AutoGen.Ollama.Sample; + +public class Chat_With_LLaMA +{ + public static async Task RunAsync() + { + using var httpClient = new HttpClient() + { + BaseAddress = new Uri("https://2xbvtxd1-11434.usw2.devtunnels.ms") + }; + + var ollamaAgent = new OllamaAgent( + httpClient: httpClient, + name: "ollama", + modelName: "llama3:latest", + systemMessage: "You are a helpful AI assistant") + .RegisterMessageConnector() + .RegisterPrintMessage(); + + var reply = await ollamaAgent.SendAsync("Can you write a piece of C# code to calculate 100th of fibonacci?"); + } +} diff --git a/dotnet/sample/AutoGen.Ollama.Sample/Chat_With_LLaVA.cs b/dotnet/sample/AutoGen.Ollama.Sample/Chat_With_LLaVA.cs new file mode 100644 index 00000000000..b1b310e3956 --- /dev/null +++ b/dotnet/sample/AutoGen.Ollama.Sample/Chat_With_LLaVA.cs @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Chat_With_LLaVA.cs + +using AutoGen.Core; +using AutoGen.Ollama.Extension; + +namespace AutoGen.Ollama.Sample; + +public class Chat_With_LLaVA +{ + public static async Task RunAsync() + { + using var httpClient = new HttpClient() + { + BaseAddress = new Uri("https://2xbvtxd1-11434.usw2.devtunnels.ms") + }; + + var ollamaAgent = new OllamaAgent( + httpClient: httpClient, + name: "ollama", + modelName: "llava:latest", + systemMessage: "You are a helpful AI assistant") + .RegisterMessageConnector() + .RegisterPrintMessage(); + + var image = Path.Combine("images", "background.png"); + var binaryData = BinaryData.FromBytes(File.ReadAllBytes(image), "image/png"); + var imageMessage = new ImageMessage(Role.User, binaryData); + var textMessage = new TextMessage(Role.User, "what's in this image?"); + var reply = await ollamaAgent.SendAsync(chatHistory: [textMessage, imageMessage]); + + // You can also use MultiModalMessage to put text and image together in one message + // In this case, all the messages in the multi-modal message will be put into single piece of message + // where the text is the concatenation of all the text messages seperated by \n + // and the images are all the images in the multi-modal message + var multiModalMessage = new MultiModalMessage(Role.User, [textMessage, imageMessage]); + + reply = await ollamaAgent.SendAsync(chatHistory: [multiModalMessage]); + } +} diff --git a/dotnet/sample/AutoGen.Ollama.Sample/Program.cs b/dotnet/sample/AutoGen.Ollama.Sample/Program.cs new file mode 100644 index 00000000000..62c92eebe7e --- /dev/null +++ b/dotnet/sample/AutoGen.Ollama.Sample/Program.cs @@ -0,0 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Program.cs + +using AutoGen.Ollama.Sample; + +await Chat_With_LLaVA.RunAsync(); diff --git a/dotnet/sample/AutoGen.Ollama.Sample/images/background.png b/dotnet/sample/AutoGen.Ollama.Sample/images/background.png new file mode 100644 index 00000000000..ca276f81f5b --- /dev/null +++ b/dotnet/sample/AutoGen.Ollama.Sample/images/background.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300b7c9d6ba0c23a3e52fbd2e268141ddcca0434a9fb9dcf7e58e7e903d36dcf +size 2126185 diff --git a/dotnet/sample/AutoGen.SemanticKernel.Sample/AutoGen.SemanticKernel.Sample.csproj b/dotnet/sample/AutoGen.SemanticKernel.Sample/AutoGen.SemanticKernel.Sample.csproj new file mode 100644 index 00000000000..6c226651292 --- /dev/null +++ b/dotnet/sample/AutoGen.SemanticKernel.Sample/AutoGen.SemanticKernel.Sample.csproj @@ -0,0 +1,17 @@ + + + + Exe + $(TestTargetFramework) + True + $(NoWarn);CS8981;CS8600;CS8602;CS8604;CS8618;CS0219;SKEXP0054;SKEXP0050;SKEXP0110 + enable + + + + + + + + + diff --git a/dotnet/sample/AutoGen.SemanticKernel.Sample/Create_Semantic_Kernel_Agent.cs b/dotnet/sample/AutoGen.SemanticKernel.Sample/Create_Semantic_Kernel_Agent.cs new file mode 100644 index 00000000000..3333cdd9ad9 --- /dev/null +++ b/dotnet/sample/AutoGen.SemanticKernel.Sample/Create_Semantic_Kernel_Agent.cs @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Create_Semantic_Kernel_Agent.cs + +using AutoGen.Core; +using AutoGen.SemanticKernel.Extension; +using Microsoft.SemanticKernel; + +namespace AutoGen.SemanticKernel.Sample; + +public class Create_Semantic_Kernel_Agent +{ + public static async Task RunAsync() + { + var openAIKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY") ?? throw new Exception("Please set OPENAI_API_KEY environment variable."); + var modelId = "gpt-3.5-turbo"; + var kernel = Kernel.CreateBuilder() + .AddOpenAIChatCompletion(modelId: modelId, apiKey: openAIKey) + .Build(); + + var skAgent = new SemanticKernelAgent( + kernel: kernel, + name: "assistant", + systemMessage: "You are a helpful AI assistant") + .RegisterMessageConnector() // register message connector so it support AutoGen built-in message types like TextMessage. + .RegisterPrintMessage(); // pretty print the message to the console + + await skAgent.SendAsync("Hey tell me a long tedious joke"); + } +} diff --git a/dotnet/sample/AutoGen.SemanticKernel.Sample/Create_Semantic_Kernel_Chat_Agent.cs b/dotnet/sample/AutoGen.SemanticKernel.Sample/Create_Semantic_Kernel_Chat_Agent.cs new file mode 100644 index 00000000000..0caea6f031f --- /dev/null +++ b/dotnet/sample/AutoGen.SemanticKernel.Sample/Create_Semantic_Kernel_Chat_Agent.cs @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Create_Semantic_Kernel_Chat_Agent.cs + +using AutoGen.Core; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Agents; + +namespace AutoGen.SemanticKernel.Sample; + +public class Create_Semantic_Kernel_Chat_Agent +{ + public static async Task RunAsync() + { + var openAIKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY") ?? throw new Exception("Please set OPENAI_API_KEY environment variable."); + var modelId = "gpt-3.5-turbo"; + var kernel = Kernel.CreateBuilder() + .AddOpenAIChatCompletion(modelId: modelId, apiKey: openAIKey) + .Build(); + + // The built-in ChatCompletionAgent from semantic kernel. + var chatAgent = new ChatCompletionAgent() + { + Kernel = kernel, + Name = "assistant", + Description = "You are a helpful AI assistant", + }; + + var messageConnector = new SemanticKernelChatMessageContentConnector(); + var skAgent = new SemanticKernelChatCompletionAgent(chatAgent) + .RegisterMiddleware(messageConnector) // register message connector so it support AutoGen built-in message types like TextMessage. + .RegisterPrintMessage(); // pretty print the message to the console + + await skAgent.SendAsync("Hey tell me a long tedious joke"); + } +} diff --git a/dotnet/sample/AutoGen.SemanticKernel.Sample/Program.cs b/dotnet/sample/AutoGen.SemanticKernel.Sample/Program.cs new file mode 100644 index 00000000000..5032f2d4330 --- /dev/null +++ b/dotnet/sample/AutoGen.SemanticKernel.Sample/Program.cs @@ -0,0 +1,6 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Program.cs + +using AutoGen.SemanticKernel.Sample; + +await Use_Kernel_Functions_With_Other_Agent.RunAsync(); diff --git a/dotnet/sample/AutoGen.SemanticKernel.Sample/Use_Bing_Search_With_Semantic_Kernel_Agent.cs b/dotnet/sample/AutoGen.SemanticKernel.Sample/Use_Bing_Search_With_Semantic_Kernel_Agent.cs new file mode 100644 index 00000000000..4cebc88291f --- /dev/null +++ b/dotnet/sample/AutoGen.SemanticKernel.Sample/Use_Bing_Search_With_Semantic_Kernel_Agent.cs @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Use_Bing_Search_With_Semantic_Kernel_Agent.cs + +using AutoGen.Core; +using AutoGen.SemanticKernel.Extension; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Plugins.Web; +using Microsoft.SemanticKernel.Plugins.Web.Bing; + +namespace AutoGen.SemanticKernel.Sample; + +public class Use_Bing_Search_With_Semantic_Kernel_Agent +{ + public static async Task RunAsync() + { + var bingApiKey = Environment.GetEnvironmentVariable("BING_API_KEY") ?? throw new Exception("BING_API_KEY environment variable is not set"); + var bingSearch = new BingConnector(bingApiKey); + var webSearchPlugin = new WebSearchEnginePlugin(bingSearch); + + var openAIKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY") ?? throw new Exception("Please set OPENAI_API_KEY environment variable."); + var modelId = "gpt-3.5-turbo"; + var kernelBuilder = Kernel.CreateBuilder() + .AddOpenAIChatCompletion(modelId: modelId, apiKey: openAIKey); + kernelBuilder.Plugins.AddFromObject(webSearchPlugin); + + var kernel = kernelBuilder.Build(); + + var skAgent = new SemanticKernelAgent( + kernel: kernel, + name: "assistant", + systemMessage: "You are a helpful AI assistant") + .RegisterMessageConnector() // register message connector so it support AutoGen built-in message types like TextMessage. + .RegisterPrintMessage(); // pretty print the message to the console + + await skAgent.SendAsync("Tell me more about gpt-4-o"); + } +} diff --git a/dotnet/sample/AutoGen.SemanticKernel.Sample/Use_Kernel_Functions_With_Other_Agent.cs b/dotnet/sample/AutoGen.SemanticKernel.Sample/Use_Kernel_Functions_With_Other_Agent.cs new file mode 100644 index 00000000000..d91d727668a --- /dev/null +++ b/dotnet/sample/AutoGen.SemanticKernel.Sample/Use_Kernel_Functions_With_Other_Agent.cs @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Use_Kernel_Functions_With_Other_Agent.cs + +using AutoGen.Core; +using AutoGen.OpenAI; +using AutoGen.OpenAI.Extension; +using Azure.AI.OpenAI; +using Microsoft.SemanticKernel; + +namespace AutoGen.SemanticKernel.Sample; + +public class Use_Kernel_Functions_With_Other_Agent +{ + public static async Task RunAsync() + { + var openAIKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY") ?? throw new Exception("Please set OPENAI_API_KEY environment variable."); + var modelId = "gpt-3.5-turbo"; + var kernelBuilder = Kernel.CreateBuilder(); + var kernel = kernelBuilder.Build(); + var getWeatherFunction = KernelFunctionFactory.CreateFromMethod( + method: (string location) => $"The weather in {location} is 75 degrees Fahrenheit.", + functionName: "GetWeather", + description: "Get the weather for a location."); + var plugin = kernel.CreatePluginFromFunctions("my_plugin", [getWeatherFunction]); + + // Create a middleware to handle the plugin functions + var kernelPluginMiddleware = new KernelPluginMiddleware(kernel, plugin); + + var openAIClient = new OpenAIClient(openAIKey); + var openAIAgent = new OpenAIChatAgent( + openAIClient: openAIClient, + name: "assistant", + modelName: modelId) + .RegisterMessageConnector() // register message connector so it support AutoGen built-in message types like TextMessage. + .RegisterMiddleware(kernelPluginMiddleware) // register the middleware to handle the plugin functions + .RegisterPrintMessage(); // pretty print the message to the console + + var toolAggregateMessage = await openAIAgent.SendAsync("Tell me the weather in Seattle"); + + // The aggregate message will be converted to [ToolCallMessage, ToolCallResultMessage] when flowing into the agent + // send the aggregated message to llm to generate the final response + var finalReply = await openAIAgent.SendAsync(toolAggregateMessage); + } +} diff --git a/dotnet/src/AutoGen.Core/Extension/MessageExtension.cs b/dotnet/src/AutoGen.Core/Extension/MessageExtension.cs index 47dbad55e30..3dbba9668f9 100644 --- a/dotnet/src/AutoGen.Core/Extension/MessageExtension.cs +++ b/dotnet/src/AutoGen.Core/Extension/MessageExtension.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // MessageExtension.cs +using System; using System.Collections.Generic; using System.Linq; using System.Text; @@ -15,7 +16,9 @@ public static string FormatMessage(this IMessage message) { return message switch { +#pragma warning disable CS0618 // deprecated Message msg => msg.FormatMessage(), +#pragma warning restore CS0618 // deprecated TextMessage textMessage => textMessage.FormatMessage(), ImageMessage imageMessage => imageMessage.FormatMessage(), ToolCallMessage toolCallMessage => toolCallMessage.FormatMessage(), @@ -110,6 +113,8 @@ public static string FormatMessage(this AggregateMessage textMessage.Role == Role.System, +#pragma warning disable CS0618 // deprecated Message msg => msg.Role == Role.System, +#pragma warning restore CS0618 // deprecated _ => false, }; } /// /// Get the content from the message - /// if the message is a or , return the content - /// if the message is a and only contains one function call, return the result of that function call + /// if the message implements , return the content from the message by calling /// if the message is a where TMessage1 is and TMessage2 is and the second message only contains one function call, return the result of that function call /// for all other situation, return null. /// @@ -166,10 +172,11 @@ public static bool IsSystemMessage(this IMessage message) { return message switch { - TextMessage textMessage => textMessage.Content, + ICanGetTextContent canGetTextContent => canGetTextContent.GetContent(), + AggregateMessage aggregateMessage => string.Join("\n", aggregateMessage.Message2.ToolCalls.Where(x => x.Result is not null).Select(x => x.Result)), +#pragma warning disable CS0618 // deprecated Message msg => msg.Content, - ToolCallResultMessage toolCallResultMessage => toolCallResultMessage.ToolCalls.Count == 1 ? toolCallResultMessage.ToolCalls.First().Result : null, - AggregateMessage aggregateMessage => aggregateMessage.Message2.ToolCalls.Count == 1 ? aggregateMessage.Message2.ToolCalls.First().Result : null, +#pragma warning restore CS0618 // deprecated _ => null, }; } @@ -182,7 +189,9 @@ public static bool IsSystemMessage(this IMessage message) return message switch { TextMessage textMessage => textMessage.Role, +#pragma warning disable CS0618 // deprecated Message msg => msg.Role, +#pragma warning restore CS0618 // deprecated ImageMessage img => img.Role, MultiModalMessage multiModal => multiModal.Role, _ => null, @@ -191,8 +200,7 @@ public static bool IsSystemMessage(this IMessage message) /// /// Return the tool calls from the message if it's available. - /// if the message is a , return its tool calls - /// if the message is a and the function name and function arguments are available, return a list of tool call with one item + /// if the message implements , return the tool calls from the message by calling /// if the message is a where TMessage1 is and TMessage2 is , return the tool calls from the first message /// /// @@ -201,11 +209,13 @@ public static bool IsSystemMessage(this IMessage message) { return message switch { - ToolCallMessage toolCallMessage => toolCallMessage.ToolCalls, + ICanGetToolCalls canGetToolCalls => canGetToolCalls.GetToolCalls().ToList(), +#pragma warning disable CS0618 // deprecated Message msg => msg.FunctionName is not null && msg.FunctionArguments is not null - ? msg.Content is not null ? new List { new ToolCall(msg.FunctionName, msg.FunctionArguments, result: msg.Content) } - : new List { new ToolCall(msg.FunctionName, msg.FunctionArguments) } + ? msg.Content is not null ? [new ToolCall(msg.FunctionName, msg.FunctionArguments, result: msg.Content)] + : new List { new(msg.FunctionName, msg.FunctionArguments) } : null, +#pragma warning restore CS0618 // deprecated AggregateMessage aggregateMessage => aggregateMessage.Message1.ToolCalls, _ => null, }; diff --git a/dotnet/src/AutoGen.Core/GroupChat/Graph.cs b/dotnet/src/AutoGen.Core/GroupChat/Graph.cs index 78d92508611..02f4da50bae 100644 --- a/dotnet/src/AutoGen.Core/GroupChat/Graph.cs +++ b/dotnet/src/AutoGen.Core/GroupChat/Graph.cs @@ -8,19 +8,6 @@ namespace AutoGen.Core; -/// -/// Obsolete: please use -/// -[Obsolete("please use Graph")] -public class Workflow : Graph -{ - [Obsolete("please use Graph")] - public Workflow(IEnumerable transitions) - : base(transitions) - { - } -} - public class Graph { private readonly List transitions = new List(); diff --git a/dotnet/src/AutoGen.Core/GroupChat/GroupChat.cs b/dotnet/src/AutoGen.Core/GroupChat/GroupChat.cs index 3b6288ca0a7..cd17a21f8b9 100644 --- a/dotnet/src/AutoGen.Core/GroupChat/GroupChat.cs +++ b/dotnet/src/AutoGen.Core/GroupChat/GroupChat.cs @@ -110,7 +110,7 @@ public async Task SelectNextSpeakerAsync(IAgent currentSpeaker, IEnumera {string.Join(",", agentNames)} Each message will start with 'From name:', e.g: -From admin: +From {agentNames.First()}: //your message//."); var conv = this.ProcessConversationsForRolePlay(this.initializeMessages, conversationHistory); diff --git a/dotnet/src/AutoGen.Core/Message/IMessage.cs b/dotnet/src/AutoGen.Core/Message/IMessage.cs index 7b48f4f0d63..ad215d510e3 100644 --- a/dotnet/src/AutoGen.Core/Message/IMessage.cs +++ b/dotnet/src/AutoGen.Core/Message/IMessage.cs @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // IMessage.cs +using System.Collections.Generic; + namespace AutoGen.Core; /// @@ -29,7 +31,7 @@ namespace AutoGen.Core; /// /// : an aggregate message type that contains two message types. /// This type is useful when you want to combine two message types into one unique message type. One example is when invoking a tool call and you want to return both and . -/// One example of how this type is used in AutoGen is +/// One example of how this type is used in AutoGen is and its return message /// /// /// @@ -41,6 +43,24 @@ public interface IMessage : IMessage, IStreamingMessage { } +/// +/// The interface for messages that can get text content. +/// This interface will be used by to get the content from the message. +/// +public interface ICanGetTextContent : IMessage, IStreamingMessage +{ + public string? GetContent(); +} + +/// +/// The interface for messages that can get a list of +/// +public interface ICanGetToolCalls : IMessage, IStreamingMessage +{ + public IEnumerable GetToolCalls(); +} + + public interface IStreamingMessage { string? From { get; set; } diff --git a/dotnet/src/AutoGen.Core/Message/ImageMessage.cs b/dotnet/src/AutoGen.Core/Message/ImageMessage.cs index 1239785c411..d2e2d080300 100644 --- a/dotnet/src/AutoGen.Core/Message/ImageMessage.cs +++ b/dotnet/src/AutoGen.Core/Message/ImageMessage.cs @@ -49,7 +49,9 @@ public ImageMessage(Role role, BinaryData data, string? from = null) public string BuildDataUri() { if (this.Data is null) + { throw new NullReferenceException($"{nameof(Data)}"); + } return $"data:{this.Data.MediaType};base64,{Convert.ToBase64String(this.Data.ToArray())}"; } diff --git a/dotnet/src/AutoGen.Core/Message/Message.cs b/dotnet/src/AutoGen.Core/Message/Message.cs index ec4751b9344..b31b413eca7 100644 --- a/dotnet/src/AutoGen.Core/Message/Message.cs +++ b/dotnet/src/AutoGen.Core/Message/Message.cs @@ -1,10 +1,12 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Message.cs +using System; using System.Collections.Generic; namespace AutoGen.Core; +[Obsolete("This message class is deprecated, please use a specific AutoGen built-in message type instead. For more information, please visit https://microsoft.github.io/autogen-for-net/articles/Built-in-messages.html")] public class Message : IMessage { public Message( diff --git a/dotnet/src/AutoGen.Core/Message/TextMessage.cs b/dotnet/src/AutoGen.Core/Message/TextMessage.cs index ed4d7436dde..addd8728a92 100644 --- a/dotnet/src/AutoGen.Core/Message/TextMessage.cs +++ b/dotnet/src/AutoGen.Core/Message/TextMessage.cs @@ -3,7 +3,7 @@ namespace AutoGen.Core; -public class TextMessage : IMessage, IStreamingMessage +public class TextMessage : IMessage, IStreamingMessage, ICanGetTextContent { public TextMessage(Role role, string content, string? from = null) { @@ -44,9 +44,14 @@ public override string ToString() { return $"TextMessage({this.Role}, {this.Content}, {this.From})"; } + + public string? GetContent() + { + return this.Content; + } } -public class TextMessageUpdate : IStreamingMessage +public class TextMessageUpdate : IStreamingMessage, ICanGetTextContent { public TextMessageUpdate(Role role, string? content, string? from = null) { @@ -60,4 +65,9 @@ public TextMessageUpdate(Role role, string? content, string? from = null) public string? From { get; set; } public Role Role { get; set; } + + public string? GetContent() + { + return this.Content; + } } diff --git a/dotnet/src/AutoGen.Core/Message/ToolCallAggregateMessage.cs b/dotnet/src/AutoGen.Core/Message/ToolCallAggregateMessage.cs new file mode 100644 index 00000000000..7781b785ef8 --- /dev/null +++ b/dotnet/src/AutoGen.Core/Message/ToolCallAggregateMessage.cs @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// FunctionCallAggregateMessage.cs + +using System.Collections.Generic; + +namespace AutoGen.Core; + +/// +/// An aggregate message that contains a tool call message and a tool call result message. +/// This message type is used by to return both and . +/// +public class ToolCallAggregateMessage : AggregateMessage, ICanGetTextContent, ICanGetToolCalls +{ + public ToolCallAggregateMessage(ToolCallMessage message1, ToolCallResultMessage message2, string? from = null) + : base(message1, message2, from) + { + } + + public string? GetContent() + { + return this.Message2.GetContent(); + } + + public IEnumerable GetToolCalls() + { + return this.Message1.GetToolCalls(); + } +} diff --git a/dotnet/src/AutoGen.Core/Message/ToolCallMessage.cs b/dotnet/src/AutoGen.Core/Message/ToolCallMessage.cs index 8dcd98ea0ec..396dba3d3a1 100644 --- a/dotnet/src/AutoGen.Core/Message/ToolCallMessage.cs +++ b/dotnet/src/AutoGen.Core/Message/ToolCallMessage.cs @@ -26,6 +26,8 @@ public ToolCall(string functionName, string functionArgs, string result) public string FunctionArguments { get; set; } + public string? ToolCallId { get; set; } + public string? Result { get; set; } public override string ToString() @@ -34,7 +36,7 @@ public override string ToString() } } -public class ToolCallMessage : IMessage +public class ToolCallMessage : IMessage, ICanGetToolCalls { public ToolCallMessage(IEnumerable toolCalls, string? from = null) { @@ -45,7 +47,7 @@ public ToolCallMessage(IEnumerable toolCalls, string? from = null) public ToolCallMessage(string functionName, string functionArgs, string? from = null) { this.From = from; - this.ToolCalls = new List { new ToolCall(functionName, functionArgs) }; + this.ToolCalls = new List { new ToolCall(functionName, functionArgs) { ToolCallId = functionName } }; } public ToolCallMessage(ToolCallMessageUpdate update) @@ -89,6 +91,11 @@ public override string ToString() return sb.ToString(); } + + public IEnumerable GetToolCalls() + { + return this.ToolCalls; + } } public class ToolCallMessageUpdate : IStreamingMessage diff --git a/dotnet/src/AutoGen.Core/Message/ToolCallResultMessage.cs b/dotnet/src/AutoGen.Core/Message/ToolCallResultMessage.cs index 99c7740849a..fa7357c941c 100644 --- a/dotnet/src/AutoGen.Core/Message/ToolCallResultMessage.cs +++ b/dotnet/src/AutoGen.Core/Message/ToolCallResultMessage.cs @@ -7,7 +7,7 @@ namespace AutoGen.Core; -public class ToolCallResultMessage : IMessage +public class ToolCallResultMessage : IMessage, ICanGetTextContent { public ToolCallResultMessage(IEnumerable toolCalls, string? from = null) { @@ -18,7 +18,7 @@ public ToolCallResultMessage(IEnumerable toolCalls, string? from = nul public ToolCallResultMessage(string result, string functionName, string functionArgs, string? from = null) { this.From = from; - var toolCall = new ToolCall(functionName, functionArgs); + var toolCall = new ToolCall(functionName, functionArgs) { ToolCallId = functionName }; toolCall.Result = result; this.ToolCalls = [toolCall]; } @@ -30,6 +30,15 @@ public ToolCallResultMessage(string result, string functionName, string function public string? From { get; set; } + public string? GetContent() + { + var results = this.ToolCalls + .Where(x => x.Result != null) + .Select(x => x.Result); + + return string.Join("\n", results); + } + public override string ToString() { var sb = new StringBuilder(); @@ -41,16 +50,4 @@ public override string ToString() return sb.ToString(); } - - private void Validate() - { - // each tool call must have a result - foreach (var toolCall in this.ToolCalls) - { - if (string.IsNullOrEmpty(toolCall.Result)) - { - throw new System.ArgumentException($"The tool call {toolCall} does not have a result"); - } - } - } } diff --git a/dotnet/src/AutoGen.Core/Middleware/FunctionCallMiddleware.cs b/dotnet/src/AutoGen.Core/Middleware/FunctionCallMiddleware.cs index 2bc02805538..d0788077b59 100644 --- a/dotnet/src/AutoGen.Core/Middleware/FunctionCallMiddleware.cs +++ b/dotnet/src/AutoGen.Core/Middleware/FunctionCallMiddleware.cs @@ -18,8 +18,7 @@ namespace AutoGen.Core; /// Otherwise, the message will be sent to the inner agent. In this situation /// if the reply from the inner agent is , /// and the tool calls is available in this middleware's function map, the tools from the reply will be invoked, -/// and a where TMessage1 is and TMessage2 is "/> -/// will be returned. +/// and a will be returned. /// /// If the reply from the inner agent is but the tool calls is not available in this middleware's function map, /// or the reply from the inner agent is not , the original reply from the inner agent will be returned. @@ -128,13 +127,13 @@ private async Task InvokeToolCallMessagesBeforeInvokingAg if (this.functionMap?.TryGetValue(functionName, out var func) is true) { var result = await func(functionArguments); - toolCallResult.Add(new ToolCall(functionName, functionArguments, result)); + toolCallResult.Add(new ToolCall(functionName, functionArguments, result) { ToolCallId = toolCall.ToolCallId }); } else if (this.functionMap is not null) { var errorMessage = $"Function {functionName} is not available. Available functions are: {string.Join(", ", this.functionMap.Select(f => f.Key))}"; - toolCallResult.Add(new ToolCall(functionName, functionArguments, errorMessage)); + toolCallResult.Add(new ToolCall(functionName, functionArguments, errorMessage) { ToolCallId = toolCall.ToolCallId }); } else { @@ -156,14 +155,14 @@ private async Task InvokeToolCallMessagesAfterInvokingAgentAsync(ToolC if (this.functionMap?.TryGetValue(fName, out var func) is true) { var result = await func(fArgs); - toolCallResult.Add(new ToolCall(fName, fArgs, result)); + toolCallResult.Add(new ToolCall(fName, fArgs, result) { ToolCallId = toolCall.ToolCallId }); } } if (toolCallResult.Count() > 0) { var toolCallResultMessage = new ToolCallResultMessage(toolCallResult, from: agent.Name); - return new AggregateMessage(toolCallMsg, toolCallResultMessage, from: agent.Name); + return new ToolCallAggregateMessage(toolCallMsg, toolCallResultMessage, from: agent.Name); } else { diff --git a/dotnet/src/AutoGen.DotnetInteractive/AutoGen.DotnetInteractive.csproj b/dotnet/src/AutoGen.DotnetInteractive/AutoGen.DotnetInteractive.csproj index 57fcb1fce16..72c67fe7801 100644 --- a/dotnet/src/AutoGen.DotnetInteractive/AutoGen.DotnetInteractive.csproj +++ b/dotnet/src/AutoGen.DotnetInteractive/AutoGen.DotnetInteractive.csproj @@ -19,7 +19,7 @@ - + @@ -27,14 +27,12 @@ - - + - - + diff --git a/dotnet/src/AutoGen.DotnetInteractive/DotnetInteractiveFunction.cs b/dotnet/src/AutoGen.DotnetInteractive/DotnetInteractiveFunction.cs index 5587694882c..bb5504cd548 100644 --- a/dotnet/src/AutoGen.DotnetInteractive/DotnetInteractiveFunction.cs +++ b/dotnet/src/AutoGen.DotnetInteractive/DotnetInteractiveFunction.cs @@ -12,57 +12,58 @@ namespace AutoGen.DotnetInteractive; public class DotnetInteractiveFunction : IDisposable { private readonly InteractiveService? _interactiveService = null; - private string? _notebookPath; + private string _notebookPath; private readonly KernelInfoCollection _kernelInfoCollection = new KernelInfoCollection(); + /// + /// Create an instance of " + /// + /// interactive service to use. + /// notebook path if provided. public DotnetInteractiveFunction(InteractiveService interactiveService, string? notebookPath = null, bool continueFromExistingNotebook = false) { this._interactiveService = interactiveService; - this._notebookPath = notebookPath; + this._notebookPath = notebookPath ?? Path.GetTempPath() + "notebook.ipynb"; this._kernelInfoCollection.Add(new KernelInfo("csharp")); this._kernelInfoCollection.Add(new KernelInfo("markdown")); - - if (this._notebookPath != null) + if (continueFromExistingNotebook == false) { - if (continueFromExistingNotebook == false) + // remove existing notebook + if (File.Exists(this._notebookPath)) { - // remove existing notebook - if (File.Exists(this._notebookPath)) - { - File.Delete(this._notebookPath); - } + File.Delete(this._notebookPath); + } - var document = new InteractiveDocument(); + var document = new InteractiveDocument(); - using var stream = File.OpenWrite(_notebookPath); - Notebook.Write(document, stream, this._kernelInfoCollection); - stream.Flush(); - stream.Dispose(); - } - else if (continueFromExistingNotebook == true && File.Exists(this._notebookPath)) + using var stream = File.OpenWrite(_notebookPath); + Notebook.Write(document, stream, this._kernelInfoCollection); + stream.Flush(); + stream.Dispose(); + } + else if (continueFromExistingNotebook == true && File.Exists(this._notebookPath)) + { + // load existing notebook + using var readStream = File.OpenRead(this._notebookPath); + var document = Notebook.Read(readStream, this._kernelInfoCollection); + foreach (var cell in document.Elements) { - // load existing notebook - using var readStream = File.OpenRead(this._notebookPath); - var document = Notebook.Read(readStream, this._kernelInfoCollection); - foreach (var cell in document.Elements) + if (cell.KernelName == "csharp") { - if (cell.KernelName == "csharp") - { - var code = cell.Contents; - this._interactiveService.SubmitCSharpCodeAsync(code, default).Wait(); - } + var code = cell.Contents; + this._interactiveService.SubmitCSharpCodeAsync(code, default).Wait(); } } - else - { - // create an empty notebook - var document = new InteractiveDocument(); + } + else + { + // create an empty notebook + var document = new InteractiveDocument(); - using var stream = File.OpenWrite(_notebookPath); - Notebook.Write(document, stream, this._kernelInfoCollection); - stream.Flush(); - stream.Dispose(); - } + using var stream = File.OpenWrite(_notebookPath); + Notebook.Write(document, stream, this._kernelInfoCollection); + stream.Flush(); + stream.Dispose(); } } diff --git a/dotnet/src/AutoGen.DotnetInteractive/InteractiveService.cs b/dotnet/src/AutoGen.DotnetInteractive/InteractiveService.cs index 0dc34f24e44..7490b64e126 100644 --- a/dotnet/src/AutoGen.DotnetInteractive/InteractiveService.cs +++ b/dotnet/src/AutoGen.DotnetInteractive/InteractiveService.cs @@ -5,7 +5,6 @@ using System.Reactive.Linq; using System.Reflection; using Microsoft.DotNet.Interactive; -using Microsoft.DotNet.Interactive.App.Connection; using Microsoft.DotNet.Interactive.Commands; using Microsoft.DotNet.Interactive.Connection; using Microsoft.DotNet.Interactive.Events; @@ -41,7 +40,7 @@ public InteractiveService(string installingDirectory) public async Task StartAsync(string workingDirectory, CancellationToken ct = default) { - this.kernel = await this.CreateKernelAsync(workingDirectory, ct); + this.kernel = await this.CreateKernelAsync(workingDirectory, true, ct); return true; } @@ -84,7 +83,51 @@ public async Task StartAsync(string workingDirectory, CancellationToken ct return await this.SubmitCommandAsync(command, ct); } - private async Task CreateKernelAsync(string workingDirectory, CancellationToken ct = default) + public bool RestoreDotnetInteractive() + { + this.WriteLine("Restore dotnet interactive tool"); + // write RestoreInteractive.config from embedded resource to this.workingDirectory + var assembly = Assembly.GetAssembly(typeof(InteractiveService))!; + var resourceName = "AutoGen.DotnetInteractive.RestoreInteractive.config"; + using (var stream = assembly.GetManifestResourceStream(resourceName)!) + using (var fileStream = File.Create(Path.Combine(this.installingDirectory, "RestoreInteractive.config"))) + { + stream.CopyTo(fileStream); + } + + // write dotnet-tool.json from embedded resource to this.workingDirectory + + resourceName = "AutoGen.DotnetInteractive.dotnet-tools.json"; + using (var stream2 = assembly.GetManifestResourceStream(resourceName)!) + using (var fileStream2 = File.Create(Path.Combine(this.installingDirectory, "dotnet-tools.json"))) + { + stream2.CopyTo(fileStream2); + } + + var psi = new ProcessStartInfo + { + FileName = "dotnet", + Arguments = $"tool restore --configfile RestoreInteractive.config", + WorkingDirectory = this.installingDirectory, + RedirectStandardInput = true, + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + CreateNoWindow = true, + }; + + using var process = new Process { StartInfo = psi }; + process.OutputDataReceived += this.PrintProcessOutput; + process.ErrorDataReceived += this.PrintProcessOutput; + process.Start(); + process.BeginErrorReadLine(); + process.BeginOutputReadLine(); + process.WaitForExit(); + + return process.ExitCode == 0; + } + + private async Task CreateKernelAsync(string workingDirectory, bool restoreWhenFail = true, CancellationToken ct = default) { try { @@ -139,13 +182,13 @@ await rootProxyKernel.SendAsync( return compositeKernel; } - catch (CommandLineInvocationException ex) when (ex.Message.Contains("Cannot find a tool in the manifest file that has a command named 'dotnet-interactive'")) + catch (CommandLineInvocationException) when (restoreWhenFail) { var success = this.RestoreDotnetInteractive(); if (success) { - return await this.CreateKernelAsync(workingDirectory, ct); + return await this.CreateKernelAsync(workingDirectory, false, ct); } throw; @@ -176,50 +219,6 @@ private void WriteLine(string data) this.Output?.Invoke(this, data); } - private bool RestoreDotnetInteractive() - { - this.WriteLine("Restore dotnet interactive tool"); - // write RestoreInteractive.config from embedded resource to this.workingDirectory - var assembly = Assembly.GetAssembly(typeof(InteractiveService))!; - var resourceName = "AutoGen.DotnetInteractive.RestoreInteractive.config"; - using (var stream = assembly.GetManifestResourceStream(resourceName)!) - using (var fileStream = File.Create(Path.Combine(this.installingDirectory, "RestoreInteractive.config"))) - { - stream.CopyTo(fileStream); - } - - // write dotnet-tool.json from embedded resource to this.workingDirectory - - resourceName = "AutoGen.DotnetInteractive.dotnet-tools.json"; - using (var stream2 = assembly.GetManifestResourceStream(resourceName)!) - using (var fileStream2 = File.Create(Path.Combine(this.installingDirectory, "dotnet-tools.json"))) - { - stream2.CopyTo(fileStream2); - } - - var psi = new ProcessStartInfo - { - FileName = "dotnet", - Arguments = $"tool restore --configfile RestoreInteractive.config", - WorkingDirectory = this.installingDirectory, - RedirectStandardInput = true, - RedirectStandardOutput = true, - RedirectStandardError = true, - UseShellExecute = false, - CreateNoWindow = true, - }; - - using var process = new Process { StartInfo = psi }; - process.OutputDataReceived += this.PrintProcessOutput; - process.ErrorDataReceived += this.PrintProcessOutput; - process.Start(); - process.BeginErrorReadLine(); - process.BeginOutputReadLine(); - process.WaitForExit(); - - return process.ExitCode == 0; - } - private void PrintProcessOutput(object sender, DataReceivedEventArgs e) { if (!string.IsNullOrEmpty(e.Data)) diff --git a/dotnet/src/AutoGen.DotnetInteractive/dotnet-tools.json b/dotnet/src/AutoGen.DotnetInteractive/dotnet-tools.json index b2677b61678..12b09e61cae 100644 --- a/dotnet/src/AutoGen.DotnetInteractive/dotnet-tools.json +++ b/dotnet/src/AutoGen.DotnetInteractive/dotnet-tools.json @@ -3,7 +3,7 @@ "isRoot": true, "tools": { "Microsoft.dotnet-interactive": { - "version": "1.0.431302", + "version": "1.0.522904", "commands": [ "dotnet-interactive" ] diff --git a/dotnet/src/AutoGen.Mistral/DTOs/ChatMessage.cs b/dotnet/src/AutoGen.Mistral/DTOs/ChatMessage.cs index c5dae2aa34d..b0fa1757c12 100644 --- a/dotnet/src/AutoGen.Mistral/DTOs/ChatMessage.cs +++ b/dotnet/src/AutoGen.Mistral/DTOs/ChatMessage.cs @@ -13,7 +13,7 @@ public class ChatMessage /// /// role. /// content. - public ChatMessage(RoleEnum? role = default(RoleEnum?), string? content = null) + public ChatMessage(RoleEnum? role = default, string? content = null) { this.Role = role; this.Content = content; @@ -67,18 +67,25 @@ public enum RoleEnum [JsonPropertyName("tool_calls")] public List? ToolCalls { get; set; } + + [JsonPropertyName("tool_call_id")] + public string? ToolCallId { get; set; } } public class FunctionContent { - public FunctionContent(FunctionCall function) + public FunctionContent(string id, FunctionCall function) { this.Function = function; + this.Id = id; } [JsonPropertyName("function")] public FunctionCall Function { get; set; } + [JsonPropertyName("id")] + public string Id { get; set; } + public class FunctionCall { public FunctionCall(string name, string arguments) diff --git a/dotnet/src/AutoGen.Mistral/Middleware/MistralChatMessageConnector.cs b/dotnet/src/AutoGen.Mistral/Middleware/MistralChatMessageConnector.cs index 3ba910aa700..95592e97fcc 100644 --- a/dotnet/src/AutoGen.Mistral/Middleware/MistralChatMessageConnector.cs +++ b/dotnet/src/AutoGen.Mistral/Middleware/MistralChatMessageConnector.cs @@ -158,7 +158,7 @@ private IMessage PostProcessMessage(ChatCompletionResponse response, IAgent from else if (finishReason == Choice.FinishReasonEnum.ToolCalls) { var functionContents = choice.Message?.ToolCalls ?? throw new ArgumentNullException("choice.Message.ToolCalls"); - var toolCalls = functionContents.Select(f => new ToolCall(f.Function.Name, f.Function.Arguments)).ToList(); + var toolCalls = functionContents.Select(f => new ToolCall(f.Function.Name, f.Function.Arguments) { ToolCallId = f.Id }).ToList(); return new ToolCallMessage(toolCalls, from: from.Name); } else @@ -257,6 +257,7 @@ private IEnumerable> ProcessToolCallResultMessage(ToolCall var message = new ChatMessage(ChatMessage.RoleEnum.Tool, content: toolCall.Result) { Name = toolCall.FunctionName, + ToolCallId = toolCall.ToolCallId, }; messages.Add(message); @@ -305,10 +306,12 @@ private IEnumerable> ProcessToolCallMessage(ToolCallMessag // convert tool call message to chat message var chatMessage = new ChatMessage(ChatMessage.RoleEnum.Assistant); chatMessage.ToolCalls = new List(); - foreach (var toolCall in toolCallMessage.ToolCalls) + for (var i = 0; i < toolCallMessage.ToolCalls.Count; i++) { + var toolCall = toolCallMessage.ToolCalls[i]; + var toolCallId = toolCall.ToolCallId ?? $"{toolCall.FunctionName}_{i}"; var functionCall = new FunctionContent.FunctionCall(toolCall.FunctionName, toolCall.FunctionArguments); - var functionContent = new FunctionContent(functionCall); + var functionContent = new FunctionContent(toolCallId, functionCall); chatMessage.ToolCalls.Add(functionContent); } diff --git a/dotnet/src/AutoGen.Ollama/Agent/OllamaAgent.cs b/dotnet/src/AutoGen.Ollama/Agent/OllamaAgent.cs new file mode 100644 index 00000000000..9ef68388d60 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Agent/OllamaAgent.cs @@ -0,0 +1,185 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaAgent.cs + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Net.Http; +using System.Runtime.CompilerServices; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using AutoGen.Core; + +namespace AutoGen.Ollama; + +/// +/// An agent that can interact with ollama models. +/// +public class OllamaAgent : IStreamingAgent +{ + private readonly HttpClient _httpClient; + private readonly string _modelName; + private readonly string _systemMessage; + private readonly OllamaReplyOptions? _replyOptions; + + public OllamaAgent(HttpClient httpClient, string name, string modelName, + string systemMessage = "You are a helpful AI assistant", + OllamaReplyOptions? replyOptions = null) + { + Name = name; + _httpClient = httpClient; + _modelName = modelName; + _systemMessage = systemMessage; + _replyOptions = replyOptions; + } + + public async Task GenerateReplyAsync( + IEnumerable messages, GenerateReplyOptions? options = null, CancellationToken cancellation = default) + { + ChatRequest request = await BuildChatRequest(messages, options); + request.Stream = false; + var httpRequest = BuildRequest(request); + using (HttpResponseMessage? response = await _httpClient.SendAsync(httpRequest, HttpCompletionOption.ResponseContentRead, cancellation)) + { + response.EnsureSuccessStatusCode(); + Stream? streamResponse = await response.Content.ReadAsStreamAsync(); + ChatResponse chatResponse = await JsonSerializer.DeserializeAsync(streamResponse, cancellationToken: cancellation) + ?? throw new Exception("Failed to deserialize response"); + var output = new MessageEnvelope(chatResponse, from: Name); + return output; + } + } + + public async IAsyncEnumerable GenerateStreamingReplyAsync( + IEnumerable messages, + GenerateReplyOptions? options = null, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + ChatRequest request = await BuildChatRequest(messages, options); + request.Stream = true; + HttpRequestMessage message = BuildRequest(request); + using (HttpResponseMessage? response = await _httpClient.SendAsync(message, HttpCompletionOption.ResponseHeadersRead, cancellationToken)) + { + response.EnsureSuccessStatusCode(); + using Stream? stream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false); + using var reader = new StreamReader(stream); + + while (!reader.EndOfStream && !cancellationToken.IsCancellationRequested) + { + string? line = await reader.ReadLineAsync(); + if (string.IsNullOrWhiteSpace(line)) + { + continue; + } + + ChatResponseUpdate? update = JsonSerializer.Deserialize(line); + if (update is { Done: false }) + { + yield return new MessageEnvelope(update, from: Name); + } + else + { + var finalUpdate = JsonSerializer.Deserialize(line) ?? throw new Exception("Failed to deserialize response"); + + yield return new MessageEnvelope(finalUpdate, from: Name); + } + } + } + } + + public string Name { get; } + + private async Task BuildChatRequest(IEnumerable messages, GenerateReplyOptions? options) + { + var request = new ChatRequest + { + Model = _modelName, + Messages = await BuildChatHistory(messages) + }; + + if (options is OllamaReplyOptions replyOptions) + { + BuildChatRequestOptions(replyOptions, request); + return request; + } + + if (_replyOptions != null) + { + BuildChatRequestOptions(_replyOptions, request); + return request; + } + return request; + } + private void BuildChatRequestOptions(OllamaReplyOptions replyOptions, ChatRequest request) + { + request.Format = replyOptions.Format == FormatType.Json ? OllamaConsts.JsonFormatType : null; + request.Template = replyOptions.Template; + request.KeepAlive = replyOptions.KeepAlive; + + if (replyOptions.Temperature != null + || replyOptions.MaxToken != null + || replyOptions.StopSequence != null + || replyOptions.Seed != null + || replyOptions.MiroStat != null + || replyOptions.MiroStatEta != null + || replyOptions.MiroStatTau != null + || replyOptions.NumCtx != null + || replyOptions.NumGqa != null + || replyOptions.NumGpu != null + || replyOptions.NumThread != null + || replyOptions.RepeatLastN != null + || replyOptions.RepeatPenalty != null + || replyOptions.TopK != null + || replyOptions.TopP != null + || replyOptions.TfsZ != null) + { + request.Options = new ModelReplyOptions + { + Temperature = replyOptions.Temperature, + NumPredict = replyOptions.MaxToken, + Stop = replyOptions.StopSequence?[0], + Seed = replyOptions.Seed, + MiroStat = replyOptions.MiroStat, + MiroStatEta = replyOptions.MiroStatEta, + MiroStatTau = replyOptions.MiroStatTau, + NumCtx = replyOptions.NumCtx, + NumGqa = replyOptions.NumGqa, + NumGpu = replyOptions.NumGpu, + NumThread = replyOptions.NumThread, + RepeatLastN = replyOptions.RepeatLastN, + RepeatPenalty = replyOptions.RepeatPenalty, + TopK = replyOptions.TopK, + TopP = replyOptions.TopP, + TfsZ = replyOptions.TfsZ + }; + } + } + private async Task> BuildChatHistory(IEnumerable messages) + { + var history = messages.Select(m => m switch + { + IMessage chatMessage => chatMessage.Content, + _ => throw new ArgumentException("Invalid message type") + }); + + // if there's no system message in the history, add one to the beginning + if (!history.Any(m => m.Role == "system")) + { + history = new[] { new Message() { Role = "system", Value = _systemMessage } }.Concat(history); + } + + return history.ToList(); + } + + private static HttpRequestMessage BuildRequest(ChatRequest request) + { + string serialized = JsonSerializer.Serialize(request); + return new HttpRequestMessage(HttpMethod.Post, OllamaConsts.ChatCompletionEndpoint) + { + Content = new StringContent(serialized, Encoding.UTF8, OllamaConsts.JsonMediaType) + }; + } +} diff --git a/dotnet/src/AutoGen.Ollama/AutoGen.Ollama.csproj b/dotnet/src/AutoGen.Ollama/AutoGen.Ollama.csproj new file mode 100644 index 00000000000..20924a476b7 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/AutoGen.Ollama.csproj @@ -0,0 +1,13 @@ + + + + netstandard2.0 + AutoGen.Ollama + True + + + + + + + diff --git a/dotnet/src/AutoGen.Ollama/DTOs/ChatRequest.cs b/dotnet/src/AutoGen.Ollama/DTOs/ChatRequest.cs new file mode 100644 index 00000000000..3b0cf04a1a0 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/DTOs/ChatRequest.cs @@ -0,0 +1,53 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// ChatRequest.cs + +using System.Collections.Generic; +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +public class ChatRequest +{ + /// + /// (required) the model name + /// + [JsonPropertyName("model")] + public string Model { get; set; } = string.Empty; + + /// + /// the messages of the chat, this can be used to keep a chat memory + /// + [JsonPropertyName("messages")] + public IList Messages { get; set; } = []; + + /// + /// the format to return a response in. Currently, the only accepted value is json + /// + [JsonPropertyName("format")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? Format { get; set; } + + /// + /// additional model parameters listed in the documentation for the Modelfile such as temperature + /// + [JsonPropertyName("options")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public ModelReplyOptions? Options { get; set; } + /// + /// the prompt template to use (overrides what is defined in the Modelfile) + /// + [JsonPropertyName("template")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? Template { get; set; } + /// + /// if false the response will be returned as a single response object, rather than a stream of objects + /// + [JsonPropertyName("stream")] + public bool Stream { get; set; } + /// + /// controls how long the model will stay loaded into memory following the request (default: 5m) + /// + [JsonPropertyName("keep_alive")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? KeepAlive { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/DTOs/ChatResponse.cs b/dotnet/src/AutoGen.Ollama/DTOs/ChatResponse.cs new file mode 100644 index 00000000000..7d8142de785 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/DTOs/ChatResponse.cs @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// ChatResponse.cs + +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +public class ChatResponse : ChatResponseUpdate +{ + /// + /// time spent generating the response + /// + [JsonPropertyName("total_duration")] + public long TotalDuration { get; set; } + + /// + /// time spent in nanoseconds loading the model + /// + [JsonPropertyName("load_duration")] + public long LoadDuration { get; set; } + + /// + /// number of tokens in the prompt + /// + [JsonPropertyName("prompt_eval_count")] + public int PromptEvalCount { get; set; } + + /// + /// time spent in nanoseconds evaluating the prompt + /// + [JsonPropertyName("prompt_eval_duration")] + public long PromptEvalDuration { get; set; } + + /// + /// number of tokens the response + /// + [JsonPropertyName("eval_count")] + public int EvalCount { get; set; } + + /// + /// time in nanoseconds spent generating the response + /// + [JsonPropertyName("eval_duration")] + public long EvalDuration { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/DTOs/ChatResponseUpdate.cs b/dotnet/src/AutoGen.Ollama/DTOs/ChatResponseUpdate.cs new file mode 100644 index 00000000000..8b4dac194f4 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/DTOs/ChatResponseUpdate.cs @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// ChatResponseUpdate.cs + +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +public class ChatResponseUpdate +{ + [JsonPropertyName("model")] + public string Model { get; set; } = string.Empty; + + [JsonPropertyName("created_at")] + public string CreatedAt { get; set; } = string.Empty; + + [JsonPropertyName("message")] + public Message? Message { get; set; } + + [JsonPropertyName("done")] + public bool Done { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/DTOs/Message.cs b/dotnet/src/AutoGen.Ollama/DTOs/Message.cs new file mode 100644 index 00000000000..2e0d891cc61 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/DTOs/Message.cs @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// ChatResponseUpdate.cs + +using System.Collections.Generic; +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +public class Message +{ + public Message() + { + } + + public Message(string role, string value) + { + Role = role; + Value = value; + } + + /// + /// the role of the message, either system, user or assistant + /// + [JsonPropertyName("role")] + public string Role { get; set; } = string.Empty; + /// + /// the content of the message + /// + [JsonPropertyName("content")] + public string Value { get; set; } = string.Empty; + + /// + /// (optional): a list of images to include in the message (for multimodal models such as llava) + /// + [JsonPropertyName("images")] + public IList? Images { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/DTOs/ModelReplyOptions.cs b/dotnet/src/AutoGen.Ollama/DTOs/ModelReplyOptions.cs new file mode 100644 index 00000000000..9d54a1bb83b --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/DTOs/ModelReplyOptions.cs @@ -0,0 +1,129 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// ModelReplyOptions.cs + +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +//https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values +public class ModelReplyOptions +{ + /// + /// Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) + /// + [JsonPropertyName("mirostat")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? MiroStat { get; set; } + + /// + /// Influences how quickly the algorithm responds to feedback from the generated text. + /// A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) + /// + [JsonPropertyName("mirostat_eta")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float? MiroStatEta { get; set; } + + /// + /// Controls the balance between coherence and diversity of the output. + /// A lower value will result in more focused and coherent text. (Default: 5.0) + /// + [JsonPropertyName("mirostat_tau")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float? MiroStatTau { get; set; } + + /// + /// Sets the size of the context window used to generate the next token. (Default: 2048) + /// + [JsonPropertyName("num_ctx")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? NumCtx { get; set; } + + /// + /// The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b + /// + [JsonPropertyName("num_gqa")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? NumGqa { get; set; } + + /// + /// The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable. + /// + [JsonPropertyName("num_gpu")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? NumGpu { get; set; } + + /// + /// Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. + /// It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). + /// + [JsonPropertyName("num_thread")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? NumThread { get; set; } + + /// + /// Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) + /// + [JsonPropertyName("repeat_last_n")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? RepeatLastN { get; set; } + + /// + /// Sets how strongly to penalize repetitions. + /// A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) + /// + [JsonPropertyName("repeat_penalty")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float? RepeatPenalty { get; set; } + + /// + /// The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) + /// + [JsonPropertyName("temperature")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float? Temperature { get; set; } + + /// + /// Sets the random number seed to use for generation. + /// Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) + /// + [JsonPropertyName("seed")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? Seed { get; set; } + + /// + /// Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. + /// Multiple stop patterns may be set by specifying multiple separate stop parameters in a modelfile. + /// + [JsonPropertyName("stop")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? Stop { get; set; } + + /// + /// Tail free sampling is used to reduce the impact of less probable tokens from the output. + /// A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) + /// + [JsonPropertyName("tfs_z")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public float? TfsZ { get; set; } + + /// + /// Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) + /// + [JsonPropertyName("num_predict")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? NumPredict { get; set; } + + /// + /// Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + /// + [JsonPropertyName("top_k")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? TopK { get; set; } + + /// + /// Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + /// + [JsonPropertyName("top_p")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public int? TopP { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/DTOs/OllamaReplyOptions.cs b/dotnet/src/AutoGen.Ollama/DTOs/OllamaReplyOptions.cs new file mode 100644 index 00000000000..c7c77d1db25 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/DTOs/OllamaReplyOptions.cs @@ -0,0 +1,111 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaReplyOptions.cs + +using AutoGen.Core; + +namespace AutoGen.Ollama; + +public enum FormatType +{ + None, + Json, +} + +public class OllamaReplyOptions : GenerateReplyOptions +{ + /// + /// the format to return a response in. Currently, the only accepted value is json + /// + public FormatType Format { get; set; } = FormatType.None; + + /// + /// the prompt template to use (overrides what is defined in the Modelfile) + /// + public string? Template { get; set; } + + /// + /// The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) + /// + public new float? Temperature { get; set; } + + /// + /// controls how long the model will stay loaded into memory following the request (default: 5m) + /// + public string? KeepAlive { get; set; } + + /// + /// Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) + /// + public int? MiroStat { get; set; } + + /// + /// Influences how quickly the algorithm responds to feedback from the generated text. + /// A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) + /// + public float? MiroStatEta { get; set; } + + /// + /// Controls the balance between coherence and diversity of the output. + /// A lower value will result in more focused and coherent text. (Default: 5.0) + /// + public float? MiroStatTau { get; set; } + + /// + /// Sets the size of the context window used to generate the next token. (Default: 2048) + /// + public int? NumCtx { get; set; } + + /// + /// The number of GQA groups in the transformer layer. Required for some models, for example it is 8 for llama2:70b + /// + public int? NumGqa { get; set; } + + /// + /// The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable. + /// + public int? NumGpu { get; set; } + + /// + /// Sets the number of threads to use during computation. By default, Ollama will detect this for optimal performance. + /// It is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). + /// + public int? NumThread { get; set; } + + /// + /// Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) + /// + public int? RepeatLastN { get; set; } + + /// + /// Sets how strongly to penalize repetitions. + /// A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) + /// + public float? RepeatPenalty { get; set; } + + /// + /// Sets the random number seed to use for generation. + /// Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) + /// + public int? Seed { get; set; } + + /// + /// Tail free sampling is used to reduce the impact of less probable tokens from the output. + /// A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) + /// + public float? TfsZ { get; set; } + + /// + /// Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) + /// + public new int? MaxToken { get; set; } + + /// + /// Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + /// + public int? TopK { get; set; } + + /// + /// Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + /// + public int? TopP { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/Embeddings/ITextEmbeddingService.cs b/dotnet/src/AutoGen.Ollama/Embeddings/ITextEmbeddingService.cs new file mode 100644 index 00000000000..5ce0dc8cc40 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Embeddings/ITextEmbeddingService.cs @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// ITextEmbeddingService.cs + +using System.Threading; +using System.Threading.Tasks; + +namespace AutoGen.Ollama; + +public interface ITextEmbeddingService +{ + public Task GenerateAsync(TextEmbeddingsRequest request, CancellationToken cancellationToken); +} diff --git a/dotnet/src/AutoGen.Ollama/Embeddings/OllamaTextEmbeddingService.cs b/dotnet/src/AutoGen.Ollama/Embeddings/OllamaTextEmbeddingService.cs new file mode 100644 index 00000000000..2e431e7bcb8 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Embeddings/OllamaTextEmbeddingService.cs @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaTextEmbeddingService.cs + +using System; +using System.IO; +using System.Net.Http; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; + +namespace AutoGen.Ollama; + +public class OllamaTextEmbeddingService : ITextEmbeddingService +{ + private readonly HttpClient _client; + + public OllamaTextEmbeddingService(HttpClient client) + { + _client = client; + } + public async Task GenerateAsync(TextEmbeddingsRequest request, CancellationToken cancellationToken = default) + { + using (HttpResponseMessage? response = await _client + .SendAsync(BuildPostRequest(request), HttpCompletionOption.ResponseContentRead, cancellationToken)) + { + response.EnsureSuccessStatusCode(); + + Stream? streamResponse = await response.Content.ReadAsStreamAsync(); + TextEmbeddingsResponse output = await JsonSerializer + .DeserializeAsync(streamResponse, cancellationToken: cancellationToken) + ?? throw new Exception("Failed to deserialize response"); + return output; + } + } + private static HttpRequestMessage BuildPostRequest(TextEmbeddingsRequest request) + { + string serialized = JsonSerializer.Serialize(request); + return new HttpRequestMessage(HttpMethod.Post, OllamaConsts.EmbeddingsEndpoint) + { + Content = new StringContent(serialized, Encoding.UTF8, OllamaConsts.JsonMediaType) + }; + } +} diff --git a/dotnet/src/AutoGen.Ollama/Embeddings/TextEmbeddingsRequest.cs b/dotnet/src/AutoGen.Ollama/Embeddings/TextEmbeddingsRequest.cs new file mode 100644 index 00000000000..7f2531c522a --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Embeddings/TextEmbeddingsRequest.cs @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// TextEmbeddingsRequest.cs + +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +public class TextEmbeddingsRequest +{ + /// + /// name of model to generate embeddings from + /// + [JsonPropertyName("model")] + public string Model { get; set; } = string.Empty; + /// + /// text to generate embeddings for + /// + [JsonPropertyName("prompt")] + public string Prompt { get; set; } = string.Empty; + /// + /// additional model parameters listed in the documentation for the Modelfile such as temperature + /// + [JsonPropertyName("options")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public ModelReplyOptions? Options { get; set; } + /// + /// controls how long the model will stay loaded into memory following the request (default: 5m) + /// + [JsonPropertyName("keep_alive")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? KeepAlive { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/Embeddings/TextEmbeddingsResponse.cs b/dotnet/src/AutoGen.Ollama/Embeddings/TextEmbeddingsResponse.cs new file mode 100644 index 00000000000..580059c033b --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Embeddings/TextEmbeddingsResponse.cs @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// TextEmbeddingsResponse.cs + +using System.Text.Json.Serialization; + +namespace AutoGen.Ollama; + +public class TextEmbeddingsResponse +{ + [JsonPropertyName("embedding")] + public double[]? Embedding { get; set; } +} diff --git a/dotnet/src/AutoGen.Ollama/Extension/OllamaAgentExtension.cs b/dotnet/src/AutoGen.Ollama/Extension/OllamaAgentExtension.cs new file mode 100644 index 00000000000..4c0df513ef8 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Extension/OllamaAgentExtension.cs @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaAgentExtension.cs + +using AutoGen.Core; + +namespace AutoGen.Ollama.Extension; + +public static class OllamaAgentExtension +{ + /// + /// Register an to the + /// + /// the connector to use. If null, a new instance of will be created. + public static MiddlewareStreamingAgent RegisterMessageConnector( + this OllamaAgent agent, OllamaMessageConnector? connector = null) + { + if (connector == null) + { + connector = new OllamaMessageConnector(); + } + + return agent.RegisterStreamingMiddleware(connector); + } + + /// + /// Register an to the where T is + /// + /// the connector to use. If null, a new instance of will be created. + public static MiddlewareStreamingAgent RegisterMessageConnector( + this MiddlewareStreamingAgent agent, OllamaMessageConnector? connector = null) + { + if (connector == null) + { + connector = new OllamaMessageConnector(); + } + + return agent.RegisterStreamingMiddleware(connector); + } +} diff --git a/dotnet/src/AutoGen.Ollama/Middlewares/OllamaMessageConnector.cs b/dotnet/src/AutoGen.Ollama/Middlewares/OllamaMessageConnector.cs new file mode 100644 index 00000000000..a21ec3a1c99 --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/Middlewares/OllamaMessageConnector.cs @@ -0,0 +1,186 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaMessageConnector.cs + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net.Http; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using AutoGen.Core; + +namespace AutoGen.Ollama; + +public class OllamaMessageConnector : IStreamingMiddleware +{ + public string Name => nameof(OllamaMessageConnector); + + public async Task InvokeAsync(MiddlewareContext context, IAgent agent, + CancellationToken cancellationToken = default) + { + var messages = ProcessMessage(context.Messages, agent); + IMessage reply = await agent.GenerateReplyAsync(messages, context.Options, cancellationToken); + + return reply switch + { + IMessage messageEnvelope when messageEnvelope.Content.Message?.Value is string content => new TextMessage(Role.Assistant, content, messageEnvelope.From), + IMessage messageEnvelope when messageEnvelope.Content.Message?.Value is null => throw new InvalidOperationException("Message content is null"), + _ => reply + }; + } + + public async IAsyncEnumerable InvokeAsync(MiddlewareContext context, IStreamingAgent agent, + [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + var messages = ProcessMessage(context.Messages, agent); + var chunks = new List(); + await foreach (var update in agent.GenerateStreamingReplyAsync(messages, context.Options, cancellationToken)) + { + if (update is IStreamingMessage chatResponseUpdate) + { + var response = chatResponseUpdate.Content switch + { + _ when chatResponseUpdate.Content.Message?.Value is string content => new TextMessageUpdate(Role.Assistant, content, chatResponseUpdate.From), + _ => null, + }; + + if (response != null) + { + chunks.Add(chatResponseUpdate.Content); + yield return response; + } + } + else + { + yield return update; + } + } + + if (chunks.Count == 0) + { + yield break; + } + + // if the chunks are not empty, aggregate them into a single message + var messageContent = string.Join(string.Empty, chunks.Select(c => c.Message?.Value)); + var message = new TextMessage(Role.Assistant, messageContent, agent.Name); + + yield return message; + } + + private IEnumerable ProcessMessage(IEnumerable messages, IAgent agent) + { + return messages.SelectMany(m => + { + if (m is IMessage messageEnvelope) + { + return [m]; + } + else + { + return m switch + { + TextMessage textMessage => ProcessTextMessage(textMessage, agent), + ImageMessage imageMessage => ProcessImageMessage(imageMessage, agent), + MultiModalMessage multiModalMessage => ProcessMultiModalMessage(multiModalMessage, agent), + _ => [m], + }; + } + }); + } + + private IEnumerable ProcessMultiModalMessage(MultiModalMessage multiModalMessage, IAgent agent) + { + var textMessages = multiModalMessage.Content.Where(m => m is TextMessage textMessage && textMessage.GetContent() is not null); + var imageMessages = multiModalMessage.Content.Where(m => m is ImageMessage); + + // aggregate the text messages into one message + // by concatenating the content using newline + var textContent = string.Join("\n", textMessages.Select(m => ((TextMessage)m).Content)); + + // collect all the images + var images = imageMessages.SelectMany(m => ProcessImageMessage((ImageMessage)m, agent) + .SelectMany(m => (m as IMessage)?.Content.Images)); + + var message = new Message() + { + Role = "user", + Value = textContent, + Images = images.ToList(), + }; + + return [MessageEnvelope.Create(message, agent.Name)]; + } + + private IEnumerable ProcessImageMessage(ImageMessage imageMessage, IAgent agent) + { + byte[]? data = imageMessage.Data?.ToArray(); + if (data is null) + { + if (imageMessage.Url is null) + { + throw new InvalidOperationException("Invalid ImageMessage, the data or url must be provided"); + } + + var uri = new Uri(imageMessage.Url); + // download the image from the URL + using var client = new HttpClient(); + var response = client.GetAsync(uri).Result; + if (!response.IsSuccessStatusCode) + { + throw new HttpRequestException($"Failed to download the image from {uri}"); + } + + data = response.Content.ReadAsByteArrayAsync().Result; + } + + var base64Image = Convert.ToBase64String(data); + var message = imageMessage.From switch + { + null when imageMessage.Role == Role.User => new Message { Role = "user", Images = [base64Image] }, + null => throw new InvalidOperationException("Invalid Role, the role must be user"), + _ when imageMessage.From != agent.Name => new Message { Role = "user", Images = [base64Image] }, + _ => throw new InvalidOperationException("The from field must be null or the agent name"), + }; + + return [MessageEnvelope.Create(message, agent.Name)]; + } + + private IEnumerable ProcessTextMessage(TextMessage textMessage, IAgent agent) + { + if (textMessage.Role == Role.System) + { + var message = new Message + { + Role = "system", + Value = textMessage.Content + }; + + return [MessageEnvelope.Create(message, agent.Name)]; + } + else if (textMessage.From == agent.Name) + { + var message = new Message + { + Role = "assistant", + Value = textMessage.Content + }; + + return [MessageEnvelope.Create(message, agent.Name)]; + } + else + { + var message = textMessage.From switch + { + null when textMessage.Role == Role.User => new Message { Role = "user", Value = textMessage.Content }, + null when textMessage.Role == Role.Assistant => new Message { Role = "assistant", Value = textMessage.Content }, + null => throw new InvalidOperationException("Invalid Role"), + _ when textMessage.From != agent.Name => new Message { Role = "user", Value = textMessage.Content }, + _ => throw new InvalidOperationException("The from field must be null or the agent name"), + }; + + return [MessageEnvelope.Create(message, agent.Name)]; + } + } +} diff --git a/dotnet/src/AutoGen.Ollama/OllamaConsts.cs b/dotnet/src/AutoGen.Ollama/OllamaConsts.cs new file mode 100644 index 00000000000..f305446a9aa --- /dev/null +++ b/dotnet/src/AutoGen.Ollama/OllamaConsts.cs @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaConsts.cs + +namespace AutoGen.Ollama; + +public class OllamaConsts +{ + public const string JsonFormatType = "json"; + public const string JsonMediaType = "application/json"; + public const string ChatCompletionEndpoint = "/api/chat"; + public const string EmbeddingsEndpoint = "/api/embeddings"; +} diff --git a/dotnet/src/AutoGen.OpenAI/Agent/GPTAgent.cs b/dotnet/src/AutoGen.OpenAI/Agent/GPTAgent.cs index 52070788e34..cdc6cc464d1 100644 --- a/dotnet/src/AutoGen.OpenAI/Agent/GPTAgent.cs +++ b/dotnet/src/AutoGen.OpenAI/Agent/GPTAgent.cs @@ -29,10 +29,8 @@ namespace AutoGen.OpenAI; /// public class GPTAgent : IStreamingAgent { - private readonly IDictionary>>? functionMap; private readonly OpenAIClient openAIClient; - private readonly string? modelName; - private readonly OpenAIChatAgent _innerAgent; + private readonly IStreamingAgent _innerAgent; public GPTAgent( string name, @@ -52,16 +50,23 @@ public GPTAgent( _ => throw new ArgumentException($"Unsupported config type {config.GetType()}"), }; - modelName = config switch + var modelName = config switch { AzureOpenAIConfig azureConfig => azureConfig.DeploymentName, OpenAIConfig openAIConfig => openAIConfig.ModelId, _ => throw new ArgumentException($"Unsupported config type {config.GetType()}"), }; - _innerAgent = new OpenAIChatAgent(openAIClient, name, modelName, systemMessage, temperature, maxTokens, seed, responseFormat, functions); + _innerAgent = new OpenAIChatAgent(openAIClient, name, modelName, systemMessage, temperature, maxTokens, seed, responseFormat, functions) + .RegisterMessageConnector(); + + if (functionMap is not null) + { + var functionMapMiddleware = new FunctionCallMiddleware(functionMap: functionMap); + _innerAgent = _innerAgent.RegisterStreamingMiddleware(functionMapMiddleware); + } + Name = name; - this.functionMap = functionMap; } public GPTAgent( @@ -77,10 +82,16 @@ public GPTAgent( IDictionary>>? functionMap = null) { this.openAIClient = openAIClient; - this.modelName = modelName; Name = name; - this.functionMap = functionMap; - _innerAgent = new OpenAIChatAgent(openAIClient, name, modelName, systemMessage, temperature, maxTokens, seed, responseFormat, functions); + + _innerAgent = new OpenAIChatAgent(openAIClient, name, modelName, systemMessage, temperature, maxTokens, seed, responseFormat, functions) + .RegisterMessageConnector(); + + if (functionMap is not null) + { + var functionMapMiddleware = new FunctionCallMiddleware(functionMap: functionMap); + _innerAgent = _innerAgent.RegisterStreamingMiddleware(functionMapMiddleware); + } } public string Name { get; } @@ -90,14 +101,7 @@ public async Task GenerateReplyAsync( GenerateReplyOptions? options = null, CancellationToken cancellationToken = default) { - var agent = this._innerAgent.RegisterMessageConnector(); - if (this.functionMap is not null) - { - var functionMapMiddleware = new FunctionCallMiddleware(functionMap: this.functionMap); - agent = agent.RegisterStreamingMiddleware(functionMapMiddleware); - } - - return await agent.GenerateReplyAsync(messages, options, cancellationToken); + return await _innerAgent.GenerateReplyAsync(messages, options, cancellationToken); } public IAsyncEnumerable GenerateStreamingReplyAsync( @@ -105,13 +109,6 @@ public IAsyncEnumerable GenerateStreamingReplyAsync( GenerateReplyOptions? options = null, CancellationToken cancellationToken = default) { - var agent = this._innerAgent.RegisterMessageConnector(); - if (this.functionMap is not null) - { - var functionMapMiddleware = new FunctionCallMiddleware(functionMap: this.functionMap); - agent = agent.RegisterStreamingMiddleware(functionMapMiddleware); - } - - return agent.GenerateStreamingReplyAsync(messages, options, cancellationToken); + return _innerAgent.GenerateStreamingReplyAsync(messages, options, cancellationToken); } } diff --git a/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs b/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs index b3dfb1e8668..ed795e5e8ed 100644 --- a/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs +++ b/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs @@ -12,6 +12,8 @@ public static class MessageExtension { public static string TEXT_CONTENT_TYPE = "text"; public static string IMAGE_CONTENT_TYPE = "image"; + + [Obsolete("This method is deprecated, please replace Message with one of the built-in message types.")] public static ChatRequestUserMessage ToChatRequestUserMessage(this Message message) { if (message.Value is ChatRequestUserMessage message1) @@ -50,6 +52,7 @@ public static ChatRequestUserMessage ToChatRequestUserMessage(this Message messa throw new ArgumentException("Content is null and metadata is null"); } + [Obsolete("This method is deprecated")] public static IEnumerable ToOpenAIChatRequestMessage(this IAgent agent, IMessage message) { if (message is IMessage oaiMessage) diff --git a/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs b/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs index 2bd9470ffa7..8f1825e2fa0 100644 --- a/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs +++ b/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs @@ -19,7 +19,6 @@ namespace AutoGen.OpenAI; /// - /// - /// - -/// - /// - where T is /// - where TMessage1 is and TMessage2 is /// @@ -27,6 +26,11 @@ public class OpenAIChatRequestMessageConnector : IMiddleware, IStreamingMiddlewa { private bool strictMode = false; + /// + /// Create a new instance of . + /// + /// If true, will throw an + /// When the message type is not supported. If false, it will ignore the unsupported message type. public OpenAIChatRequestMessageConnector(bool strictMode = false) { this.strictMode = strictMode; @@ -36,8 +40,7 @@ public OpenAIChatRequestMessageConnector(bool strictMode = false) public async Task InvokeAsync(MiddlewareContext context, IAgent agent, CancellationToken cancellationToken = default) { - var chatMessages = ProcessIncomingMessages(agent, context.Messages) - .Select(m => new MessageEnvelope(m)); + var chatMessages = ProcessIncomingMessages(agent, context.Messages); var reply = await agent.GenerateReplyAsync(chatMessages, context.Options, cancellationToken); @@ -49,8 +52,7 @@ public async IAsyncEnumerable InvokeAsync( IStreamingAgent agent, [EnumeratorCancellation] CancellationToken cancellationToken = default) { - var chatMessages = ProcessIncomingMessages(agent, context.Messages) - .Select(m => new MessageEnvelope(m)); + var chatMessages = ProcessIncomingMessages(agent, context.Messages); var streamingReply = agent.GenerateStreamingReplyAsync(chatMessages, context.Options, cancellationToken); string? currentToolName = null; await foreach (var reply in streamingReply) @@ -73,7 +75,14 @@ public async IAsyncEnumerable InvokeAsync( } else { - yield return reply; + if (this.strictMode) + { + throw new InvalidOperationException($"Invalid streaming message type {reply.GetType().Name}"); + } + else + { + yield return reply; + } } } } @@ -82,16 +91,10 @@ public IMessage PostProcessMessage(IMessage message) { return message switch { - TextMessage => message, - ImageMessage => message, - MultiModalMessage => message, - ToolCallMessage => message, - ToolCallResultMessage => message, - Message => message, - AggregateMessage => message, - IMessage m => PostProcessMessage(m), - IMessage m => PostProcessMessage(m), - _ => throw new InvalidOperationException("The type of message is not supported. Must be one of TextMessage, ImageMessage, MultiModalMessage, ToolCallMessage, ToolCallResultMessage, Message, IMessage, AggregateMessage"), + IMessage m => PostProcessChatResponseMessage(m.Content, m.From), + IMessage m => PostProcessChatCompletions(m), + _ when strictMode is false => message, + _ => throw new InvalidOperationException($"Invalid return message type {message.GetType().Name}"), }; } @@ -120,12 +123,7 @@ public IMessage PostProcessMessage(IMessage message) } } - private IMessage PostProcessMessage(IMessage message) - { - return PostProcessMessage(message.Content, message.From); - } - - private IMessage PostProcessMessage(IMessage message) + private IMessage PostProcessChatCompletions(IMessage message) { // throw exception if prompt filter results is not null if (message.Content.Choices[0].FinishReason == CompletionsFinishReason.ContentFiltered) @@ -133,12 +131,12 @@ private IMessage PostProcessMessage(IMessage message) throw new InvalidOperationException("The content is filtered because its potential risk. Please try another input."); } - return PostProcessMessage(message.Content.Choices[0].Message, message.From); + return PostProcessChatResponseMessage(message.Content.Choices[0].Message, message.From); } - private IMessage PostProcessMessage(ChatResponseMessage chatResponseMessage, string? from) + private IMessage PostProcessChatResponseMessage(ChatResponseMessage chatResponseMessage, string? from) { - if (chatResponseMessage.Content is string content) + if (chatResponseMessage.Content is string content && !string.IsNullOrEmpty(content)) { return new TextMessage(Role.Assistant, content, from); } @@ -154,7 +152,7 @@ private IMessage PostProcessMessage(ChatResponseMessage chatResponseMessage, str .Where(tc => tc is ChatCompletionsFunctionToolCall) .Select(tc => (ChatCompletionsFunctionToolCall)tc); - var toolCalls = functionToolCalls.Select(tc => new ToolCall(tc.Name, tc.Arguments)); + var toolCalls = functionToolCalls.Select(tc => new ToolCall(tc.Name, tc.Arguments) { ToolCallId = tc.Id }); return new ToolCallMessage(toolCalls, from); } @@ -162,112 +160,44 @@ private IMessage PostProcessMessage(ChatResponseMessage chatResponseMessage, str throw new InvalidOperationException("Invalid ChatResponseMessage"); } - public IEnumerable ProcessIncomingMessages(IAgent agent, IEnumerable messages) + public IEnumerable ProcessIncomingMessages(IAgent agent, IEnumerable messages) { - return messages.SelectMany(m => + return messages.SelectMany(m => { - if (m.From == null) + if (m is IMessage crm) { - return ProcessIncomingMessagesWithEmptyFrom(m); - } - else if (m.From == agent.Name) - { - return ProcessIncomingMessagesForSelf(m); + return [crm]; } else { - return ProcessIncomingMessagesForOther(m); + var chatRequestMessages = m switch + { + TextMessage textMessage => ProcessTextMessage(agent, textMessage), + ImageMessage imageMessage when (imageMessage.From is null || imageMessage.From != agent.Name) => ProcessImageMessage(agent, imageMessage), + MultiModalMessage multiModalMessage when (multiModalMessage.From is null || multiModalMessage.From != agent.Name) => ProcessMultiModalMessage(agent, multiModalMessage), + ToolCallMessage toolCallMessage when (toolCallMessage.From is null || toolCallMessage.From == agent.Name) => ProcessToolCallMessage(agent, toolCallMessage), + ToolCallResultMessage toolCallResultMessage => ProcessToolCallResultMessage(toolCallResultMessage), + AggregateMessage aggregateMessage => ProcessFunctionCallMiddlewareMessage(agent, aggregateMessage), +#pragma warning disable CS0618 // deprecated + Message msg => ProcessMessage(agent, msg), +#pragma warning restore CS0618 // deprecated + _ when strictMode is false => [], + _ => throw new InvalidOperationException($"Invalid message type: {m.GetType().Name}"), + }; + + if (chatRequestMessages.Any()) + { + return chatRequestMessages.Select(cm => MessageEnvelope.Create(cm, m.From)); + } + else + { + return [m]; + } } }); } - private IEnumerable ProcessIncomingMessagesForSelf(IMessage message) - { - return message switch - { - TextMessage textMessage => ProcessIncomingMessagesForSelf(textMessage), - ImageMessage imageMessage => ProcessIncomingMessagesForSelf(imageMessage), - MultiModalMessage multiModalMessage => ProcessIncomingMessagesForSelf(multiModalMessage), - ToolCallMessage toolCallMessage => ProcessIncomingMessagesForSelf(toolCallMessage), - ToolCallResultMessage toolCallResultMessage => ProcessIncomingMessagesForSelf(toolCallResultMessage), - Message msg => ProcessIncomingMessagesForSelf(msg), - IMessage crm => ProcessIncomingMessagesForSelf(crm), - AggregateMessage aggregateMessage => ProcessIncomingMessagesForSelf(aggregateMessage), - _ => throw new NotImplementedException(), - }; - } - - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(IMessage message) - { - return message switch - { - TextMessage textMessage => ProcessIncomingMessagesWithEmptyFrom(textMessage), - ImageMessage imageMessage => ProcessIncomingMessagesWithEmptyFrom(imageMessage), - MultiModalMessage multiModalMessage => ProcessIncomingMessagesWithEmptyFrom(multiModalMessage), - ToolCallMessage toolCallMessage => ProcessIncomingMessagesWithEmptyFrom(toolCallMessage), - ToolCallResultMessage toolCallResultMessage => ProcessIncomingMessagesWithEmptyFrom(toolCallResultMessage), - Message msg => ProcessIncomingMessagesWithEmptyFrom(msg), - IMessage crm => ProcessIncomingMessagesWithEmptyFrom(crm), - AggregateMessage aggregateMessage => ProcessIncomingMessagesWithEmptyFrom(aggregateMessage), - _ => throw new NotImplementedException(), - }; - } - - private IEnumerable ProcessIncomingMessagesForOther(IMessage message) - { - return message switch - { - TextMessage textMessage => ProcessIncomingMessagesForOther(textMessage), - ImageMessage imageMessage => ProcessIncomingMessagesForOther(imageMessage), - MultiModalMessage multiModalMessage => ProcessIncomingMessagesForOther(multiModalMessage), - ToolCallMessage toolCallMessage => ProcessIncomingMessagesForOther(toolCallMessage), - ToolCallResultMessage toolCallResultMessage => ProcessIncomingMessagesForOther(toolCallResultMessage), - Message msg => ProcessIncomingMessagesForOther(msg), - IMessage crm => ProcessIncomingMessagesForOther(crm), - AggregateMessage aggregateMessage => ProcessIncomingMessagesForOther(aggregateMessage), - _ => throw new NotImplementedException(), - }; - } - - private IEnumerable ProcessIncomingMessagesForSelf(TextMessage message) - { - if (message.Role == Role.System) - { - return new[] { new ChatRequestSystemMessage(message.Content) }; - } - else - { - return new[] { new ChatRequestAssistantMessage(message.Content) }; - } - } - - private IEnumerable ProcessIncomingMessagesForSelf(ImageMessage _) - { - return [new ChatRequestAssistantMessage("// Image Message is not supported")]; - } - - private IEnumerable ProcessIncomingMessagesForSelf(MultiModalMessage _) - { - return [new ChatRequestAssistantMessage("// MultiModal Message is not supported")]; - } - - private IEnumerable ProcessIncomingMessagesForSelf(ToolCallMessage message) - { - var toolCall = message.ToolCalls.Select(tc => new ChatCompletionsFunctionToolCall(tc.FunctionName, tc.FunctionName, tc.FunctionArguments)); - var chatRequestMessage = new ChatRequestAssistantMessage(string.Empty); - foreach (var tc in toolCall) - { - chatRequestMessage.ToolCalls.Add(tc); - } - - return new[] { chatRequestMessage }; - } - - private IEnumerable ProcessIncomingMessagesForSelf(ToolCallResultMessage message) - { - return message.ToolCalls.Select(tc => new ChatRequestToolMessage(tc.Result, tc.FunctionName)); - } - + [Obsolete("This method is deprecated, please use ProcessIncomingMessages(IAgent agent, IEnumerable messages) instead.")] private IEnumerable ProcessIncomingMessagesForSelf(Message message) { if (message.Role == Role.System) @@ -303,151 +233,147 @@ private IEnumerable ProcessIncomingMessagesForSelf(Message m } } - private IEnumerable ProcessIncomingMessagesForSelf(IMessage message) - { - return new[] { message.Content }; - } - - private IEnumerable ProcessIncomingMessagesForSelf(AggregateMessage aggregateMessage) + [Obsolete("This method is deprecated, please use ProcessIncomingMessages(IAgent agent, IEnumerable messages) instead.")] + private IEnumerable ProcessIncomingMessagesForOther(Message message) { - var toolCallMessage1 = aggregateMessage.Message1; - var toolCallResultMessage = aggregateMessage.Message2; - - var assistantMessage = new ChatRequestAssistantMessage(string.Empty); - var toolCalls = toolCallMessage1.ToolCalls.Select(tc => new ChatCompletionsFunctionToolCall(tc.FunctionName, tc.FunctionName, tc.FunctionArguments)); - foreach (var tc in toolCalls) + if (message.Role == Role.System) { - assistantMessage.ToolCalls.Add(tc); + return [new ChatRequestSystemMessage(message.Content) { Name = message.From }]; } + else if (message.Content is string content && content is { Length: > 0 }) + { + if (message.FunctionName is not null) + { + return new[] { new ChatRequestToolMessage(content, message.FunctionName) }; + } - var toolCallResults = toolCallResultMessage.ToolCalls.Select(tc => new ChatRequestToolMessage(tc.Result, tc.FunctionName)); - - // return assistantMessage and tool call result messages - var messages = new List { assistantMessage }; - messages.AddRange(toolCallResults); - - return messages; + return [new ChatRequestUserMessage(message.Content) { Name = message.From }]; + } + else if (message.FunctionName is string _) + { + return [new ChatRequestUserMessage("// Message type is not supported") { Name = message.From }]; + } + else + { + throw new InvalidOperationException("Invalid Message as message from other."); + } } - private IEnumerable ProcessIncomingMessagesForOther(TextMessage message) + private IEnumerable ProcessTextMessage(IAgent agent, TextMessage message) { if (message.Role == Role.System) { - return new[] { new ChatRequestSystemMessage(message.Content) }; + return [new ChatRequestSystemMessage(message.Content) { Name = message.From }]; + } + + if (agent.Name == message.From) + { + return [new ChatRequestAssistantMessage(message.Content) { Name = agent.Name }]; } else { - return new[] { new ChatRequestUserMessage(message.Content) }; + return message.From switch + { + null when message.Role == Role.User => [new ChatRequestUserMessage(message.Content)], + null when message.Role == Role.Assistant => [new ChatRequestAssistantMessage(message.Content)], + null => throw new InvalidOperationException("Invalid Role"), + _ => [new ChatRequestUserMessage(message.Content) { Name = message.From }] + }; } } - private IEnumerable ProcessIncomingMessagesForOther(ImageMessage message) + private IEnumerable ProcessImageMessage(IAgent agent, ImageMessage message) { - return new[] { new ChatRequestUserMessage([ - new ChatMessageImageContentItem(new Uri(message.Url ?? message.BuildDataUri())), - ])}; + if (agent.Name == message.From) + { + // image message from assistant is not supported + throw new ArgumentException("ImageMessage is not supported when message.From is the same with agent"); + } + + var imageContentItem = this.CreateChatMessageImageContentItemFromImageMessage(message); + return [new ChatRequestUserMessage([imageContentItem]) { Name = message.From }]; } - private IEnumerable ProcessIncomingMessagesForOther(MultiModalMessage message) + private IEnumerable ProcessMultiModalMessage(IAgent agent, MultiModalMessage message) { + if (agent.Name == message.From) + { + // image message from assistant is not supported + throw new ArgumentException("MultiModalMessage is not supported when message.From is the same with agent"); + } + IEnumerable items = message.Content.Select(ci => ci switch { TextMessage text => new ChatMessageTextContentItem(text.Content), - ImageMessage image => new ChatMessageImageContentItem(new Uri(image.Url ?? image.BuildDataUri())), + ImageMessage image => this.CreateChatMessageImageContentItemFromImageMessage(image), _ => throw new NotImplementedException(), }); - return new[] { new ChatRequestUserMessage(items) }; + return [new ChatRequestUserMessage(items) { Name = message.From }]; } - private IEnumerable ProcessIncomingMessagesForOther(ToolCallMessage msg) + private ChatMessageImageContentItem CreateChatMessageImageContentItemFromImageMessage(ImageMessage message) { - throw new ArgumentException("ToolCallMessage is not supported when message.From is not the same with agent"); + return message.Data is null + ? new ChatMessageImageContentItem(new Uri(message.Url)) + : new ChatMessageImageContentItem(message.Data, message.Data.MediaType); } - private IEnumerable ProcessIncomingMessagesForOther(ToolCallResultMessage message) + private IEnumerable ProcessToolCallMessage(IAgent agent, ToolCallMessage message) { - return message.ToolCalls.Select(tc => new ChatRequestToolMessage(tc.Result, tc.FunctionName)); - } - - private IEnumerable ProcessIncomingMessagesForOther(Message message) - { - if (message.Role == Role.System) + if (message.From is not null && message.From != agent.Name) { - return new[] { new ChatRequestSystemMessage(message.Content) }; + throw new ArgumentException("ToolCallMessage is not supported when message.From is not the same with agent"); } - else if (message.Content is string content && content is { Length: > 0 }) - { - if (message.FunctionName is not null) - { - return new[] { new ChatRequestToolMessage(content, message.FunctionName) }; - } - return new[] { new ChatRequestUserMessage(message.Content) }; - } - else if (message.FunctionName is string _) - { - return new[] - { - new ChatRequestUserMessage("// Message type is not supported"), - }; - } - else + var toolCall = message.ToolCalls.Select((tc, i) => new ChatCompletionsFunctionToolCall(tc.ToolCallId ?? $"{tc.FunctionName}_{i}", tc.FunctionName, tc.FunctionArguments)); + var chatRequestMessage = new ChatRequestAssistantMessage(string.Empty) { Name = message.From }; + foreach (var tc in toolCall) { - throw new InvalidOperationException("Invalid Message as message from other."); + chatRequestMessage.ToolCalls.Add(tc); } - } - - private IEnumerable ProcessIncomingMessagesForOther(IMessage message) - { - return new[] { message.Content }; - } - - private IEnumerable ProcessIncomingMessagesForOther(AggregateMessage aggregateMessage) - { - // convert as user message - var resultMessage = aggregateMessage.Message2; - - return resultMessage.ToolCalls.Select(tc => new ChatRequestUserMessage(tc.Result)); - } - - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(TextMessage message) - { - return ProcessIncomingMessagesForOther(message); - } - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(ImageMessage message) - { - return ProcessIncomingMessagesForOther(message); + return [chatRequestMessage]; } - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(MultiModalMessage message) + private IEnumerable ProcessToolCallResultMessage(ToolCallResultMessage message) { - return ProcessIncomingMessagesForOther(message); + return message.ToolCalls + .Where(tc => tc.Result is not null) + .Select((tc, i) => new ChatRequestToolMessage(tc.Result, tc.ToolCallId ?? $"{tc.FunctionName}_{i}")); } - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(ToolCallMessage message) + [Obsolete("This method is deprecated, please use ProcessIncomingMessages(IAgent agent, IEnumerable messages) instead.")] + private IEnumerable ProcessMessage(IAgent agent, Message message) { - return ProcessIncomingMessagesForSelf(message); + if (message.From is not null && message.From != agent.Name) + { + return ProcessIncomingMessagesForOther(message); + } + else + { + return ProcessIncomingMessagesForSelf(message); + } } - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(ToolCallResultMessage message) + private IEnumerable ProcessFunctionCallMiddlewareMessage(IAgent agent, AggregateMessage aggregateMessage) { - return ProcessIncomingMessagesForOther(message); - } + if (aggregateMessage.From is not null && aggregateMessage.From != agent.Name) + { + // convert as user message + var resultMessage = aggregateMessage.Message2; - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(Message message) - { - return ProcessIncomingMessagesForOther(message); - } + return resultMessage.ToolCalls.Select(tc => new ChatRequestUserMessage(tc.Result) { Name = aggregateMessage.From }); + } + else + { + var toolCallMessage1 = aggregateMessage.Message1; + var toolCallResultMessage = aggregateMessage.Message2; - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(IMessage message) - { - return new[] { message.Content }; - } + var assistantMessage = this.ProcessToolCallMessage(agent, toolCallMessage1); + var toolCallResults = this.ProcessToolCallResultMessage(toolCallResultMessage); - private IEnumerable ProcessIncomingMessagesWithEmptyFrom(AggregateMessage aggregateMessage) - { - return ProcessIncomingMessagesForOther(aggregateMessage); + return assistantMessage.Concat(toolCallResults); + } } } diff --git a/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs b/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs index 6a8395ef22e..6ce242eb1ab 100644 --- a/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs +++ b/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs @@ -133,7 +133,9 @@ private IEnumerable ProcessMessageForSelf(IMessage message) { TextMessage textMessage => ProcessMessageForSelf(textMessage), MultiModalMessage multiModalMessage => ProcessMessageForSelf(multiModalMessage), +#pragma warning disable CS0618 // deprecated Message m => ProcessMessageForSelf(m), +#pragma warning restore CS0618 // deprecated _ => throw new System.NotImplementedException(), }; } @@ -145,7 +147,9 @@ private IEnumerable ProcessMessageForOthers(IMessage message TextMessage textMessage => ProcessMessageForOthers(textMessage), MultiModalMessage multiModalMessage => ProcessMessageForOthers(multiModalMessage), ImageMessage imageMessage => ProcessMessageForOthers(imageMessage), +#pragma warning disable CS0618 // deprecated Message m => ProcessMessageForOthers(m), +#pragma warning restore CS0618 // deprecated _ => throw new InvalidOperationException("unsupported message type, only support TextMessage, ImageMessage, MultiModalMessage and Message."), }; } @@ -208,7 +212,7 @@ private IEnumerable ProcessMessageForOthers(MultiModalMessag return [new ChatMessageContent(AuthorRole.User, collections)]; } - + [Obsolete("This method is deprecated, please use the specific method instead.")] private IEnumerable ProcessMessageForSelf(Message message) { if (message.Role == Role.System) @@ -229,6 +233,7 @@ private IEnumerable ProcessMessageForSelf(Message message) } } + [Obsolete("This method is deprecated, please use the specific method instead.")] private IEnumerable ProcessMessageForOthers(Message message) { if (message.Role == Role.System) diff --git a/dotnet/src/AutoGen.SourceGenerator/AutoGen.SourceGenerator.csproj b/dotnet/src/AutoGen.SourceGenerator/AutoGen.SourceGenerator.csproj index 4558160722d..37f344ed11e 100644 --- a/dotnet/src/AutoGen.SourceGenerator/AutoGen.SourceGenerator.csproj +++ b/dotnet/src/AutoGen.SourceGenerator/AutoGen.SourceGenerator.csproj @@ -14,7 +14,7 @@ - + AutoGen.SourceGenerator @@ -50,6 +50,10 @@ + + + + True diff --git a/dotnet/src/AutoGen.SourceGenerator/FunctionCallGenerator.cs b/dotnet/src/AutoGen.SourceGenerator/FunctionCallGenerator.cs index 50bdc03f0af..cd01416182b 100644 --- a/dotnet/src/AutoGen.SourceGenerator/FunctionCallGenerator.cs +++ b/dotnet/src/AutoGen.SourceGenerator/FunctionCallGenerator.cs @@ -144,7 +144,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context) private class PartialClassOutput { - public PartialClassOutput(string fullClassName, ClassDeclarationSyntax classDeclarationSyntax, IEnumerable functionContracts) + public PartialClassOutput(string fullClassName, ClassDeclarationSyntax classDeclarationSyntax, IEnumerable functionContracts) { FullClassName = fullClassName; ClassDeclarationSyntax = classDeclarationSyntax; @@ -155,10 +155,10 @@ public PartialClassOutput(string fullClassName, ClassDeclarationSyntax classDecl public ClassDeclarationSyntax ClassDeclarationSyntax { get; } - public IEnumerable FunctionContracts { get; } + public IEnumerable FunctionContracts { get; } } - private FunctionContract CreateFunctionContract(MethodDeclarationSyntax method, string? className, string? namespaceName) + private SourceGeneratorFunctionContract CreateFunctionContract(MethodDeclarationSyntax method, string? className, string? namespaceName) { // get function_call attribute var functionCallAttribute = method.AttributeLists.SelectMany(attributeList => attributeList.Attributes) @@ -208,7 +208,7 @@ private FunctionContract CreateFunctionContract(MethodDeclarationSyntax method, description = System.Text.RegularExpressions.Regex.Replace(description, @"[^\S\r\n]+\/[\/]+\s*", string.Empty); } var jsonItemType = parameter.Type!.ToString().EndsWith("[]") ? parameter.Type!.ToString().Substring(0, parameter.Type!.ToString().Length - 2) : null; - return new ParameterContract + return new SourceGeneratorParameterContract { Name = parameter.Identifier.ToString(), JsonType = parameter.Type!.ToString() switch @@ -234,7 +234,7 @@ private FunctionContract CreateFunctionContract(MethodDeclarationSyntax method, }; }); - return new FunctionContract + return new SourceGeneratorFunctionContract { ClassName = className, Namespace = namespaceName, diff --git a/dotnet/src/AutoGen.SourceGenerator/FunctionExtension.cs b/dotnet/src/AutoGen.SourceGenerator/FunctionExtension.cs index a56e4cb54f4..cfb77d26a2b 100644 --- a/dotnet/src/AutoGen.SourceGenerator/FunctionExtension.cs +++ b/dotnet/src/AutoGen.SourceGenerator/FunctionExtension.cs @@ -5,27 +5,27 @@ internal static class FunctionExtension { - public static string GetFunctionName(this FunctionContract function) + public static string GetFunctionName(this SourceGeneratorFunctionContract function) { return function.Name ?? string.Empty; } - public static string GetFunctionSchemaClassName(this FunctionContract function) + public static string GetFunctionSchemaClassName(this SourceGeneratorFunctionContract function) { return $"{function.GetFunctionName()}Schema"; } - public static string GetFunctionDefinitionName(this FunctionContract function) + public static string GetFunctionDefinitionName(this SourceGeneratorFunctionContract function) { return $"{function.GetFunctionName()}Function"; } - public static string GetFunctionWrapperName(this FunctionContract function) + public static string GetFunctionWrapperName(this SourceGeneratorFunctionContract function) { return $"{function.GetFunctionName()}Wrapper"; } - public static string GetFunctionContractName(this FunctionContract function) + public static string GetFunctionContractName(this SourceGeneratorFunctionContract function) { return $"{function.GetFunctionName()}FunctionContract"; } diff --git a/dotnet/src/AutoGen.SourceGenerator/FunctionContract.cs b/dotnet/src/AutoGen.SourceGenerator/SourceGeneratorFunctionContract.cs similarity index 81% rename from dotnet/src/AutoGen.SourceGenerator/FunctionContract.cs rename to dotnet/src/AutoGen.SourceGenerator/SourceGeneratorFunctionContract.cs index 2f26352173d..24e42affa3b 100644 --- a/dotnet/src/AutoGen.SourceGenerator/FunctionContract.cs +++ b/dotnet/src/AutoGen.SourceGenerator/SourceGeneratorFunctionContract.cs @@ -3,7 +3,7 @@ namespace AutoGen.SourceGenerator { - internal class FunctionContract + internal class SourceGeneratorFunctionContract { public string? Namespace { get; set; } @@ -15,12 +15,12 @@ internal class FunctionContract public string? ReturnDescription { get; set; } - public ParameterContract[]? Parameters { get; set; } + public SourceGeneratorParameterContract[]? Parameters { get; set; } public string? ReturnType { get; set; } } - internal class ParameterContract + internal class SourceGeneratorParameterContract { public string? Name { get; set; } diff --git a/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.cs b/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.cs index 1d455bd3041..e56db112eb7 100644 --- a/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.cs +++ b/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.cs @@ -31,7 +31,6 @@ public virtual string TransformText() // This code was generated by a tool. // //---------------------- -using Azure.AI.OpenAI; using System.Text.Json; using System.Text.Json.Serialization; using System.Threading.Tasks; @@ -152,7 +151,8 @@ public virtual string TransformText() } this.Write(" },\r\n"); } - this.Write(" };\r\n }\r\n\r\n public Azure.AI.OpenAI.FunctionDefinition "); + this.Write(" };\r\n }\r\n\r\n public global::Azure.AI.OpenAI.FunctionDefin" + + "ition "); this.Write(this.ToStringHelper.ToStringWithCulture(functionContract.GetFunctionDefinitionName())); this.Write("\r\n {\r\n get => this."); this.Write(this.ToStringHelper.ToStringWithCulture(functionContract.GetFunctionContractName())); @@ -168,7 +168,7 @@ public virtual string TransformText() public string NameSpace {get; set;} public string ClassName {get; set;} -public IEnumerable FunctionContracts {get; set;} +public IEnumerable FunctionContracts {get; set;} public bool IsStatic {get; set;} = false; } diff --git a/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.tt b/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.tt index baa2a680fe2..526dfe400ce 100644 --- a/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.tt +++ b/dotnet/src/AutoGen.SourceGenerator/Template/FunctionCallTemplate.tt @@ -8,7 +8,6 @@ // This code was generated by a tool. // //---------------------- -using Azure.AI.OpenAI; using System.Text.Json; using System.Text.Json.Serialization; using System.Threading.Tasks; @@ -98,7 +97,7 @@ namespace <#=NameSpace#> }; } - public Azure.AI.OpenAI.FunctionDefinition <#=functionContract.GetFunctionDefinitionName()#> + public global::Azure.AI.OpenAI.FunctionDefinition <#=functionContract.GetFunctionDefinitionName()#> { get => this.<#=functionContract.GetFunctionContractName()#>.ToOpenAIFunctionDefinition(); } diff --git a/dotnet/test/AutoGen.DotnetInteractive.Tests/AutoGen.DotnetInteractive.Tests.csproj b/dotnet/test/AutoGen.DotnetInteractive.Tests/AutoGen.DotnetInteractive.Tests.csproj new file mode 100644 index 00000000000..cf2c24eaf78 --- /dev/null +++ b/dotnet/test/AutoGen.DotnetInteractive.Tests/AutoGen.DotnetInteractive.Tests.csproj @@ -0,0 +1,24 @@ + + + + $(TestTargetFramework) + enable + false + True + + + + + + + + + + + + + + + + + diff --git a/dotnet/test/AutoGen.DotnetInteractive.Tests/DotnetInteractiveServiceTest.cs b/dotnet/test/AutoGen.DotnetInteractive.Tests/DotnetInteractiveServiceTest.cs new file mode 100644 index 00000000000..0e36053c45e --- /dev/null +++ b/dotnet/test/AutoGen.DotnetInteractive.Tests/DotnetInteractiveServiceTest.cs @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// DotnetInteractiveServiceTest.cs + +using FluentAssertions; +using Xunit; +using Xunit.Abstractions; + +namespace AutoGen.DotnetInteractive.Tests; + +public class DotnetInteractiveServiceTest : IDisposable +{ + private ITestOutputHelper _output; + private InteractiveService _interactiveService; + private string _workingDir; + + public DotnetInteractiveServiceTest(ITestOutputHelper output) + { + _output = output; + _workingDir = Path.Combine(Path.GetTempPath(), "test", Path.GetRandomFileName()); + if (!Directory.Exists(_workingDir)) + { + Directory.CreateDirectory(_workingDir); + } + + _interactiveService = new InteractiveService(_workingDir); + _interactiveService.StartAsync(_workingDir, default).Wait(); + } + + public void Dispose() + { + _interactiveService.Dispose(); + } + + [Fact] + public async Task ItRunCSharpCodeSnippetTestsAsync() + { + var cts = new CancellationTokenSource(); + var isRunning = await _interactiveService.StartAsync(_workingDir, cts.Token); + + isRunning.Should().BeTrue(); + + _interactiveService.IsRunning().Should().BeTrue(); + + // test code snippet + var hello_world = @" +Console.WriteLine(""hello world""); +"; + + await this.TestCSharpCodeSnippet(_interactiveService, hello_world, "hello world"); + await this.TestCSharpCodeSnippet( + _interactiveService, + code: @" +Console.WriteLine(""hello world"" +", + expectedOutput: "Error: (2,32): error CS1026: ) expected"); + + await this.TestCSharpCodeSnippet( + service: _interactiveService, + code: "throw new Exception();", + expectedOutput: "Error: System.Exception: Exception of type 'System.Exception' was thrown"); + } + + [Fact] + public async Task ItRunPowershellScriptTestsAsync() + { + // test power shell + var ps = @"Write-Output ""hello world"""; + await this.TestPowershellCodeSnippet(_interactiveService, ps, "hello world"); + } + + private async Task TestPowershellCodeSnippet(InteractiveService service, string code, string expectedOutput) + { + var result = await service.SubmitPowershellCodeAsync(code, CancellationToken.None); + result.Should().StartWith(expectedOutput); + } + + private async Task TestCSharpCodeSnippet(InteractiveService service, string code, string expectedOutput) + { + var result = await service.SubmitCSharpCodeAsync(code, CancellationToken.None); + result.Should().StartWith(expectedOutput); + } +} diff --git a/dotnet/test/AutoGen.Mistral.Tests/MistralClientAgentTests.cs b/dotnet/test/AutoGen.Mistral.Tests/MistralClientAgentTests.cs index 2b6839dd0ef..3aa61a7a71d 100644 --- a/dotnet/test/AutoGen.Mistral.Tests/MistralClientAgentTests.cs +++ b/dotnet/test/AutoGen.Mistral.Tests/MistralClientAgentTests.cs @@ -87,11 +87,15 @@ public async Task MistralAgentFunctionCallMessageTest() } """; var functionCallResult = await this.GetWeatherWrapper(weatherFunctionArgumets); - + var toolCall = new ToolCall(this.GetWeatherFunctionContract.Name!, weatherFunctionArgumets) + { + ToolCallId = "012345678", // Mistral AI requires the tool call id to be a length of 9 + Result = functionCallResult, + }; IMessage[] chatHistory = [ new TextMessage(Role.User, "what's the weather in Seattle?"), - new ToolCallMessage(this.GetWeatherFunctionContract.Name!, weatherFunctionArgumets, from: agent.Name), - new ToolCallResultMessage(functionCallResult, this.GetWeatherFunctionContract.Name!, weatherFunctionArgumets), + new ToolCallMessage([toolCall], from: agent.Name), + new ToolCallResultMessage([toolCall], weatherFunctionArgumets), ]; var reply = await agent.SendAsync(chatHistory: chatHistory); @@ -152,7 +156,7 @@ public async Task MistralAgentFunctionCallMiddlewareMessageTest() var question = new TextMessage(Role.User, "what's the weather in Seattle?"); var reply = await functionCallAgent.SendAsync(question); - reply.Should().BeOfType>(); + reply.Should().BeOfType(); // resend the reply to the same agent so it can generate the final response // because the reply's from is the agent's name diff --git a/dotnet/test/AutoGen.Ollama.Tests/AutoGen.Ollama.Tests.csproj b/dotnet/test/AutoGen.Ollama.Tests/AutoGen.Ollama.Tests.csproj new file mode 100644 index 00000000000..27f80716f1c --- /dev/null +++ b/dotnet/test/AutoGen.Ollama.Tests/AutoGen.Ollama.Tests.csproj @@ -0,0 +1,33 @@ + + + + $(TestTargetFramework) + enable + false + True + + + + + + + + + + + + + + + + + + + PreserveNewest + + + PreserveNewest + + + + diff --git a/dotnet/test/AutoGen.Ollama.Tests/OllamaAgentTests.cs b/dotnet/test/AutoGen.Ollama.Tests/OllamaAgentTests.cs new file mode 100644 index 00000000000..c1fb466f0b0 --- /dev/null +++ b/dotnet/test/AutoGen.Ollama.Tests/OllamaAgentTests.cs @@ -0,0 +1,224 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaAgentTests.cs + +using System.Text.Json; +using AutoGen.Core; +using AutoGen.Ollama.Extension; +using AutoGen.Tests; +using FluentAssertions; + +namespace AutoGen.Ollama.Tests; + +public class OllamaAgentTests +{ + [ApiKeyFact("OLLAMA_HOST", "OLLAMA_MODEL_NAME")] + public async Task GenerateReplyAsync_ReturnsValidMessage_WhenCalled() + { + string host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + string modelName = Environment.GetEnvironmentVariable("OLLAMA_MODEL_NAME") + ?? throw new InvalidOperationException("OLLAMA_MODEL_NAME is not set."); + OllamaAgent ollamaAgent = BuildOllamaAgent(host, modelName); + + var message = new Message("user", "hey how are you"); + var messages = new IMessage[] { MessageEnvelope.Create(message, from: modelName) }; + IMessage result = await ollamaAgent.GenerateReplyAsync(messages); + + result.Should().NotBeNull(); + result.Should().BeOfType>(); + result.From.Should().Be(ollamaAgent.Name); + } + + [ApiKeyFact("OLLAMA_HOST", "OLLAMA_MODEL_NAME")] + public async Task GenerateReplyAsync_ReturnsValidJsonMessageContent_WhenCalled() + { + string host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + string modelName = Environment.GetEnvironmentVariable("OLLAMA_MODEL_NAME") + ?? throw new InvalidOperationException("OLLAMA_MODEL_NAME is not set."); + OllamaAgent ollamaAgent = BuildOllamaAgent(host, modelName); + + var message = new Message("user", "What color is the sky at different times of the day? Respond using JSON"); + var messages = new IMessage[] { MessageEnvelope.Create(message, from: modelName) }; + IMessage result = await ollamaAgent.GenerateReplyAsync(messages, new OllamaReplyOptions + { + Format = FormatType.Json + }); + + result.Should().NotBeNull(); + result.Should().BeOfType>(); + result.From.Should().Be(ollamaAgent.Name); + + string jsonContent = ((MessageEnvelope)result).Content.Message!.Value; + bool isValidJson = IsValidJsonMessage(jsonContent); + isValidJson.Should().BeTrue(); + } + + [ApiKeyFact("OLLAMA_HOST", "OLLAMA_MODEL_NAME")] + public async Task GenerateStreamingReplyAsync_ReturnsValidMessages_WhenCalled() + { + string host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + string modelName = Environment.GetEnvironmentVariable("OLLAMA_MODEL_NAME") + ?? throw new InvalidOperationException("OLLAMA_MODEL_NAME is not set."); + OllamaAgent ollamaAgent = BuildOllamaAgent(host, modelName); + + var msg = new Message("user", "hey how are you"); + var messages = new IMessage[] { MessageEnvelope.Create(msg, from: modelName) }; + IStreamingMessage? finalReply = default; + await foreach (IStreamingMessage message in ollamaAgent.GenerateStreamingReplyAsync(messages)) + { + message.Should().NotBeNull(); + message.From.Should().Be(ollamaAgent.Name); + var streamingMessage = (IMessage)message; + if (streamingMessage.Content.Done) + { + finalReply = message; + break; + } + else + { + streamingMessage.Content.Message.Should().NotBeNull(); + streamingMessage.Content.Done.Should().BeFalse(); + } + } + + finalReply.Should().BeOfType>(); + var update = ((MessageEnvelope)finalReply!).Content; + update.Done.Should().BeTrue(); + update.TotalDuration.Should().BeGreaterThan(0); + } + + [ApiKeyFact("OLLAMA_HOST")] + public async Task ItReturnValidMessageUsingLLavaAsync() + { + var host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + var modelName = "llava:latest"; + var ollamaAgent = BuildOllamaAgent(host, modelName); + var imagePath = Path.Combine("images", "image.png"); + var base64Image = Convert.ToBase64String(File.ReadAllBytes(imagePath)); + var message = new Message() + { + Role = "user", + Value = "What's the color of the background in this image", + Images = [base64Image], + }; + + var messages = new IMessage[] { MessageEnvelope.Create(message, from: modelName) }; + var reply = await ollamaAgent.GenerateReplyAsync(messages); + + reply.Should().BeOfType>(); + var chatResponse = ((MessageEnvelope)reply).Content; + chatResponse.Message.Should().NotBeNull(); + } + + [ApiKeyFact("OLLAMA_HOST")] + public async Task ItCanProcessMultiModalMessageUsingLLavaAsync() + { + var host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + var modelName = "llava:latest"; + var ollamaAgent = BuildOllamaAgent(host, modelName) + .RegisterMessageConnector(); + var image = Path.Combine("images", "image.png"); + var binaryData = BinaryData.FromBytes(File.ReadAllBytes(image), "image/png"); + var imageMessage = new ImageMessage(Role.User, binaryData); + var textMessage = new TextMessage(Role.User, "What's in this image?"); + var multiModalMessage = new MultiModalMessage(Role.User, [textMessage, imageMessage]); + + var reply = await ollamaAgent.SendAsync(multiModalMessage); + reply.Should().BeOfType(); + reply.GetRole().Should().Be(Role.Assistant); + reply.GetContent().Should().NotBeNullOrEmpty(); + reply.From.Should().Be(ollamaAgent.Name); + } + + [ApiKeyFact("OLLAMA_HOST")] + public async Task ItCanProcessImageMessageUsingLLavaAsync() + { + var host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + var modelName = "llava:latest"; + var ollamaAgent = BuildOllamaAgent(host, modelName) + .RegisterMessageConnector(); + var image = Path.Combine("images", "image.png"); + var binaryData = BinaryData.FromBytes(File.ReadAllBytes(image), "image/png"); + var imageMessage = new ImageMessage(Role.User, binaryData); + + var reply = await ollamaAgent.SendAsync(imageMessage); + reply.Should().BeOfType(); + reply.GetRole().Should().Be(Role.Assistant); + reply.GetContent().Should().NotBeNullOrEmpty(); + reply.From.Should().Be(ollamaAgent.Name); + } + + [ApiKeyFact("OLLAMA_HOST")] + public async Task ItReturnValidStreamingMessageUsingLLavaAsync() + { + var host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + var modelName = "llava:latest"; + var ollamaAgent = BuildOllamaAgent(host, modelName); + var squareImagePath = Path.Combine("images", "square.png"); + var base64Image = Convert.ToBase64String(File.ReadAllBytes(squareImagePath)); + var imageMessage = new Message() + { + Role = "user", + Value = "What's in this image?", + Images = [base64Image], + }; + + var messages = new IMessage[] { MessageEnvelope.Create(imageMessage, from: modelName) }; + + IStreamingMessage? finalReply = default; + await foreach (IStreamingMessage message in ollamaAgent.GenerateStreamingReplyAsync(messages)) + { + message.Should().NotBeNull(); + message.From.Should().Be(ollamaAgent.Name); + var streamingMessage = (IMessage)message; + if (streamingMessage.Content.Done) + { + finalReply = message; + break; + } + else + { + streamingMessage.Content.Message.Should().NotBeNull(); + streamingMessage.Content.Done.Should().BeFalse(); + } + } + + finalReply.Should().BeOfType>(); + var update = ((MessageEnvelope)finalReply!).Content; + update.Done.Should().BeTrue(); + update.TotalDuration.Should().BeGreaterThan(0); + } + + private static bool IsValidJsonMessage(string input) + { + try + { + JsonDocument.Parse(input); + return true; + } + catch (JsonException) + { + return false; + } + catch (Exception ex) + { + Console.WriteLine("An unexpected exception occurred: " + ex.Message); + return false; + } + } + + private static OllamaAgent BuildOllamaAgent(string host, string modelName) + { + var httpClient = new HttpClient + { + BaseAddress = new Uri(host) + }; + return new OllamaAgent(httpClient, "TestAgent", modelName); + } +} diff --git a/dotnet/test/AutoGen.Ollama.Tests/OllamaMessageTests.cs b/dotnet/test/AutoGen.Ollama.Tests/OllamaMessageTests.cs new file mode 100644 index 00000000000..b19291e9767 --- /dev/null +++ b/dotnet/test/AutoGen.Ollama.Tests/OllamaMessageTests.cs @@ -0,0 +1,176 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaMessageTests.cs + +using AutoGen.Core; +using AutoGen.Tests; +using FluentAssertions; +using Xunit; +namespace AutoGen.Ollama.Tests; + +public class OllamaMessageTests +{ + [Fact] + public async Task ItProcessUserTextMessageAsync() + { + var messageConnector = new OllamaMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, ct) => + { + msgs.Count().Should().Be(1); + var innerMessage = msgs.First(); + innerMessage.Should().BeOfType>(); + var message = (IMessage)innerMessage; + message.Content.Value.Should().Be("Hello"); + message.Content.Images.Should().BeNullOrEmpty(); + message.Content.Role.Should().Be("user"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(messageConnector); + + // when from is null and role is user + await agent.SendAsync("Hello"); + + // when from is user and role is user + var userMessage = new TextMessage(Role.User, "Hello", from: "user"); + await agent.SendAsync(userMessage); + + // when from is user but role is assistant + userMessage = new TextMessage(Role.Assistant, "Hello", from: "user"); + await agent.SendAsync(userMessage); + } + + [Fact] + public async Task ItProcessStreamingTextMessageAsync() + { + var messageConnector = new OllamaMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterStreamingMiddleware(messageConnector); + + var messageChunks = Enumerable.Range(0, 10) + .Select(i => new ChatResponseUpdate() + { + Message = new Message() + { + Value = i.ToString(), + Role = "assistant", + } + }) + .Select(m => MessageEnvelope.Create(m)); + + IStreamingMessage? finalReply = null; + await foreach (var reply in agent.GenerateStreamingReplyAsync(messageChunks)) + { + reply.Should().BeAssignableTo(); + finalReply = reply; + } + + finalReply.Should().BeOfType(); + var textMessage = (TextMessage)finalReply!; + textMessage.GetContent().Should().Be("0123456789"); + } + + [Fact] + public async Task ItProcessAssistantTextMessageAsync() + { + var messageConnector = new OllamaMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, ct) => + { + msgs.Count().Should().Be(1); + var innerMessage = msgs.First(); + innerMessage.Should().BeOfType>(); + var message = (IMessage)innerMessage; + message.Content.Value.Should().Be("Hello"); + message.Content.Images.Should().BeNullOrEmpty(); + message.Content.Role.Should().Be("assistant"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(messageConnector); + + // when from is null and role is assistant + var assistantMessage = new TextMessage(Role.Assistant, "Hello"); + await agent.SendAsync(assistantMessage); + + // when from is assistant and role is assistant + assistantMessage = new TextMessage(Role.Assistant, "Hello", from: "assistant"); + await agent.SendAsync(assistantMessage); + + // when from is assistant but role is user + assistantMessage = new TextMessage(Role.User, "Hello", from: "assistant"); + await agent.SendAsync(assistantMessage); + } + + [Fact] + public async Task ItProcessSystemTextMessageAsync() + { + var messageConnector = new OllamaMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, ct) => + { + msgs.Count().Should().Be(1); + var innerMessage = msgs.First(); + innerMessage.Should().BeOfType>(); + var message = (IMessage)innerMessage; + message.Content.Value.Should().Be("Hello"); + message.Content.Images.Should().BeNullOrEmpty(); + message.Content.Role.Should().Be("system"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(messageConnector); + + // when role is system + var systemMessage = new TextMessage(Role.System, "Hello"); + await agent.SendAsync(systemMessage); + } + + [Fact] + public async Task ItProcessImageMessageAsync() + { + var messageConnector = new OllamaMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, ct) => + { + msgs.Count().Should().Be(1); + var innerMessage = msgs.First(); + innerMessage.Should().BeOfType>(); + var message = (IMessage)innerMessage; + message.Content.Images!.Count.Should().Be(1); + message.Content.Role.Should().Be("user"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(messageConnector); + + var square = Path.Combine("images", "square.png"); + BinaryData imageBinaryData = BinaryData.FromBytes(File.ReadAllBytes(square), "image/png"); + var imageMessage = new ImageMessage(Role.User, imageBinaryData); + await agent.SendAsync(imageMessage); + } + + [Fact] + public async Task ItProcessMultiModalMessageAsync() + { + var messageConnector = new OllamaMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, ct) => + { + msgs.Count().Should().Be(1); + var message = msgs.First(); + message.Should().BeOfType>(); + + var multiModalMessage = (IMessage)message; + multiModalMessage.Content.Images!.Count.Should().Be(1); + multiModalMessage.Content.Value.Should().Be("Hello"); + + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(messageConnector); + + var square = Path.Combine("images", "square.png"); + BinaryData imageBinaryData = BinaryData.FromBytes(File.ReadAllBytes(square), "image/png"); + var imageMessage = new ImageMessage(Role.User, imageBinaryData); + var textMessage = new TextMessage(Role.User, "Hello"); + var multiModalMessage = new MultiModalMessage(Role.User, [textMessage, imageMessage]); + + await agent.SendAsync(multiModalMessage); + } +} diff --git a/dotnet/test/AutoGen.Ollama.Tests/OllamaTextEmbeddingServiceTests.cs b/dotnet/test/AutoGen.Ollama.Tests/OllamaTextEmbeddingServiceTests.cs new file mode 100644 index 00000000000..06522bdd823 --- /dev/null +++ b/dotnet/test/AutoGen.Ollama.Tests/OllamaTextEmbeddingServiceTests.cs @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OllamaTextEmbeddingServiceTests.cs + +using AutoGen.Tests; +using FluentAssertions; + +namespace AutoGen.Ollama.Tests; + +public class OllamaTextEmbeddingServiceTests +{ + [ApiKeyFact("OLLAMA_HOST", "OLLAMA_EMBEDDING_MODEL_NAME")] + public async Task GenerateAsync_ReturnsEmbeddings_WhenApiResponseIsSuccessful() + { + string host = Environment.GetEnvironmentVariable("OLLAMA_HOST") + ?? throw new InvalidOperationException("OLLAMA_HOST is not set."); + string embeddingModelName = Environment.GetEnvironmentVariable("OLLAMA_EMBEDDING_MODEL_NAME") + ?? throw new InvalidOperationException("OLLAMA_EMBEDDING_MODEL_NAME is not set."); + var httpClient = new HttpClient + { + BaseAddress = new Uri(host) + }; + var request = new TextEmbeddingsRequest { Model = embeddingModelName, Prompt = "Llamas are members of the camelid family", }; + var service = new OllamaTextEmbeddingService(httpClient); + TextEmbeddingsResponse response = await service.GenerateAsync(request); + response.Should().NotBeNull(); + } +} diff --git a/dotnet/test/AutoGen.Ollama.Tests/images/image.png b/dotnet/test/AutoGen.Ollama.Tests/images/image.png new file mode 100644 index 00000000000..ca276f81f5b --- /dev/null +++ b/dotnet/test/AutoGen.Ollama.Tests/images/image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300b7c9d6ba0c23a3e52fbd2e268141ddcca0434a9fb9dcf7e58e7e903d36dcf +size 2126185 diff --git a/dotnet/test/AutoGen.Ollama.Tests/images/square.png b/dotnet/test/AutoGen.Ollama.Tests/images/square.png new file mode 100644 index 00000000000..afb4f4cd4df --- /dev/null +++ b/dotnet/test/AutoGen.Ollama.Tests/images/square.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8323d0b8eceb752e14c29543b2e28bb2fc648ed9719095c31b7708867a4dc918 +size 491 diff --git a/dotnet/test/AutoGen.Tests/ApprovalTests/OpenAIMessageTests.BasicMessageTest.approved.txt b/dotnet/test/AutoGen.OpenAI.Tests/ApprovalTests/OpenAIMessageTests.BasicMessageTest.approved.txt similarity index 73% rename from dotnet/test/AutoGen.Tests/ApprovalTests/OpenAIMessageTests.BasicMessageTest.approved.txt rename to dotnet/test/AutoGen.OpenAI.Tests/ApprovalTests/OpenAIMessageTests.BasicMessageTest.approved.txt index 2cb58f4d88c..e8e9af84dbd 100644 --- a/dotnet/test/AutoGen.Tests/ApprovalTests/OpenAIMessageTests.BasicMessageTest.approved.txt +++ b/dotnet/test/AutoGen.OpenAI.Tests/ApprovalTests/OpenAIMessageTests.BasicMessageTest.approved.txt @@ -3,6 +3,7 @@ "OriginalMessage": "TextMessage(system, You are a helpful AI assistant, )", "ConvertedMessages": [ { + "Name": null, "Role": "system", "Content": "You are a helpful AI assistant" } @@ -14,6 +15,7 @@ { "Role": "user", "Content": "Hello", + "Name": "user", "MultiModaItem": null } ] @@ -24,71 +26,20 @@ { "Role": "assistant", "Content": "How can I help you?", + "Name": "assistant", "TooCall": [], "FunctionCallName": null, "FunctionCallArguments": null } ] }, - { - "OriginalMessage": "Message(system, You are a helpful AI assistant, , , )", - "ConvertedMessages": [ - { - "Role": "system", - "Content": "You are a helpful AI assistant" - } - ] - }, - { - "OriginalMessage": "Message(user, Hello, user, , )", - "ConvertedMessages": [ - { - "Role": "user", - "Content": "Hello", - "MultiModaItem": null - } - ] - }, - { - "OriginalMessage": "Message(assistant, How can I help you?, assistant, , )", - "ConvertedMessages": [ - { - "Role": "assistant", - "Content": "How can I help you?", - "TooCall": [], - "FunctionCallName": null, - "FunctionCallArguments": null - } - ] - }, - { - "OriginalMessage": "Message(function, result, user, , )", - "ConvertedMessages": [ - { - "Role": "user", - "Content": "result", - "MultiModaItem": null - } - ] - }, - { - "OriginalMessage": "Message(assistant, , assistant, functionName, functionArguments)", - "ConvertedMessages": [ - { - "Role": "assistant", - "Content": null, - "TooCall": [], - "FunctionCallName": "functionName", - "FunctionCallArguments": "functionArguments" - } - ] - }, { "OriginalMessage": "ImageMessage(user, https://example.com/image.png, user)", "ConvertedMessages": [ { "Role": "user", "Content": null, + "Name": "user", "MultiModaItem": [ { "Type": "Image", @@ -107,6 +58,7 @@ { "Role": "user", "Content": null, + "Name": "user", "MultiModaItem": [ { "Type": "Text", @@ -129,6 +81,7 @@ { "Role": "assistant", "Content": "", + "Name": "assistant", "TooCall": [ { "Type": "Function", @@ -158,12 +111,12 @@ { "Role": "tool", "Content": "test", - "ToolCallId": "result" + "ToolCallId": "result_0" }, { "Role": "tool", "Content": "test", - "ToolCallId": "result" + "ToolCallId": "result_1" } ] }, @@ -173,18 +126,19 @@ { "Role": "assistant", "Content": "", + "Name": "assistant", "TooCall": [ { "Type": "Function", "Name": "test", "Arguments": "test", - "Id": "test" + "Id": "test_0" }, { "Type": "Function", "Name": "test", "Arguments": "test", - "Id": "test" + "Id": "test_1" } ], "FunctionCallName": null, @@ -198,6 +152,7 @@ { "Role": "assistant", "Content": "", + "Name": "assistant", "TooCall": [ { "Type": "Function", diff --git a/dotnet/test/AutoGen.OpenAI.Tests/AutoGen.OpenAI.Tests.csproj b/dotnet/test/AutoGen.OpenAI.Tests/AutoGen.OpenAI.Tests.csproj new file mode 100644 index 00000000000..044975354b8 --- /dev/null +++ b/dotnet/test/AutoGen.OpenAI.Tests/AutoGen.OpenAI.Tests.csproj @@ -0,0 +1,32 @@ + + + + $(TestTargetFramework) + false + True + + + + + + + + + + + + + + + + + + + + $([System.String]::Copy('%(FileName)').Split('.')[0]) + $(ProjectExt.Replace('proj', '')) + %(ParentFile)%(ParentExtension) + + + + diff --git a/dotnet/test/AutoGen.OpenAI.Tests/GlobalUsing.cs b/dotnet/test/AutoGen.OpenAI.Tests/GlobalUsing.cs new file mode 100644 index 00000000000..d66bf001ed5 --- /dev/null +++ b/dotnet/test/AutoGen.OpenAI.Tests/GlobalUsing.cs @@ -0,0 +1,4 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// GlobalUsing.cs + +global using AutoGen.Core; diff --git a/dotnet/test/AutoGen.OpenAI.Tests/MathClassTest.cs b/dotnet/test/AutoGen.OpenAI.Tests/MathClassTest.cs new file mode 100644 index 00000000000..87fc0767020 --- /dev/null +++ b/dotnet/test/AutoGen.OpenAI.Tests/MathClassTest.cs @@ -0,0 +1,223 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// MathClassTest.cs + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using AutoGen.OpenAI.Extension; +using AutoGen.Tests; +using Azure.AI.OpenAI; +using FluentAssertions; +using Xunit.Abstractions; + +namespace AutoGen.OpenAI.Tests +{ + public partial class MathClassTest + { + private readonly ITestOutputHelper _output; + + // as of 2024-05-20, aoai return 500 error when round > 1 + // I'm pretty sure that round > 5 was supported before + // So this is probably some wield regression on aoai side + // I'll keep this test case here for now, plus setting round to 1 + // so the test can still pass. + // In the future, we should rewind this test case to round > 1 (previously was 5) + private int round = 1; + public MathClassTest(ITestOutputHelper output) + { + _output = output; + } + + private Task Print(IEnumerable messages, GenerateReplyOptions? option, IAgent agent, CancellationToken ct) + { + try + { + var reply = agent.GenerateReplyAsync(messages, option, ct).Result; + + _output.WriteLine(reply.FormatMessage()); + return Task.FromResult(reply); + } + catch (Exception) + { + _output.WriteLine("Request failed"); + _output.WriteLine($"agent name: {agent.Name}"); + foreach (var message in messages) + { + _output.WriteLine(message.FormatMessage()); + } + + throw; + } + + } + + [FunctionAttribute] + public async Task CreateMathQuestion(string question, int question_index) + { + return $@"[MATH_QUESTION] +Question {question_index}: +{question} + +Student, please answer"; + } + + [FunctionAttribute] + public async Task AnswerQuestion(string answer) + { + return $@"[MATH_ANSWER] +The answer is {answer} +teacher please check answer"; + } + + [FunctionAttribute] + public async Task AnswerIsCorrect(string message) + { + return $@"[ANSWER_IS_CORRECT] +{message} +please update progress"; + } + + [FunctionAttribute] + public async Task UpdateProgress(int correctAnswerCount) + { + if (correctAnswerCount >= this.round) + { + return $@"[UPDATE_PROGRESS] +{GroupChatExtension.TERMINATE}"; + } + else + { + return $@"[UPDATE_PROGRESS] +the number of resolved question is {correctAnswerCount} +teacher, please create the next math question"; + } + } + + + [ApiKeyFact("AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT")] + public async Task OpenAIAgentMathChatTestAsync() + { + var key = Environment.GetEnvironmentVariable("AZURE_OPENAI_API_KEY") ?? throw new ArgumentException("AZURE_OPENAI_API_KEY is not set"); + var endPoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new ArgumentException("AZURE_OPENAI_ENDPOINT is not set"); + + var openaiClient = new OpenAIClient(new Uri(endPoint), new Azure.AzureKeyCredential(key)); + var model = "gpt-35-turbo-16k"; + var teacher = await CreateTeacherAgentAsync(openaiClient, model); + var student = await CreateStudentAssistantAgentAsync(openaiClient, model); + + var adminFunctionMiddleware = new FunctionCallMiddleware( + functions: [this.UpdateProgressFunctionContract], + functionMap: new Dictionary>> + { + { this.UpdateProgressFunction.Name!, this.UpdateProgressWrapper }, + }); + var admin = new OpenAIChatAgent( + openAIClient: openaiClient, + modelName: model, + name: "Admin", + systemMessage: $@"You are admin. You update progress after each question is answered.") + .RegisterMessageConnector() + .RegisterStreamingMiddleware(adminFunctionMiddleware) + .RegisterMiddleware(Print); + + var groupAdmin = new OpenAIChatAgent( + openAIClient: openaiClient, + modelName: model, + name: "GroupAdmin", + systemMessage: "You are group admin. You manage the group chat.") + .RegisterMessageConnector() + .RegisterMiddleware(Print); + await RunMathChatAsync(teacher, student, admin, groupAdmin); + } + + private async Task CreateTeacherAgentAsync(OpenAIClient client, string model) + { + var functionCallMiddleware = new FunctionCallMiddleware( + functions: [this.CreateMathQuestionFunctionContract, this.AnswerIsCorrectFunctionContract], + functionMap: new Dictionary>> + { + { this.CreateMathQuestionFunctionContract.Name!, this.CreateMathQuestionWrapper }, + { this.AnswerIsCorrectFunctionContract.Name!, this.AnswerIsCorrectWrapper }, + }); + + var teacher = new OpenAIChatAgent( + openAIClient: client, + name: "Teacher", + systemMessage: @"You are a preschool math teacher. +You create math question and ask student to answer it. +Then you check if the answer is correct. +If the answer is wrong, you ask student to fix it", + modelName: model) + .RegisterMessageConnector() + .RegisterStreamingMiddleware(functionCallMiddleware) + .RegisterMiddleware(Print); + + return teacher; + } + + private async Task CreateStudentAssistantAgentAsync(OpenAIClient client, string model) + { + var functionCallMiddleware = new FunctionCallMiddleware( + functions: [this.AnswerQuestionFunctionContract], + functionMap: new Dictionary>> + { + { this.AnswerQuestionFunctionContract.Name!, this.AnswerQuestionWrapper }, + }); + var student = new OpenAIChatAgent( + openAIClient: client, + name: "Student", + modelName: model, + systemMessage: @"You are a student. You answer math question from teacher.") + .RegisterMessageConnector() + .RegisterStreamingMiddleware(functionCallMiddleware) + .RegisterMiddleware(Print); + + return student; + } + + private async Task RunMathChatAsync(IAgent teacher, IAgent student, IAgent admin, IAgent groupAdmin) + { + var teacher2Student = Transition.Create(teacher, student); + var student2Teacher = Transition.Create(student, teacher); + var teacher2Admin = Transition.Create(teacher, admin); + var admin2Teacher = Transition.Create(admin, teacher); + var workflow = new Graph( + [ + teacher2Student, + student2Teacher, + teacher2Admin, + admin2Teacher, + ]); + var group = new GroupChat( + workflow: workflow, + members: [ + admin, + teacher, + student, + ], + admin: groupAdmin); + + var groupChatManager = new GroupChatManager(group); + var chatHistory = await admin.InitiateChatAsync(groupChatManager, "teacher, create question", maxRound: 50); + + chatHistory.Where(msg => msg.From == teacher.Name && msg.GetContent()?.Contains("[MATH_QUESTION]") is true) + .Count() + .Should().BeGreaterThanOrEqualTo(this.round); + + chatHistory.Where(msg => msg.From == student.Name && msg.GetContent()?.Contains("[MATH_ANSWER]") is true) + .Count() + .Should().BeGreaterThanOrEqualTo(this.round); + + chatHistory.Where(msg => msg.From == teacher.Name && msg.GetContent()?.Contains("[ANSWER_IS_CORRECT]") is true) + .Count() + .Should().BeGreaterThanOrEqualTo(this.round); + + // check if there's terminate chat message from admin + chatHistory.Where(msg => msg.From == admin.Name && msg.IsGroupChatTerminateMessage()) + .Count() + .Should().Be(1); + } + } +} diff --git a/dotnet/test/AutoGen.Tests/OpenAIChatAgentTest.cs b/dotnet/test/AutoGen.OpenAI.Tests/OpenAIChatAgentTest.cs similarity index 93% rename from dotnet/test/AutoGen.Tests/OpenAIChatAgentTest.cs rename to dotnet/test/AutoGen.OpenAI.Tests/OpenAIChatAgentTest.cs index c504eb06a18..284cd3856bb 100644 --- a/dotnet/test/AutoGen.Tests/OpenAIChatAgentTest.cs +++ b/dotnet/test/AutoGen.OpenAI.Tests/OpenAIChatAgentTest.cs @@ -5,12 +5,12 @@ using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; -using AutoGen.OpenAI; using AutoGen.OpenAI.Extension; +using AutoGen.Tests; using Azure.AI.OpenAI; using FluentAssertions; -namespace AutoGen.Tests; +namespace AutoGen.OpenAI.Tests; public partial class OpenAIChatAgentTest { @@ -79,7 +79,6 @@ public async Task OpenAIChatMessageContentConnectorTestAsync() new TextMessage(Role.Assistant, "Hello", from: "user"), ], from: "user"), - new Message(Role.Assistant, "Hello", from: "user"), // Message type is going to be deprecated, please use TextMessage instead }; foreach (var message in messages) @@ -133,7 +132,6 @@ public async Task OpenAIChatAgentToolCallTestAsync() new TextMessage(Role.Assistant, question, from: "user"), ], from: "user"), - new Message(Role.Assistant, question, from: "user"), // Message type is going to be deprecated, please use TextMessage instead }; foreach (var message in messages) @@ -202,14 +200,13 @@ public async Task OpenAIChatAgentToolCallInvokingTestAsync() new TextMessage(Role.Assistant, question, from: "user"), ], from: "user"), - new Message(Role.Assistant, question, from: "user"), // Message type is going to be deprecated, please use TextMessage instead }; foreach (var message in messages) { var reply = await functionCallAgent.SendAsync(message); - reply.Should().BeOfType>(); + reply.Should().BeOfType(); reply.From.Should().Be("assistant"); reply.GetToolCalls()!.Count().Should().Be(1); reply.GetToolCalls()!.First().FunctionName.Should().Be(this.GetWeatherAsyncFunctionContract.Name); @@ -229,7 +226,7 @@ public async Task OpenAIChatAgentToolCallInvokingTestAsync() } else { - streamingMessage.Should().BeOfType>(); + streamingMessage.Should().BeOfType(); streamingMessage.As().GetContent()!.ToLower().Should().Contain("seattle"); } } diff --git a/dotnet/test/AutoGen.OpenAI.Tests/OpenAIMessageTests.cs b/dotnet/test/AutoGen.OpenAI.Tests/OpenAIMessageTests.cs new file mode 100644 index 00000000000..81581d068ee --- /dev/null +++ b/dotnet/test/AutoGen.OpenAI.Tests/OpenAIMessageTests.cs @@ -0,0 +1,720 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// OpenAIMessageTests.cs + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Reflection; +using System.Text.Json; +using System.Threading.Tasks; +using ApprovalTests; +using ApprovalTests.Namers; +using ApprovalTests.Reporters; +using AutoGen.Tests; +using Azure.AI.OpenAI; +using FluentAssertions; +using Xunit; + +namespace AutoGen.OpenAI.Tests; + +public class OpenAIMessageTests +{ + private readonly JsonSerializerOptions jsonSerializerOptions = new JsonSerializerOptions + { + WriteIndented = true, + IgnoreReadOnlyProperties = false, + }; + + [Fact] + [UseReporter(typeof(DiffReporter))] + [UseApprovalSubdirectory("ApprovalTests")] + public void BasicMessageTest() + { + IMessage[] messages = [ + new TextMessage(Role.System, "You are a helpful AI assistant"), + new TextMessage(Role.User, "Hello", "user"), + new TextMessage(Role.Assistant, "How can I help you?", from: "assistant"), + new ImageMessage(Role.User, "https://example.com/image.png", "user"), + new MultiModalMessage(Role.Assistant, + [ + new TextMessage(Role.User, "Hello", "user"), + new ImageMessage(Role.User, "https://example.com/image.png", "user"), + ], "user"), + new ToolCallMessage("test", "test", "assistant"), + new ToolCallResultMessage("result", "test", "test", "user"), + new ToolCallResultMessage( + [ + new ToolCall("result", "test", "test"), + new ToolCall("result", "test", "test"), + ], "user"), + new ToolCallMessage( + [ + new ToolCall("test", "test"), + new ToolCall("test", "test"), + ], "assistant"), + new AggregateMessage( + message1: new ToolCallMessage("test", "test", "assistant"), + message2: new ToolCallResultMessage("result", "test", "test", "assistant"), "assistant"), + ]; + var openaiMessageConnectorMiddleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant"); + + var oaiMessages = messages.Select(m => (m, openaiMessageConnectorMiddleware.ProcessIncomingMessages(agent, [m]))); + VerifyOAIMessages(oaiMessages); + } + + [Fact] + public async Task ItProcessUserTextMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestUserMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("Hello"); + chatRequestMessage.Name.Should().Be("user"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + IMessage message = new TextMessage(Role.User, "Hello", "user"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItShortcutChatRequestMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + + var chatRequestMessage = (ChatRequestUserMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("hello"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var userMessage = new ChatRequestUserMessage("hello"); + var chatRequestMessage = MessageEnvelope.Create(userMessage); + await agent.GenerateReplyAsync([chatRequestMessage]); + } + + [Fact] + public async Task ItShortcutMessageWhenStrictModelIsFalseAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + + var chatRequestMessage = ((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Should().Be("hello"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var userMessage = "hello"; + var chatRequestMessage = MessageEnvelope.Create(userMessage); + await agent.GenerateReplyAsync([chatRequestMessage]); + } + + [Fact] + public async Task ItThrowExceptionWhenStrictModeIsTrueAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(true); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + // user message + var userMessage = "hello"; + var chatRequestMessage = MessageEnvelope.Create(userMessage); + Func action = async () => await agent.GenerateReplyAsync([chatRequestMessage]); + + await action.Should().ThrowAsync().WithMessage("Invalid message type: MessageEnvelope`1"); + } + + [Fact] + public async Task ItProcessAssistantTextMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestAssistantMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("How can I help you?"); + chatRequestMessage.Name.Should().Be("assistant"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // assistant message + IMessage message = new TextMessage(Role.Assistant, "How can I help you?", "assistant"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItProcessSystemTextMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestSystemMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("You are a helpful AI assistant"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // system message + IMessage message = new TextMessage(Role.System, "You are a helpful AI assistant"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItProcessImageMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestUserMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().BeNullOrEmpty(); + chatRequestMessage.Name.Should().Be("user"); + chatRequestMessage.MultimodalContentItems.Count().Should().Be(1); + chatRequestMessage.MultimodalContentItems.First().Should().BeOfType(); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + IMessage message = new ImageMessage(Role.User, "https://example.com/image.png", "user"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItThrowExceptionWhenProcessingImageMessageFromSelfAndStrictModeIsTrueAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(true); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + var imageMessage = new ImageMessage(Role.Assistant, "https://example.com/image.png", "assistant"); + Func action = async () => await agent.GenerateReplyAsync([imageMessage]); + + await action.Should().ThrowAsync().WithMessage("Invalid message type: ImageMessage"); + } + + [Fact] + public async Task ItProcessMultiModalMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestUserMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().BeNullOrEmpty(); + chatRequestMessage.Name.Should().Be("user"); + chatRequestMessage.MultimodalContentItems.Count().Should().Be(2); + chatRequestMessage.MultimodalContentItems.First().Should().BeOfType(); + chatRequestMessage.MultimodalContentItems.Last().Should().BeOfType(); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + IMessage message = new MultiModalMessage( + Role.User, + [ + new TextMessage(Role.User, "Hello", "user"), + new ImageMessage(Role.User, "https://example.com/image.png", "user"), + ], "user"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItThrowExceptionWhenProcessingMultiModalMessageFromSelfAndStrictModeIsTrueAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(true); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + var multiModalMessage = new MultiModalMessage( + Role.Assistant, + [ + new TextMessage(Role.User, "Hello", "assistant"), + new ImageMessage(Role.User, "https://example.com/image.png", "assistant"), + ], "assistant"); + + Func action = async () => await agent.GenerateReplyAsync([multiModalMessage]); + + await action.Should().ThrowAsync().WithMessage("Invalid message type: MultiModalMessage"); + } + + [Fact] + public async Task ItProcessToolCallMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestAssistantMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().BeNullOrEmpty(); + chatRequestMessage.Name.Should().Be("assistant"); + chatRequestMessage.ToolCalls.Count().Should().Be(1); + chatRequestMessage.ToolCalls.First().Should().BeOfType(); + var functionToolCall = (ChatCompletionsFunctionToolCall)chatRequestMessage.ToolCalls.First(); + functionToolCall.Name.Should().Be("test"); + functionToolCall.Id.Should().Be("test"); + functionToolCall.Arguments.Should().Be("test"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + IMessage message = new ToolCallMessage("test", "test", "assistant"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItProcessParallelToolCallMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestAssistantMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().BeNullOrEmpty(); + chatRequestMessage.Name.Should().Be("assistant"); + chatRequestMessage.ToolCalls.Count().Should().Be(2); + for (int i = 0; i < chatRequestMessage.ToolCalls.Count(); i++) + { + chatRequestMessage.ToolCalls.ElementAt(i).Should().BeOfType(); + var functionToolCall = (ChatCompletionsFunctionToolCall)chatRequestMessage.ToolCalls.ElementAt(i); + functionToolCall.Name.Should().Be("test"); + functionToolCall.Id.Should().Be($"test_{i}"); + functionToolCall.Arguments.Should().Be("test"); + } + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var toolCalls = new[] + { + new ToolCall("test", "test"), + new ToolCall("test", "test"), + }; + IMessage message = new ToolCallMessage(toolCalls, "assistant"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItThrowExceptionWhenProcessingToolCallMessageFromUserAndStrictModeIsTrueAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(strictMode: true); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + var toolCallMessage = new ToolCallMessage("test", "test", "user"); + Func action = async () => await agent.GenerateReplyAsync([toolCallMessage]); + await action.Should().ThrowAsync().WithMessage("Invalid message type: ToolCallMessage"); + } + + [Fact] + public async Task ItProcessToolCallResultMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestToolMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("result"); + chatRequestMessage.ToolCallId.Should().Be("test"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + IMessage message = new ToolCallResultMessage("result", "test", "test", "user"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItProcessParallelToolCallResultMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + msgs.Count().Should().Be(2); + + for (int i = 0; i < msgs.Count(); i++) + { + var innerMessage = msgs.ElementAt(i); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestToolMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("result"); + chatRequestMessage.ToolCallId.Should().Be($"test_{i}"); + } + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var toolCalls = new[] + { + new ToolCall("test", "test", "result"), + new ToolCall("test", "test", "result"), + }; + IMessage message = new ToolCallResultMessage(toolCalls, "user"); + await agent.GenerateReplyAsync([message]); + } + + [Fact] + public async Task ItProcessFunctionCallMiddlewareMessageFromUserAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + msgs.Count().Should().Be(1); + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestUserMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("result"); + chatRequestMessage.Name.Should().Be("user"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var toolCallMessage = new ToolCallMessage("test", "test", "user"); + var toolCallResultMessage = new ToolCallResultMessage("result", "test", "test", "user"); + var aggregateMessage = new AggregateMessage(toolCallMessage, toolCallResultMessage, "user"); + await agent.GenerateReplyAsync([aggregateMessage]); + } + + [Fact] + public async Task ItProcessFunctionCallMiddlewareMessageFromAssistantAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + msgs.Count().Should().Be(2); + var innerMessage = msgs.Last(); + innerMessage!.Should().BeOfType>(); + var chatRequestMessage = (ChatRequestToolMessage)((MessageEnvelope)innerMessage!).Content; + chatRequestMessage.Content.Should().Be("result"); + chatRequestMessage.ToolCallId.Should().Be("test"); + + var toolCallMessage = msgs.First(); + toolCallMessage!.Should().BeOfType>(); + var toolCallRequestMessage = (ChatRequestAssistantMessage)((MessageEnvelope)toolCallMessage!).Content; + toolCallRequestMessage.Content.Should().BeNullOrEmpty(); + toolCallRequestMessage.ToolCalls.Count().Should().Be(1); + toolCallRequestMessage.ToolCalls.First().Should().BeOfType(); + var functionToolCall = (ChatCompletionsFunctionToolCall)toolCallRequestMessage.ToolCalls.First(); + functionToolCall.Name.Should().Be("test"); + functionToolCall.Id.Should().Be("test"); + functionToolCall.Arguments.Should().Be("test"); + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var toolCallMessage = new ToolCallMessage("test", "test", "assistant"); + var toolCallResultMessage = new ToolCallResultMessage("result", "test", "test", "assistant"); + var aggregateMessage = new ToolCallAggregateMessage(toolCallMessage, toolCallResultMessage, "assistant"); + await agent.GenerateReplyAsync([aggregateMessage]); + } + + [Fact] + public async Task ItProcessParallelFunctionCallMiddlewareMessageFromAssistantAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(async (msgs, _, innerAgent, _) => + { + msgs.Count().Should().Be(3); + var toolCallMessage = msgs.First(); + toolCallMessage!.Should().BeOfType>(); + var toolCallRequestMessage = (ChatRequestAssistantMessage)((MessageEnvelope)toolCallMessage!).Content; + toolCallRequestMessage.Content.Should().BeNullOrEmpty(); + toolCallRequestMessage.ToolCalls.Count().Should().Be(2); + + for (int i = 0; i < toolCallRequestMessage.ToolCalls.Count(); i++) + { + toolCallRequestMessage.ToolCalls.ElementAt(i).Should().BeOfType(); + var functionToolCall = (ChatCompletionsFunctionToolCall)toolCallRequestMessage.ToolCalls.ElementAt(i); + functionToolCall.Name.Should().Be("test"); + functionToolCall.Id.Should().Be($"test_{i}"); + functionToolCall.Arguments.Should().Be("test"); + } + + for (int i = 1; i < msgs.Count(); i++) + { + var toolCallResultMessage = msgs.ElementAt(i); + toolCallResultMessage!.Should().BeOfType>(); + var toolCallResultRequestMessage = (ChatRequestToolMessage)((MessageEnvelope)toolCallResultMessage!).Content; + toolCallResultRequestMessage.Content.Should().Be("result"); + toolCallResultRequestMessage.ToolCallId.Should().Be($"test_{i - 1}"); + } + + return await innerAgent.GenerateReplyAsync(msgs); + }) + .RegisterMiddleware(middleware); + + // user message + var toolCalls = new[] + { + new ToolCall("test", "test", "result"), + new ToolCall("test", "test", "result"), + }; + var toolCallMessage = new ToolCallMessage(toolCalls, "assistant"); + var toolCallResultMessage = new ToolCallResultMessage(toolCalls, "assistant"); + var aggregateMessage = new AggregateMessage(toolCallMessage, toolCallResultMessage, "assistant"); + await agent.GenerateReplyAsync([aggregateMessage]); + } + + [Fact] + public async Task ItConvertChatResponseMessageToTextMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + // text message + var textMessage = CreateInstance(ChatRole.Assistant, "hello"); + var chatRequestMessage = MessageEnvelope.Create(textMessage); + + var message = await agent.GenerateReplyAsync([chatRequestMessage]); + message.Should().BeOfType(); + message.GetContent().Should().Be("hello"); + message.GetRole().Should().Be(Role.Assistant); + } + + [Fact] + public async Task ItConvertChatResponseMessageToToolCallMessageAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + // tool call message + var toolCallMessage = CreateInstance(ChatRole.Assistant, "", new[] { new ChatCompletionsFunctionToolCall("test", "test", "test") }, new FunctionCall("test", "test"), CreateInstance(), new Dictionary()); + var chatRequestMessage = MessageEnvelope.Create(toolCallMessage); + var message = await agent.GenerateReplyAsync([chatRequestMessage]); + message.Should().BeOfType(); + message.GetToolCalls()!.Count().Should().Be(1); + message.GetToolCalls()!.First().FunctionName.Should().Be("test"); + message.GetToolCalls()!.First().FunctionArguments.Should().Be("test"); + } + + [Fact] + public async Task ItReturnOriginalMessageWhenStrictModeIsFalseAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + // text message + var textMessage = "hello"; + var messageToSend = MessageEnvelope.Create(textMessage); + + var message = await agent.GenerateReplyAsync([messageToSend]); + message.Should().BeOfType>(); + } + + [Fact] + public async Task ItThrowInvalidOperationExceptionWhenStrictModeIsTrueAsync() + { + var middleware = new OpenAIChatRequestMessageConnector(true); + var agent = new EchoAgent("assistant") + .RegisterMiddleware(middleware); + + // text message + var textMessage = new ChatRequestUserMessage("hello"); + var messageToSend = MessageEnvelope.Create(textMessage); + Func action = async () => await agent.GenerateReplyAsync([messageToSend]); + + await action.Should().ThrowAsync().WithMessage("Invalid return message type MessageEnvelope`1"); + } + + [Fact] + public void ToOpenAIChatRequestMessageShortCircuitTest() + { + var agent = new EchoAgent("assistant"); + var middleware = new OpenAIChatRequestMessageConnector(); + ChatRequestMessage[] messages = + [ + new ChatRequestUserMessage("Hello"), + new ChatRequestAssistantMessage("How can I help you?"), + new ChatRequestSystemMessage("You are a helpful AI assistant"), + new ChatRequestFunctionMessage("result", "functionName"), + new ChatRequestToolMessage("test", "test"), + ]; + + foreach (var oaiMessage in messages) + { + IMessage message = new MessageEnvelope(oaiMessage); + var oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); + oaiMessages.Count().Should().Be(1); + //oaiMessages.First().Should().BeOfType>(); + if (oaiMessages.First() is IMessage chatRequestMessage) + { + chatRequestMessage.Content.Should().Be(oaiMessage); + } + else + { + // fail the test + Assert.True(false); + } + } + } + private void VerifyOAIMessages(IEnumerable<(IMessage, IEnumerable)> messages) + { + var jsonObjects = messages.Select(pair => + { + var (originalMessage, ms) = pair; + var objs = new List(); + foreach (var m in ms) + { + object? obj = null; + var chatRequestMessage = (m as IMessage)?.Content; + if (chatRequestMessage is ChatRequestUserMessage userMessage) + { + obj = new + { + Role = userMessage.Role.ToString(), + Content = userMessage.Content, + Name = userMessage.Name, + MultiModaItem = userMessage.MultimodalContentItems?.Select(item => + { + return item switch + { + ChatMessageImageContentItem imageContentItem => new + { + Type = "Image", + ImageUrl = GetImageUrlFromContent(imageContentItem), + } as object, + ChatMessageTextContentItem textContentItem => new + { + Type = "Text", + Text = textContentItem.Text, + } as object, + _ => throw new System.NotImplementedException(), + }; + }), + }; + } + + if (chatRequestMessage is ChatRequestAssistantMessage assistantMessage) + { + obj = new + { + Role = assistantMessage.Role.ToString(), + Content = assistantMessage.Content, + Name = assistantMessage.Name, + TooCall = assistantMessage.ToolCalls.Select(tc => + { + return tc switch + { + ChatCompletionsFunctionToolCall functionToolCall => new + { + Type = "Function", + Name = functionToolCall.Name, + Arguments = functionToolCall.Arguments, + Id = functionToolCall.Id, + } as object, + _ => throw new System.NotImplementedException(), + }; + }), + FunctionCallName = assistantMessage.FunctionCall?.Name, + FunctionCallArguments = assistantMessage.FunctionCall?.Arguments, + }; + } + + if (chatRequestMessage is ChatRequestSystemMessage systemMessage) + { + obj = new + { + Name = systemMessage.Name, + Role = systemMessage.Role.ToString(), + Content = systemMessage.Content, + }; + } + + if (chatRequestMessage is ChatRequestFunctionMessage functionMessage) + { + obj = new + { + Role = functionMessage.Role.ToString(), + Content = functionMessage.Content, + Name = functionMessage.Name, + }; + } + + if (chatRequestMessage is ChatRequestToolMessage toolCallMessage) + { + obj = new + { + Role = toolCallMessage.Role.ToString(), + Content = toolCallMessage.Content, + ToolCallId = toolCallMessage.ToolCallId, + }; + } + + objs.Add(obj ?? throw new System.NotImplementedException()); + } + + return new + { + OriginalMessage = originalMessage.ToString(), + ConvertedMessages = objs, + }; + }); + + var json = JsonSerializer.Serialize(jsonObjects, this.jsonSerializerOptions); + Approvals.Verify(json); + } + + private object? GetImageUrlFromContent(ChatMessageImageContentItem content) + { + return content.GetType().GetProperty("ImageUrl", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)?.GetValue(content); + } + + private static T CreateInstance(params object[] args) + { + var type = typeof(T); + var instance = type.Assembly.CreateInstance( + type.FullName!, false, + BindingFlags.Instance | BindingFlags.NonPublic, + null, args, null, null); + return (T)instance!; + } +} diff --git a/dotnet/test/AutoGen.SemanticKernel.Tests/KernelFunctionMiddlewareTests.cs b/dotnet/test/AutoGen.SemanticKernel.Tests/KernelFunctionMiddlewareTests.cs index f560419e8c8..658cc899e10 100644 --- a/dotnet/test/AutoGen.SemanticKernel.Tests/KernelFunctionMiddlewareTests.cs +++ b/dotnet/test/AutoGen.SemanticKernel.Tests/KernelFunctionMiddlewareTests.cs @@ -30,8 +30,8 @@ public async Task ItRegisterKernelFunctionMiddlewareFromTestPluginTests() var reply = await agent.SendAsync("what's the status of the light?"); reply.GetContent().Should().Be("off"); - reply.Should().BeOfType>(); - if (reply is AggregateMessage aggregateMessage) + reply.Should().BeOfType(); + if (reply is ToolCallAggregateMessage aggregateMessage) { var toolCallMessage = aggregateMessage.Message1; toolCallMessage.ToolCalls.Should().HaveCount(1); @@ -44,8 +44,8 @@ public async Task ItRegisterKernelFunctionMiddlewareFromTestPluginTests() reply = await agent.SendAsync("change the status of the light to on"); reply.GetContent().Should().Be("The status of the light is now on"); - reply.Should().BeOfType>(); - if (reply is AggregateMessage aggregateMessage1) + reply.Should().BeOfType(); + if (reply is ToolCallAggregateMessage aggregateMessage1) { var toolCallMessage = aggregateMessage1.Message1; toolCallMessage.ToolCalls.Should().HaveCount(1); @@ -75,8 +75,8 @@ public async Task ItRegisterKernelFunctionMiddlewareFromMethodTests() var reply = await agent.SendAsync("what's the weather in Seattle?"); reply.GetContent().Should().Be("The weather in Seattle is sunny."); - reply.Should().BeOfType>(); - if (reply is AggregateMessage getWeatherMessage) + reply.Should().BeOfType(); + if (reply is ToolCallAggregateMessage getWeatherMessage) { var toolCallMessage = getWeatherMessage.Message1; toolCallMessage.ToolCalls.Should().HaveCount(1); @@ -88,8 +88,8 @@ public async Task ItRegisterKernelFunctionMiddlewareFromMethodTests() reply = await agent.SendAsync("Create a person object with name: John, email: 12345@gmail.com, age: 30"); reply.GetContent().Should().Be("Name: John, Email: 12345@gmail.com, Age: 30"); - reply.Should().BeOfType>(); - if (reply is AggregateMessage createPersonObjectMessage) + reply.Should().BeOfType(); + if (reply is ToolCallAggregateMessage createPersonObjectMessage) { var toolCallMessage = createPersonObjectMessage.Message1; toolCallMessage.ToolCalls.Should().HaveCount(1); diff --git a/dotnet/test/AutoGen.SourceGenerator.Tests/ApprovalTests/FunctionCallTemplateTests.TestFunctionCallTemplate.approved.txt b/dotnet/test/AutoGen.SourceGenerator.Tests/ApprovalTests/FunctionCallTemplateTests.TestFunctionCallTemplate.approved.txt new file mode 100644 index 00000000000..feab4ebd607 --- /dev/null +++ b/dotnet/test/AutoGen.SourceGenerator.Tests/ApprovalTests/FunctionCallTemplateTests.TestFunctionCallTemplate.approved.txt @@ -0,0 +1,71 @@ +//---------------------- +// +// This code was generated by a tool. +// +//---------------------- +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Threading.Tasks; +using System; +using AutoGen.Core; +using AutoGen.OpenAI.Extension; + +namespace AutoGen.SourceGenerator.Tests +{ + public partial class FunctionExamples + { + + private class AddAsyncSchema + { + [JsonPropertyName(@"a")] + public System.Int32 a {get; set;} + [JsonPropertyName(@"b")] + public System.Int32 b {get; set;} + } + + public System.Threading.Tasks.Task`1[System.String] AddAsyncWrapper(string arguments) + { + var schema = JsonSerializer.Deserialize( + arguments, + new JsonSerializerOptions + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + }); + + return AddAsync(schema.a, schema.b); + } + + public FunctionContract AddAsyncFunctionContract + { + get => new FunctionContract + { + Name = @"AddAsync", + Description = @"Add two numbers.", + ReturnType = typeof(System.Threading.Tasks.Task`1[System.String]), + Parameters = new [] + { + new FunctionParameterContract + { + Name = @"a", + Description = @"The first number.", + ParameterType = typeof(System.Int32), + IsRequired = true, + }, + new FunctionParameterContract + { + Name = @"b", + Description = @"The second number.", + ParameterType = typeof(System.Int32), + IsRequired = true, + }, + }, + }; + } + + public global::Azure.AI.OpenAI.FunctionDefinition AddAsyncFunction + { + get => this.AddAsyncFunctionContract.ToOpenAIFunctionDefinition(); + } + } +} + diff --git a/dotnet/test/AutoGen.SourceGenerator.Tests/AutoGen.SourceGenerator.Tests.csproj b/dotnet/test/AutoGen.SourceGenerator.Tests/AutoGen.SourceGenerator.Tests.csproj index c4b2a8aa8ce..0d0d91e0522 100644 --- a/dotnet/test/AutoGen.SourceGenerator.Tests/AutoGen.SourceGenerator.Tests.csproj +++ b/dotnet/test/AutoGen.SourceGenerator.Tests/AutoGen.SourceGenerator.Tests.csproj @@ -17,7 +17,7 @@ - + diff --git a/dotnet/test/AutoGen.SourceGenerator.Tests/FunctionCallTemplateTests.cs b/dotnet/test/AutoGen.SourceGenerator.Tests/FunctionCallTemplateTests.cs new file mode 100644 index 00000000000..3c1e6c8ede3 --- /dev/null +++ b/dotnet/test/AutoGen.SourceGenerator.Tests/FunctionCallTemplateTests.cs @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// FunctionCallTemplateTests.cs + +using ApprovalTests; +using ApprovalTests.Namers; +using ApprovalTests.Reporters; +using AutoGen.SourceGenerator.Template; +using Xunit; + +namespace AutoGen.SourceGenerator.Tests; + +public class FunctionCallTemplateTests +{ + [Fact] + [UseReporter(typeof(DiffReporter))] + [UseApprovalSubdirectory("ApprovalTests")] + public void TestFunctionCallTemplate() + { + var functionExample = new FunctionExamples(); + var function = functionExample.AddAsyncFunctionContract; + var functionCallTemplate = new FunctionCallTemplate() + { + ClassName = function.ClassName, + NameSpace = function.Namespace, + FunctionContracts = [new SourceGeneratorFunctionContract() + { + Name = function.Name, + Description = function.Description, + ReturnType = function.ReturnType!.ToString(), + ReturnDescription = function.ReturnDescription, + Parameters = function.Parameters!.Select(p => new SourceGeneratorParameterContract() + { + Name = p.Name, + Description = p.Description, + Type = p.ParameterType!.ToString(), + IsOptional = !p.IsRequired, + JsonType = p.ParameterType!.ToString(), + }).ToArray() + }] + }; + + var actual = functionCallTemplate.TransformText(); + + Approvals.Verify(actual); + } +} diff --git a/dotnet/test/AutoGen.Tests/EchoAgent.cs b/dotnet/test/AutoGen.Tests/EchoAgent.cs index 28a7b91bad5..9cead5ad251 100644 --- a/dotnet/test/AutoGen.Tests/EchoAgent.cs +++ b/dotnet/test/AutoGen.Tests/EchoAgent.cs @@ -3,12 +3,13 @@ using System.Collections.Generic; using System.Linq; +using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; namespace AutoGen.Tests { - internal class EchoAgent : IAgent + public class EchoAgent : IStreamingAgent { public EchoAgent(string name) { @@ -27,5 +28,14 @@ public Task GenerateReplyAsync( return Task.FromResult(lastMessage); } + + public async IAsyncEnumerable GenerateStreamingReplyAsync(IEnumerable messages, GenerateReplyOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + foreach (var message in messages) + { + message.From = this.Name; + yield return message; + } + } } } diff --git a/dotnet/test/AutoGen.Tests/MathClassTest.cs b/dotnet/test/AutoGen.Tests/MathClassTest.cs deleted file mode 100644 index 3f1eac76246..00000000000 --- a/dotnet/test/AutoGen.Tests/MathClassTest.cs +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// MathClassTest.cs - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Threading.Tasks; -using AutoGen.OpenAI; -using FluentAssertions; -using Xunit.Abstractions; - -namespace AutoGen.Tests -{ - public partial class MathClassTest - { - private readonly ITestOutputHelper _output; - public MathClassTest(ITestOutputHelper output) - { - _output = output; - } - - [FunctionAttribute] - public async Task CreateMathQuestion(string question, int question_index) - { - return $@"// ignore this line [MATH_QUESTION] -Question #{question_index}: -{question}"; - } - - [FunctionAttribute] - public async Task AnswerQuestion(string answer) - { - return $@"// ignore this line [MATH_ANSWER] -The answer is {answer}, teacher please check answer"; - } - - [FunctionAttribute] - public async Task AnswerIsCorrect(string message) - { - return $@"// ignore this line [ANSWER_IS_CORRECT] -{message}"; - } - - [FunctionAttribute] - public async Task UpdateProgress(int correctAnswerCount) - { - if (correctAnswerCount >= 5) - { - return $@"// ignore this line [UPDATE_PROGRESS] -{GroupChatExtension.TERMINATE}"; - } - else - { - return $@"// ignore this line [UPDATE_PROGRESS] -the number of resolved question is {correctAnswerCount} -teacher, please create the next math question"; - } - } - - - [ApiKeyFact("AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT")] - public async Task AssistantAgentMathChatTestAsync() - { - var teacher = await CreateTeacherAssistantAgentAsync(); - var student = await CreateStudentAssistantAgentAsync(); - var key = Environment.GetEnvironmentVariable("AZURE_OPENAI_API_KEY") ?? throw new ArgumentException("AZURE_OPENAI_API_KEY is not set"); - var endPoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new ArgumentException("AZURE_OPENAI_ENDPOINT is not set"); - var model = "gpt-35-turbo-16k"; - var admin = new GPTAgent( - name: "Admin", - systemMessage: $@"You are admin. You ask teacher to create 5 math questions. You update progress after each question is answered.", - config: new AzureOpenAIConfig(endPoint, model, key), - functions: new[] - { - this.UpdateProgressFunction, - }, - functionMap: new Dictionary>> - { - { this.UpdateProgressFunction.Name, this.UpdateProgressWrapper }, - }) - .RegisterMiddleware(async (messages, options, agent, ct) => - { - // check admin reply to make sure it calls UpdateProgress function - var maxAttempt = 5; - var reply = await agent.GenerateReplyAsync(messages, options, ct); - while (maxAttempt-- > 0) - { - if (options?.Functions is { Length: 0 }) - { - return reply; - } - - var formattedMessage = reply.FormatMessage(); - this._output.WriteLine(formattedMessage); - if (reply.GetContent()?.Contains("[UPDATE_PROGRESS]") is true) - { - return reply; - } - else - { - await Task.Delay(1000); - var review = "Admin, please update progress based on conversation"; - reply = await agent.SendAsync(review, messages, ct); - } - } - - throw new Exception("Admin does not call UpdateProgress function"); - }); - - await RunMathChatAsync(teacher, student, admin); - } - - private async Task CreateTeacherAssistantAgentAsync() - { - var key = Environment.GetEnvironmentVariable("AZURE_OPENAI_API_KEY") ?? throw new ArgumentException("AZURE_OPENAI_API_KEY is not set"); - var endPoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new ArgumentException("AZURE_OPENAI_ENDPOINT is not set"); - var model = "gpt-35-turbo-16k"; - var config = new AzureOpenAIConfig(endPoint, model, key); - var llmConfig = new ConversableAgentConfig - { - ConfigList = new[] - { - config, - }, - FunctionContracts = new[] - { - this.CreateMathQuestionFunctionContract, - this.AnswerIsCorrectFunctionContract, - }, - }; - - var teacher = new AssistantAgent( - name: "Teacher", - systemMessage: $@"You are a preschool math teacher. -You create math question and ask student to answer it. -Then you check if the answer is correct. -If the answer is wrong, you ask student to fix it. -If the answer is correct, you create another math question. -", - llmConfig: llmConfig, - functionMap: new Dictionary>> - { - { this.CreateMathQuestionFunction.Name, this.CreateMathQuestionWrapper }, - { this.AnswerIsCorrectFunction.Name, this.AnswerIsCorrectWrapper }, - }); - - return teacher; - } - - private async Task CreateStudentAssistantAgentAsync() - { - var key = Environment.GetEnvironmentVariable("AZURE_OPENAI_API_KEY") ?? throw new ArgumentException("AZURE_OPENAI_API_KEY is not set"); - var endPoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new ArgumentException("AZURE_OPENAI_ENDPOINT is not set"); - var model = "gpt-35-turbo-16k"; - var config = new AzureOpenAIConfig(endPoint, model, key); - var llmConfig = new ConversableAgentConfig - { - FunctionContracts = new[] - { - this.AnswerQuestionFunctionContract, - }, - ConfigList = new[] - { - config, - }, - }; - var student = new AssistantAgent( - name: "Student", - systemMessage: $@"You are a student. Here's your workflow in pseudo code: --workflow- -answer_question -if answer is wrong - fix_answer --end- - -Here are a few examples of answer_question: --example 1- -2 - -Here are a few examples of fix_answer: --example 1- -sorry, the answer should be 2, not 3 -", - llmConfig: llmConfig, - functionMap: new Dictionary>> - { - { this.AnswerQuestionFunction.Name, this.AnswerQuestionWrapper } - }); - - return student; - } - - private async Task RunMathChatAsync(IAgent teacher, IAgent student, IAgent admin) - { - var group = new GroupChat( - [ - admin, - teacher, - student, - ], - admin); - - admin.SendIntroduction($@"Welcome to the group chat! I'm admin", group); - teacher.SendIntroduction($@"Hey I'm Teacher", group); - student.SendIntroduction($@"Hey I'm Student", group); - admin.SendIntroduction(@$"Teacher, please create pre-school math question for student and check answer. -Student, for each question, please answer it and ask teacher to check if the answer is correct. -I'll update the progress after each question is answered. -The conversation will end after 5 correct answers. -", group); - - var groupChatManager = new GroupChatManager(group); - var chatHistory = await admin.InitiateChatAsync(groupChatManager, maxRound: 50); - - // print chat history - foreach (var message in chatHistory) - { - _output.WriteLine(message.FormatMessage()); - } - - // check if there's five questions from teacher - chatHistory.Where(msg => msg.From == teacher.Name && msg.GetContent()?.Contains("[MATH_QUESTION]") is true) - .Count() - .Should().BeGreaterThanOrEqualTo(5); - - // check if there's more than five answers from student (answer might be wrong) - chatHistory.Where(msg => msg.From == student.Name && msg.GetContent()?.Contains("[MATH_ANSWER]") is true) - .Count() - .Should().BeGreaterThanOrEqualTo(5); - - // check if there's five answer_is_correct from teacher - chatHistory.Where(msg => msg.From == teacher.Name && msg.GetContent()?.Contains("[ANSWER_IS_CORRECT]") is true) - .Count() - .Should().BeGreaterThanOrEqualTo(5); - - // check if there's terminate chat message from admin - chatHistory.Where(msg => msg.From == admin.Name && msg.IsGroupChatTerminateMessage()) - .Count() - .Should().Be(1); - } - } -} diff --git a/dotnet/test/AutoGen.Tests/OpenAIMessageTests.cs b/dotnet/test/AutoGen.Tests/OpenAIMessageTests.cs deleted file mode 100644 index 6e9cd28c4cb..00000000000 --- a/dotnet/test/AutoGen.Tests/OpenAIMessageTests.cs +++ /dev/null @@ -1,382 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// OpenAIMessageTests.cs - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text.Json; -using ApprovalTests; -using ApprovalTests.Namers; -using ApprovalTests.Reporters; -using AutoGen.OpenAI; -using Azure.AI.OpenAI; -using FluentAssertions; -using Xunit; - -namespace AutoGen.Tests; - -public class OpenAIMessageTests -{ - private readonly JsonSerializerOptions jsonSerializerOptions = new JsonSerializerOptions - { - WriteIndented = true, - IgnoreReadOnlyProperties = false, - }; - - [Fact] - [UseReporter(typeof(DiffReporter))] - [UseApprovalSubdirectory("ApprovalTests")] - public void BasicMessageTest() - { - IMessage[] messages = [ - new TextMessage(Role.System, "You are a helpful AI assistant"), - new TextMessage(Role.User, "Hello", "user"), - new TextMessage(Role.Assistant, "How can I help you?", from: "assistant"), - new Message(Role.System, "You are a helpful AI assistant"), - new Message(Role.User, "Hello", "user"), - new Message(Role.Assistant, "How can I help you?", from: "assistant"), - new Message(Role.Function, "result", "user"), - new Message(Role.Assistant, null, "assistant") - { - FunctionName = "functionName", - FunctionArguments = "functionArguments", - }, - new ImageMessage(Role.User, "https://example.com/image.png", "user"), - new MultiModalMessage(Role.Assistant, - [ - new TextMessage(Role.User, "Hello", "user"), - new ImageMessage(Role.User, "https://example.com/image.png", "user"), - ], "user"), - new ToolCallMessage("test", "test", "assistant"), - new ToolCallResultMessage("result", "test", "test", "user"), - new ToolCallResultMessage( - [ - new ToolCall("result", "test", "test"), - new ToolCall("result", "test", "test"), - ], "user"), - new ToolCallMessage( - [ - new ToolCall("test", "test"), - new ToolCall("test", "test"), - ], "assistant"), - new AggregateMessage( - message1: new ToolCallMessage("test", "test", "assistant"), - message2: new ToolCallResultMessage("result", "test", "test", "assistant"), "assistant"), - ]; - var openaiMessageConnectorMiddleware = new OpenAIChatRequestMessageConnector(); - var agent = new EchoAgent("assistant"); - - var oaiMessages = messages.Select(m => (m, openaiMessageConnectorMiddleware.ProcessIncomingMessages(agent, [m]))); - VerifyOAIMessages(oaiMessages); - } - - [Fact] - public void ToOpenAIChatRequestMessageTest() - { - var agent = new EchoAgent("assistant"); - var middleware = new OpenAIChatRequestMessageConnector(); - - // user message - IMessage message = new TextMessage(Role.User, "Hello", "user"); - var oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - var userMessage = (ChatRequestUserMessage)oaiMessages.First(); - userMessage.Content.Should().Be("Hello"); - - // user message test 2 - // even if Role is assistant, it should be converted to user message because it is from the user - message = new TextMessage(Role.Assistant, "Hello", "user"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - userMessage = (ChatRequestUserMessage)oaiMessages.First(); - userMessage.Content.Should().Be("Hello"); - - // user message with multimodal content - // image - message = new ImageMessage(Role.User, "https://example.com/image.png", "user"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - userMessage = (ChatRequestUserMessage)oaiMessages.First(); - userMessage.Content.Should().BeNullOrEmpty(); - userMessage.MultimodalContentItems.Count().Should().Be(1); - userMessage.MultimodalContentItems.First().Should().BeOfType(); - - // text and image - message = new MultiModalMessage( - Role.User, - [ - new TextMessage(Role.User, "Hello", "user"), - new ImageMessage(Role.User, "https://example.com/image.png", "user"), - ], "user"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - userMessage = (ChatRequestUserMessage)oaiMessages.First(); - userMessage.Content.Should().BeNullOrEmpty(); - userMessage.MultimodalContentItems.Count().Should().Be(2); - userMessage.MultimodalContentItems.First().Should().BeOfType(); - - // assistant text message - message = new TextMessage(Role.Assistant, "How can I help you?", "assistant"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - var assistantMessage = (ChatRequestAssistantMessage)oaiMessages.First(); - assistantMessage.Content.Should().Be("How can I help you?"); - - // assistant text message with single tool call - message = new ToolCallMessage("test", "test", "assistant"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - assistantMessage = (ChatRequestAssistantMessage)oaiMessages.First(); - assistantMessage.Content.Should().BeNullOrEmpty(); - assistantMessage.ToolCalls.Count().Should().Be(1); - assistantMessage.ToolCalls.First().Should().BeOfType(); - - // user should not suppose to send tool call message - message = new ToolCallMessage("test", "test", "user"); - Func action = () => middleware.ProcessIncomingMessages(agent, [message]).First(); - action.Should().Throw().WithMessage("ToolCallMessage is not supported when message.From is not the same with agent"); - - // assistant text message with multiple tool calls - message = new ToolCallMessage( - toolCalls: - [ - new ToolCall("test", "test"), - new ToolCall("test", "test"), - ], "assistant"); - - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - assistantMessage = (ChatRequestAssistantMessage)oaiMessages.First(); - assistantMessage.Content.Should().BeNullOrEmpty(); - assistantMessage.ToolCalls.Count().Should().Be(2); - - // tool call result message - message = new ToolCallResultMessage("result", "test", "test", "user"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - var toolCallMessage = (ChatRequestToolMessage)oaiMessages.First(); - toolCallMessage.Content.Should().Be("result"); - - // tool call result message with multiple tool calls - message = new ToolCallResultMessage( - toolCalls: - [ - new ToolCall("result", "test", "test"), - new ToolCall("result", "test", "test"), - ], "user"); - - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(2); - oaiMessages.First().Should().BeOfType(); - toolCallMessage = (ChatRequestToolMessage)oaiMessages.First(); - toolCallMessage.Content.Should().Be("test"); - oaiMessages.Last().Should().BeOfType(); - toolCallMessage = (ChatRequestToolMessage)oaiMessages.Last(); - toolCallMessage.Content.Should().Be("test"); - - // aggregate message test - // aggregate message with tool call and tool call result will be returned by GPT agent if the tool call is automatically invoked inside agent - message = new AggregateMessage( - message1: new ToolCallMessage("test", "test", "assistant"), - message2: new ToolCallResultMessage("result", "test", "test", "assistant"), "assistant"); - - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(2); - oaiMessages.First().Should().BeOfType(); - assistantMessage = (ChatRequestAssistantMessage)oaiMessages.First(); - assistantMessage.Content.Should().BeNullOrEmpty(); - assistantMessage.ToolCalls.Count().Should().Be(1); - - oaiMessages.Last().Should().BeOfType(); - toolCallMessage = (ChatRequestToolMessage)oaiMessages.Last(); - toolCallMessage.Content.Should().Be("result"); - - // aggregate message test 2 - // if the aggregate message is from user, it should be converted to user message - message = new AggregateMessage( - message1: new ToolCallMessage("test", "test", "user"), - message2: new ToolCallResultMessage("result", "test", "test", "user"), "user"); - - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - userMessage = (ChatRequestUserMessage)oaiMessages.First(); - userMessage.Content.Should().Be("result"); - - // aggregate message test 3 - // if the aggregate message is from user and contains multiple tool call results, it should be converted to user message - message = new AggregateMessage( - message1: new ToolCallMessage( - toolCalls: - [ - new ToolCall("test", "test"), - new ToolCall("test", "test"), - ], from: "user"), - message2: new ToolCallResultMessage( - toolCalls: - [ - new ToolCall("result", "test", "test"), - new ToolCall("result", "test", "test"), - ], from: "user"), "user"); - - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - oaiMessages.Count().Should().Be(2); - oaiMessages.First().Should().BeOfType(); - oaiMessages.Last().Should().BeOfType(); - - // system message - message = new TextMessage(Role.System, "You are a helpful AI assistant"); - oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().BeOfType(); - } - - [Fact] - public void ToOpenAIChatRequestMessageShortCircuitTest() - { - var agent = new EchoAgent("assistant"); - var middleware = new OpenAIChatRequestMessageConnector(); - ChatRequestMessage[] messages = - [ - new ChatRequestUserMessage("Hello"), - new ChatRequestAssistantMessage("How can I help you?"), - new ChatRequestSystemMessage("You are a helpful AI assistant"), - new ChatRequestFunctionMessage("result", "functionName"), - new ChatRequestToolMessage("test", "test"), - ]; - - foreach (var oaiMessage in messages) - { - IMessage message = new MessageEnvelope(oaiMessage); - var oaiMessages = middleware.ProcessIncomingMessages(agent, [message]); - oaiMessages.Count().Should().Be(1); - oaiMessages.First().Should().Be(oaiMessage); - } - } - private void VerifyOAIMessages(IEnumerable<(IMessage, IEnumerable)> messages) - { - var jsonObjects = messages.Select(pair => - { - var (originalMessage, ms) = pair; - var objs = new List(); - foreach (var m in ms) - { - object? obj = null; - if (m is ChatRequestUserMessage userMessage) - { - obj = new - { - Role = userMessage.Role.ToString(), - Content = userMessage.Content, - MultiModaItem = userMessage.MultimodalContentItems?.Select(item => - { - return item switch - { - ChatMessageImageContentItem imageContentItem => new - { - Type = "Image", - ImageUrl = GetImageUrlFromContent(imageContentItem), - } as object, - ChatMessageTextContentItem textContentItem => new - { - Type = "Text", - Text = textContentItem.Text, - } as object, - _ => throw new System.NotImplementedException(), - }; - }), - }; - } - - if (m is ChatRequestAssistantMessage assistantMessage) - { - obj = new - { - Role = assistantMessage.Role.ToString(), - Content = assistantMessage.Content, - TooCall = assistantMessage.ToolCalls.Select(tc => - { - return tc switch - { - ChatCompletionsFunctionToolCall functionToolCall => new - { - Type = "Function", - Name = functionToolCall.Name, - Arguments = functionToolCall.Arguments, - Id = functionToolCall.Id, - } as object, - _ => throw new System.NotImplementedException(), - }; - }), - FunctionCallName = assistantMessage.FunctionCall?.Name, - FunctionCallArguments = assistantMessage.FunctionCall?.Arguments, - }; - } - - if (m is ChatRequestSystemMessage systemMessage) - { - obj = new - { - Role = systemMessage.Role.ToString(), - Content = systemMessage.Content, - }; - } - - if (m is ChatRequestFunctionMessage functionMessage) - { - obj = new - { - Role = functionMessage.Role.ToString(), - Content = functionMessage.Content, - Name = functionMessage.Name, - }; - } - - if (m is ChatRequestToolMessage toolCallMessage) - { - obj = new - { - Role = toolCallMessage.Role.ToString(), - Content = toolCallMessage.Content, - ToolCallId = toolCallMessage.ToolCallId, - }; - } - - objs.Add(obj ?? throw new System.NotImplementedException()); - } - - return new - { - OriginalMessage = originalMessage.ToString(), - ConvertedMessages = objs, - }; - }); - - var json = JsonSerializer.Serialize(jsonObjects, this.jsonSerializerOptions); - Approvals.Verify(json); - } - - private object? GetImageUrlFromContent(ChatMessageImageContentItem content) - { - return content.GetType().GetProperty("ImageUrl", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)?.GetValue(content); - } -} diff --git a/dotnet/test/AutoGen.Tests/SingleAgentTest.cs b/dotnet/test/AutoGen.Tests/SingleAgentTest.cs index 79d2b9c2f3f..b784ff8da03 100644 --- a/dotnet/test/AutoGen.Tests/SingleAgentTest.cs +++ b/dotnet/test/AutoGen.Tests/SingleAgentTest.cs @@ -118,7 +118,6 @@ public async Task GPTFunctionCallAgentTestAsync() var agentWithFunction = new GPTAgent("gpt", "You are a helpful AI assistant", config, 0, functions: new[] { this.EchoAsyncFunction }); await EchoFunctionCallTestAsync(agentWithFunction); - await UpperCaseTestAsync(agentWithFunction); } [ApiKeyFact("AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT")] @@ -144,7 +143,6 @@ public async Task AssistantAgentFunctionCallTestAsync() llmConfig: llmConfig); await EchoFunctionCallTestAsync(assistantAgent); - await UpperCaseTestAsync(assistantAgent); } [Fact] @@ -286,7 +284,7 @@ public async Task EchoFunctionCallExecutionTestAsync(IAgent agent) reply.GetContent().Should().Be("[ECHO] Hello world"); reply.From.Should().Be(agent.Name); - reply.Should().BeOfType>(); + reply.Should().BeOfType(); } public async Task EchoFunctionCallExecutionStreamingTestAsync(IStreamingAgent agent) @@ -306,7 +304,7 @@ public async Task EchoFunctionCallExecutionStreamingTestAsync(IStreamingAgent ag finalReply = reply; } - if (finalReply is AggregateMessage aggregateMessage) + if (finalReply is ToolCallAggregateMessage aggregateMessage) { var toolCallResultMessage = aggregateMessage.Message2; toolCallResultMessage.ToolCalls.First().Result.Should().Be(answer); diff --git a/dotnet/test/AutoGen.Tests/TwoAgentTest.cs b/dotnet/test/AutoGen.Tests/TwoAgentTest.cs index 91437eaa618..5c9a350e01b 100644 --- a/dotnet/test/AutoGen.Tests/TwoAgentTest.cs +++ b/dotnet/test/AutoGen.Tests/TwoAgentTest.cs @@ -68,7 +68,7 @@ public async Task TwoAgentWeatherChatTestAsync() else { // terminate message - return new Message(Role.Assistant, GroupChatExtension.TERMINATE); + return new TextMessage(Role.Assistant, GroupChatExtension.TERMINATE); } }) .RegisterMiddleware(async (msgs, option, agent, ct) => diff --git a/dotnet/test/AutoGen.Tests/WorkflowTest.cs b/dotnet/test/AutoGen.Tests/WorkflowTest.cs index d57cf2126c4..d1d12010e39 100644 --- a/dotnet/test/AutoGen.Tests/WorkflowTest.cs +++ b/dotnet/test/AutoGen.Tests/WorkflowTest.cs @@ -30,12 +30,12 @@ public async Task TransitionTestAsync() var canTransit = await aliceToBob.CanTransitionAsync([]); canTransit.Should().BeFalse(); - canTransit = await aliceToBob.CanTransitionAsync(new[] { new Message(Role.Assistant, "Hello") }); + canTransit = await aliceToBob.CanTransitionAsync(new[] { new TextMessage(Role.Assistant, "Hello") }); canTransit.Should().BeTrue(); // if no function is provided, it should always return true var aliceToBobNoFunction = Transition.Create(alice, bob); - canTransit = await aliceToBobNoFunction.CanTransitionAsync(new[] { new Message(Role.Assistant, "Hello") }); + canTransit = await aliceToBobNoFunction.CanTransitionAsync(new[] { new TextMessage(Role.Assistant, "Hello") }); canTransit.Should().BeTrue(); } diff --git a/dotnet/website/articles/Built-in-messages.md b/dotnet/website/articles/Built-in-messages.md index 2767091bd76..3a3754a3058 100644 --- a/dotnet/website/articles/Built-in-messages.md +++ b/dotnet/website/articles/Built-in-messages.md @@ -7,12 +7,15 @@ Besides, AutoGen also provides a set of built-in message types that implement th > [!NOTE] > The minimal requirement for an agent to be used as admin in @AutoGen.Core.GroupChat is to support @AutoGen.Core.TextMessage. +> [!NOTE] +> @AutoGen.Core.Message will be deprecated in 0.0.14. Please replace it with a more specific message type like @AutoGen.Core.TextMessage, @AutoGen.Core.ImageMessage, etc. + - @AutoGen.Core.TextMessage: A message that contains a piece of text. - @AutoGen.Core.ImageMessage: A message that contains an image. - @AutoGen.Core.MultiModalMessage: A message that contains multiple modalities like text, image, etc. - @AutoGen.Core.ToolCallMessage: A message that represents a function call request. - @AutoGen.Core.ToolCallResultMessage: A message that represents a function call result. -- @AutoGen.Core.AggregateMessage`2: A message that represents an aggregate message that contains multiple sub-messages. This type of message is used by @AutoGen.Core.FunctionCallMiddleware to aggregate both @AutoGen.Core.ToolCallMessage and @AutoGen.Core.ToolCallResultMessage into a single message. +- @AutoGen.Core.ToolCallAggregateMessage: A message that contains both @AutoGen.Core.ToolCallMessage and @AutoGen.Core.ToolCallResultMessage. This type of message is used by @AutoGen.Core.FunctionCallMiddleware to aggregate both @AutoGen.Core.ToolCallMessage and @AutoGen.Core.ToolCallResultMessage into a single message. - @AutoGen.Core.MessageEnvelope`1: A message that represents an envelope that contains a message of any type. - @AutoGen.Core.Message: The original message type before 0.0.9. This message type is reserved for backward compatibility. It is recommended to replace it with a more specific message type like @AutoGen.Core.TextMessage, @AutoGen.Core.ImageMessage, etc. diff --git a/dotnet/website/articles/MistralChatAgent-count-token-usage.md b/dotnet/website/articles/MistralChatAgent-count-token-usage.md index b7f025aa11d..261845cf615 100644 --- a/dotnet/website/articles/MistralChatAgent-count-token-usage.md +++ b/dotnet/website/articles/MistralChatAgent-count-token-usage.md @@ -4,7 +4,7 @@ The following example shows how to create a `MistralAITokenCounterMiddleware` @A To collect the token usage for the entire chat session, one easy solution is simply collect all the responses from agent and sum up the token usage for each response. To collect all the agent responses, we can create a middleware which simply saves all responses to a list and register it with the agent. To get the token usage information for each response, because in the example we are using @AutoGen.Mistral.MistralClientAgent, we can simply get the token usage from the response object. > [!NOTE] -> You can find the complete example in the [Example13_OpenAIAgent_JsonMode](https://github.com/microsoft/autogen/tree/dotnet/dotnet/sample/AutoGen.BasicSamples/Example14_MistralClientAgent_TokenCount.cs). +> You can find the complete example in the [Example13_OpenAIAgent_JsonMode](https://github.com/microsoft/autogen/tree/main/dotnet/sample/AutoGen.BasicSamples/Example14_MistralClientAgent_TokenCount.cs). - Step 1: Adding using statement [!code-csharp[](../../sample/AutoGen.BasicSamples/Example14_MistralClientAgent_TokenCount.cs?name=using_statements)] diff --git a/dotnet/website/articles/OpenAIChatAgent-connect-to-third-party-api.md b/dotnet/website/articles/OpenAIChatAgent-connect-to-third-party-api.md index 2072c0f99a8..8321fc87a5c 100644 --- a/dotnet/website/articles/OpenAIChatAgent-connect-to-third-party-api.md +++ b/dotnet/website/articles/OpenAIChatAgent-connect-to-third-party-api.md @@ -1,7 +1,7 @@ The following example shows how to connect to third-party OpenAI API using @AutoGen.OpenAI.OpenAIChatAgent. > [!NOTE] -> You can find the complete code of this example in [Example16_OpenAIChatAgent_ConnectToThirdPartyBackend](https://github.com/microsoft/autogen/tree/dotnet/dotnet/sample/AutoGen.BasicSamples/Example16_OpenAIChatAgent_ConnectToThirdPartyBackend.cs). +> You can find the complete code of this example in [Example16_OpenAIChatAgent_ConnectToThirdPartyBackend](https://github.com/microsoft/autogen/tree/main/dotnet/sample/AutoGen.BasicSamples/Example16_OpenAIChatAgent_ConnectToThirdPartyBackend.cs). ## Overview A lot of LLM applications/platforms support spinning up a chat server that is compatible with OpenAI API, such as LM Studio, Ollama, Mistral etc. This means that you can connect to these servers using the @AutoGen.OpenAI.OpenAIChatAgent. diff --git a/dotnet/website/articles/OpenAIChatAgent-use-json-mode.md b/dotnet/website/articles/OpenAIChatAgent-use-json-mode.md index 4d69340f585..a822cb04633 100644 --- a/dotnet/website/articles/OpenAIChatAgent-use-json-mode.md +++ b/dotnet/website/articles/OpenAIChatAgent-use-json-mode.md @@ -9,7 +9,7 @@ JSON mode is a new feature in OpenAI which allows you to instruct model to alway ## How to enable JSON mode in OpenAIChatAgent. > [!NOTE] -> You can find the complete example in the [Example13_OpenAIAgent_JsonMode](https://github.com/microsoft/autogen/tree/dotnet/dotnet/sample/AutoGen.BasicSamples/Example13_OpenAIAgent_JsonMode.cs). +> You can find the complete example in the [Example13_OpenAIAgent_JsonMode](https://github.com/microsoft/autogen/tree/main/dotnet/sample/AutoGen.BasicSamples/Example13_OpenAIAgent_JsonMode.cs). To enable JSON mode for @AutoGen.OpenAI.OpenAIChatAgent, set `responseFormat` to `ChatCompletionsResponseFormat.JsonObject` when creating the agent. Note that when enabling JSON mode, you also need to instruct the agent to output JSON format in its system message. diff --git a/notebook/agentchat_agentoptimizer.ipynb b/notebook/agentchat_agentoptimizer.ipynb index 13df0a0d77a..7177703ab06 100644 --- a/notebook/agentchat_agentoptimizer.ipynb +++ b/notebook/agentchat_agentoptimizer.ipynb @@ -53,7 +53,7 @@ "source": [ "# MathUserProxy with function_call\n", "\n", - "This agent is a customized MathUserProxy inherits from its [partent class](https://github.com/microsoft/autogen/blob/main/autogen/agentchat/contrib/math_user_proxy_agent.py).\n", + "This agent is a customized MathUserProxy inherits from its [parent class](https://github.com/microsoft/autogen/blob/main/autogen/agentchat/contrib/math_user_proxy_agent.py).\n", "\n", "It supports using both function_call and python to solve math problems.\n" ] diff --git a/notebook/agentchat_capability_long_context_handling.ipynb b/notebook/agentchat_capability_long_context_handling.ipynb index 0bc1b4ffdd7..0a9d715e3e5 100644 --- a/notebook/agentchat_capability_long_context_handling.ipynb +++ b/notebook/agentchat_capability_long_context_handling.ipynb @@ -6,8 +6,8 @@ "source": [ "# Handling A Long Context via `TransformChatHistory`\n", "\n", - "
\n", - " Deprecation Notice: TransformChatHistory is no longer supported. Please use TransformMessages as the new standard method. For the latest examples, visit the notebook at notebook/agentchat_transform_messages.ipynb.\n", + "
\n", + " Deprecation Notice: TransformChatHistory is no longer supported and will be removed in version 0.2.30. Please transition to using TransformMessages as the new standard method. For a detailed introduction to this method, including how to limit the number of tokens in message context history to replace TransformChatHistory, visit our guide Introduction to Transform Messages.\n", "
\n", "\n", "This notebook illustrates how you can use the `TransformChatHistory` capability to give any `Conversable` agent an ability to handle a long context. \n", @@ -665,7 +665,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -679,9 +679,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.9" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebook/agentchat_compression.ipynb b/notebook/agentchat_compression.ipynb index 29cc2d9e224..d7f3a7204db 100644 --- a/notebook/agentchat_compression.ipynb +++ b/notebook/agentchat_compression.ipynb @@ -6,9 +6,9 @@ "source": [ "# Conversations with Chat History Compression Enabled\n", "\n", - "**CompressibleAgent will be deprecated.** \n", - "\n", - "Refer to https://github.com/microsoft/autogen/blob/main/notebook/agentchat_capability_long_context_handling.ipynb for long context handling capability.\n", + "
\n", + " Deprecation Notice: CompressibleAgent has been deprecated and will no longer be available as of version 0.2.30. Please transition to using TransformMessages, which is now the recommended approach. For a detailed guide on implementing this new standard, refer to our user guide on Compressing Text with LLMLingua. This guide provides examples for effectively utilizing LLMLingua transform as a replacement for CompressibleAgent.\n", + "
\n", "\n", "AutoGen offers conversable agents powered by LLM, tools, or humans, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participance through multi-agent conversation. Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n", "\n", @@ -854,7 +854,7 @@ ], "metadata": { "kernelspec": { - "display_name": "msft", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -868,10 +868,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4 + "version": "3.11.9" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebook/agentchat_logging.ipynb b/notebook/agentchat_logging.ipynb index 2ad19e7995a..7eb4138b4cc 100644 --- a/notebook/agentchat_logging.ipynb +++ b/notebook/agentchat_logging.ipynb @@ -8,6 +8,10 @@ "\n", "AutoGen offers utilities to log data for debugging and performance analysis. This notebook demonstrates how to use them. \n", "\n", + "we log data in different modes:\n", + "- SQlite Database\n", + "- File \n", + "\n", "In general, users can initiate logging by calling `autogen.runtime_logging.start()` and stop logging by calling `autogen.runtime_logging.stop()`" ] }, @@ -287,6 +291,82 @@ " + str(round(session_cost, 4))\n", ")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Log data in File mode\n", + "\n", + "By default, the log type is set to `sqlite` as shown above, but we introduced a new parameter for the `autogen.runtime_logging.start()`\n", + "\n", + "the `logger_type = \"file\"` will start to log data in the File mode." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logging session ID: ed493ebf-d78e-49f0-b832-69557276d557\n", + "\u001b[33muser_proxy\u001b[0m (to assistant):\n", + "\n", + "What is the height of the Eiffel Tower? Only respond with the answer and terminate\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to user_proxy):\n", + "\n", + "The height of the Eiffel Tower is 330 meters.\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "\n", + "import pandas as pd\n", + "\n", + "import autogen\n", + "from autogen import AssistantAgent, UserProxyAgent\n", + "\n", + "# Setup API key. Add your own API key to config file or environment variable\n", + "llm_config = {\n", + " \"config_list\": autogen.config_list_from_json(\n", + " env_or_file=\"OAI_CONFIG_LIST\",\n", + " ),\n", + " \"temperature\": 0.9,\n", + "}\n", + "\n", + "# Start logging with logger_type and the filename to log to\n", + "logging_session_id = autogen.runtime_logging.start(logger_type=\"file\", config={\"filename\": \"runtime.log\"})\n", + "print(\"Logging session ID: \" + str(logging_session_id))\n", + "\n", + "# Create an agent workflow and run it\n", + "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)\n", + "user_proxy = UserProxyAgent(\n", + " name=\"user_proxy\",\n", + " code_execution_config=False,\n", + " human_input_mode=\"NEVER\",\n", + " is_termination_msg=lambda msg: \"TERMINATE\" in msg[\"content\"],\n", + ")\n", + "\n", + "user_proxy.initiate_chat(\n", + " assistant, message=\"What is the height of the Eiffel Tower? Only respond with the answer and terminate\"\n", + ")\n", + "autogen.runtime_logging.stop()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This should create a `runtime.log` file in your current directory. " + ] } ], "metadata": { @@ -312,7 +392,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.9.13" } }, "nbformat": 4, diff --git a/notebook/agenteval_cq_math.ipynb b/notebook/agenteval_cq_math.ipynb index 71a19b044a7..43ea28de1a3 100644 --- a/notebook/agenteval_cq_math.ipynb +++ b/notebook/agenteval_cq_math.ipynb @@ -17,12 +17,12 @@ "source": [ "# Demonstrating the `AgentEval` framework using the task of solving math problems as an example\n", "\n", - "This notebook aims to demonstrate how to `AgentEval` implemented through [AutoGen](https://github.com/microsoft/autogen) works, where we use a math problem-solving task as an example. \n", - "`AgentEval` consists of two key components:\n", + "This notebook aims to demonstrate how to `AgentEval` implemented through [AutoGen](https://github.com/microsoft/autogen) works in an offline scenario, where we use a math problem-solving task as an example. \n", + "`AgentEval` consists of two key steps:\n", "\n", - "- `CriticAgent`: This is an LLM-based agent that generates a list criteria $(c_1, \\dots, c_n)$ to help to evaluate a utility given task.\n", + "- `generate_criteria`: This is an LLM-based function that generates a list of criteria $(c_1, \\dots, c_n)$ to help to evaluate a utility given task.\n", "\n", - "- `QuantifierAgent`: This agent quantifies the performance of any sample task based on the criteria designed by the `CriticAgent` in the following way: $(c_1=a_1, \\dots, c_n=a_n)$\n", + "- `quantify_criteria`: This function quantifies the performance of any sample task based on the criteria generated in the `generate_criteria` step in the following way: $(c_1=a_1, \\dots, c_n=a_n)$\n", "\n", "![AgentEval](../website/blog/2023-11-20-AgentEval/img/agenteval-CQ.png)\n", "\n", @@ -49,7 +49,70 @@ "id": "68lTZZyJ1_BI", "outputId": "15a55fab-e13a-4654-b8cb-ae117478d6d8" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: pyautogen>=0.2.3 in /home/vscode/.local/lib/python3.10/site-packages (0.2.17)\n", + "Requirement already satisfied: docker in /home/vscode/.local/lib/python3.10/site-packages (7.0.0)\n", + "Requirement already satisfied: diskcache in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (5.6.3)\n", + "Requirement already satisfied: flaml in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (2.1.2)\n", + "Requirement already satisfied: tiktoken in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (0.6.0)\n", + "Requirement already satisfied: openai>=1.3 in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (1.14.1)\n", + "Requirement already satisfied: pydantic!=2.6.0,<3,>=1.10 in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (2.6.4)\n", + "Requirement already satisfied: termcolor in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (2.4.0)\n", + "Requirement already satisfied: python-dotenv in /home/vscode/.local/lib/python3.10/site-packages (from pyautogen>=0.2.3) (1.0.1)\n", + "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.10/site-packages (from docker) (2.31.0)\n", + "Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/site-packages (from docker) (24.0)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/site-packages (from docker) (2.2.1)\n", + "Requirement already satisfied: tqdm>4 in /home/vscode/.local/lib/python3.10/site-packages (from openai>=1.3->pyautogen>=0.2.3) (4.66.2)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /home/vscode/.local/lib/python3.10/site-packages (from openai>=1.3->pyautogen>=0.2.3) (0.27.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /home/vscode/.local/lib/python3.10/site-packages (from openai>=1.3->pyautogen>=0.2.3) (1.9.0)\n", + "Requirement already satisfied: sniffio in /home/vscode/.local/lib/python3.10/site-packages (from openai>=1.3->pyautogen>=0.2.3) (1.3.1)\n", + "Requirement already satisfied: anyio<5,>=3.5.0 in /home/vscode/.local/lib/python3.10/site-packages (from openai>=1.3->pyautogen>=0.2.3) (4.3.0)\n", + "Requirement already satisfied: typing-extensions<5,>=4.7 in /home/vscode/.local/lib/python3.10/site-packages (from openai>=1.3->pyautogen>=0.2.3) (4.10.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /home/vscode/.local/lib/python3.10/site-packages (from pydantic!=2.6.0,<3,>=1.10->pyautogen>=0.2.3) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /home/vscode/.local/lib/python3.10/site-packages (from pydantic!=2.6.0,<3,>=1.10->pyautogen>=0.2.3) (2.16.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/site-packages (from requests>=2.26.0->docker) (2024.2.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/site-packages (from requests>=2.26.0->docker) (3.6)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/site-packages (from requests>=2.26.0->docker) (3.3.2)\n", + "Requirement already satisfied: NumPy>=1.17 in /home/vscode/.local/lib/python3.10/site-packages (from flaml->pyautogen>=0.2.3) (1.26.4)\n", + "Requirement already satisfied: regex>=2022.1.18 in /home/vscode/.local/lib/python3.10/site-packages (from tiktoken->pyautogen>=0.2.3) (2023.12.25)\n", + "Requirement already satisfied: exceptiongroup>=1.0.2 in /home/vscode/.local/lib/python3.10/site-packages (from anyio<5,>=3.5.0->openai>=1.3->pyautogen>=0.2.3) (1.2.0)\n", + "Requirement already satisfied: httpcore==1.* in /home/vscode/.local/lib/python3.10/site-packages (from httpx<1,>=0.23.0->openai>=1.3->pyautogen>=0.2.3) (1.0.4)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /home/vscode/.local/lib/python3.10/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.3->pyautogen>=0.2.3) (0.14.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: scipy in /home/vscode/.local/lib/python3.10/site-packages (1.12.0)\n", + "Requirement already satisfied: numpy<1.29.0,>=1.22.4 in /home/vscode/.local/lib/python3.10/site-packages (from scipy) (1.26.4)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: matplotlib in /home/vscode/.local/lib/python3.10/site-packages (3.8.3)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/site-packages (from matplotlib) (24.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (3.1.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (1.2.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (4.50.0)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: cycler>=0.10 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: pillow>=8 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (10.2.0)\n", + "Requirement already satisfied: numpy<2,>=1.21 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (1.26.4)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /home/vscode/.local/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: six>=1.5 in /home/vscode/.local/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install \"pyautogen>=0.2.3\" docker\n", "%pip install scipy\n", @@ -63,11 +126,6 @@ }, "source": [ "## Set your API Endpoint\n", - "\n", - "* The [`config_list_openai_aoai`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_openai_aoai) function tries to create a list of configurations using Azure OpenAI endpoints and OpenAI endpoints. It assumes the api keys and api bases are stored in the corresponding environment variables or local txt files:\n", - " - OpenAI API key: os.environ[\"OPENAI_API_KEY\"] or `openai_api_key_file=\"key_openai.txt\"`.\n", - " - Azure OpenAI API key: os.environ[\"AZURE_OPENAI_API_KEY\"] or `aoai_api_key_file=\"key_aoai.txt\"`. Multiple keys can be stored, one per line.\n", - " - Azure OpenAI API base: os.environ[\"AZURE_OPENAI_API_BASE\"] or `aoai_api_base_file=\"base_aoai.txt\"`. Multiple bases can be stored, one per line.\n", "* The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file. It first looks for an environment variable with a specified name. The value of the environment variable needs to be a valid json string. If that variable is not found, it looks for a json file with the same name. It filters the configs by filter_dict.\n", "\n", "You can set the value of config_list in any way you prefer. Please refer to this [notebook](https://github.com/microsoft/autogen/blob/main/notebook/oai_openai_utils.ipynb) for full code examples of the different methods.\n" @@ -90,68 +148,11 @@ "import scipy.stats as stats\n", "\n", "import autogen\n", + "from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria\n", + "from autogen.agentchat.contrib.agent_eval.criterion import Criterion\n", + "from autogen.agentchat.contrib.agent_eval.task import Task\n", "\n", - "config_list = autogen.config_list_from_json(\n", - " \"OAI_CONFIG_LIST\",\n", - " filter_dict={\n", - " \"model\": [\"gpt-4\"],\n", - " },\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fBZ-XFXy1_BJ" - }, - "source": [ - "\n", - "## Construct `CriticAgent`\n", - "\n", - "We construct the planning agent named `critic` and a user proxy agent for the critic named `critic_user`. We specify `human_input_mode` as \"NEVER\" in the user proxy agent, ensuring that it will never ask for human feedback. Additionally, we define the `ask_critic` function to send a message to the critic and retrieve the criteria from the critic.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "9XAeyjd11_BK" - }, - "outputs": [], - "source": [ - "critic = autogen.AssistantAgent(\n", - " name=\"critic\",\n", - " llm_config={\"config_list\": config_list},\n", - " system_message=\"\"\"You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be dinstinguishable, quantifieable and not redundant.\n", - " Convert the evaluation criteria into a dictionary where the keys are the criteria.\n", - " The value of each key is a dictionary as follows {\"description\": criteria description , \"accepted_values\": possible accepted inputs for this key}\n", - " Make sure the keys are criteria for assessing the given task. \"accepted_values\" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. \"description\" includes the criterion description.\n", - " Return the dictionary.\"\"\",\n", - ")\n", - "\n", - "critic_user = autogen.UserProxyAgent(\n", - " name=\"critic_user\",\n", - " max_consecutive_auto_reply=0, # terminate without auto-reply\n", - " human_input_mode=\"NEVER\",\n", - " code_execution_config={\n", - " \"use_docker\": False\n", - " }, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n", - ")\n", - "\n", - "\n", - "def ask_critic(message):\n", - " \"\"\"\n", - " Initiate a chat with the critic user and return the last message received from the planner.\n", - "\n", - " Args:\n", - " - message (str): The message to be sent to the critic user.\n", - "\n", - " Returns:\n", - " - str: The content of the last message received.\n", - " \"\"\"\n", - " critic_user.initiate_chat(critic, message=message)\n", - " # return the last received from the planner\n", - " return critic_user.messagelast_message()[\"content\"]" + "config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\")" ] }, { @@ -167,207 +168,138 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "5H1WRs_wkiK0" }, - "outputs": [], - "source": [ - "def read_without_groundtruth(file_name):\n", - " \"\"\"\n", - " Read the mathproblem logs - bypassing any information about the ground truths.\n", - "\n", - " Args:\n", - " - file_name (str): The single log file that wants to get evaluated.\n", - "\n", - " Returns:\n", - " - str: The log file without any information about the ground truth answer of the problem.\n", - " \"\"\"\n", - " f = open(file_name, \"r\").readlines()\n", - " output_dictionary = \"\"\n", - " for line in f:\n", - " if \"is_correct\" not in line and \"correct_ans\" not in line and \"check_result\" not in line:\n", - " output_dictionary += line\n", - " elif \"is_correct\" in line:\n", - " correctness = line.replace(\",\", \"\").split(\":\")[-1].rstrip().strip()\n", - " return [output_dictionary, correctness]\n", - "\n", - "\n", - "# Reading one successful and one failed example of the task\n", - "response_successful = read_without_groundtruth(\n", - " \"../test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt\"\n", - ")[0]\n", - "response_failed = read_without_groundtruth(\n", - " \"../test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt\"\n", - ")[0]\n", - "\n", - "task = {\n", - " \"name\": \"Math problem solving\",\n", - " \"description\": \"Given any question, the system needs to solve the problem as consisely and accurately as possible\",\n", - " \"successful_response\": response_successful,\n", - " \"failed_response\": response_failed,\n", - "}\n", - "\n", - "sys_msg = f\"\"\"Task: {task[\"name\"]}.\n", - "Task description: {task[\"description\"]}\n", - "Task successful example: {task[\"successful_response\"]}\n", - "Task failed example: {task[\"failed_response\"]}\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Vu70o024lenI" - }, - "source": [ - "# The Criteria\n", - "Now, we print the designed criteria for assessing math problems. " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "k9DsDB5hqvtG", - "outputId": "0edd7a0c-b031-4f67-efc6-1a1e77066921" - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mcritic_user\u001b[0m (to critic):\n", + "\u001b[33mcritic_user\u001b[0m (to chat_manager):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " \n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mcritic\u001b[0m (to chat_manager):\n", + "\n", + "[\n", " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", + " \"name\": \"Accuracy\",\n", + " \"description\": \"The solution must be correct and adhere strictly to mathematical principles and techniques appropriate for the problem.\",\n", + " \"accepted_values\": [\"Correct\", \"Minor errors\", \"Major errors\", \"Incorrect\"]\n", " },\n", " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", + " \"name\": \"Relevance\",\n", + " \"description\": \"The content of the response must be relevant to the question posed and should address the specific problem requirements.\",\n", + " \"accepted_values\": [\"Highly relevant\", \"Relevant\", \"Somewhat relevant\", \"Not relevant\"]\n", " },\n", " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", + " \"name\": \"Efficiency\",\n", + " \"description\": \"The solution should be derived in a time-effective manner, considering the complexity of the problem.\",\n", + " \"accepted_values\": [\"Highly efficient\", \"Efficient\", \"Inefficient\", \"Redundant\"]\n", " },\n", " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mcritic\u001b[0m (to critic_user):\n", - "\n", - "In evaluating math problem-solving tasks, we can establish certain criteria to assess the level of success in solving the math problems. Below are the criteria with their corresponding descriptions and the accepted values:\n", - "\n", - "```python\n", - "evaluation_criteria = {\n", - " \"accuracy\": {\n", - " \"description\": \"Correctness of the final answer provided.\",\n", - " \"accepted_values\": {\n", - " \"correct\": \"The given answer is correct.\",\n", - " \"incorrect\": \"The given answer is incorrect.\",\n", - " \"partial\": \"The answer is partially correct with minor errors.\"\n", - " }\n", + " \"name\": \"Logic and Structure\",\n", + " \"description\": \"The reasoning should be logical and the information structured in a clear and understandable sequence.\",\n", + " \"accepted_values\": [\"Exceptionally clear\", \"Clear\", \"Somewhat clear\", \"Confusing\"]\n", " },\n", - " \"completeness\": {\n", - " \"description\": \"The extent to which all necessary steps are included and properly documented.\",\n", - " \"accepted_values\": {\n", - " \"complete\": \"All necessary steps are included and properly documented.\",\n", - " \"incomplete\": \"Some steps are missing or not properly documented.\",\n", - " \"overly_detailed\": \"The solution contains unnecessary detail that doesn't contribute to understanding.\"\n", - " }\n", + " {\n", + " \"name\": \"Use of Resources\",\n", + " \"description\": \"The response should make appropriate and optimal use of external resources or tools (e.g., Python scripts) when necessary.\",\n", + " \"accepted_values\": [\"Optimal\", \"Appropriate\", \"Underutilized\", \"Overreliance\"]\n", " },\n", - " \"efficiency\": {\n", - " \"description\": \"The method used to solve the problem is concise and does not include redundant steps.\",\n", - " \"accepted_values\": {\n", - " \"efficient\": \"The solution is found through the most direct method with no superfluous steps.\",\n", - " \"inefficient\": \"The method used is not the most direct and may include redundant steps.\",\n", - " \"acceptable\": \"The method used is reasonably direct with little redundancy.\"\n", - " }\n", + " {\n", + " \"name\": \"Mathematical Notation\",\n", + " \"description\": \"The use of proper and standard mathematical notation in the solution and explanation.\",\n", + " \"accepted_values\": [\"Excellent\", \"Good\", \"Adequate\", \"Poor\"]\n", " },\n", - " \"methodology\": {\n", - " \"description\": \"The approach used to solve the problem, including the use of formulas, theorems, and problem-solving techniques.\",\n", - " \"accepted_values\": {\n", - " \"appropriate\": \"The methodology used is appropriate for the problem.\",\n", - " \"inappropriate\": \"The methodology used is not suitable for the problem.\",\n", - " \"partially_appropriate\": \"The methodology used is partially suitable but could be improved.\"\n", - " }\n", + " {\n", + " \"name\": \"Explanation and Justification\",\n", + " \"description\": \"There should be a clear explanation, rationale, or justification for each step taken towards the solution.\",\n", + " \"accepted_values\": [\"Thorough\", \"Adequate\", \"Insufficient\", \"Missing\"]\n", " },\n", - " \"clarity\": {\n", - " \"description\": \"The ease with which the solution can be understood by others.\",\n", - " \"accepted_values\": {\n", - " \"clear\": \"The solution is presented in a clear, logical manner that is easy to follow.\",\n", - " \"unclear\": \"The solution is difficult to follow or understand.\",\n", - " \"somewhat_clear\": \"The solution is generally clear but could be improved in some areas for better understanding.\"\n", - " }\n", + " {\n", + " \"name\": \"Correctness of Answer Format\",\n", + " \"description\": \"The answer should be presented in the format requested in the problem (e.g., interval notation, simplified form).\",\n", + " \"accepted_values\": [\"Perfectly formatted\", \"Properly formatted\", \"Slightly incorrect format\", \"Improperly formatted\"]\n", " },\n", - " \"use_of_language\": {\n", - " \"description\": \"The correctness and appropriateness of mathematical language and notation.\",\n", - " \"accepted_values\": {\n", - " \"appropriate\": \"The language and notation are mathematically sound and correctly applied.\",\n", - " \"inappropriate\": \"The language and notation have errors or are misapplied.\",\n", - " \"mostly_appropriate\": \"The language and notation are mostly correct, but there are minor errors or inconsistencies.\"\n", - " }\n", + " {\n", + " \"name\": \"Handling of Edge Cases\",\n", + " \"description\": \"The solution should correctly handle any special or edge cases that may arise in the problem.\",\n", + " \"accepted_values\": [\"Complete\", \"Most cases\", \"Some cases\", \"No consideration\"]\n", " }\n", - "}\n", - "```\n", - "\n", - "These criteria should provide a comprehensive framework for evaluating math problem-solving tasks in terms of accuracy, completeness, efficiency, and clarity.\n", + "]\n", "\n", "--------------------------------------------------------------------------------\n" ] } ], "source": [ - "current_task_name = \"_\".join(task[\"name\"].split()).lower()\n", - "gen_criteria = critic_user.initiate_chat(critic, message=sys_msg)\n", - "criteria = critic_user.last_message()\n", + "def remove_ground_truth(test_case):\n", + " test_details = json.loads(test_case)\n", + " # need to remove the ground truth from the test details\n", + " correctness = test_details.pop(\"is_correct\", None)\n", + " test_details.pop(\"correct_ans\", None)\n", + " test_details.pop(\"check_result\", None)\n", + " return str(test_details), correctness\n", + "\n", + "\n", + "# Reading one successful and one failed example of the task\n", + "success_str = open(\"../test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt\", \"r\").read()\n", + "response_successful = remove_ground_truth(success_str)[0]\n", + "failed_str = open(\"../test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt\", \"r\").read()\n", + "response_failed = remove_ground_truth(failed_str)[0]\n", + "\n", + "task = Task(\n", + " **{\n", + " \"name\": \"Math problem solving\",\n", + " \"description\": \"Given any question, the system needs to solve the problem as consisely and accurately as possible\",\n", + " \"successful_response\": response_successful,\n", + " \"failed_response\": response_failed,\n", + " }\n", + ")\n", + "\n", + "criteria = generate_criteria(task=task, llm_config={\"config_list\": config_list}, max_round=8)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vu70o024lenI" + }, + "source": [ + "# The Criteria\n", + "Now, we print the designed criteria for assessing math problems. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "k9DsDB5hqvtG", + "outputId": "0edd7a0c-b031-4f67-efc6-1a1e77066921" + }, + "outputs": [], + "source": [ + "current_task_name = \"_\".join(task.name.split()).lower()\n", "cr_file = open(f\"../test/test_files/agenteval-in-out/{current_task_name}_criteria.json\", \"w\")\n", - "cr_file.write(criteria[\"content\"])\n", + "cr_file.write(Criterion.write_json(criteria))\n", "cr_file.close()" ] }, @@ -377,7 +309,7 @@ "id": "PETPZluOEGCR" }, "source": [ - "*Note :* You can also define and use your own criteria by editing `criteria.txt`" + "*Note :* You can also define and use your own criteria in order to feed into the quantifier." ] }, { @@ -388,40 +320,21 @@ "source": [ "# The `QuantifierAgent`\n", "\n", - "Once we have the criteria, we need to quantify a new sample based on the designed criteria and its accepted values. This will be done through `QuantifierAgent` agent as follows. \n", - "We note that can skip the designed criteria by the agent and use your own defined criteria in `criteria_file`." + "Once we have the criteria, we need to quantify a new sample based on the designed criteria and its accepted values. This will be done through `quantify_criteria` from agent_eval. \n", + "Again, you can use your own defined criteria in `criteria_file`." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "id": "4uUkZJh_subA" }, "outputs": [], "source": [ "criteria_file = f\"../test/test_files/agenteval-in-out/{current_task_name}_criteria.json\"\n", - "quantifier = autogen.AssistantAgent(\n", - " name=\"quantifier\",\n", - " llm_config={\"config_list\": config_list},\n", - " system_message=\"\"\"You are a helpful assistant. You quantify the output of different tasks based on the given criteria.\n", - " The criterion is given in a dictionary format where each key is a dintinct criteria.\n", - " The value of each key is a dictionary as follows {\"description\": criteria description , \"accepted_values\": possible accepted inputs for this key}\n", - " You are going to quantify each of the crieria for a given task based on the task description.\n", - " Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.\n", - " Return only the dictionary.\"\"\",\n", - ")\n", - "\n", - "quantifier_user = autogen.UserProxyAgent(\n", - " name=\"quantifier_user\",\n", - " max_consecutive_auto_reply=0, # terminate without auto-reply\n", - " human_input_mode=\"NEVER\",\n", - " code_execution_config={\n", - " \"use_docker\": False\n", - " }, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n", - ")\n", - "\n", - "dictionary_for_eval = open(criteria_file, \"r\").read()" + "criteria = open(criteria_file, \"r\").read()\n", + "criteria = Criterion.parse_json_str(criteria)" ] }, { @@ -433,41 +346,6 @@ "## Running the quantifier on a single test case" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "zQ0H3sy8l-Ai" - }, - "outputs": [], - "source": [ - "def get_quantifier(file, criteria_file):\n", - " \"\"\"\n", - " Running quantifier agent on individual log.\n", - "\n", - " Args:\n", - " - file (str): The log path.\n", - " - file (str): The criteria jason file path\n", - " Returns:\n", - " - dict: A dictionary including the actual success of the problem as well as estimated performance by the agent eval.\n", - " {\"actual_success\":actual_label, \"estimated_performance\" : a dictionary of all the criteria and their quantified estimated performance.} }\n", - " \"\"\"\n", - " dictionary_for_eval = open(criteria_file, \"r\").read()\n", - "\n", - " test_case, actual_label = read_without_groundtruth(file)\n", - " print(\"actual label for this case: \", actual_label)\n", - " cq_results = quantifier_user.initiate_chat( # noqa: F841\n", - " quantifier,\n", - " message=sys_msg\n", - " + \"Evaluation dictionary: \"\n", - " + str(dictionary_for_eval)\n", - " + \"actual test case to evaluate: \"\n", - " + test_case,\n", - " )\n", - " quantified_results = quantifier_user.last_message()\n", - " return {\"actual_success\": actual_label, \"estimated_performance\": quantified_results[\"content\"]}" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -477,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -490,176 +368,173 @@ "name": "stdout", "output_type": "stream", "text": [ - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: In evaluating math problem-solving tasks, we can establish certain criteria to assess the level of success in solving the math problems. Below are the criteria with their corresponding descriptions and the accepted values:\n", - "\n", - "```python\n", - "evaluation_criteria = {\n", - " \"accuracy\": {\n", - " \"description\": \"Correctness of the final answer provided.\",\n", - " \"accepted_values\": {\n", - " \"correct\": \"The given answer is correct.\",\n", - " \"incorrect\": \"The given answer is incorrect.\",\n", - " \"partial\": \"The answer is partially correct with minor errors.\"\n", - " }\n", - " },\n", - " \"completeness\": {\n", - " \"description\": \"The extent to which all necessary steps are included and properly documented.\",\n", - " \"accepted_values\": {\n", - " \"complete\": \"All necessary steps are included and properly documented.\",\n", - " \"incomplete\": \"Some steps are missing or not properly documented.\",\n", - " \"overly_detailed\": \"The solution contains unnecessary detail that doesn't contribute to understanding.\"\n", - " }\n", - " },\n", - " \"efficiency\": {\n", - " \"description\": \"The method used to solve the problem is concise and does not include redundant steps.\",\n", - " \"accepted_values\": {\n", - " \"efficient\": \"The solution is found through the most direct method with no superfluous steps.\",\n", - " \"inefficient\": \"The method used is not the most direct and may include redundant steps.\",\n", - " \"acceptable\": \"The method used is reasonably direct with little redundancy.\"\n", - " }\n", - " },\n", - " \"methodology\": {\n", - " \"description\": \"The approach used to solve the problem, including the use of formulas, theorems, and problem-solving techniques.\",\n", - " \"accepted_values\": {\n", - " \"appropriate\": \"The methodology used is appropriate for the problem.\",\n", - " \"inappropriate\": \"The methodology used is not suitable for the problem.\",\n", - " \"partially_appropriate\": \"The methodology used is partially suitable but could be improved.\"\n", - " }\n", - " },\n", - " \"clarity\": {\n", - " \"description\": \"The ease with which the solution can be understood by others.\",\n", - " \"accepted_values\": {\n", - " \"clear\": \"The solution is presented in a clear, logical manner that is easy to follow.\",\n", - " \"unclear\": \"The solution is difficult to follow or understand.\",\n", - " \"somewhat_clear\": \"The solution is generally clear but could be improved in some areas for better understanding.\"\n", - " }\n", - " },\n", - " \"use_of_language\": {\n", - " \"description\": \"The correctness and appropriateness of mathematical language and notation.\",\n", - " \"accepted_values\": {\n", - " \"appropriate\": \"The language and notation are mathematically sound and correctly applied.\",\n", - " \"inappropriate\": \"The language and notation have errors or are misapplied.\",\n", - " \"mostly_appropriate\": \"The language and notation are mostly correct, but there are minor errors or inconsistencies.\"\n", - " }\n", - " }\n", - "}\n", - "```\n", - "\n", - "These criteria should provide a comprehensive framework for evaluating math problem-solving tasks in terms of accuracy, completeness, efficiency, and clarity.actual test case to evaluate: {\n", - " \"problem\": \"Find $24^{-1} \\\\pmod{11^2}$. That is, find the residue $b$ for which $24b \\\\equiv 1\\\\pmod{11^2}$.\\n\\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Since $5 \\\\times 24 = 120 = 121 - 1$, it follows that $-5 \\\\times 24 \\\\equiv 1 \\\\pmod{121}$. Adding 121 to $-5$ to make it positive, we find $(-5 + 121) \\\\times 24 \\\\equiv 116 \\\\times 24 \\\\equiv 1 \\\\pmod{121}$, so it follows that the modular inverse of $24$ is $\\\\boxed{116}$ when taken modulo $121$.\",\n", - " \"problem_id\": \"5\",\n", - " \"response_with_ans\": \"To find the modular inverse of 24 modulo 11^2, we can use the Extended Euclidean Algorithm. Here is a Python function to compute the modular inverse using this algorithm:\\n\\n```python\\ndef mod_inverse(a, m):\\n g, x, _ = extended_gcd(a, m)\\n if g != 1:\\n raise Exception(f\\\"{a} and {m} are not coprime.\\\")\\n return x % m\\n\\ndef extended_gcd(a, b):\\n if a == 0:\\n return b, 0, 1\\n else:\\n g, x, y = extended_gcd(b % a, a)\\n return g, y - (b // a) * x, x\\n```\\n\\nLet's use the above function to find the modular inverse of 24 modulo 11^2:\\n\\n```python\\na = 24\\nm = 11**2\\nmod_inverse(a, m)\\n```\\n\\nI will execute the above code to find the modular inverse of 24 modulo 11^2.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Find $24^{-1} \\\\pmod{11^2}$. That is, find the residue $b$ for which $24b \\\\equiv 1\\\\pmod{11^2}$.\\n\\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the modular inverse of 24 modulo 11^2, we can use the Extended Euclidean Algorithm. Here is a Python function to compute the modular inverse using this algorithm:\\n\\n```python\\ndef mod_inverse(a, m):\\n g, x, _ = extended_gcd(a, m)\\n if g != 1:\\n raise Exception(f\\\"{a} and {m} are not coprime.\\\")\\n return x % m\\n\\ndef extended_gcd(a, b):\\n if a == 0:\\n return b, 0, 1\\n else:\\n g, x, y = extended_gcd(b % a, a)\\n return g, y - (b // a) * x, x\\n```\\n\\nLet's use the above function to find the modular inverse of 24 modulo 11^2:\\n\\n```python\\na = 24\\nm = 11**2\\nmod_inverse(a, m)\\n```\\n\\nI will execute the above code to find the modular inverse of 24 modulo 11^2.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 13.481226921081543,\n", - " \"trial\": -1\n", - "}\n", - "\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Accuracy\",\n", + " \"description\": \"The solution must be correct and adhere strictly to mathematical principles and techniques appropriate for the problem.\",\n", + " \"accepted_values\": [\n", + " \"Correct\",\n", + " \"Minor errors\",\n", + " \"Major errors\",\n", + " \"Incorrect\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Conciseness\",\n", + " \"description\": \"The explanation and method provided should be direct and to the point, avoiding unnecessary steps or complexity.\",\n", + " \"accepted_values\": [\n", + " \"Very concise\",\n", + " \"Concise\",\n", + " \"Somewhat verbose\",\n", + " \"Verbose\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Relevance\",\n", + " \"description\": \"The content of the response must be relevant to the question posed and should address the specific problem requirements.\",\n", + " \"accepted_values\": [\n", + " \"Highly relevant\",\n", + " \"Relevant\",\n", + " \"Somewhat relevant\",\n", + " \"Not relevant\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Efficiency\",\n", + " \"description\": \"The solution should be derived in a time-effective manner, considering the complexity of the problem.\",\n", + " \"accepted_values\": [\n", + " \"Highly efficient\",\n", + " \"Efficient\",\n", + " \"Inefficient\",\n", + " \"Redundant\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Logic and Structure\",\n", + " \"description\": \"The reasoning should be logical and the information structured in a clear and understandable sequence.\",\n", + " \"accepted_values\": [\n", + " \"Exceptionally clear\",\n", + " \"Clear\",\n", + " \"Somewhat clear\",\n", + " \"Confusing\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Use of Resources\",\n", + " \"description\": \"The response should make appropriate and optimal use of external resources or tools (e.g., Python scripts) when necessary.\",\n", + " \"accepted_values\": [\n", + " \"Optimal\",\n", + " \"Appropriate\",\n", + " \"Underutilized\",\n", + " \"Overreliance\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Notation\",\n", + " \"description\": \"The use of proper and standard mathematical notation in the solution and explanation.\",\n", + " \"accepted_values\": [\n", + " \"Excellent\",\n", + " \"Good\",\n", + " \"Adequate\",\n", + " \"Poor\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation and Justification\",\n", + " \"description\": \"There should be a clear explanation, rationale, or justification for each step taken towards the solution.\",\n", + " \"accepted_values\": [\n", + " \"Thorough\",\n", + " \"Adequate\",\n", + " \"Insufficient\",\n", + " \"Missing\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Correctness of Answer Format\",\n", + " \"description\": \"The answer should be presented in the format requested in the problem (e.g., interval notation, simplified form).\",\n", + " \"accepted_values\": [\n", + " \"Perfectly formatted\",\n", + " \"Properly formatted\",\n", + " \"Slightly incorrect format\",\n", + " \"Improperly formatted\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Handling of Edge Cases\",\n", + " \"description\": \"The solution should correctly handle any special or edge cases that may arise in the problem.\",\n", + " \"accepted_values\": [\n", + " \"Complete\",\n", + " \"Most cases\",\n", + " \"Some cases\",\n", + " \"No consideration\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " }\n", + "]actual test case to evaluate: {'problem': 'Find $24^{-1} \\\\pmod{11^2}$. That is, find the residue $b$ for which $24b \\\\equiv 1\\\\pmod{11^2}$.\\n\\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Since $5 \\\\times 24 = 120 = 121 - 1$, it follows that $-5 \\\\times 24 \\\\equiv 1 \\\\pmod{121}$. Adding 121 to $-5$ to make it positive, we find $(-5 + 121) \\\\times 24 \\\\equiv 116 \\\\times 24 \\\\equiv 1 \\\\pmod{121}$, so it follows that the modular inverse of $24$ is $\\\\boxed{116}$ when taken modulo $121$.', 'problem_id': '5', 'response_with_ans': 'To find the modular inverse of 24 modulo 11^2, we can use the Extended Euclidean Algorithm. Here is a Python function to compute the modular inverse using this algorithm:\\n\\n```python\\ndef mod_inverse(a, m):\\n g, x, _ = extended_gcd(a, m)\\n if g != 1:\\n raise Exception(f\"{a} and {m} are not coprime.\")\\n return x % m\\n\\ndef extended_gcd(a, b):\\n if a == 0:\\n return b, 0, 1\\n else:\\n g, x, y = extended_gcd(b % a, a)\\n return g, y - (b // a) * x, x\\n```\\n\\nLet\\'s use the above function to find the modular inverse of 24 modulo 11^2:\\n\\n```python\\na = 24\\nm = 11**2\\nmod_inverse(a, m)\\n```\\n\\nI will execute the above code to find the modular inverse of 24 modulo 11^2.', 'round': 0, 'messages': [{'content': 'Find $24^{-1} \\\\pmod{11^2}$. That is, find the residue $b$ for which $24b \\\\equiv 1\\\\pmod{11^2}$.\\n\\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.', 'role': 'user'}, {'content': 'To find the modular inverse of 24 modulo 11^2, we can use the Extended Euclidean Algorithm. Here is a Python function to compute the modular inverse using this algorithm:\\n\\n```python\\ndef mod_inverse(a, m):\\n g, x, _ = extended_gcd(a, m)\\n if g != 1:\\n raise Exception(f\"{a} and {m} are not coprime.\")\\n return x % m\\n\\ndef extended_gcd(a, b):\\n if a == 0:\\n return b, 0, 1\\n else:\\n g, x, y = extended_gcd(b % a, a)\\n return g, y - (b // a) * x, x\\n```\\n\\nLet\\'s use the above function to find the modular inverse of 24 modulo 11^2:\\n\\n```python\\na = 24\\nm = 11**2\\nmod_inverse(a, m)\\n```\\n\\nI will execute the above code to find the modular inverse of 24 modulo 11^2.', 'role': 'assistant'}], 'time': 13.481226921081543, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", - "```json\n", "{\n", - " \"accuracy\": \"correct\",\n", - " \"completeness\": \"complete\",\n", - " \"efficiency\": \"efficient\",\n", - " \"methodology\": \"appropriate\",\n", - " \"clarity\": \"clear\",\n", - " \"use_of_language\": \"appropriate\"\n", + " \"Accuracy\": \"Correct\",\n", + " \"Conciseness\": \"Concise\",\n", + " \"Relevance\": \"Highly relevant\",\n", + " \"Efficiency\": \"Efficient\",\n", + " \"Logic and Structure\": \"Clear\",\n", + " \"Use of Resources\": \"Optimal\",\n", + " \"Mathematical Notation\": \"Good\",\n", + " \"Explanation and Justification\": \"Adequate\",\n", + " \"Correctness of Answer Format\": \"Perfectly formatted\",\n", + " \"Handling of Edge Cases\": \"Complete\"\n", "}\n", - "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual correctness: true\n", - "predicted coprrectness:\n", - " ```json\n", - "{\n", - " \"accuracy\": \"correct\",\n", - " \"completeness\": \"complete\",\n", - " \"efficiency\": \"efficient\",\n", - " \"methodology\": \"appropriate\",\n", - " \"clarity\": \"clear\",\n", - " \"use_of_language\": \"appropriate\"\n", - "}\n", - "```\n" + "actual correctness: True\n", + "predicted correctness:\n", + " {\n", + " \"Accuracy\": \"Correct\",\n", + " \"Conciseness\": \"Concise\",\n", + " \"Relevance\": \"Highly relevant\",\n", + " \"Efficiency\": \"Efficient\",\n", + " \"Logic and Structure\": \"Clear\",\n", + " \"Use of Resources\": \"Optimal\",\n", + " \"Mathematical Notation\": \"Good\",\n", + " \"Explanation and Justification\": \"Adequate\",\n", + " \"Correctness of Answer Format\": \"Perfectly formatted\",\n", + " \"Handling of Edge Cases\": \"Complete\"\n", + "}\n" ] } ], "source": [ - "test_case = \"../test/test_files/agenteval-in-out/samples/sample_test_case.json\"\n", - "quantifier_output = get_quantifier(test_case, criteria_file)\n", + "test_case = open(\"../test/test_files/agenteval-in-out/samples/sample_test_case.json\", \"r\").read()\n", + "test_case, ground_truth = remove_ground_truth(test_case)\n", + "quantifier_output = quantify_criteria(\n", + " llm_config={\"config_list\": config_list},\n", + " criteria=criteria,\n", + " task=task,\n", + " test_case=test_case,\n", + " ground_truth=ground_truth,\n", + ")\n", "print(\"actual correctness:\", quantifier_output[\"actual_success\"])\n", - "print(\"predicted coprrectness:\\n\", quantifier_output[\"estimated_performance\"])" + "print(\"predicted correctness:\\n\", quantifier_output[\"estimated_performance\"])" ] }, { @@ -676,28 +551,28 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2024-01-06 19:06:41-- https://github.com/julianakiseleva/autogen/raw/ddabd4f0e7c13a50e33cf8462e79358666371477/test/test_files/agenteval-in-out/prealgebra.zip\n", - "Resolving github.com (github.com)... 140.82.121.4\n", - "Connecting to github.com (github.com)|140.82.121.4|:443... connected.\n", + "--2024-05-08 17:42:25-- https://github.com/julianakiseleva/autogen/raw/ddabd4f0e7c13a50e33cf8462e79358666371477/test/test_files/agenteval-in-out/prealgebra.zip\n", + "Resolving github.com (github.com)... 140.82.116.3\n", + "Connecting to github.com (github.com)|140.82.116.3|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://raw.githubusercontent.com/julianakiseleva/autogen/ddabd4f0e7c13a50e33cf8462e79358666371477/test/test_files/agenteval-in-out/prealgebra.zip [following]\n", - "--2024-01-06 19:06:41-- https://raw.githubusercontent.com/julianakiseleva/autogen/ddabd4f0e7c13a50e33cf8462e79358666371477/test/test_files/agenteval-in-out/prealgebra.zip\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...\n", + "--2024-05-08 17:42:25-- https://raw.githubusercontent.com/julianakiseleva/autogen/ddabd4f0e7c13a50e33cf8462e79358666371477/test/test_files/agenteval-in-out/prealgebra.zip\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 28567 (28K) [application/zip]\n", "Saving to: ‘prealgebra.zip’\n", "\n", - "prealgebra.zip 100%[===================>] 27.90K --.-KB/s in 0.005s \n", + "prealgebra.zip 100%[===================>] 27.90K --.-KB/s in 0s \n", "\n", - "2024-01-06 19:06:41 (5.85 MB/s) - ‘prealgebra.zip’ saved [28567/28567]\n", + "2024-05-08 17:42:25 (63.0 MB/s) - ‘prealgebra.zip’ saved [28567/28567]\n", "\n", "Archive: prealgebra.zip\n", "warning: skipped \"../\" path component(s) in ../prealgebra/\n", @@ -762,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -775,358 +650,285 @@ "name": "stdout", "output_type": "stream", "text": [ - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Amaretta's birthday is July 27, and her brother Enzo's birthday is September 3. Every year, Amaretta and Enzo celebrate by eating cake every day from Amaretta's birthday through Enzo's birthday (including both birthdays). If they did this for the first time in 2008, how many cake-eating days will they have observed by the end of 2016?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"There are $39$ cake-eating days each year: the last $5$ days of July, all $31$ days of August, and the first $3$ days of September.\\n\\nThere are $9$ years in the list $$2008,2009,2010,2011,2012,2013,2014,2015,2016.$$ Besides listing them out, we can also see this by subtracting $2007$ from each year, which gives us the list $1,2,3,4,5,6,7,8,9$ (which clearly has $9$ entries).\\n\\n$39$ cake-eating days each year for $9$ years make $39\\\\cdot 9 = \\\\boxed{351}$ days in total.\",\n", - " \"problem_id\": \"3\",\n", - " \"response_with_ans\": \"To calculate the total number of cake-eating days, we will first calculate the number of days between Amaretta's birthday and Enzo's birthday in a non-leap year and in a leap year. Then, we will count the number of leap years and non-leap years in the given range (2008-2016). Finally, we will sum the total number of days for each type of year for both non-leap and leap years.\\n\\nPlease run the following Python code:\\n\\n```python\\nfrom datetime import date\\n\\namaretta_birthday = date(2008, 7, 27)\\nenzo_birthday = date(2008, 9, 3)\\n\\nnormal_year_days = (enzo_birthday - amaretta_birthday).days + 1\\nleap_year_days = normal_year_days + 1\\n\\nleap_years = [year for year in range(2008, 2017) if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)]\\nnon_leap_years_count = 9 - len(leap_years)\\n\\ntotal_cake_eating_days = non_leap_years_count * normal_year_days + len(leap_years) * leap_year_days\\nprint(total_cake_eating_days)\\n```\\n\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Amaretta's birthday is July 27, and her brother Enzo's birthday is September 3. Every year, Amaretta and Enzo celebrate by eating cake every day from Amaretta's birthday through Enzo's birthday (including both birthdays). If they did this for the first time in 2008, how many cake-eating days will they have observed by the end of 2016?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To calculate the total number of cake-eating days, we will first calculate the number of days between Amaretta's birthday and Enzo's birthday in a non-leap year and in a leap year. Then, we will count the number of leap years and non-leap years in the given range (2008-2016). Finally, we will sum the total number of days for each type of year for both non-leap and leap years.\\n\\nPlease run the following Python code:\\n\\n```python\\nfrom datetime import date\\n\\namaretta_birthday = date(2008, 7, 27)\\nenzo_birthday = date(2008, 9, 3)\\n\\nnormal_year_days = (enzo_birthday - amaretta_birthday).days + 1\\nleap_year_days = normal_year_days + 1\\n\\nleap_years = [year for year in range(2008, 2017) if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)]\\nnon_leap_years_count = 9 - len(leap_years)\\n\\ntotal_cake_eating_days = non_leap_years_count * normal_year_days + len(leap_years) * leap_year_days\\nprint(total_cake_eating_days)\\n```\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 15.208062410354614,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'Bekah has exactly three brass house number digits: 2, 3 and 5. How many distinct numbers can she form using one or more of the digits?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'If Bekah uses only one digit, she can form three numbers. If she uses two digits, she has three choices for the tens place and two for the units, so she can form six numbers. Finally, if Bekah uses all three digits, she has three choices for the hundreds place, two for the tens, and one for the units, so she can form six numbers. Thus, Bekah can form $3 + 6 + 6 = \\\\boxed{15}$ distinct numbers.', 'problem_id': '11', 'response_with_ans': 'To find the answer, we\\'ll consider all possible combinations that Bekah can make using the three digits she has:\\n1. Single-digit numbers: 2, 3, and 5\\n2. Two-digits numbers using the digits: 23, 25, 32, 35, 52, and 53\\n3. Three-digits number using all the digits: 235, 253, 325, 352, 523, and 532\\n\\nLet\\'s calculate the number of distinct numbers Bekah can form using one or more of the digits with Python code:\\n\\n```python\\nfrom itertools import permutations\\n\\ndigits = [2, 3, 5]\\n\\n# Single-digit numbers\\none_digit_numbers = set(digits)\\n\\n# Two-digits numbers\\ntwo_digit_numbers = set(int(\\'\\'.join(map(str, p))) for p in permutations(digits, 2))\\n\\n# Three-digits numbers\\nthree_digit_numbers = set(int(\\'\\'.join(map(str, p))) for p in permutations(digits, 3))\\n\\ndistinct_numbers = one_digit_numbers | two_digit_numbers | three_digit_numbers\\ntotal_distinct_numbers = len(distinct_numbers)\\n\\nprint(\"Distinct numbers:\", distinct_numbers)\\nprint(\"Total distinct numbers:\", total_distinct_numbers)\\n```\\n\\nPlease run the above Python code to calculate the total number of distinct numbers Bekah can form using one or more of the digits.', 'round': 0, 'messages': [{'content': 'Bekah has exactly three brass house number digits: 2, 3 and 5. How many distinct numbers can she form using one or more of the digits?', 'role': 'user'}, {'content': 'To find the answer, we\\'ll consider all possible combinations that Bekah can make using the three digits she has:\\n1. Single-digit numbers: 2, 3, and 5\\n2. Two-digits numbers using the digits: 23, 25, 32, 35, 52, and 53\\n3. Three-digits number using all the digits: 235, 253, 325, 352, 523, and 532\\n\\nLet\\'s calculate the number of distinct numbers Bekah can form using one or more of the digits with Python code:\\n\\n```python\\nfrom itertools import permutations\\n\\ndigits = [2, 3, 5]\\n\\n# Single-digit numbers\\none_digit_numbers = set(digits)\\n\\n# Two-digits numbers\\ntwo_digit_numbers = set(int(\\'\\'.join(map(str, p))) for p in permutations(digits, 2))\\n\\n# Three-digits numbers\\nthree_digit_numbers = set(int(\\'\\'.join(map(str, p))) for p in permutations(digits, 3))\\n\\ndistinct_numbers = one_digit_numbers | two_digit_numbers | three_digit_numbers\\ntotal_distinct_numbers = len(distinct_numbers)\\n\\nprint(\"Distinct numbers:\", distinct_numbers)\\nprint(\"Total distinct numbers:\", total_distinct_numbers)\\n```\\n\\nPlease run the above Python code to calculate the total number of distinct numbers Bekah can form using one or more of the digits.', 'role': 'assistant'}], 'time': 15.620970249176025, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", - " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"very efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Problem Interpretation\": \"completely accurate\",\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"completely correct\",\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"very efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Two numbers are said to be 'relatively prime' if their greatest common factor is 1. How many integers greater than 10 and less than 30 are relatively prime with 28?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Since $28=2^2\\\\cdot 7$, a positive integer is relatively prime with $28$ if and only if it contains neither $2$ nor $7$ in its prime factorization. In other words, we want to count the number of integers between $11$ and $29$ inclusive which are divisible by neither $2$ nor $7$.\\n\\nAll of the odd numbers are not divisible by 2; there are 10 such numbers. The only one of these that is divisible by 7 is 21, so there are $10- 1 =\\\\boxed{9}$ numbers between 10 and 30 that are relatively prime with 28.\",\n", - " \"problem_id\": \"12\",\n", - " \"response_with_ans\": \"I will find the relatively prime numbers with 28 in the range greater than 10 and less than 30 using Python code. I will use the math.gcd function and a loop to check each number in the specified range.\\n\\n```python\\nimport math\\n\\ncount = 0\\nrelatively_prime_numbers = []\\nfor number in range(11, 30):\\n if math.gcd(28, number) == 1:\\n relatively_prime_numbers.append(number)\\n count += 1\\n\\nprint(f\\\"Relatively prime numbers with 28: {relatively_prime_numbers}\\\")\\nprint(f\\\"Number of relatively prime numbers with 28: {count}\\\")\\n```\\nExecute the above code to find the integers relatively prime with 28 and their count.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Two numbers are said to be 'relatively prime' if their greatest common factor is 1. How many integers greater than 10 and less than 30 are relatively prime with 28?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I will find the relatively prime numbers with 28 in the range greater than 10 and less than 30 using Python code. I will use the math.gcd function and a loop to check each number in the specified range.\\n\\n```python\\nimport math\\n\\ncount = 0\\nrelatively_prime_numbers = []\\nfor number in range(11, 30):\\n if math.gcd(28, number) == 1:\\n relatively_prime_numbers.append(number)\\n count += 1\\n\\nprint(f\\\"Relatively prime numbers with 28: {relatively_prime_numbers}\\\")\\nprint(f\\\"Number of relatively prime numbers with 28: {count}\\\")\\n```\\nExecute the above code to find the integers relatively prime with 28 and their count.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 6.9820802211761475,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'What is $.0\\\\overline{3} \\\\div .\\\\overline{03}$? Express your answer as a mixed number.', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'It is almost always easier to use fractions than decimals when dividing. So the first task is to convert these repeating decimals to fractions. First, $.0\\\\overline{3}$: \\\\[\\n10 \\\\cdot .0\\\\overline{3} = .\\\\overline{3} = \\\\frac{1}{3}\\\\\\\\\\n\\\\Rightarrow .0\\\\overline{3} = \\\\frac{1}{3} \\\\div 10 = \\\\frac{1}{3} \\\\cdot \\\\frac{1}{10} = \\\\frac{1}{30}.\\n\\\\]Next, $.\\\\overline{03}$: \\\\[\\n99 \\\\cdot .\\\\overline{03} = (100-1) \\\\cdot .\\\\overline{03} = 3.\\\\overline{03} - .\\\\overline{03} = 3\\\\\\\\\\n\\\\Rightarrow .\\\\overline{03} = \\\\frac{3}{99} = \\\\frac{3}{3 \\\\cdot 33} = \\\\frac{1}{33}.\\n\\\\]We now have the tools to make our calculation: \\\\begin{align*}\\n.0\\\\overline{3} \\\\div .\\\\overline{03} &= \\\\frac{1}{30} \\\\div \\\\frac{1}{33}= \\\\frac{1}{30} \\\\cdot \\\\frac{33}{1}\\\\\\\\\\n&= \\\\frac{33}{30} = \\\\frac{3 \\\\cdot 11}{3 \\\\cdot 10} = \\\\frac{11}{10}\\\\\\\\\\n&= \\\\frac{10+1}{10} = \\\\boxed{1\\\\frac{1}{10}}.\\n\\\\end{align*}', 'problem_id': '15', 'response_with_ans': \"Now, let's simplify the fraction:\\n\\n$\\\\frac{33}{3} = 11$\\n\\nSo, the result of the division is:\\n\\n$.0\\\\overline{3} \\\\div .\\\\overline{03} = 11$\\n\\nSince there is no remainder or mixed number component, the final answer is 11.\\n\\nTERMINATE\", 'round': 3, 'messages': [{'content': 'What is $.0\\\\overline{3} \\\\div .\\\\overline{03}$? Express your answer as a mixed number.', 'role': 'user'}, {'content': \"To solve the problem, let's first convert the repeating decimal numbers into fractions.\\n\\n1. Convert $0.\\\\overline{3}$ to a fraction:\\n\\nLet x = $0.\\\\overline{3}$\\nThen, 10x = $3.\\\\overline{3}$\\nSubtracting the two equations, we get:\\n9x = 3\\nx = $\\\\frac{1}{3}$\\n\\nSo, $0.\\\\overline{3}$ can be written as $\\\\frac{1}{3}$.\\n\\n2. Convert $.\\\\overline{03}$ to a fraction:\\n\\nLet y = $.\\\\overline{03}$\\nThen, 100y = $3.\\\\overline{03}$\\nSubtracting the two equations, we get:\\n99y = 3\\ny = $\\\\frac{1}{33}$\\n\\nSo, $.\\\\overline{03}$ can be written as $\\\\frac{1}{33}$.\\n\\nNow that we have the fractions, we can compute: \\n$(\\\\frac{1}{3}) \\\\div (\\\\frac{1}{33})$\\nTo divide fractions, we multiply the first one by the reciprocal of the second one:\\n\\n$(\\\\frac{1}{3}) \\\\times (\\\\frac{33}{1})$\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Let's compute the product of the fractions:\\n\\n$\\\\frac{1}{3} \\\\times \\\\frac{33}{1} = \\\\frac{1 \\\\times 33}{3 \\\\times 1}$\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Now, let's solve the multiplication in the numerator and the denominator:\\n\\n$\\\\frac{1 \\\\times 33}{3 \\\\times 1} = \\\\frac{33}{3}$\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Now, let's simplify the fraction:\\n\\n$\\\\frac{33}{3} = 11$\\n\\nSo, the result of the division is:\\n\\n$.0\\\\overline{3} \\\\div .\\\\overline{03} = 11$\\n\\nSince there is no remainder or mixed number component, the final answer is 11.\\n\\nTERMINATE\", 'role': 'assistant'}], 'time': 34.40860724449158, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", - " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Problem Interpretation\": \"completely accurate\",\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"completely incorrect\",\n", + " \"Explanation Clarity\": \"moderately clear\",\n", + " \"Code Efficiency\": \"not applicable\",\n", + " \"Code Correctness\": \"not applicable\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", - " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"How many positive and negative integers is $12$ a multiple of?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"The number $12$ is a multiple of $-12, -6, -4, -3, -2, -1, 1, 2, 3, 4, 6,$ and $12,$ for a total of $\\\\boxed{12}$ integers.\",\n", - " \"problem_id\": \"1\",\n", - " \"response_with_ans\": \"```python\\nnum = 12\\ndivisors = [i for i in range(1, num + 1) if num % i == 0]\\n\\npositive_divisors_count = len(divisors)\\nnegative_divisors_count = positive_divisors_count\\ntotal_divisors_count = positive_divisors_count + negative_divisors_count\\n\\nprint(f\\\"Positive divisors of 12: {divisors}\\\")\\ndivisors_negative = [-x for x in divisors]\\nprint(f\\\"Negative divisors of 12: {divisors_negative}\\\")\\nprint(f\\\"Total divisors count: {total_divisors_count}\\\")\\n```\\n\\nExecute this code snippet to find the divisors of 12 and count how many positive and negative integers 12 is a multiple of.\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"How many positive and negative integers is $12$ a multiple of?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To determine how many positive and negative integers 12 is a multiple of, we need to find the divisors of 12. Then we will double the number of divisors, as each positive divisor has a corresponding negative divisor.\\n\\nLet's use a Python code snippet to find all the divisors of 12 and count them.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"```python\\nnum = 12\\ndivisors = [i for i in range(1, num + 1) if num % i == 0]\\n\\npositive_divisors_count = len(divisors)\\nnegative_divisors_count = positive_divisors_count\\ntotal_divisors_count = positive_divisors_count + negative_divisors_count\\n\\nprint(f\\\"Positive divisors of 12: {divisors}\\\")\\ndivisors_negative = [-x for x in divisors]\\nprint(f\\\"Negative divisors of 12: {divisors_negative}\\\")\\nprint(f\\\"Total divisors count: {total_divisors_count}\\\")\\n```\\n\\nExecute this code snippet to find the divisors of 12 and count how many positive and negative integers 12 is a multiple of.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 17.360238790512085,\n", - " \"trial\": -1\n", - "}\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " }\n", + "]actual test case to evaluate: {'problem': 'How many integers $n$ satisfy both of the inequalities $4n + 3 < 25$ and $-7n + 5 < 24$?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Subtract 3 and divide by 4 on both sides of the first inequality to obtain \\\\begin{align*}\\n4n + 3 &< 25 \\\\\\\\\\n\\\\Rightarrow\\\\qquad 4n &< 22 \\\\\\\\\\n\\\\Rightarrow\\\\qquad n &< 5.5.\\n\\\\end{align*}Similarly, the second inequality yields \\\\begin{align*}\\n-7n + 5 &< 24 \\\\\\\\\\n\\\\Rightarrow\\\\qquad -7n &< 19 \\\\\\\\\\n\\\\Rightarrow\\\\qquad n &> -\\\\frac{19}{7}.\\n\\\\end{align*}Therefore, we are looking for all the integers between $-\\\\frac{19}{7}$ and $5.5$. Since $-\\\\frac{19}{7}$ is between $-3$ and $-2$ and the largest integer less than $5.5$ is 5, we need to count the number of integers between $-2$ and $5$, inclusive. There are $5$ positive integers, $2$ negative integers, and zero, so there are $\\\\boxed{8}$ integers that satisfy both $4n + 3 < 25$ and $-7n + 5 < 24$.', 'problem_id': '10', 'response_with_ans': 'Here\\'s the code to solve the inequalities and find the intersection of their solutions:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols(\"n\")\\ninequality_1 = 4 * n + 3 < 25\\ninequality_2 = -7 * n + 5 < 24\\n\\nsolution_1 = solve(inequality_1, n)\\nsolution_2 = solve(inequality_2, n)\\n\\nintersection = (max(solution_1[0], solution_2[0]), min(solution_1[1], solution_2[1]))\\n\\nprint(f\"Solution to inequality 1: {solution_1}\")\\nprint(f\"Solution to inequality 2: {solution_2}\")\\nprint(f\"Intersection of solutions: {intersection}\")\\n```\\n\\nExecute this code, and let\\'s see the solutions for both inequalities and their intersection.', 'round': 1, 'messages': [{'content': 'How many integers $n$ satisfy both of the inequalities $4n + 3 < 25$ and $-7n + 5 < 24$?', 'role': 'user'}, {'content': \"To find the number of integers $n$ that satisfy both inequalities, we'll first solve each inequality individually, and then find the intersection of the solutions.\\n\\nStep 1: Solve the inequalities\\n1. $4n + 3 < 25$\\n2. $-7n + 5 < 24$\\n\\nStep 2: Find the intersection of the solutions\\n\\nStep 3: Count the number of integers in the intersection\\n\\nFirst, let's solve the inequalities using the python code.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': 'Here\\'s the code to solve the inequalities and find the intersection of their solutions:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols(\"n\")\\ninequality_1 = 4 * n + 3 < 25\\ninequality_2 = -7 * n + 5 < 24\\n\\nsolution_1 = solve(inequality_1, n)\\nsolution_2 = solve(inequality_2, n)\\n\\nintersection = (max(solution_1[0], solution_2[0]), min(solution_1[1], solution_2[1]))\\n\\nprint(f\"Solution to inequality 1: {solution_1}\")\\nprint(f\"Solution to inequality 2: {solution_2}\")\\nprint(f\"Intersection of solutions: {intersection}\")\\n```\\n\\nExecute this code, and let\\'s see the solutions for both inequalities and their intersection.', 'role': 'assistant'}], 'time': 19.949471950531006, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", @@ -1135,494 +937,289 @@ " \"Code Efficiency\": \"moderately efficient\",\n", " \"Code Correctness\": \"completely correct\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"In isosceles right triangle $ABC$, point $D$ is on hypotenuse $\\\\overline{BC}$ such that $\\\\overline{AD}$ is an altitude of $\\\\triangle ABC$ and $DC = 5$. What is the area of triangle $ABC$?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"In isosceles right triangle $\\\\triangle ABC$ below, $\\\\overline{AD}$ is the altitude to the hypotenuse.\\n\\n[asy]\\nimport olympiad;\\nunitsize(0.8inch);\\npair A,B,C,D;\\nA = (0,1);\\nB= (1,0);\\nC = -B;\\nD = (0,0);\\ndraw(A--B--C--A,linewidth(1));\\ndraw(A--D,linewidth(0.8));\\ndraw(rightanglemark(C,A,B,s=4));\\ndraw(rightanglemark(C,D,A,s=4));\\nlabel(\\\"$A$\\\",A,N);\\nlabel(\\\"$B$\\\",B,S);\\nlabel(\\\"$C$\\\",C,S);\\nlabel(\\\"$D$\\\",D,S);\\n[/asy]\\n\\nBecause $\\\\triangle ABC$ is an isosceles right triangle, $\\\\angle ABC = 45^\\\\circ$. Since $\\\\angle ADB = 90^\\\\circ$, we know that $\\\\angle DAB = 45^\\\\circ$, so $\\\\triangle ABD$ is also a 45-45-90 triangle. Similarly, $\\\\triangle ACD$ is a 45-45-90 triangle. Therefore, $DA=DB = DC = 5$, so $BC = BD+DC = 10$, and \\\\[[ABC] = \\\\frac{(AD)(BC)}{2} = \\\\frac{(5)({10})}{2} = \\\\boxed{25}.\\\\]\",\n", - " \"problem_id\": \"13\",\n", - " \"response_with_ans\": \"```python\\nfrom sympy import Eq, solve, symbols\\r\\n\\r\\n# Step 1: Find the length of side AC using the Pythagorean theorem\\r\\na, b, c = symbols(\\\"a b c\\\")\\r\\nPythagorean = Eq(a**2 + b**2, c**2)\\r\\n\\r\\n# We know BD = 5 and it's an isosceles right triangle so:\\r\\nBD = 5\\r\\nBC = BD * 2\\r\\nside_A = BD # Side A is equal to BD\\r\\n\\r\\nisosceles_triangle = Pythagorean.subs({a: side_A, b: side_A, c: BC}) # Substitute values into the equation\\r\\nlength_AC = solve(isosceles_triangle, side_A)[0] # Solve for side_AC\\r\\n\\r\\n# Step 2: Find the length of side AD\\r\\nlength_AD = (BD * length_AC) / BC # Using the proportion\\r\\n\\r\\n# Step 3: Find the area of triangle ABC\\r\\narea_triangle_ABC = 0.5 * (length_AD * length_AC)\\r\\n\\r\\nlength_AC, length_AD, area_triangle_ABC\\r\\n```\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"In isosceles right triangle $ABC$, point $D$ is on hypotenuse $\\\\overline{BC}$ such that $\\\\overline{AD}$ is an altitude of $\\\\triangle ABC$ and $DC = 5$. What is the area of triangle $ABC$?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's first analyze the problem and draw a plan to solve it.\\n\\nPlan:\\n1. Use the Pythagorean theorem to find the length of side AC.\\n2. Since right triangle ADC is similar to triangle ABC, we can set up a proportion to find the length of side AD.\\n3. Use the area formula of a triangle to find the area of triangle ABC.\\n\\nNow, let's execute the plan using code to calculate side AC, then side AD, and finally the area of triangle ABC.\\n\\n\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"```python\\nfrom sympy import Eq, solve, symbols\\r\\n\\r\\n# Step 1: Find the length of side AC using the Pythagorean theorem\\r\\na, b, c = symbols(\\\"a b c\\\")\\r\\nPythagorean = Eq(a**2 + b**2, c**2)\\r\\n\\r\\n# We know BD = 5 and it's an isosceles right triangle so:\\r\\nBD = 5\\r\\nBC = BD * 2\\r\\nside_A = BD # Side A is equal to BD\\r\\n\\r\\nisosceles_triangle = Pythagorean.subs({a: side_A, b: side_A, c: BC}) # Substitute values into the equation\\r\\nlength_AC = solve(isosceles_triangle, side_A)[0] # Solve for side_AC\\r\\n\\r\\n# Step 2: Find the length of side AD\\r\\nlength_AD = (BD * length_AC) / BC # Using the proportion\\r\\n\\r\\n# Step 3: Find the area of triangle ABC\\r\\narea_triangle_ABC = 0.5 * (length_AD * length_AC)\\r\\n\\r\\nlength_AC, length_AD, area_triangle_ABC\\r\\n```\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 22.85700249671936,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'What is the sum of the lengths, in centimeters, of the two legs of a 30-60-90 right triangle, if the length of the hypotenuse is $2\\\\sqrt{6}$ centimeters?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'We know that the ratio of the lengths of the sides of a 30-60-90 triangle is $1:\\\\sqrt{3}:2$. We know that the length of the hypotenuse is $2\\\\sqrt{6}$ and the ratio of the length shortest leg to that of the hypotenuse is $1:2$. Therefore, the length of the shorter leg is $\\\\sqrt{6}$. Since the ratio of the shorter leg to the longer leg is $1:\\\\sqrt{3}$, the length of the longer leg is $\\\\sqrt{6} \\\\cdot \\\\sqrt{3} = 3\\\\sqrt{2}$. The sum of the lengths of these two legs is $\\\\boxed{\\\\sqrt{6} + 3\\\\sqrt{2}}$ centimeters.', 'problem_id': '17', 'response_with_ans': \"Let's solve for x and find the lengths of the legs.\\n\\n```python\\nimport math\\n\\nhypotenuse = 2 * math.sqrt(6)\\nx = hypotenuse / 2\\n\\nleg_1 = x\\nleg_2 = x * math.sqrt(3)\\nsum_of_legs = leg_1 + leg_2\\n\\nleg_1, leg_2, sum_of_legs\\n```\\n\\nNow, execute the above code to find the sum of the lengths of the two legs.\", 'round': 1, 'messages': [{'content': 'What is the sum of the lengths, in centimeters, of the two legs of a 30-60-90 right triangle, if the length of the hypotenuse is $2\\\\sqrt{6}$ centimeters?', 'role': 'user'}, {'content': \"To find the sum of the lengths of the two legs, let's first analyze a 30-60-90 right triangle. In a 30-60-90 triangle, the ratios of the side lengths are:\\n\\n1. Opposite the 30-degree angle: $x$\\n2. Opposite the 60-degree angle: $x\\\\sqrt{3}$\\n3. Hypotenuse (opposite the 90-degree angle): $2x$\\n\\nNow, we are given that the hypotenuse is $2\\\\sqrt{6}$ centimeters. We can set up the following equation and solve for $x$:\\n\\n$2x = 2\\\\sqrt{6}$\\n\\nNext, we will find the lengths of the legs and sum them up.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Let's solve for x and find the lengths of the legs.\\n\\n```python\\nimport math\\n\\nhypotenuse = 2 * math.sqrt(6)\\nx = hypotenuse / 2\\n\\nleg_1 = x\\nleg_2 = x * math.sqrt(3)\\nsum_of_legs = leg_1 + leg_2\\n\\nleg_1, leg_2, sum_of_legs\\n```\\n\\nNow, execute the above code to find the sum of the lengths of the two legs.\", 'role': 'assistant'}], 'time': 18.742590188980103, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"mostly correct\"\n", + " \"Code Efficiency\": \"very efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"John counts up from 1 to 13, and then immediately counts down again to 1, and then back up to 13, and so on, alternately counting up and down: \\\\begin{align*}\\n&(1, 2, 3,4,5,6,7,8,9,10,11,12,13,\\\\\\\\\\n&\\\\qquad\\\\qquad12,11,10,9,8,7,6,5,4,3,2,1,2,3,4,\\\\ldots ).\\n\\\\end{align*} What is the $5000^{\\\\text{th}}$ integer in his list?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"We can treat this list as a sequence with a repetitive pattern. We see the sequence repeats itself every 24 elements (from 1 to 13 then back to 2). When 5000 is divided by 24, its remainder is 8. Therefore we see the $5000^{\\\\text{th}}$ integer is the same as the $8^{\\\\text{th}}$ integer, which is $\\\\boxed{8}$.\",\n", - " \"problem_id\": \"4\",\n", - " \"response_with_ans\": \"We can solve this problem by finding which \\\"group\\\" the 5000th number belongs to, and then determine the number within that group. Each group is either counting up from 1 to 13 or counting down from 13 to 1. There are 13 numbers in each direction, but since 1 is counted twice (once at the end of a down-count and once at the beginning of an up-count), counting up and down would consist of 25 numbers (13 up + 12 down).\\n\\nLet's first find out which group the 5000th number belongs to and how many numbers in total are in the complete up and down groups before 5000th number using code:\\n\\n```python\\nnum_group = 5000 // 25\\nremainder = 5000 % 25\\ntotal_numbers_before = num_group * 25\\nprint('Number of complete groups:', num_group)\\nprint('Remainder:', remainder)\\nprint('Numbers before 5000th number:', total_numbers_before)\\n```\\n\\nAfter knowing these values, I'll determine if the 5000th number is in up-count or down-count and its value.\\n\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"John counts up from 1 to 13, and then immediately counts down again to 1, and then back up to 13, and so on, alternately counting up and down: \\\\begin{align*}\\n&(1, 2, 3,4,5,6,7,8,9,10,11,12,13,\\\\\\\\\\n&\\\\qquad\\\\qquad12,11,10,9,8,7,6,5,4,3,2,1,2,3,4,\\\\ldots ).\\n\\\\end{align*} What is the $5000^{\\\\text{th}}$ integer in his list?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"We can solve this problem by finding which \\\"group\\\" the 5000th number belongs to, and then determine the number within that group. Each group is either counting up from 1 to 13 or counting down from 13 to 1. There are 13 numbers in each direction, but since 1 is counted twice (once at the end of a down-count and once at the beginning of an up-count), counting up and down would consist of 25 numbers (13 up + 12 down).\\n\\nLet's first find out which group the 5000th number belongs to and how many numbers in total are in the complete up and down groups before 5000th number using code:\\n\\n```python\\nnum_group = 5000 // 25\\nremainder = 5000 % 25\\ntotal_numbers_before = num_group * 25\\nprint('Number of complete groups:', num_group)\\nprint('Remainder:', remainder)\\nprint('Numbers before 5000th number:', total_numbers_before)\\n```\\n\\nAfter knowing these values, I'll determine if the 5000th number is in up-count or down-count and its value.\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 16.342331409454346,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'How many positive and negative integers is $12$ a multiple of?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'The number $12$ is a multiple of $-12, -6, -4, -3, -2, -1, 1, 2, 3, 4, 6,$ and $12,$ for a total of $\\\\boxed{12}$ integers.', 'problem_id': '1', 'response_with_ans': '```python\\nnum = 12\\ndivisors = [i for i in range(1, num + 1) if num % i == 0]\\n\\npositive_divisors_count = len(divisors)\\nnegative_divisors_count = positive_divisors_count\\ntotal_divisors_count = positive_divisors_count + negative_divisors_count\\n\\nprint(f\"Positive divisors of 12: {divisors}\")\\ndivisors_negative = [-x for x in divisors]\\nprint(f\"Negative divisors of 12: {divisors_negative}\")\\nprint(f\"Total divisors count: {total_divisors_count}\")\\n```\\n\\nExecute this code snippet to find the divisors of 12 and count how many positive and negative integers 12 is a multiple of.', 'round': 1, 'messages': [{'content': 'How many positive and negative integers is $12$ a multiple of?', 'role': 'user'}, {'content': \"To determine how many positive and negative integers 12 is a multiple of, we need to find the divisors of 12. Then we will double the number of divisors, as each positive divisor has a corresponding negative divisor.\\n\\nLet's use a Python code snippet to find all the divisors of 12 and count them.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '```python\\nnum = 12\\ndivisors = [i for i in range(1, num + 1) if num % i == 0]\\n\\npositive_divisors_count = len(divisors)\\nnegative_divisors_count = positive_divisors_count\\ntotal_divisors_count = positive_divisors_count + negative_divisors_count\\n\\nprint(f\"Positive divisors of 12: {divisors}\")\\ndivisors_negative = [-x for x in divisors]\\nprint(f\"Negative divisors of 12: {divisors_negative}\")\\nprint(f\"Total divisors count: {total_divisors_count}\")\\n```\\n\\nExecute this code snippet to find the divisors of 12 and count how many positive and negative integers 12 is a multiple of.', 'role': 'assistant'}], 'time': 17.360238790512085, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"mostly effective\",\n", - " \"Calculation Correctness\": \"mostly correct\",\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"completely correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"mostly correct\"\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", - "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", - "\n", - "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", - " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"An equilateral triangle has a side of length 12 inches. What is the area of the triangle, in square inches? Express your answer in simplest radical form.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"The area of an equilateral triangle with side length $s$ is $s^2\\\\sqrt{3}/4$. We have $s = 12$, so our area is $12^2\\\\sqrt{3}/4 = \\\\boxed{36\\\\sqrt{3}}$.\",\n", - " \"problem_id\": \"18\",\n", - " \"response_with_ans\": \"```python\\nimport math\\n\\nside_length = 12\\narea = (math.sqrt(3) * side_length**2) / 4\\nprint(area)\\n```\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"An equilateral triangle has a side of length 12 inches. What is the area of the triangle, in square inches? Express your answer in simplest radical form.\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the area of an equilateral triangle with a side of length s, we can use the formula:\\n\\narea = (sqrt(3) * s\\u00b2) / 4\\n\\nIn this case, s = 12 inches. Let's calculate the area.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"```python\\nimport math\\n\\nside_length = 12\\narea = (math.sqrt(3) * side_length**2) / 4\\nprint(area)\\n```\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 14.153439283370972,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", - "\n", - "{\n", - " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"very efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", - "}\n", - "\n", - "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"What is the product of $0.\\\\overline{05}$ and $1.8$ expressed as a fraction in simplest form?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"First let's convert $0.\\\\overline{05}$ to a fraction. Let's define $x=0.\\\\overline{05}$. If we multiply both sides by $100$ we get $100x=5.\\\\overline{05}$ so $99x=5$ and $x=0.\\\\overline{05}=\\\\frac{5}{99}$. Since $1.8=\\\\frac{9}{5}$ we get $$\\\\frac{\\\\cancel{5}}{99}\\\\cdot\\\\frac{9}{\\\\cancel{5}}=\\\\frac{9}{99}=\\\\boxed{\\\\frac{1}{11}}.$$\",\n", - " \"problem_id\": \"9\",\n", - " \"response_with_ans\": \"```python\\nfrom fractions import Fraction\\n\\nfraction_1 = Fraction(5, 99)\\ndecimal_2 = 1.8\\n\\nproduct = fraction_1 * decimal_2\\n\\nsimplified_fraction = Fraction(product).limit_denominator()\\n\\nprint(simplified_fraction)\\n```\\n\\nPlease run the above Python code to find the product of $\\\\frac{5}{99}$ and $1.8$ expressed as a fraction in simplest form.\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the product of $0.\\\\overline{05}$ and $1.8$ expressed as a fraction in simplest form?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the product of $0.\\\\overline{05}$ and $1.8$, first we need to convert the repeating decimal $0.\\\\overline{05}$ into a fraction.\\n\\nLet's call the repeating decimal $x$:\\n\\n$$x = 0.\\\\overline{05}$$\\n\\nTo convert the repeating decimal into a fraction, we can multiply both sides by $100$ to shift the decimal point two places to the right:\\n\\n$$100x = 5.\\\\overline{05}$$\\n\\nNow, subtract the first equation from the second equation:\\n\\n$$100x - x = 5.\\\\overline{05} - 0.\\\\overline{05}$$\\n$$99x = 5$$\\n\\nNow, we can solve for $x$:\\n\\n$$x = \\\\frac{5}{99}$$\\n\\nSo the repeating decimal $0.\\\\overline{05}$ is equivalent to the fraction $\\\\frac{5}{99}$.\\n\\nNow, we can find the product of $\\\\frac{5}{99}$ and $1.8$:\\n\\n$$\\\\frac{5}{99} \\\\times 1.8$$\\n\\nFinally, we will simplify the fraction. Let's write the code to compute the product and simplify the result as a fraction.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"```python\\nfrom fractions import Fraction\\n\\nfraction_1 = Fraction(5, 99)\\ndecimal_2 = 1.8\\n\\nproduct = fraction_1 * decimal_2\\n\\nsimplified_fraction = Fraction(product).limit_denominator()\\n\\nprint(simplified_fraction)\\n```\\n\\nPlease run the above Python code to find the product of $\\\\frac{5}{99}$ and $1.8$ expressed as a fraction in simplest form.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 18.04914665222168,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': \"Amaretta's birthday is July 27, and her brother Enzo's birthday is September 3. Every year, Amaretta and Enzo celebrate by eating cake every day from Amaretta's birthday through Enzo's birthday (including both birthdays). If they did this for the first time in 2008, how many cake-eating days will they have observed by the end of 2016?\", 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'There are $39$ cake-eating days each year: the last $5$ days of July, all $31$ days of August, and the first $3$ days of September.\\n\\nThere are $9$ years in the list $$2008,2009,2010,2011,2012,2013,2014,2015,2016.$$ Besides listing them out, we can also see this by subtracting $2007$ from each year, which gives us the list $1,2,3,4,5,6,7,8,9$ (which clearly has $9$ entries).\\n\\n$39$ cake-eating days each year for $9$ years make $39\\\\cdot 9 = \\\\boxed{351}$ days in total.', 'problem_id': '3', 'response_with_ans': \"To calculate the total number of cake-eating days, we will first calculate the number of days between Amaretta's birthday and Enzo's birthday in a non-leap year and in a leap year. Then, we will count the number of leap years and non-leap years in the given range (2008-2016). Finally, we will sum the total number of days for each type of year for both non-leap and leap years.\\n\\nPlease run the following Python code:\\n\\n```python\\nfrom datetime import date\\n\\namaretta_birthday = date(2008, 7, 27)\\nenzo_birthday = date(2008, 9, 3)\\n\\nnormal_year_days = (enzo_birthday - amaretta_birthday).days + 1\\nleap_year_days = normal_year_days + 1\\n\\nleap_years = [year for year in range(2008, 2017) if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)]\\nnon_leap_years_count = 9 - len(leap_years)\\n\\ntotal_cake_eating_days = non_leap_years_count * normal_year_days + len(leap_years) * leap_year_days\\nprint(total_cake_eating_days)\\n```\\n\", 'round': 0, 'messages': [{'content': \"Amaretta's birthday is July 27, and her brother Enzo's birthday is September 3. Every year, Amaretta and Enzo celebrate by eating cake every day from Amaretta's birthday through Enzo's birthday (including both birthdays). If they did this for the first time in 2008, how many cake-eating days will they have observed by the end of 2016?\", 'role': 'user'}, {'content': \"To calculate the total number of cake-eating days, we will first calculate the number of days between Amaretta's birthday and Enzo's birthday in a non-leap year and in a leap year. Then, we will count the number of leap years and non-leap years in the given range (2008-2016). Finally, we will sum the total number of days for each type of year for both non-leap and leap years.\\n\\nPlease run the following Python code:\\n\\n```python\\nfrom datetime import date\\n\\namaretta_birthday = date(2008, 7, 27)\\nenzo_birthday = date(2008, 9, 3)\\n\\nnormal_year_days = (enzo_birthday - amaretta_birthday).days + 1\\nleap_year_days = normal_year_days + 1\\n\\nleap_years = [year for year in range(2008, 2017) if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)]\\nnon_leap_years_count = 9 - len(leap_years)\\n\\ntotal_cake_eating_days = non_leap_years_count * normal_year_days + len(leap_years) * leap_year_days\\nprint(total_cake_eating_days)\\n```\\n\", 'role': 'assistant'}], 'time': 15.208062410354614, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", @@ -1635,1660 +1232,1254 @@ "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"What is $.0\\\\overline{3} \\\\div .\\\\overline{03}$? Express your answer as a mixed number.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"It is almost always easier to use fractions than decimals when dividing. So the first task is to convert these repeating decimals to fractions. First, $.0\\\\overline{3}$: \\\\[\\n10 \\\\cdot .0\\\\overline{3} = .\\\\overline{3} = \\\\frac{1}{3}\\\\\\\\\\n\\\\Rightarrow .0\\\\overline{3} = \\\\frac{1}{3} \\\\div 10 = \\\\frac{1}{3} \\\\cdot \\\\frac{1}{10} = \\\\frac{1}{30}.\\n\\\\]Next, $.\\\\overline{03}$: \\\\[\\n99 \\\\cdot .\\\\overline{03} = (100-1) \\\\cdot .\\\\overline{03} = 3.\\\\overline{03} - .\\\\overline{03} = 3\\\\\\\\\\n\\\\Rightarrow .\\\\overline{03} = \\\\frac{3}{99} = \\\\frac{3}{3 \\\\cdot 33} = \\\\frac{1}{33}.\\n\\\\]We now have the tools to make our calculation: \\\\begin{align*}\\n.0\\\\overline{3} \\\\div .\\\\overline{03} &= \\\\frac{1}{30} \\\\div \\\\frac{1}{33}= \\\\frac{1}{30} \\\\cdot \\\\frac{33}{1}\\\\\\\\\\n&= \\\\frac{33}{30} = \\\\frac{3 \\\\cdot 11}{3 \\\\cdot 10} = \\\\frac{11}{10}\\\\\\\\\\n&= \\\\frac{10+1}{10} = \\\\boxed{1\\\\frac{1}{10}}.\\n\\\\end{align*}\",\n", - " \"problem_id\": \"15\",\n", - " \"response_with_ans\": \"Now, let's simplify the fraction:\\n\\n$\\\\frac{33}{3} = 11$\\n\\nSo, the result of the division is:\\n\\n$.0\\\\overline{3} \\\\div .\\\\overline{03} = 11$\\n\\nSince there is no remainder or mixed number component, the final answer is 11.\\n\\nTERMINATE\",\n", - " \"round\": 3,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is $.0\\\\overline{3} \\\\div .\\\\overline{03}$? Express your answer as a mixed number.\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To solve the problem, let's first convert the repeating decimal numbers into fractions.\\n\\n1. Convert $0.\\\\overline{3}$ to a fraction:\\n\\nLet x = $0.\\\\overline{3}$\\nThen, 10x = $3.\\\\overline{3}$\\nSubtracting the two equations, we get:\\n9x = 3\\nx = $\\\\frac{1}{3}$\\n\\nSo, $0.\\\\overline{3}$ can be written as $\\\\frac{1}{3}$.\\n\\n2. Convert $.\\\\overline{03}$ to a fraction:\\n\\nLet y = $.\\\\overline{03}$\\nThen, 100y = $3.\\\\overline{03}$\\nSubtracting the two equations, we get:\\n99y = 3\\ny = $\\\\frac{1}{33}$\\n\\nSo, $.\\\\overline{03}$ can be written as $\\\\frac{1}{33}$.\\n\\nNow that we have the fractions, we can compute: \\n$(\\\\frac{1}{3}) \\\\div (\\\\frac{1}{33})$\\nTo divide fractions, we multiply the first one by the reciprocal of the second one:\\n\\n$(\\\\frac{1}{3}) \\\\times (\\\\frac{33}{1})$\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's compute the product of the fractions:\\n\\n$\\\\frac{1}{3} \\\\times \\\\frac{33}{1} = \\\\frac{1 \\\\times 33}{3 \\\\times 1}$\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Now, let's solve the multiplication in the numerator and the denominator:\\n\\n$\\\\frac{1 \\\\times 33}{3 \\\\times 1} = \\\\frac{33}{3}$\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Now, let's simplify the fraction:\\n\\n$\\\\frac{33}{3} = 11$\\n\\nSo, the result of the division is:\\n\\n$.0\\\\overline{3} \\\\div .\\\\overline{03} = 11$\\n\\nSince there is no remainder or mixed number component, the final answer is 11.\\n\\nTERMINATE\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 34.40860724449158,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'In the diagram, $AB,$ $BC,$ $CD,$ $DE,$ $EF,$ $FG,$ $GH,$ and $HK$ all have length $4,$ and all angles are right angles, with the exception of the angles at $D$ and $F.$\\n\\n[asy]\\ndraw((0,0)--(0,4)--(4,4)--(4,8)--(6.8284,5.1716)--(9.6569,8)--(9.6569,4)--(13.6569,4)--(13.6569,0)--cycle,black+linewidth(1));\\ndraw((0,0)--(0.5,0)--(0.5,0.5)--(0,0.5)--cycle,black+linewidth(1));\\ndraw((0,4)--(0.5,4)--(0.5,3.5)--(0,3.5)--cycle,black+linewidth(1));\\ndraw((4,4)--(4,4.5)--(3.5,4.5)--(3.5,4)--cycle,black+linewidth(1));\\ndraw((6.8284,5.1716)--(7.0784,5.4216)--(6.8284,5.6716)--(6.5784,5.4216)--cycle,black+linewidth(1));\\ndraw((9.6569,4)--(10.1569,4)--(10.1569,4.5)--(9.6569,4.5)--cycle,black+linewidth(1));\\ndraw((13.6569,4)--(13.1569,4)--(13.1569,3.5)--(13.6569,3.5)--cycle,black+linewidth(1));\\ndraw((13.6569,0)--(13.1569,0)--(13.1569,0.5)--(13.6569,0.5)--cycle,black+linewidth(1));\\nlabel(\"$A$\",(0,0),W);\\nlabel(\"$B$\",(0,4),NW);\\nlabel(\"$C$\",(4,4),S);\\nlabel(\"$D$\",(4,8),N);\\nlabel(\"$E$\",(6.8284,5.1716),S);\\nlabel(\"$F$\",(9.6569,8),N);\\nlabel(\"$G$\",(9.6569,4),S);\\nlabel(\"$H$\",(13.6569,4),NE);\\nlabel(\"$K$\",(13.6569,0),E);\\n[/asy]\\n\\nDetermine the length of $DF.$\\n\\n[asy]\\ndraw((0,0)--(2.8284,-2.8284)--(5.6568,0),black+linewidth(1));\\ndraw((0,0)--(5.6568,0),black+linewidth(1)+dashed);\\ndraw((2.8284,-2.8284)--(3.0784,-2.5784)--(2.8284,-2.3284)--(2.5784,-2.5784)--cycle,black+linewidth(1));\\nlabel(\"$D$\",(0,0),N);\\nlabel(\"$E$\",(2.8284,-2.8284),S);\\nlabel(\"$F$\",(5.6568,0),N);\\n[/asy]', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Since $DE=EF=4$ and $\\\\angle DEF = 90^\\\\circ,$ by the Pythagorean Theorem, \\\\begin{align*}\\nDF^2 &= DE^2+EF^2 \\\\\\\\\\n&= 4^2+4^2 \\\\\\\\\\n&=32,\\n\\\\end{align*}so that $DF = \\\\sqrt{32}=\\\\boxed{4\\\\sqrt{2}}.$', 'problem_id': '16', 'response_with_ans': \"Now let's calculate the square of DF using Python.\\n\\n```python\\nDH = 9.6569\\nHG = 5.6569\\ncos_alpha_beta = 0\\n\\nDF_squared = DH**2 + HG**2 - 2 * DH * HG * cos_alpha_beta\\nDF_squared\\n```\", 'round': 2, 'messages': [{'content': 'In the diagram, $AB,$ $BC,$ $CD,$ $DE,$ $EF,$ $FG,$ $GH,$ and $HK$ all have length $4,$ and all angles are right angles, with the exception of the angles at $D$ and $F.$\\n\\n[asy]\\ndraw((0,0)--(0,4)--(4,4)--(4,8)--(6.8284,5.1716)--(9.6569,8)--(9.6569,4)--(13.6569,4)--(13.6569,0)--cycle,black+linewidth(1));\\ndraw((0,0)--(0.5,0)--(0.5,0.5)--(0,0.5)--cycle,black+linewidth(1));\\ndraw((0,4)--(0.5,4)--(0.5,3.5)--(0,3.5)--cycle,black+linewidth(1));\\ndraw((4,4)--(4,4.5)--(3.5,4.5)--(3.5,4)--cycle,black+linewidth(1));\\ndraw((6.8284,5.1716)--(7.0784,5.4216)--(6.8284,5.6716)--(6.5784,5.4216)--cycle,black+linewidth(1));\\ndraw((9.6569,4)--(10.1569,4)--(10.1569,4.5)--(9.6569,4.5)--cycle,black+linewidth(1));\\ndraw((13.6569,4)--(13.1569,4)--(13.1569,3.5)--(13.6569,3.5)--cycle,black+linewidth(1));\\ndraw((13.6569,0)--(13.1569,0)--(13.1569,0.5)--(13.6569,0.5)--cycle,black+linewidth(1));\\nlabel(\"$A$\",(0,0),W);\\nlabel(\"$B$\",(0,4),NW);\\nlabel(\"$C$\",(4,4),S);\\nlabel(\"$D$\",(4,8),N);\\nlabel(\"$E$\",(6.8284,5.1716),S);\\nlabel(\"$F$\",(9.6569,8),N);\\nlabel(\"$G$\",(9.6569,4),S);\\nlabel(\"$H$\",(13.6569,4),NE);\\nlabel(\"$K$\",(13.6569,0),E);\\n[/asy]\\n\\nDetermine the length of $DF.$\\n\\n[asy]\\ndraw((0,0)--(2.8284,-2.8284)--(5.6568,0),black+linewidth(1));\\ndraw((0,0)--(5.6568,0),black+linewidth(1)+dashed);\\ndraw((2.8284,-2.8284)--(3.0784,-2.5784)--(2.8284,-2.3284)--(2.5784,-2.5784)--cycle,black+linewidth(1));\\nlabel(\"$D$\",(0,0),N);\\nlabel(\"$E$\",(2.8284,-2.8284),S);\\nlabel(\"$F$\",(5.6568,0),N);\\n[/asy]', 'role': 'user'}, {'content': \"Let's use coordinate geometry to determine the coordinates of points D, E, and F, which will allow us to compute the Euclidean distance between points D and F.\\n\\n1. Assign coordinates to points A, B, C, D, E, F, G, H, and K.\\n2. Calculate the slope and equation of line DE and line FG.\\n3. Use the slopes and the coordinates of points E and G to compute the coordinates of point F.\\n4. Calculate the length of DF using the distance formula.\\n\\nLet's start by assigning coordinates:\\n\\n1. A(0, 0), B(0, 4), C(4, 4), D(4, 8), G(9.6569, 4), H(13.6569, 4), and K(13.6569, 0).\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Since the angles at points D and F are not right angles, we need to use some trigonometry to find points E and F. We already have the coordinates of points D and G.\\n\\nLet α be the angle between the horizontal line and the line DE, and β be the angle between the horizontal line and the line FG.\\n\\nLet P be the intersection point between the horizontal lines passing through points D and F. Therefore, the triangle DPF is a right triangle. \\n\\nSince we know the angles α, β, and the length DG, we can determine the length DF using the law of cosines:\\n\\nLet's calculate the length of DG first:\\n\\nDG = DH + HG = (13.6569 - 4) + (9.6569 - 4) = 9.6569 + 5.6569 = 15.3138\\n\\nNow, let α be the angle between line DH and DF, and β be the angle between line HG and DF. We can use the law of cosines to calculate the length of DF:\\n\\nDF^2 = DH^2 + HG^2 - 2 * DH * HG * cos(α + β)\\n\\nWe also know angle DAB = α, angle GHB = β and AB = FG = 4.\\n\\nWe can calculate cos(α + β) using the cosine formula for the addition of angles:\\n\\ncos(α + β) = cos(α) * cos(β) - sin(α) * sin(β)\\n\\nTo find sin(α) and cos(α), we can use right triangle with sides DA, AB, and DB. Since DB = BA = 4, we have:\\n\\ncos(α) = AD / DB = 4 / 4 = 1\\nsin(α) = BA / DB = 4 / 4 = 1\\n\\nSimilarly, for angle β, we have right triangle with sides GH, HB, and GB. Since GB = BH = 4, we have:\\n\\ncos(β) = HG / GB = 4 / 4 = 1\\nsin(β) = HB / GB = 4 / 4 = 1\\n\\nNow, let's calculate cos(α + β):\\n\\ncos(α + β) = cos(α) * cos(β) - sin(α) * sin(β) = 1 * 1 - 1 * 1 = 0\\n\\nFinally, let's calculate DF using the law of cosines:\\n\\nDF^2 = DH^2 + HG^2 - 2 * DH * HG * cos(α + β)\\nDF^2 = 9.6569^2 + 5.6569^2 - 2 * 9.6569 * 5.6569 * 0\\n\\nNow let's compute the square of DF.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Now let's calculate the square of DF using Python.\\n\\n```python\\nDH = 9.6569\\nHG = 5.6569\\ncos_alpha_beta = 0\\n\\nDF_squared = DH**2 + HG**2 - 2 * DH * HG * cos_alpha_beta\\nDF_squared\\n```\", 'role': 'assistant'}], 'time': 44.82826566696167, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"mostly effective\",\n", + " \"Mathematical Methodology\": \"inappropriate\",\n", " \"Calculation Correctness\": \"completely incorrect\",\n", " \"Explanation Clarity\": \"moderately clear\",\n", - " \"Code Efficiency\": \"not applicable\",\n", - " \"Code Correctness\": \"not applicable\"\n", + " \"Code Efficiency\": \"not at all efficient\",\n", + " \"Code Correctness\": \"completely incorrect\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"The perfect squares from $1$ through $2500,$ inclusive, are printed in a sequence of digits $1491625\\\\ldots2500.$ How many digits are in the sequence?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"We consider it by four cases:\\n\\n$\\\\bullet$ Case 1: There are $3$ perfect squares that only have $1$ digit, $1^{2},$ $2^{2},$ and $3^{2}.$\\n\\n$\\\\bullet$ Case 2: The smallest perfect square that has $2$ digits is $4^{2},$ and the largest is $9^{2},$ so that's a total of $6$ perfect squares with $2$ digits.\\n\\n$\\\\bullet$ Case 3: The smallest perfect square with $3$ digits is $10^{2},$ and the largest is $31^{2},$ yielding a total of $22.$\\n\\n$\\\\bullet$ Case 4: The smallest perfect square with $4$ digits is $32^{2},$ and the last one that is no greater than $2500$ is $50^{2},$ giving a total of $19.$\\n\\nSo we have a total of $1\\\\times3+2\\\\times6+3\\\\times22+4\\\\times19=\\\\boxed{157}$ digits.\",\n", - " \"problem_id\": \"5\",\n", - " \"response_with_ans\": \"```python\\ndef count_digits(n):\\n return len(str(n))\\n\\nperfect_squares = [i ** 2 for i in range(1, int(2500 ** 0.5) + 1)]\\ntotal_digits = sum(count_digits(square) for square in perfect_squares)\\n\\nprint(total_digits)\\n```\\n\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"The perfect squares from $1$ through $2500,$ inclusive, are printed in a sequence of digits $1491625\\\\ldots2500.$ How many digits are in the sequence?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To determine the number of digits in the sequence, we can first find all perfect squares from 1 through 2500, inclusive. Then, we can count the number of digits in each perfect square and sum them up. Let's implement this in Python code:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"```python\\ndef count_digits(n):\\n return len(str(n))\\n\\nperfect_squares = [i ** 2 for i in range(1, int(2500 ** 0.5) + 1)]\\ntotal_digits = sum(count_digits(square) for square in perfect_squares)\\n\\nprint(total_digits)\\n```\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 9.788081884384155,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'A $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle is drawn on the exterior of an equilateral triangle so the hypotenuse of the right triangle is one side of the equilateral triangle. If the shorter leg of the right triangle is 6 units, what is the distance between the two vertices that the triangles do not have in common? Express your answer in simplest radical form. [asy]\\ndraw((2,0)--(0,0)--(1,1.732)--(2,1.732)--(2,0)--(1,1.732));\\ndraw((2,1.632)--(1.9,1.632)--(1.9,1.732));\\nlabel(\"$60^\\\\circ$\",(1,1.732),2SE+E);\\nlabel(\"$30^\\\\circ$\",(2,0),5NNW+4N);\\nlabel(\"6\",(1.5,1.732),N);\\n[/asy]', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Multiply the short leg of the right triangle by $\\\\sqrt{3}$ to find that the length of the longer leg is $6\\\\sqrt{3}$ units. Double the short leg of the right triangle to find that the length of the hypotenuse of the right triangle is 12 units. Since the hypotenuse of the right triangle is a side of the equilateral triangle, the side length of the equilateral triangle is also 12 units. By the Pythagorean theorem, the distance between the two vertices that the two triangles do not have in common is $\\\\sqrt{(6\\\\sqrt{3})^2+12^2}=\\\\sqrt{252}=\\\\boxed{6\\\\sqrt{7}}$ units. [asy]\\ndraw((2,0)--(0,0)--(1,sqrt(3))--(2,sqrt(3))--(2,0)--(1,sqrt(3)));\\ndraw((2,sqrt(3)-0.1)--(1.9,sqrt(3)-0.1)--(1.9,sqrt(3)));\\ndraw((0,0)--(2,sqrt(3)));\\nlabel(\"$60^\\\\circ$\",(1,sqrt(3)),2SE+E);\\nlabel(\"$30^\\\\circ$\",(2,0),5NNW+4N);\\nlabel(\"6\",(1.5,sqrt(3)),N);\\nlabel(\"$6\\\\sqrt{3}$\",(2,sqrt(3)/2),E);\\nlabel(\"12\",(1.5,sqrt(3)/2),SW);\\nlabel(\"12\",(1,0),S);\\n[/asy]', 'problem_id': '7', 'response_with_ans': 'We have found the distance between the two vertices that the triangles do not have in common (C and D):\\n\\nx = √(252 + 72√3)\\n\\nThis is the simplest radical form for the required distance. \\n\\nTERMINATE', 'round': 3, 'messages': [{'content': 'A $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle is drawn on the exterior of an equilateral triangle so the hypotenuse of the right triangle is one side of the equilateral triangle. If the shorter leg of the right triangle is 6 units, what is the distance between the two vertices that the triangles do not have in common? Express your answer in simplest radical form. [asy]\\ndraw((2,0)--(0,0)--(1,1.732)--(2,1.732)--(2,0)--(1,1.732));\\ndraw((2,1.632)--(1.9,1.632)--(1.9,1.732));\\nlabel(\"$60^\\\\circ$\",(1,1.732),2SE+E);\\nlabel(\"$30^\\\\circ$\",(2,0),5NNW+4N);\\nlabel(\"6\",(1.5,1.732),N);\\n[/asy]', 'role': 'user'}, {'content': \"Let's denote the vertices of the equilateral triangle as A, B, and C, where A is the right angle of the $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle, B is adjacent to the $30^\\\\circ$ angle, and C is adjacent to the $60^\\\\circ$ angle. Let D be the vertex of the right triangle that is not a part of the equilateral triangle (the $60^\\\\circ$ angle).\\n\\nSince the shorter leg of the right triangle is 6 units (AD) and it is a $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle, we know that:\\n1. The longer leg (BD) has a length equal to the shorter leg times $\\\\sqrt{3}$ ($6\\\\sqrt{3}$ units).\\n2. The hypotenuse (AB) has a length twice the size of the shorter leg (12 units).\\n\\nNow, we are trying to find the distance between vertices C and D. Let's call this distance x.\\n\\nTo calculate x, we can use the law of cosines on triangle $\\\\bigtriangleup BCD$. Here, the angle at vertex B is $180^\\\\circ - 60^\\\\circ = 120^\\\\circ$. We have the lengths of sides BC and BD, so we can calculate x using the law of cosines and then simplify the result.\\n\\nThe law of cosines formula is:\\nx^2 = (BC^2) + (BD^2) - 2(BC)(BD)cos(120^\\\\circ)\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': 'First, we need the length of side BC, which is equal to the length of side AB (since it is an equilateral triangle). Thus, BC = 12 units. Now we can use the law of cosines formula to find the value of x^2:\\n\\nx^2 = (12^2) + (6√3)^2 - 2(12)(6√3)cos(120º)\\n\\nThe cosine of 120º is -1/2, so:\\n\\nx^2 = 144 + 108 - 2(12)(6√3)(-1/2)\\n', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Let's simplify the expression for x^2:\\n\\nx^2 = 144 + 108 + (12)(6√3)\\n\\nNow we will calculate the result:\\n\\nx^2 = 144 + 108 + 72√3\\n\\nx^2 = 252 + 72√3\\n\\nNow, to find the value of x, we'll take the square root of both sides:\\n\\nx = √(252 + 72√3)\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': 'We have found the distance between the two vertices that the triangles do not have in common (C and D):\\n\\nx = √(252 + 72√3)\\n\\nThis is the simplest radical form for the required distance. \\n\\nTERMINATE', 'role': 'assistant'}], 'time': 35.88237428665161, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"mostly effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"mostly correct\",\n", + " \"Explanation Clarity\": \"mostly clear\",\n", + " \"Code Efficiency\": \"N/A\",\n", + " \"Code Correctness\": \"N/A\"\n", "}\n", "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"There are $26\\\\cdot 26$ possible two-letter sequences of letters, since we have 26 choices for the first and then 26 choices for the second. But only 52 of these possibilities are valid, so our answer is $\\\\frac{52}{26\\\\cdot 26} =\\\\boxed{ \\\\frac{1}{13}}$.\",\n", - " \"problem_id\": \"8\",\n", - " \"response_with_ans\": \"We'll use a reliable source like USPS to fetch the postal abbreviations, and then calculate the probability.\\n\\n```python\\nimport requests\\nfrom bs4 import BeautifulSoup\\nimport fractions\\n\\nurl = 'https://pe.usps.com/text/pub28/28apb.htm'\\nresponse = requests.get(url)\\nsoup = BeautifulSoup(response.text, 'html.parser')\\n\\n# Find the table containing the state info\\nstate_table = soup.find_all('table')[0]\\n\\n# Extract state abbreviations from the table\\nstate_abbreviations = [row.find_all('td')[-1].text.strip() for row in state_table.find_all('tr')[3:-2]]\\n\\n# Add District of Columbia and Puerto Rico.\\nstate_abbreviations.extend([\\\"DC\\\", \\\"PR\\\"])\\n\\nnum_valid_abbreviations = len(state_abbreviations)\\ntotal_possible_combinations = 26 * 26\\nprobability = fractions.Fraction(num_valid_abbreviations, total_possible_combinations)\\n\\nprint(f\\\"Valid postal abbreviations: {state_abbreviations}\\\")\\nprint(f\\\"Total valid postal abbreviations: {num_valid_abbreviations}\\\")\\nprint(f\\\"Total possible two-letter sequences: {total_possible_combinations}\\\")\\nprint(f\\\"Probability: {probability}\\\")\\n```\\n\\nPlease run the above code, and the output will show you the list of valid postal abbreviations, the total number of valid abbreviations, the total possible two-letter sequences, and the probability.\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To calculate the probability, we first need to know the total number of possible two-letter sequences and the number of valid postal abbreviations.\\n\\nTo get the total number of possible two-letter sequences, we'll calculate all possible combinations using 26 letters of the alphabet. There are 26 options for choosing the first letter and 26 options for choosing the second letter, so the possible sequences are 26 * 26.\\n\\nNext, let's find the valid postal abbreviations for all 50 states, District of Columbia, and Puerto Rico. To do this, we can create code to download the postal abbreviations.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"We'll use a reliable source like USPS to fetch the postal abbreviations, and then calculate the probability.\\n\\n```python\\nimport requests\\nfrom bs4 import BeautifulSoup\\nimport fractions\\n\\nurl = 'https://pe.usps.com/text/pub28/28apb.htm'\\nresponse = requests.get(url)\\nsoup = BeautifulSoup(response.text, 'html.parser')\\n\\n# Find the table containing the state info\\nstate_table = soup.find_all('table')[0]\\n\\n# Extract state abbreviations from the table\\nstate_abbreviations = [row.find_all('td')[-1].text.strip() for row in state_table.find_all('tr')[3:-2]]\\n\\n# Add District of Columbia and Puerto Rico.\\nstate_abbreviations.extend([\\\"DC\\\", \\\"PR\\\"])\\n\\nnum_valid_abbreviations = len(state_abbreviations)\\ntotal_possible_combinations = 26 * 26\\nprobability = fractions.Fraction(num_valid_abbreviations, total_possible_combinations)\\n\\nprint(f\\\"Valid postal abbreviations: {state_abbreviations}\\\")\\nprint(f\\\"Total valid postal abbreviations: {num_valid_abbreviations}\\\")\\nprint(f\\\"Total possible two-letter sequences: {total_possible_combinations}\\\")\\nprint(f\\\"Probability: {probability}\\\")\\n```\\n\\nPlease run the above code, and the output will show you the list of valid postal abbreviations, the total number of valid abbreviations, the total possible two-letter sequences, and the probability.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 27.01692771911621,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'The perfect squares from $1$ through $2500,$ inclusive, are printed in a sequence of digits $1491625\\\\ldots2500.$ How many digits are in the sequence?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': \"We consider it by four cases:\\n\\n$\\\\bullet$ Case 1: There are $3$ perfect squares that only have $1$ digit, $1^{2},$ $2^{2},$ and $3^{2}.$\\n\\n$\\\\bullet$ Case 2: The smallest perfect square that has $2$ digits is $4^{2},$ and the largest is $9^{2},$ so that's a total of $6$ perfect squares with $2$ digits.\\n\\n$\\\\bullet$ Case 3: The smallest perfect square with $3$ digits is $10^{2},$ and the largest is $31^{2},$ yielding a total of $22.$\\n\\n$\\\\bullet$ Case 4: The smallest perfect square with $4$ digits is $32^{2},$ and the last one that is no greater than $2500$ is $50^{2},$ giving a total of $19.$\\n\\nSo we have a total of $1\\\\times3+2\\\\times6+3\\\\times22+4\\\\times19=\\\\boxed{157}$ digits.\", 'problem_id': '5', 'response_with_ans': '```python\\ndef count_digits(n):\\n return len(str(n))\\n\\nperfect_squares = [i ** 2 for i in range(1, int(2500 ** 0.5) + 1)]\\ntotal_digits = sum(count_digits(square) for square in perfect_squares)\\n\\nprint(total_digits)\\n```\\n', 'round': 1, 'messages': [{'content': 'The perfect squares from $1$ through $2500,$ inclusive, are printed in a sequence of digits $1491625\\\\ldots2500.$ How many digits are in the sequence?', 'role': 'user'}, {'content': \"To determine the number of digits in the sequence, we can first find all perfect squares from 1 through 2500, inclusive. Then, we can count the number of digits in each perfect square and sum them up. Let's implement this in Python code:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '```python\\ndef count_digits(n):\\n return len(str(n))\\n\\nperfect_squares = [i ** 2 for i in range(1, int(2500 ** 0.5) + 1)]\\ntotal_digits = sum(count_digits(square) for square in perfect_squares)\\n\\nprint(total_digits)\\n```\\n', 'role': 'assistant'}], 'time': 9.788081884384155, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"mostly correct\"\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"What is the sum of the lengths, in centimeters, of the two legs of a 30-60-90 right triangle, if the length of the hypotenuse is $2\\\\sqrt{6}$ centimeters?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"We know that the ratio of the lengths of the sides of a 30-60-90 triangle is $1:\\\\sqrt{3}:2$. We know that the length of the hypotenuse is $2\\\\sqrt{6}$ and the ratio of the length shortest leg to that of the hypotenuse is $1:2$. Therefore, the length of the shorter leg is $\\\\sqrt{6}$. Since the ratio of the shorter leg to the longer leg is $1:\\\\sqrt{3}$, the length of the longer leg is $\\\\sqrt{6} \\\\cdot \\\\sqrt{3} = 3\\\\sqrt{2}$. The sum of the lengths of these two legs is $\\\\boxed{\\\\sqrt{6} + 3\\\\sqrt{2}}$ centimeters.\",\n", - " \"problem_id\": \"17\",\n", - " \"response_with_ans\": \"Let's solve for x and find the lengths of the legs.\\n\\n```python\\nimport math\\n\\nhypotenuse = 2 * math.sqrt(6)\\nx = hypotenuse / 2\\n\\nleg_1 = x\\nleg_2 = x * math.sqrt(3)\\nsum_of_legs = leg_1 + leg_2\\n\\nleg_1, leg_2, sum_of_legs\\n```\\n\\nNow, execute the above code to find the sum of the lengths of the two legs.\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of the lengths, in centimeters, of the two legs of a 30-60-90 right triangle, if the length of the hypotenuse is $2\\\\sqrt{6}$ centimeters?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of the lengths of the two legs, let's first analyze a 30-60-90 right triangle. In a 30-60-90 triangle, the ratios of the side lengths are:\\n\\n1. Opposite the 30-degree angle: $x$\\n2. Opposite the 60-degree angle: $x\\\\sqrt{3}$\\n3. Hypotenuse (opposite the 90-degree angle): $2x$\\n\\nNow, we are given that the hypotenuse is $2\\\\sqrt{6}$ centimeters. We can set up the following equation and solve for $x$:\\n\\n$2x = 2\\\\sqrt{6}$\\n\\nNext, we will find the lengths of the legs and sum them up.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's solve for x and find the lengths of the legs.\\n\\n```python\\nimport math\\n\\nhypotenuse = 2 * math.sqrt(6)\\nx = hypotenuse / 2\\n\\nleg_1 = x\\nleg_2 = x * math.sqrt(3)\\nsum_of_legs = leg_1 + leg_2\\n\\nleg_1, leg_2, sum_of_legs\\n```\\n\\nNow, execute the above code to find the sum of the lengths of the two legs.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 18.742590188980103,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'In isosceles right triangle $ABC$, point $D$ is on hypotenuse $\\\\overline{BC}$ such that $\\\\overline{AD}$ is an altitude of $\\\\triangle ABC$ and $DC = 5$. What is the area of triangle $ABC$?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'In isosceles right triangle $\\\\triangle ABC$ below, $\\\\overline{AD}$ is the altitude to the hypotenuse.\\n\\n[asy]\\nimport olympiad;\\nunitsize(0.8inch);\\npair A,B,C,D;\\nA = (0,1);\\nB= (1,0);\\nC = -B;\\nD = (0,0);\\ndraw(A--B--C--A,linewidth(1));\\ndraw(A--D,linewidth(0.8));\\ndraw(rightanglemark(C,A,B,s=4));\\ndraw(rightanglemark(C,D,A,s=4));\\nlabel(\"$A$\",A,N);\\nlabel(\"$B$\",B,S);\\nlabel(\"$C$\",C,S);\\nlabel(\"$D$\",D,S);\\n[/asy]\\n\\nBecause $\\\\triangle ABC$ is an isosceles right triangle, $\\\\angle ABC = 45^\\\\circ$. Since $\\\\angle ADB = 90^\\\\circ$, we know that $\\\\angle DAB = 45^\\\\circ$, so $\\\\triangle ABD$ is also a 45-45-90 triangle. Similarly, $\\\\triangle ACD$ is a 45-45-90 triangle. Therefore, $DA=DB = DC = 5$, so $BC = BD+DC = 10$, and \\\\[[ABC] = \\\\frac{(AD)(BC)}{2} = \\\\frac{(5)({10})}{2} = \\\\boxed{25}.\\\\]', 'problem_id': '13', 'response_with_ans': '```python\\nfrom sympy import Eq, solve, symbols\\r\\n\\r\\n# Step 1: Find the length of side AC using the Pythagorean theorem\\r\\na, b, c = symbols(\"a b c\")\\r\\nPythagorean = Eq(a**2 + b**2, c**2)\\r\\n\\r\\n# We know BD = 5 and it\\'s an isosceles right triangle so:\\r\\nBD = 5\\r\\nBC = BD * 2\\r\\nside_A = BD # Side A is equal to BD\\r\\n\\r\\nisosceles_triangle = Pythagorean.subs({a: side_A, b: side_A, c: BC}) # Substitute values into the equation\\r\\nlength_AC = solve(isosceles_triangle, side_A)[0] # Solve for side_AC\\r\\n\\r\\n# Step 2: Find the length of side AD\\r\\nlength_AD = (BD * length_AC) / BC # Using the proportion\\r\\n\\r\\n# Step 3: Find the area of triangle ABC\\r\\narea_triangle_ABC = 0.5 * (length_AD * length_AC)\\r\\n\\r\\nlength_AC, length_AD, area_triangle_ABC\\r\\n```', 'round': 1, 'messages': [{'content': 'In isosceles right triangle $ABC$, point $D$ is on hypotenuse $\\\\overline{BC}$ such that $\\\\overline{AD}$ is an altitude of $\\\\triangle ABC$ and $DC = 5$. What is the area of triangle $ABC$?', 'role': 'user'}, {'content': \"Let's first analyze the problem and draw a plan to solve it.\\n\\nPlan:\\n1. Use the Pythagorean theorem to find the length of side AC.\\n2. Since right triangle ADC is similar to triangle ABC, we can set up a proportion to find the length of side AD.\\n3. Use the area formula of a triangle to find the area of triangle ABC.\\n\\nNow, let's execute the plan using code to calculate side AC, then side AD, and finally the area of triangle ABC.\\n\\n\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '```python\\nfrom sympy import Eq, solve, symbols\\r\\n\\r\\n# Step 1: Find the length of side AC using the Pythagorean theorem\\r\\na, b, c = symbols(\"a b c\")\\r\\nPythagorean = Eq(a**2 + b**2, c**2)\\r\\n\\r\\n# We know BD = 5 and it\\'s an isosceles right triangle so:\\r\\nBD = 5\\r\\nBC = BD * 2\\r\\nside_A = BD # Side A is equal to BD\\r\\n\\r\\nisosceles_triangle = Pythagorean.subs({a: side_A, b: side_A, c: BC}) # Substitute values into the equation\\r\\nlength_AC = solve(isosceles_triangle, side_A)[0] # Solve for side_AC\\r\\n\\r\\n# Step 2: Find the length of side AD\\r\\nlength_AD = (BD * length_AC) / BC # Using the proportion\\r\\n\\r\\n# Step 3: Find the area of triangle ABC\\r\\narea_triangle_ABC = 0.5 * (length_AD * length_AC)\\r\\n\\r\\nlength_AC, length_AD, area_triangle_ABC\\r\\n```', 'role': 'assistant'}], 'time': 22.85700249671936, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", + " \"Mathematical Methodology\": \"mostly effective\",\n", + " \"Calculation Correctness\": \"mostly correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Code Correctness\": \"mostly correct\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Each interior angle of a polygon measures 170 degrees. How many sides does the polygon have?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"The sum of the interior angles of a polygon is $180(n-2)$, where $n$ is the number of sides. That means each interior angle has a measure of $\\\\frac{180(n-2)}{n}$ degrees. We set this equal to 170 degrees and solve for $n$. \\\\begin{align*}\\n\\\\frac{180(n-2)}{n}&=170\\\\\\\\\\n\\\\Rightarrow \\\\qquad 180n-360&=170n\\\\\\\\\\\\Rightarrow \\\\qquad 10n&=360\\\\\\\\\\n\\\\Rightarrow\\\\qquad n&=36.\\n\\\\end{align*} The polygon has $\\\\boxed{36}$ sides.\",\n", - " \"problem_id\": \"14\",\n", - " \"response_with_ans\": \"Quick note: A single backtick should be used for inline code, and triple backticks should be used for code blocks. It seems I accidentally used single backticks for the code block in my previous response. Please use the code below instead:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols('n')\\nequation = Eq(n * 170, (n - 2) * 180)\\nsides = solve(equation, n)[0]\\nprint(sides)\\n```\\n\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Each interior angle of a polygon measures 170 degrees. How many sides does the polygon have?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"We can use the formula for the sum of the interior angles of a polygon to solve this problem. The formula is:\\n\\nsum_of_interior_angles = (n - 2) * 180\\n\\nwhere n is the number of sides.\\n\\nAs each interior angle measures 170 degrees, we can write the equation:\\n\\nn * 170 = (n - 2) * 180\\n\\nLet's solve for the number of sides (n) using Python code.\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols('n')\\nequation = Eq(n * 170, (n - 2) * 180)\\nsides = solve(equation, n)[0]\\nsides\\n\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Quick note: A single backtick should be used for inline code, and triple backticks should be used for code blocks. It seems I accidentally used single backticks for the code block in my previous response. Please use the code below instead:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols('n')\\nequation = Eq(n * 170, (n - 2) * 180)\\nsides = solve(equation, n)[0]\\nprint(sides)\\n```\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 16.42014193534851,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': \"Two numbers are said to be 'relatively prime' if their greatest common factor is 1. How many integers greater than 10 and less than 30 are relatively prime with 28?\", 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Since $28=2^2\\\\cdot 7$, a positive integer is relatively prime with $28$ if and only if it contains neither $2$ nor $7$ in its prime factorization. In other words, we want to count the number of integers between $11$ and $29$ inclusive which are divisible by neither $2$ nor $7$.\\n\\nAll of the odd numbers are not divisible by 2; there are 10 such numbers. The only one of these that is divisible by 7 is 21, so there are $10- 1 =\\\\boxed{9}$ numbers between 10 and 30 that are relatively prime with 28.', 'problem_id': '12', 'response_with_ans': 'I will find the relatively prime numbers with 28 in the range greater than 10 and less than 30 using Python code. I will use the math.gcd function and a loop to check each number in the specified range.\\n\\n```python\\nimport math\\n\\ncount = 0\\nrelatively_prime_numbers = []\\nfor number in range(11, 30):\\n if math.gcd(28, number) == 1:\\n relatively_prime_numbers.append(number)\\n count += 1\\n\\nprint(f\"Relatively prime numbers with 28: {relatively_prime_numbers}\")\\nprint(f\"Number of relatively prime numbers with 28: {count}\")\\n```\\nExecute the above code to find the integers relatively prime with 28 and their count.', 'round': 0, 'messages': [{'content': \"Two numbers are said to be 'relatively prime' if their greatest common factor is 1. How many integers greater than 10 and less than 30 are relatively prime with 28?\", 'role': 'user'}, {'content': 'I will find the relatively prime numbers with 28 in the range greater than 10 and less than 30 using Python code. I will use the math.gcd function and a loop to check each number in the specified range.\\n\\n```python\\nimport math\\n\\ncount = 0\\nrelatively_prime_numbers = []\\nfor number in range(11, 30):\\n if math.gcd(28, number) == 1:\\n relatively_prime_numbers.append(number)\\n count += 1\\n\\nprint(f\"Relatively prime numbers with 28: {relatively_prime_numbers}\")\\nprint(f\"Number of relatively prime numbers with 28: {count}\")\\n```\\nExecute the above code to find the integers relatively prime with 28 and their count.', 'role': 'assistant'}], 'time': 6.9820802211761475, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"completely clear\",\n", - " \"Code Efficiency\": \"very efficient\",\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", " \"Code Correctness\": \"completely correct\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Simplify $\\\\frac{2m+8}{3}-\\\\frac{2-m}{3}$.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Both fractions have the same denominator, so we can subtract them: \\\\[\\\\frac{2m+8}{3}-\\\\frac{2-m}{3}=\\\\frac{(2m+8)-(2-m)}{3}\\\\] Distributing the negative sign across the parentheses, we get \\\\[\\\\frac{2m+8-2-(-m)}{3}=\\\\frac{2m+8-2+m}{3}=\\\\frac{3m+6}{3}\\\\] Notice that every number in the numerator has a common factor of 3. We can use the distributive law in reverse to get \\\\[\\\\frac{3m+6}{3}=\\\\frac{3(m+2)}{3}=\\\\frac{\\\\cancel{3}(m+2)}{\\\\cancel{3}}=\\\\boxed{m+2}.\\\\]\",\n", - " \"problem_id\": \"19\",\n", - " \"response_with_ans\": \"\",\n", - " \"round\": 15,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Simplify $\\\\frac{2m+8}{3}-\\\\frac{2-m}{3}$.\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To simplify the expression, we can combine the fractions as they have the same denominator:\\n\\n$\\\\frac{2m + 8}{3} - \\\\frac{2 - m}{3} = \\\\frac{(2m + 8) - (2 - m)}{3}$\\n\\nNow, I will calculate the numerator and finish simplifying the expression.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\\n\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"My apologies for the confusion in the previous responses. Let me simplify the expression step by step now. \\n\\nWe have the expression:\\n\\n$\\\\frac{2m + 8}{3} - \\\\frac{2 - m}{3}$\\n\\nWe can combine the fractions as they have the same denominator:\\n\\n$\\\\frac{(2m + 8) - (2 - m)}{3}$\\n\\nLet's further simplify the expression using Python code.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 37.317880630493164,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'What is the product of $0.\\\\overline{05}$ and $1.8$ expressed as a fraction in simplest form?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': \"First let's convert $0.\\\\overline{05}$ to a fraction. Let's define $x=0.\\\\overline{05}$. If we multiply both sides by $100$ we get $100x=5.\\\\overline{05}$ so $99x=5$ and $x=0.\\\\overline{05}=\\\\frac{5}{99}$. Since $1.8=\\\\frac{9}{5}$ we get $$\\\\frac{\\\\cancel{5}}{99}\\\\cdot\\\\frac{9}{\\\\cancel{5}}=\\\\frac{9}{99}=\\\\boxed{\\\\frac{1}{11}}.$$\", 'problem_id': '9', 'response_with_ans': '```python\\nfrom fractions import Fraction\\n\\nfraction_1 = Fraction(5, 99)\\ndecimal_2 = 1.8\\n\\nproduct = fraction_1 * decimal_2\\n\\nsimplified_fraction = Fraction(product).limit_denominator()\\n\\nprint(simplified_fraction)\\n```\\n\\nPlease run the above Python code to find the product of $\\\\frac{5}{99}$ and $1.8$ expressed as a fraction in simplest form.', 'round': 1, 'messages': [{'content': 'What is the product of $0.\\\\overline{05}$ and $1.8$ expressed as a fraction in simplest form?', 'role': 'user'}, {'content': \"To find the product of $0.\\\\overline{05}$ and $1.8$, first we need to convert the repeating decimal $0.\\\\overline{05}$ into a fraction.\\n\\nLet's call the repeating decimal $x$:\\n\\n$$x = 0.\\\\overline{05}$$\\n\\nTo convert the repeating decimal into a fraction, we can multiply both sides by $100$ to shift the decimal point two places to the right:\\n\\n$$100x = 5.\\\\overline{05}$$\\n\\nNow, subtract the first equation from the second equation:\\n\\n$$100x - x = 5.\\\\overline{05} - 0.\\\\overline{05}$$\\n$$99x = 5$$\\n\\nNow, we can solve for $x$:\\n\\n$$x = \\\\frac{5}{99}$$\\n\\nSo the repeating decimal $0.\\\\overline{05}$ is equivalent to the fraction $\\\\frac{5}{99}$.\\n\\nNow, we can find the product of $\\\\frac{5}{99}$ and $1.8$:\\n\\n$$\\\\frac{5}{99} \\\\times 1.8$$\\n\\nFinally, we will simplify the fraction. Let's write the code to compute the product and simplify the result as a fraction.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '```python\\nfrom fractions import Fraction\\n\\nfraction_1 = Fraction(5, 99)\\ndecimal_2 = 1.8\\n\\nproduct = fraction_1 * decimal_2\\n\\nsimplified_fraction = Fraction(product).limit_denominator()\\n\\nprint(simplified_fraction)\\n```\\n\\nPlease run the above Python code to find the product of $\\\\frac{5}{99}$ and $1.8$ expressed as a fraction in simplest form.', 'role': 'assistant'}], 'time': 18.04914665222168, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"not at all clear\",\n", - " \"Code Efficiency\": \"not applicable\",\n", - " \"Code Correctness\": \"not applicable\"\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", - "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", - "\n", - "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", - " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"A $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle is drawn on the exterior of an equilateral triangle so the hypotenuse of the right triangle is one side of the equilateral triangle. If the shorter leg of the right triangle is 6 units, what is the distance between the two vertices that the triangles do not have in common? Express your answer in simplest radical form. [asy]\\ndraw((2,0)--(0,0)--(1,1.732)--(2,1.732)--(2,0)--(1,1.732));\\ndraw((2,1.632)--(1.9,1.632)--(1.9,1.732));\\nlabel(\\\"$60^\\\\circ$\\\",(1,1.732),2SE+E);\\nlabel(\\\"$30^\\\\circ$\\\",(2,0),5NNW+4N);\\nlabel(\\\"6\\\",(1.5,1.732),N);\\n[/asy]\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Multiply the short leg of the right triangle by $\\\\sqrt{3}$ to find that the length of the longer leg is $6\\\\sqrt{3}$ units. Double the short leg of the right triangle to find that the length of the hypotenuse of the right triangle is 12 units. Since the hypotenuse of the right triangle is a side of the equilateral triangle, the side length of the equilateral triangle is also 12 units. By the Pythagorean theorem, the distance between the two vertices that the two triangles do not have in common is $\\\\sqrt{(6\\\\sqrt{3})^2+12^2}=\\\\sqrt{252}=\\\\boxed{6\\\\sqrt{7}}$ units. [asy]\\ndraw((2,0)--(0,0)--(1,sqrt(3))--(2,sqrt(3))--(2,0)--(1,sqrt(3)));\\ndraw((2,sqrt(3)-0.1)--(1.9,sqrt(3)-0.1)--(1.9,sqrt(3)));\\ndraw((0,0)--(2,sqrt(3)));\\nlabel(\\\"$60^\\\\circ$\\\",(1,sqrt(3)),2SE+E);\\nlabel(\\\"$30^\\\\circ$\\\",(2,0),5NNW+4N);\\nlabel(\\\"6\\\",(1.5,sqrt(3)),N);\\nlabel(\\\"$6\\\\sqrt{3}$\\\",(2,sqrt(3)/2),E);\\nlabel(\\\"12\\\",(1.5,sqrt(3)/2),SW);\\nlabel(\\\"12\\\",(1,0),S);\\n[/asy]\",\n", - " \"problem_id\": \"7\",\n", - " \"response_with_ans\": \"We have found the distance between the two vertices that the triangles do not have in common (C and D):\\n\\nx = \\u221a(252 + 72\\u221a3)\\n\\nThis is the simplest radical form for the required distance. \\n\\nTERMINATE\",\n", - " \"round\": 3,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"A $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle is drawn on the exterior of an equilateral triangle so the hypotenuse of the right triangle is one side of the equilateral triangle. If the shorter leg of the right triangle is 6 units, what is the distance between the two vertices that the triangles do not have in common? Express your answer in simplest radical form. [asy]\\ndraw((2,0)--(0,0)--(1,1.732)--(2,1.732)--(2,0)--(1,1.732));\\ndraw((2,1.632)--(1.9,1.632)--(1.9,1.732));\\nlabel(\\\"$60^\\\\circ$\\\",(1,1.732),2SE+E);\\nlabel(\\\"$30^\\\\circ$\\\",(2,0),5NNW+4N);\\nlabel(\\\"6\\\",(1.5,1.732),N);\\n[/asy]\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's denote the vertices of the equilateral triangle as A, B, and C, where A is the right angle of the $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle, B is adjacent to the $30^\\\\circ$ angle, and C is adjacent to the $60^\\\\circ$ angle. Let D be the vertex of the right triangle that is not a part of the equilateral triangle (the $60^\\\\circ$ angle).\\n\\nSince the shorter leg of the right triangle is 6 units (AD) and it is a $30^\\\\circ$-$60^\\\\circ$-$90^\\\\circ$ triangle, we know that:\\n1. The longer leg (BD) has a length equal to the shorter leg times $\\\\sqrt{3}$ ($6\\\\sqrt{3}$ units).\\n2. The hypotenuse (AB) has a length twice the size of the shorter leg (12 units).\\n\\nNow, we are trying to find the distance between vertices C and D. Let's call this distance x.\\n\\nTo calculate x, we can use the law of cosines on triangle $\\\\bigtriangleup BCD$. Here, the angle at vertex B is $180^\\\\circ - 60^\\\\circ = 120^\\\\circ$. We have the lengths of sides BC and BD, so we can calculate x using the law of cosines and then simplify the result.\\n\\nThe law of cosines formula is:\\nx^2 = (BC^2) + (BD^2) - 2(BC)(BD)cos(120^\\\\circ)\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"First, we need the length of side BC, which is equal to the length of side AB (since it is an equilateral triangle). Thus, BC = 12 units. Now we can use the law of cosines formula to find the value of x^2:\\n\\nx^2 = (12^2) + (6\\u221a3)^2 - 2(12)(6\\u221a3)cos(120\\u00ba)\\n\\nThe cosine of 120\\u00ba is -1/2, so:\\n\\nx^2 = 144 + 108 - 2(12)(6\\u221a3)(-1/2)\\n\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's simplify the expression for x^2:\\n\\nx^2 = 144 + 108 + (12)(6\\u221a3)\\n\\nNow we will calculate the result:\\n\\nx^2 = 144 + 108 + 72\\u221a3\\n\\nx^2 = 252 + 72\\u221a3\\n\\nNow, to find the value of x, we'll take the square root of both sides:\\n\\nx = \\u221a(252 + 72\\u221a3)\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"We have found the distance between the two vertices that the triangles do not have in common (C and D):\\n\\nx = \\u221a(252 + 72\\u221a3)\\n\\nThis is the simplest radical form for the required distance. \\n\\nTERMINATE\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 35.88237428665161,\n", - " \"trial\": -1\n", - "}\n", + "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", + "\n", + "Task: Math problem solving.\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " }\n", + "]actual test case to evaluate: {'problem': 'All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'There are $26\\\\cdot 26$ possible two-letter sequences of letters, since we have 26 choices for the first and then 26 choices for the second. But only 52 of these possibilities are valid, so our answer is $\\\\frac{52}{26\\\\cdot 26} =\\\\boxed{ \\\\frac{1}{13}}$.', 'problem_id': '8', 'response_with_ans': 'We\\'ll use a reliable source like USPS to fetch the postal abbreviations, and then calculate the probability.\\n\\n```python\\nimport requests\\nfrom bs4 import BeautifulSoup\\nimport fractions\\n\\nurl = \\'https://pe.usps.com/text/pub28/28apb.htm\\'\\nresponse = requests.get(url)\\nsoup = BeautifulSoup(response.text, \\'html.parser\\')\\n\\n# Find the table containing the state info\\nstate_table = soup.find_all(\\'table\\')[0]\\n\\n# Extract state abbreviations from the table\\nstate_abbreviations = [row.find_all(\\'td\\')[-1].text.strip() for row in state_table.find_all(\\'tr\\')[3:-2]]\\n\\n# Add District of Columbia and Puerto Rico.\\nstate_abbreviations.extend([\"DC\", \"PR\"])\\n\\nnum_valid_abbreviations = len(state_abbreviations)\\ntotal_possible_combinations = 26 * 26\\nprobability = fractions.Fraction(num_valid_abbreviations, total_possible_combinations)\\n\\nprint(f\"Valid postal abbreviations: {state_abbreviations}\")\\nprint(f\"Total valid postal abbreviations: {num_valid_abbreviations}\")\\nprint(f\"Total possible two-letter sequences: {total_possible_combinations}\")\\nprint(f\"Probability: {probability}\")\\n```\\n\\nPlease run the above code, and the output will show you the list of valid postal abbreviations, the total number of valid abbreviations, the total possible two-letter sequences, and the probability.', 'round': 1, 'messages': [{'content': 'All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.', 'role': 'user'}, {'content': \"To calculate the probability, we first need to know the total number of possible two-letter sequences and the number of valid postal abbreviations.\\n\\nTo get the total number of possible two-letter sequences, we'll calculate all possible combinations using 26 letters of the alphabet. There are 26 options for choosing the first letter and 26 options for choosing the second letter, so the possible sequences are 26 * 26.\\n\\nNext, let's find the valid postal abbreviations for all 50 states, District of Columbia, and Puerto Rico. To do this, we can create code to download the postal abbreviations.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': 'We\\'ll use a reliable source like USPS to fetch the postal abbreviations, and then calculate the probability.\\n\\n```python\\nimport requests\\nfrom bs4 import BeautifulSoup\\nimport fractions\\n\\nurl = \\'https://pe.usps.com/text/pub28/28apb.htm\\'\\nresponse = requests.get(url)\\nsoup = BeautifulSoup(response.text, \\'html.parser\\')\\n\\n# Find the table containing the state info\\nstate_table = soup.find_all(\\'table\\')[0]\\n\\n# Extract state abbreviations from the table\\nstate_abbreviations = [row.find_all(\\'td\\')[-1].text.strip() for row in state_table.find_all(\\'tr\\')[3:-2]]\\n\\n# Add District of Columbia and Puerto Rico.\\nstate_abbreviations.extend([\"DC\", \"PR\"])\\n\\nnum_valid_abbreviations = len(state_abbreviations)\\ntotal_possible_combinations = 26 * 26\\nprobability = fractions.Fraction(num_valid_abbreviations, total_possible_combinations)\\n\\nprint(f\"Valid postal abbreviations: {state_abbreviations}\")\\nprint(f\"Total valid postal abbreviations: {num_valid_abbreviations}\")\\nprint(f\"Total possible two-letter sequences: {total_possible_combinations}\")\\nprint(f\"Probability: {probability}\")\\n```\\n\\nPlease run the above code, and the output will show you the list of valid postal abbreviations, the total number of valid abbreviations, the total possible two-letter sequences, and the probability.', 'role': 'assistant'}], 'time': 27.01692771911621, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", - " \"Problem Interpretation\": \"mostly accurate\",\n", + " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"mostly correct\",\n", - " \"Explanation Clarity\": \"moderately clear\",\n", - " \"Code Efficiency\": \"not applicable\",\n", - " \"Code Correctness\": \"not applicable\"\n", + " \"Calculation Correctness\": \"completely correct\",\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\\\overline{PQ}$ is $B$, the midpoint of $\\\\overline{BQ}$ is $C$, and the midpoint of $\\\\overline{PC}$ is $D$. What is the coordinate of $D$?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Since $B$ is midpoint of $\\\\overline{PQ}$, the coordinate of $B$ is $(8+48)/2 = 4+24 = 28$. Since $C$ is the midpoint of $\\\\overline{BQ}$, the coordinate of $C$ is $(28+48)/2 = 14+24=38$. Since $D$ is the midpoint of $\\\\overline{PC}$, the coordinate of $D$ is $(8+38)/2 = 4 + 19 = \\\\boxed{23}$.\\n\\n[asy]\\npair P, Q, B, C, D;\\n\\nP = (8,0);\\nQ = (48,0);\\nB = (P+Q)/2;\\nC = (B+Q)/2;\\nD = (P+C)/2;\\n\\ndot(P);\\ndot(Q);\\ndot(B);\\ndot(C);\\ndot(D);\\n\\ndraw(P--Q);\\nlabel(\\\"$P$\\\",P,S);\\nlabel(\\\"$Q$\\\",Q,S);\\nlabel(\\\"$B$\\\",B,S);\\nlabel(\\\"$C$\\\",C,S);\\nlabel(\\\"$D$\\\",D,S);\\n[/asy]\",\n", - " \"problem_id\": \"6\",\n", - " \"response_with_ans\": \"Let's find the coordinates using Python code.\\n\\n```python\\nP = 8\\nQ = 48\\n\\n# Find the midpoint of PQ (B)\\nB = (P + Q) / 2\\n\\n# Find the midpoint of BQ (C)\\nC = (B + Q) / 2\\n\\n# Find the midpoint of PC (D)\\nD = (P + C) / 2\\n\\nD\\n```\\n\\nExecute the above Python code to find the coordinate of D.\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\\\overline{PQ}$ is $B$, the midpoint of $\\\\overline{BQ}$ is $C$, and the midpoint of $\\\\overline{PC}$ is $D$. What is the coordinate of $D$?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the coordinate of D, we need to follow these steps:\\n1. Find the coordinate of B (midpoint of PQ)\\n2. Find the coordinate of C (midpoint of BQ)\\n3. Find the coordinate of D (midpoint of PC)\\n\\nI'll use Python code to find the coordinates of B, C, and D.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's find the coordinates using Python code.\\n\\n```python\\nP = 8\\nQ = 48\\n\\n# Find the midpoint of PQ (B)\\nB = (P + Q) / 2\\n\\n# Find the midpoint of BQ (C)\\nC = (B + Q) / 2\\n\\n# Find the midpoint of PC (D)\\nD = (P + C) / 2\\n\\nD\\n```\\n\\nExecute the above Python code to find the coordinate of D.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 9.041668176651001,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\\\overline{PQ}$ is $B$, the midpoint of $\\\\overline{BQ}$ is $C$, and the midpoint of $\\\\overline{PC}$ is $D$. What is the coordinate of $D$?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Since $B$ is midpoint of $\\\\overline{PQ}$, the coordinate of $B$ is $(8+48)/2 = 4+24 = 28$. Since $C$ is the midpoint of $\\\\overline{BQ}$, the coordinate of $C$ is $(28+48)/2 = 14+24=38$. Since $D$ is the midpoint of $\\\\overline{PC}$, the coordinate of $D$ is $(8+38)/2 = 4 + 19 = \\\\boxed{23}$.\\n\\n[asy]\\npair P, Q, B, C, D;\\n\\nP = (8,0);\\nQ = (48,0);\\nB = (P+Q)/2;\\nC = (B+Q)/2;\\nD = (P+C)/2;\\n\\ndot(P);\\ndot(Q);\\ndot(B);\\ndot(C);\\ndot(D);\\n\\ndraw(P--Q);\\nlabel(\"$P$\",P,S);\\nlabel(\"$Q$\",Q,S);\\nlabel(\"$B$\",B,S);\\nlabel(\"$C$\",C,S);\\nlabel(\"$D$\",D,S);\\n[/asy]', 'problem_id': '6', 'response_with_ans': \"Let's find the coordinates using Python code.\\n\\n```python\\nP = 8\\nQ = 48\\n\\n# Find the midpoint of PQ (B)\\nB = (P + Q) / 2\\n\\n# Find the midpoint of BQ (C)\\nC = (B + Q) / 2\\n\\n# Find the midpoint of PC (D)\\nD = (P + C) / 2\\n\\nD\\n```\\n\\nExecute the above Python code to find the coordinate of D.\", 'round': 1, 'messages': [{'content': 'On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\\\overline{PQ}$ is $B$, the midpoint of $\\\\overline{BQ}$ is $C$, and the midpoint of $\\\\overline{PC}$ is $D$. What is the coordinate of $D$?', 'role': 'user'}, {'content': \"To find the coordinate of D, we need to follow these steps:\\n1. Find the coordinate of B (midpoint of PQ)\\n2. Find the coordinate of C (midpoint of BQ)\\n3. Find the coordinate of D (midpoint of PC)\\n\\nI'll use Python code to find the coordinates of B, C, and D.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Let's find the coordinates using Python code.\\n\\n```python\\nP = 8\\nQ = 48\\n\\n# Find the midpoint of PQ (B)\\nB = (P + Q) / 2\\n\\n# Find the midpoint of BQ (C)\\nC = (B + Q) / 2\\n\\n# Find the midpoint of PC (D)\\nD = (P + C) / 2\\n\\nD\\n```\\n\\nExecute the above Python code to find the coordinate of D.\", 'role': 'assistant'}], 'time': 9.041668176651001, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"very efficient\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", " \"Code Correctness\": \"completely correct\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Triangle $ABC$ is a right triangle. If the measure of angle $PAB$ is $x^\\\\circ$ and the measure of angle $ACB$ is expressed in the form $(Mx+N)^\\\\circ$ with $M=1$, what is the value of $M+N$?\\n\\n[asy]\\ndraw((-10,0)--(20,0),linewidth(1),Arrows);\\ndraw((0,0)--(10,10/sqrt(3))--(10+10/3,0),linewidth(1));\\n\\ndraw((10,10/sqrt(3))+dir(-150)--(10,10/sqrt(3))+dir(-150)+dir(-60)--(10,10/sqrt(3))+dir(-60),linewidth(1));\\n\\ndot((-3,0));\\n\\ndraw(dir(180)..dir(105)..dir(30),linewidth(1));\\n\\nlabel(\\\"P\\\",(-3,0),NW);\\nlabel(\\\"A\\\",(0,0),S);\\nlabel(\\\"$x^\\\\circ$\\\",(-1,1),N);\\nlabel(\\\"B\\\",(10,10/sqrt(3)),N);\\nlabel(\\\"C\\\",(10+10/3,0),NE);\\n\\n[/asy]\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Since $\\\\angle PAB$ and $\\\\angle BAC$ are supplementary, $\\\\angle BAC = 180^{\\\\circ} - x^\\\\circ$. Since the three angles of a triangle add up to $ 180^{\\\\circ} $, we have $\\\\angle ACB = 180^{\\\\circ} - 90^{\\\\circ} - (180^{\\\\circ} - x^\\\\circ) = x^\\\\circ - 90^{\\\\circ}$. Thus, $M + N = \\\\boxed{-89}$.\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"We know that $x + y = 180^\\\\circ$. From this equation, we can express $y$ in terms of $x$: $y = 180^\\\\circ - x$.\\n\\nNow we substitute the expression for $y$ in the expression of angle $ACB$:\\n$ACB = 90^\\\\circ - y = 90^\\\\circ - (180^\\\\circ - x) = x - 90^\\\\circ$.\\n\\nComparing this expression with the given form $(Mx + N)^\\\\circ$, we can see that $M = 1$ and $N = -90$. Therefore, $M + N = 1 + (-90) = -89$.\\n\\nThe value of $M + N$ is $-89$.\",\n", - " \"round\": 2,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Triangle $ABC$ is a right triangle. If the measure of angle $PAB$ is $x^\\\\circ$ and the measure of angle $ACB$ is expressed in the form $(Mx+N)^\\\\circ$ with $M=1$, what is the value of $M+N$?\\n\\n[asy]\\ndraw((-10,0)--(20,0),linewidth(1),Arrows);\\ndraw((0,0)--(10,10/sqrt(3))--(10+10/3,0),linewidth(1));\\n\\ndraw((10,10/sqrt(3))+dir(-150)--(10,10/sqrt(3))+dir(-150)+dir(-60)--(10,10/sqrt(3))+dir(-60),linewidth(1));\\n\\ndot((-3,0));\\n\\ndraw(dir(180)..dir(105)..dir(30),linewidth(1));\\n\\nlabel(\\\"P\\\",(-3,0),NW);\\nlabel(\\\"A\\\",(0,0),S);\\nlabel(\\\"$x^\\\\circ$\\\",(-1,1),N);\\nlabel(\\\"B\\\",(10,10/sqrt(3)),N);\\nlabel(\\\"C\\\",(10+10/3,0),NE);\\n\\n[/asy]\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To solve this problem, we need to find the relationship between angle $x$ and angle $ACB$. We know that triangle $ABC$ is a right triangle, so the sum of angles $ACB$ and $ABC$ must be $90^\\\\circ$. Let's call the angle $ABC$ as $y$. Therefore, angle $ACB = 90^\\\\circ - y$.\\n\\nNow, we need to find the relationship between angles $x$ and $y$. Since $PAB$ and $ABC$ share the same vertex $A$ and angle $PAB$ is opposite angle $ABC$, they are explementary (their sum is $180^\\\\circ$). So, we have $x + y = 180^\\\\circ$.\\n\\nNow, we can express $y$ in terms of $x$ and then substitute the expression for $y$ in the expression of angle $ACB$. Finally, we can compare angle $ACB$'s expression with the given form $(Mx + N)^\\\\circ$ to find the values of $M$ and $N$.\\n\\nLet me do the calculations.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"We know that $x + y = 180^\\\\circ$. From this equation, we can express $y$ in terms of $x$: $y = 180^\\\\circ - x$.\\n\\nNow we substitute the expression for $y$ in the expression of angle $ACB$:\\n$ACB = 90^\\\\circ - y = 90^\\\\circ - (180^\\\\circ - x) = x - 90^\\\\circ$.\\n\\nComparing this expression with the given form $(Mx + N)^\\\\circ$, we can see that $M = 1$ and $N = -90$. Therefore, $M + N = 1 + (-90) = -89$.\\n\\nThe value of $M + N$ is $-89$.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"TERMINATE\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 28.305670976638794,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'Triangle $ABC$ is a right triangle. If the measure of angle $PAB$ is $x^\\\\circ$ and the measure of angle $ACB$ is expressed in the form $(Mx+N)^\\\\circ$ with $M=1$, what is the value of $M+N$?\\n\\n[asy]\\ndraw((-10,0)--(20,0),linewidth(1),Arrows);\\ndraw((0,0)--(10,10/sqrt(3))--(10+10/3,0),linewidth(1));\\n\\ndraw((10,10/sqrt(3))+dir(-150)--(10,10/sqrt(3))+dir(-150)+dir(-60)--(10,10/sqrt(3))+dir(-60),linewidth(1));\\n\\ndot((-3,0));\\n\\ndraw(dir(180)..dir(105)..dir(30),linewidth(1));\\n\\nlabel(\"P\",(-3,0),NW);\\nlabel(\"A\",(0,0),S);\\nlabel(\"$x^\\\\circ$\",(-1,1),N);\\nlabel(\"B\",(10,10/sqrt(3)),N);\\nlabel(\"C\",(10+10/3,0),NE);\\n\\n[/asy]', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Since $\\\\angle PAB$ and $\\\\angle BAC$ are supplementary, $\\\\angle BAC = 180^{\\\\circ} - x^\\\\circ$. Since the three angles of a triangle add up to $ 180^{\\\\circ} $, we have $\\\\angle ACB = 180^{\\\\circ} - 90^{\\\\circ} - (180^{\\\\circ} - x^\\\\circ) = x^\\\\circ - 90^{\\\\circ}$. Thus, $M + N = \\\\boxed{-89}$.', 'problem_id': '0', 'response_with_ans': 'We know that $x + y = 180^\\\\circ$. From this equation, we can express $y$ in terms of $x$: $y = 180^\\\\circ - x$.\\n\\nNow we substitute the expression for $y$ in the expression of angle $ACB$:\\n$ACB = 90^\\\\circ - y = 90^\\\\circ - (180^\\\\circ - x) = x - 90^\\\\circ$.\\n\\nComparing this expression with the given form $(Mx + N)^\\\\circ$, we can see that $M = 1$ and $N = -90$. Therefore, $M + N = 1 + (-90) = -89$.\\n\\nThe value of $M + N$ is $-89$.', 'round': 2, 'messages': [{'content': 'Triangle $ABC$ is a right triangle. If the measure of angle $PAB$ is $x^\\\\circ$ and the measure of angle $ACB$ is expressed in the form $(Mx+N)^\\\\circ$ with $M=1$, what is the value of $M+N$?\\n\\n[asy]\\ndraw((-10,0)--(20,0),linewidth(1),Arrows);\\ndraw((0,0)--(10,10/sqrt(3))--(10+10/3,0),linewidth(1));\\n\\ndraw((10,10/sqrt(3))+dir(-150)--(10,10/sqrt(3))+dir(-150)+dir(-60)--(10,10/sqrt(3))+dir(-60),linewidth(1));\\n\\ndot((-3,0));\\n\\ndraw(dir(180)..dir(105)..dir(30),linewidth(1));\\n\\nlabel(\"P\",(-3,0),NW);\\nlabel(\"A\",(0,0),S);\\nlabel(\"$x^\\\\circ$\",(-1,1),N);\\nlabel(\"B\",(10,10/sqrt(3)),N);\\nlabel(\"C\",(10+10/3,0),NE);\\n\\n[/asy]', 'role': 'user'}, {'content': \"To solve this problem, we need to find the relationship between angle $x$ and angle $ACB$. We know that triangle $ABC$ is a right triangle, so the sum of angles $ACB$ and $ABC$ must be $90^\\\\circ$. Let's call the angle $ABC$ as $y$. Therefore, angle $ACB = 90^\\\\circ - y$.\\n\\nNow, we need to find the relationship between angles $x$ and $y$. Since $PAB$ and $ABC$ share the same vertex $A$ and angle $PAB$ is opposite angle $ABC$, they are explementary (their sum is $180^\\\\circ$). So, we have $x + y = 180^\\\\circ$.\\n\\nNow, we can express $y$ in terms of $x$ and then substitute the expression for $y$ in the expression of angle $ACB$. Finally, we can compare angle $ACB$'s expression with the given form $(Mx + N)^\\\\circ$ to find the values of $M$ and $N$.\\n\\nLet me do the calculations.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': 'We know that $x + y = 180^\\\\circ$. From this equation, we can express $y$ in terms of $x$: $y = 180^\\\\circ - x$.\\n\\nNow we substitute the expression for $y$ in the expression of angle $ACB$:\\n$ACB = 90^\\\\circ - y = 90^\\\\circ - (180^\\\\circ - x) = x - 90^\\\\circ$.\\n\\nComparing this expression with the given form $(Mx + N)^\\\\circ$, we can see that $M = 1$ and $N = -90$. Therefore, $M + N = 1 + (-90) = -89$.\\n\\nThe value of $M + N$ is $-89$.', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': 'TERMINATE', 'role': 'assistant'}], 'time': 28.305670976638794, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", - " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"not at all efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Problem Interpretation\": \"completely accurate\",\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"completely correct\",\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"not applicable\",\n", + " \"Code Correctness\": \"not applicable\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Simplify: $\\\\frac{\\\\sqrt{2.5^2-0.7^2}}{2.7-2.5}$.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"We have \\\\begin{align*}\\n\\\\frac{\\\\sqrt{2.5^2 - 0.7^2}}{2.7-2.5} &= \\\\frac{\\\\sqrt{6.25 - 0.49}}{2.7-2.5} = \\\\frac{\\\\sqrt{5.76}}{0.2} = \\\\frac{\\\\sqrt{576/100}}{0.2}\\\\\\\\\\n&= \\\\frac{\\\\sqrt{576}/\\\\sqrt{100}}{0.2} = \\\\frac{24/10}{0.2} = \\\\frac{2.4}{0.2} = \\\\boxed{12}.\\\\end{align*}\",\n", - " \"problem_id\": \"2\",\n", - " \"response_with_ans\": \"```python\\nimport math\\n\\nnumerator = math.sqrt(2.5 ** 2 - 0.7 ** 2)\\ndenominator = 2.7 - 2.5\\n\\nresult = numerator / denominator\\nresult\\n```\\n\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Simplify: $\\\\frac{\\\\sqrt{2.5^2-0.7^2}}{2.7-2.5}$.\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To simplify the given expression, let's break it down step by step and use Python to calculate the values:\\n\\nStep 1: Calculate the values inside the square root: $2.5^2 - 0.7^2$\\nStep 2: Take the square root of the result\\nStep 3: Calculate the denominator: $2.7 - 2.5$\\nStep 4: Divide the result of Step 2 by the result of Step 3\\n\\nNow, let's execute Python code to get the result.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"```python\\nimport math\\n\\nnumerator = math.sqrt(2.5 ** 2 - 0.7 ** 2)\\ndenominator = 2.7 - 2.5\\n\\nresult = numerator / denominator\\nresult\\n```\\n\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 8.612567901611328,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'John counts up from 1 to 13, and then immediately counts down again to 1, and then back up to 13, and so on, alternately counting up and down: \\\\begin{align*}\\n&(1, 2, 3,4,5,6,7,8,9,10,11,12,13,\\\\\\\\\\n&\\\\qquad\\\\qquad12,11,10,9,8,7,6,5,4,3,2,1,2,3,4,\\\\ldots ).\\n\\\\end{align*} What is the $5000^{\\\\text{th}}$ integer in his list?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'We can treat this list as a sequence with a repetitive pattern. We see the sequence repeats itself every 24 elements (from 1 to 13 then back to 2). When 5000 is divided by 24, its remainder is 8. Therefore we see the $5000^{\\\\text{th}}$ integer is the same as the $8^{\\\\text{th}}$ integer, which is $\\\\boxed{8}$.', 'problem_id': '4', 'response_with_ans': 'We can solve this problem by finding which \"group\" the 5000th number belongs to, and then determine the number within that group. Each group is either counting up from 1 to 13 or counting down from 13 to 1. There are 13 numbers in each direction, but since 1 is counted twice (once at the end of a down-count and once at the beginning of an up-count), counting up and down would consist of 25 numbers (13 up + 12 down).\\n\\nLet\\'s first find out which group the 5000th number belongs to and how many numbers in total are in the complete up and down groups before 5000th number using code:\\n\\n```python\\nnum_group = 5000 // 25\\nremainder = 5000 % 25\\ntotal_numbers_before = num_group * 25\\nprint(\\'Number of complete groups:\\', num_group)\\nprint(\\'Remainder:\\', remainder)\\nprint(\\'Numbers before 5000th number:\\', total_numbers_before)\\n```\\n\\nAfter knowing these values, I\\'ll determine if the 5000th number is in up-count or down-count and its value.\\n', 'round': 0, 'messages': [{'content': 'John counts up from 1 to 13, and then immediately counts down again to 1, and then back up to 13, and so on, alternately counting up and down: \\\\begin{align*}\\n&(1, 2, 3,4,5,6,7,8,9,10,11,12,13,\\\\\\\\\\n&\\\\qquad\\\\qquad12,11,10,9,8,7,6,5,4,3,2,1,2,3,4,\\\\ldots ).\\n\\\\end{align*} What is the $5000^{\\\\text{th}}$ integer in his list?', 'role': 'user'}, {'content': 'We can solve this problem by finding which \"group\" the 5000th number belongs to, and then determine the number within that group. Each group is either counting up from 1 to 13 or counting down from 13 to 1. There are 13 numbers in each direction, but since 1 is counted twice (once at the end of a down-count and once at the beginning of an up-count), counting up and down would consist of 25 numbers (13 up + 12 down).\\n\\nLet\\'s first find out which group the 5000th number belongs to and how many numbers in total are in the complete up and down groups before 5000th number using code:\\n\\n```python\\nnum_group = 5000 // 25\\nremainder = 5000 % 25\\ntotal_numbers_before = num_group * 25\\nprint(\\'Number of complete groups:\\', num_group)\\nprint(\\'Remainder:\\', remainder)\\nprint(\\'Numbers before 5000th number:\\', total_numbers_before)\\n```\\n\\nAfter knowing these values, I\\'ll determine if the 5000th number is in up-count or down-count and its value.\\n', 'role': 'assistant'}], 'time': 16.342331409454346, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", - " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Problem Interpretation\": \"completely accurate\",\n", + " \"Mathematical Methodology\": \"mostly effective\",\n", + " \"Calculation Correctness\": \"mostly correct\",\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", + " \"Code Correctness\": \"mostly correct\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"Bekah has exactly three brass house number digits: 2, 3 and 5. How many distinct numbers can she form using one or more of the digits?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"If Bekah uses only one digit, she can form three numbers. If she uses two digits, she has three choices for the tens place and two for the units, so she can form six numbers. Finally, if Bekah uses all three digits, she has three choices for the hundreds place, two for the tens, and one for the units, so she can form six numbers. Thus, Bekah can form $3 + 6 + 6 = \\\\boxed{15}$ distinct numbers.\",\n", - " \"problem_id\": \"11\",\n", - " \"response_with_ans\": \"To find the answer, we'll consider all possible combinations that Bekah can make using the three digits she has:\\n1. Single-digit numbers: 2, 3, and 5\\n2. Two-digits numbers using the digits: 23, 25, 32, 35, 52, and 53\\n3. Three-digits number using all the digits: 235, 253, 325, 352, 523, and 532\\n\\nLet's calculate the number of distinct numbers Bekah can form using one or more of the digits with Python code:\\n\\n```python\\nfrom itertools import permutations\\n\\ndigits = [2, 3, 5]\\n\\n# Single-digit numbers\\none_digit_numbers = set(digits)\\n\\n# Two-digits numbers\\ntwo_digit_numbers = set(int(''.join(map(str, p))) for p in permutations(digits, 2))\\n\\n# Three-digits numbers\\nthree_digit_numbers = set(int(''.join(map(str, p))) for p in permutations(digits, 3))\\n\\ndistinct_numbers = one_digit_numbers | two_digit_numbers | three_digit_numbers\\ntotal_distinct_numbers = len(distinct_numbers)\\n\\nprint(\\\"Distinct numbers:\\\", distinct_numbers)\\nprint(\\\"Total distinct numbers:\\\", total_distinct_numbers)\\n```\\n\\nPlease run the above Python code to calculate the total number of distinct numbers Bekah can form using one or more of the digits.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"Bekah has exactly three brass house number digits: 2, 3 and 5. How many distinct numbers can she form using one or more of the digits?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the answer, we'll consider all possible combinations that Bekah can make using the three digits she has:\\n1. Single-digit numbers: 2, 3, and 5\\n2. Two-digits numbers using the digits: 23, 25, 32, 35, 52, and 53\\n3. Three-digits number using all the digits: 235, 253, 325, 352, 523, and 532\\n\\nLet's calculate the number of distinct numbers Bekah can form using one or more of the digits with Python code:\\n\\n```python\\nfrom itertools import permutations\\n\\ndigits = [2, 3, 5]\\n\\n# Single-digit numbers\\none_digit_numbers = set(digits)\\n\\n# Two-digits numbers\\ntwo_digit_numbers = set(int(''.join(map(str, p))) for p in permutations(digits, 2))\\n\\n# Three-digits numbers\\nthree_digit_numbers = set(int(''.join(map(str, p))) for p in permutations(digits, 3))\\n\\ndistinct_numbers = one_digit_numbers | two_digit_numbers | three_digit_numbers\\ntotal_distinct_numbers = len(distinct_numbers)\\n\\nprint(\\\"Distinct numbers:\\\", distinct_numbers)\\nprint(\\\"Total distinct numbers:\\\", total_distinct_numbers)\\n```\\n\\nPlease run the above Python code to calculate the total number of distinct numbers Bekah can form using one or more of the digits.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 15.620970249176025,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'Each interior angle of a polygon measures 170 degrees. How many sides does the polygon have?', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'The sum of the interior angles of a polygon is $180(n-2)$, where $n$ is the number of sides. That means each interior angle has a measure of $\\\\frac{180(n-2)}{n}$ degrees. We set this equal to 170 degrees and solve for $n$. \\\\begin{align*}\\n\\\\frac{180(n-2)}{n}&=170\\\\\\\\\\n\\\\Rightarrow \\\\qquad 180n-360&=170n\\\\\\\\\\\\Rightarrow \\\\qquad 10n&=360\\\\\\\\\\n\\\\Rightarrow\\\\qquad n&=36.\\n\\\\end{align*} The polygon has $\\\\boxed{36}$ sides.', 'problem_id': '14', 'response_with_ans': \"Quick note: A single backtick should be used for inline code, and triple backticks should be used for code blocks. It seems I accidentally used single backticks for the code block in my previous response. Please use the code below instead:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols('n')\\nequation = Eq(n * 170, (n - 2) * 180)\\nsides = solve(equation, n)[0]\\nprint(sides)\\n```\\n\", 'round': 1, 'messages': [{'content': 'Each interior angle of a polygon measures 170 degrees. How many sides does the polygon have?', 'role': 'user'}, {'content': \"We can use the formula for the sum of the interior angles of a polygon to solve this problem. The formula is:\\n\\nsum_of_interior_angles = (n - 2) * 180\\n\\nwhere n is the number of sides.\\n\\nAs each interior angle measures 170 degrees, we can write the equation:\\n\\nn * 170 = (n - 2) * 180\\n\\nLet's solve for the number of sides (n) using Python code.\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols('n')\\nequation = Eq(n * 170, (n - 2) * 180)\\nsides = solve(equation, n)[0]\\nsides\\n\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"Quick note: A single backtick should be used for inline code, and triple backticks should be used for code blocks. It seems I accidentally used single backticks for the code block in my previous response. Please use the code below instead:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols('n')\\nequation = Eq(n * 170, (n - 2) * 180)\\nsides = solve(equation, n)[0]\\nprint(sides)\\n```\\n\", 'role': 'assistant'}], 'time': 16.42014193534851, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", - " \"Problem Interpretation\": \"completely accurate\",\n", - " \"Mathematical Methodology\": \"completely effective\",\n", - " \"Calculation Correctness\": \"completely correct\",\n", - " \"Explanation Clarity\": \"completely clear\",\n", - " \"Code Efficiency\": \"very efficient\",\n", - " \"Code Correctness\": \"completely correct\"\n", + " \"Problem Interpretation\": \"completely accurate\",\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"completely correct\",\n", + " \"Explanation Clarity\": \"very clear\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: false\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " }\n", + "]actual test case to evaluate: {'problem': 'An equilateral triangle has a side of length 12 inches. What is the area of the triangle, in square inches? Express your answer in simplest radical form.', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'The area of an equilateral triangle with side length $s$ is $s^2\\\\sqrt{3}/4$. We have $s = 12$, so our area is $12^2\\\\sqrt{3}/4 = \\\\boxed{36\\\\sqrt{3}}$.', 'problem_id': '18', 'response_with_ans': '```python\\nimport math\\n\\nside_length = 12\\narea = (math.sqrt(3) * side_length**2) / 4\\nprint(area)\\n```', 'round': 1, 'messages': [{'content': 'An equilateral triangle has a side of length 12 inches. What is the area of the triangle, in square inches? Express your answer in simplest radical form.', 'role': 'user'}, {'content': \"To find the area of an equilateral triangle with a side of length s, we can use the formula:\\n\\narea = (sqrt(3) * s²) / 4\\n\\nIn this case, s = 12 inches. Let's calculate the area.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '```python\\nimport math\\n\\nside_length = 12\\narea = (math.sqrt(3) * side_length**2) / 4\\nprint(area)\\n```', 'role': 'assistant'}], 'time': 14.153439283370972, 'trial': -1}\n", "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", + "\n", + "{\n", + " \"Problem Interpretation\": \"completely accurate\",\n", + " \"Mathematical Methodology\": \"completely effective\",\n", + " \"Calculation Correctness\": \"completely correct\",\n", + " \"Explanation Clarity\": \"completely clear\",\n", + " \"Code Efficiency\": \"moderately efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", + "\n", + "Task: Math problem solving.\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"In the diagram, $AB,$ $BC,$ $CD,$ $DE,$ $EF,$ $FG,$ $GH,$ and $HK$ all have length $4,$ and all angles are right angles, with the exception of the angles at $D$ and $F.$\\n\\n[asy]\\ndraw((0,0)--(0,4)--(4,4)--(4,8)--(6.8284,5.1716)--(9.6569,8)--(9.6569,4)--(13.6569,4)--(13.6569,0)--cycle,black+linewidth(1));\\ndraw((0,0)--(0.5,0)--(0.5,0.5)--(0,0.5)--cycle,black+linewidth(1));\\ndraw((0,4)--(0.5,4)--(0.5,3.5)--(0,3.5)--cycle,black+linewidth(1));\\ndraw((4,4)--(4,4.5)--(3.5,4.5)--(3.5,4)--cycle,black+linewidth(1));\\ndraw((6.8284,5.1716)--(7.0784,5.4216)--(6.8284,5.6716)--(6.5784,5.4216)--cycle,black+linewidth(1));\\ndraw((9.6569,4)--(10.1569,4)--(10.1569,4.5)--(9.6569,4.5)--cycle,black+linewidth(1));\\ndraw((13.6569,4)--(13.1569,4)--(13.1569,3.5)--(13.6569,3.5)--cycle,black+linewidth(1));\\ndraw((13.6569,0)--(13.1569,0)--(13.1569,0.5)--(13.6569,0.5)--cycle,black+linewidth(1));\\nlabel(\\\"$A$\\\",(0,0),W);\\nlabel(\\\"$B$\\\",(0,4),NW);\\nlabel(\\\"$C$\\\",(4,4),S);\\nlabel(\\\"$D$\\\",(4,8),N);\\nlabel(\\\"$E$\\\",(6.8284,5.1716),S);\\nlabel(\\\"$F$\\\",(9.6569,8),N);\\nlabel(\\\"$G$\\\",(9.6569,4),S);\\nlabel(\\\"$H$\\\",(13.6569,4),NE);\\nlabel(\\\"$K$\\\",(13.6569,0),E);\\n[/asy]\\n\\nDetermine the length of $DF.$\\n\\n[asy]\\ndraw((0,0)--(2.8284,-2.8284)--(5.6568,0),black+linewidth(1));\\ndraw((0,0)--(5.6568,0),black+linewidth(1)+dashed);\\ndraw((2.8284,-2.8284)--(3.0784,-2.5784)--(2.8284,-2.3284)--(2.5784,-2.5784)--cycle,black+linewidth(1));\\nlabel(\\\"$D$\\\",(0,0),N);\\nlabel(\\\"$E$\\\",(2.8284,-2.8284),S);\\nlabel(\\\"$F$\\\",(5.6568,0),N);\\n[/asy]\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Since $DE=EF=4$ and $\\\\angle DEF = 90^\\\\circ,$ by the Pythagorean Theorem, \\\\begin{align*}\\nDF^2 &= DE^2+EF^2 \\\\\\\\\\n&= 4^2+4^2 \\\\\\\\\\n&=32,\\n\\\\end{align*}so that $DF = \\\\sqrt{32}=\\\\boxed{4\\\\sqrt{2}}.$\",\n", - " \"problem_id\": \"16\",\n", - " \"response_with_ans\": \"Now let's calculate the square of DF using Python.\\n\\n```python\\nDH = 9.6569\\nHG = 5.6569\\ncos_alpha_beta = 0\\n\\nDF_squared = DH**2 + HG**2 - 2 * DH * HG * cos_alpha_beta\\nDF_squared\\n```\",\n", - " \"round\": 2,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"In the diagram, $AB,$ $BC,$ $CD,$ $DE,$ $EF,$ $FG,$ $GH,$ and $HK$ all have length $4,$ and all angles are right angles, with the exception of the angles at $D$ and $F.$\\n\\n[asy]\\ndraw((0,0)--(0,4)--(4,4)--(4,8)--(6.8284,5.1716)--(9.6569,8)--(9.6569,4)--(13.6569,4)--(13.6569,0)--cycle,black+linewidth(1));\\ndraw((0,0)--(0.5,0)--(0.5,0.5)--(0,0.5)--cycle,black+linewidth(1));\\ndraw((0,4)--(0.5,4)--(0.5,3.5)--(0,3.5)--cycle,black+linewidth(1));\\ndraw((4,4)--(4,4.5)--(3.5,4.5)--(3.5,4)--cycle,black+linewidth(1));\\ndraw((6.8284,5.1716)--(7.0784,5.4216)--(6.8284,5.6716)--(6.5784,5.4216)--cycle,black+linewidth(1));\\ndraw((9.6569,4)--(10.1569,4)--(10.1569,4.5)--(9.6569,4.5)--cycle,black+linewidth(1));\\ndraw((13.6569,4)--(13.1569,4)--(13.1569,3.5)--(13.6569,3.5)--cycle,black+linewidth(1));\\ndraw((13.6569,0)--(13.1569,0)--(13.1569,0.5)--(13.6569,0.5)--cycle,black+linewidth(1));\\nlabel(\\\"$A$\\\",(0,0),W);\\nlabel(\\\"$B$\\\",(0,4),NW);\\nlabel(\\\"$C$\\\",(4,4),S);\\nlabel(\\\"$D$\\\",(4,8),N);\\nlabel(\\\"$E$\\\",(6.8284,5.1716),S);\\nlabel(\\\"$F$\\\",(9.6569,8),N);\\nlabel(\\\"$G$\\\",(9.6569,4),S);\\nlabel(\\\"$H$\\\",(13.6569,4),NE);\\nlabel(\\\"$K$\\\",(13.6569,0),E);\\n[/asy]\\n\\nDetermine the length of $DF.$\\n\\n[asy]\\ndraw((0,0)--(2.8284,-2.8284)--(5.6568,0),black+linewidth(1));\\ndraw((0,0)--(5.6568,0),black+linewidth(1)+dashed);\\ndraw((2.8284,-2.8284)--(3.0784,-2.5784)--(2.8284,-2.3284)--(2.5784,-2.5784)--cycle,black+linewidth(1));\\nlabel(\\\"$D$\\\",(0,0),N);\\nlabel(\\\"$E$\\\",(2.8284,-2.8284),S);\\nlabel(\\\"$F$\\\",(5.6568,0),N);\\n[/asy]\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Let's use coordinate geometry to determine the coordinates of points D, E, and F, which will allow us to compute the Euclidean distance between points D and F.\\n\\n1. Assign coordinates to points A, B, C, D, E, F, G, H, and K.\\n2. Calculate the slope and equation of line DE and line FG.\\n3. Use the slopes and the coordinates of points E and G to compute the coordinates of point F.\\n4. Calculate the length of DF using the distance formula.\\n\\nLet's start by assigning coordinates:\\n\\n1. A(0, 0), B(0, 4), C(4, 4), D(4, 8), G(9.6569, 4), H(13.6569, 4), and K(13.6569, 0).\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Since the angles at points D and F are not right angles, we need to use some trigonometry to find points E and F. We already have the coordinates of points D and G.\\n\\nLet \\u03b1 be the angle between the horizontal line and the line DE, and \\u03b2 be the angle between the horizontal line and the line FG.\\n\\nLet P be the intersection point between the horizontal lines passing through points D and F. Therefore, the triangle DPF is a right triangle. \\n\\nSince we know the angles \\u03b1, \\u03b2, and the length DG, we can determine the length DF using the law of cosines:\\n\\nLet's calculate the length of DG first:\\n\\nDG = DH + HG = (13.6569 - 4) + (9.6569 - 4) = 9.6569 + 5.6569 = 15.3138\\n\\nNow, let \\u03b1 be the angle between line DH and DF, and \\u03b2 be the angle between line HG and DF. We can use the law of cosines to calculate the length of DF:\\n\\nDF^2 = DH^2 + HG^2 - 2 * DH * HG * cos(\\u03b1 + \\u03b2)\\n\\nWe also know angle DAB = \\u03b1, angle GHB = \\u03b2 and AB = FG = 4.\\n\\nWe can calculate cos(\\u03b1 + \\u03b2) using the cosine formula for the addition of angles:\\n\\ncos(\\u03b1 + \\u03b2) = cos(\\u03b1) * cos(\\u03b2) - sin(\\u03b1) * sin(\\u03b2)\\n\\nTo find sin(\\u03b1) and cos(\\u03b1), we can use right triangle with sides DA, AB, and DB. Since DB = BA = 4, we have:\\n\\ncos(\\u03b1) = AD / DB = 4 / 4 = 1\\nsin(\\u03b1) = BA / DB = 4 / 4 = 1\\n\\nSimilarly, for angle \\u03b2, we have right triangle with sides GH, HB, and GB. Since GB = BH = 4, we have:\\n\\ncos(\\u03b2) = HG / GB = 4 / 4 = 1\\nsin(\\u03b2) = HB / GB = 4 / 4 = 1\\n\\nNow, let's calculate cos(\\u03b1 + \\u03b2):\\n\\ncos(\\u03b1 + \\u03b2) = cos(\\u03b1) * cos(\\u03b2) - sin(\\u03b1) * sin(\\u03b2) = 1 * 1 - 1 * 1 = 0\\n\\nFinally, let's calculate DF using the law of cosines:\\n\\nDF^2 = DH^2 + HG^2 - 2 * DH * HG * cos(\\u03b1 + \\u03b2)\\nDF^2 = 9.6569^2 + 5.6569^2 - 2 * 9.6569 * 5.6569 * 0\\n\\nNow let's compute the square of DF.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Now let's calculate the square of DF using Python.\\n\\n```python\\nDH = 9.6569\\nHG = 5.6569\\ncos_alpha_beta = 0\\n\\nDF_squared = DH**2 + HG**2 - 2 * DH * HG * cos_alpha_beta\\nDF_squared\\n```\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 44.82826566696167,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'Simplify: $\\\\frac{\\\\sqrt{2.5^2-0.7^2}}{2.7-2.5}$.', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'We have \\\\begin{align*}\\n\\\\frac{\\\\sqrt{2.5^2 - 0.7^2}}{2.7-2.5} &= \\\\frac{\\\\sqrt{6.25 - 0.49}}{2.7-2.5} = \\\\frac{\\\\sqrt{5.76}}{0.2} = \\\\frac{\\\\sqrt{576/100}}{0.2}\\\\\\\\\\n&= \\\\frac{\\\\sqrt{576}/\\\\sqrt{100}}{0.2} = \\\\frac{24/10}{0.2} = \\\\frac{2.4}{0.2} = \\\\boxed{12}.\\\\end{align*}', 'problem_id': '2', 'response_with_ans': '```python\\nimport math\\n\\nnumerator = math.sqrt(2.5 ** 2 - 0.7 ** 2)\\ndenominator = 2.7 - 2.5\\n\\nresult = numerator / denominator\\nresult\\n```\\n', 'round': 1, 'messages': [{'content': 'Simplify: $\\\\frac{\\\\sqrt{2.5^2-0.7^2}}{2.7-2.5}$.', 'role': 'user'}, {'content': \"To simplify the given expression, let's break it down step by step and use Python to calculate the values:\\n\\nStep 1: Calculate the values inside the square root: $2.5^2 - 0.7^2$\\nStep 2: Take the square root of the result\\nStep 3: Calculate the denominator: $2.7 - 2.5$\\nStep 4: Divide the result of Step 2 by the result of Step 3\\n\\nNow, let's execute Python code to get the result.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '```python\\nimport math\\n\\nnumerator = math.sqrt(2.5 ** 2 - 0.7 ** 2)\\ndenominator = 2.7 - 2.5\\n\\nresult = numerator / denominator\\nresult\\n```\\n', 'role': 'assistant'}], 'time': 8.612567901611328, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", "{\n", @@ -3296,150 +2487,130 @@ " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"mostly efficient\",\n", - " \"Code Correctness\": \"mostly correct\"\n", + " \"Code Efficiency\": \"moderately efficient\",\n", + " \"Code Correctness\": \"completely correct\"\n", "}\n", "\n", "--------------------------------------------------------------------------------\n", - "actual label for this case: true\n", "\u001b[33mquantifier_user\u001b[0m (to quantifier):\n", "\n", "Task: Math problem solving.\n", - "Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", - "Task successful example: {\n", - " \"problem\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Number Theory\",\n", - " \"solution\": \"Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$\",\n", - " \"problem_id\": \"0\",\n", - " \"response_with_ans\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"round\": 0,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"What is the sum of all the distinct positive two-digit factors of 144?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere's a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\\\"The sum of all the distinct positive two-digit factors of 144 is:\\\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 11.140539407730103,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Task failed example: {\n", - " \"problem\": \"Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Algebra\",\n", - " \"solution\": \"We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 24.91333508491516,\n", - " \"trial\": -1\n", - "}\n", - "\n", - "Evaluation dictionary: {\n", - " \"Problem Interpretation\": {\n", - " \"description\": \"Ability to correctly interpret the problem.\",\n", - " \"accepted_values\": [\"completely off\", \"slightly relevant\", \"relevant\", \"mostly accurate\", \"completely accurate\"]\n", - " },\n", - " \"Mathematical Methodology\": {\n", - " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", - " \"accepted_values\": [\"inappropriate\", \"barely adequate\", \"adequate\", \"mostly effective\", \"completely effective\"]\n", - " },\n", - " \"Calculation Correctness\": {\n", - " \"description\": \"Accuracy of calculations made and solutions given\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"neither\", \"mostly correct\", \"completely correct\"]\n", - " },\n", - " \"Explanation Clarity\": {\n", - " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", - " \"accepted_values\": [\"not at all clear\", \"slightly clear\", \"moderately clear\", \"very clear\", \"completely clear\"]\n", - " },\n", - " \"Code Efficiency\": {\n", - " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", - " \"accepted_values\": [\"not at all efficient\", \"slightly efficient\", \"moderately efficient\", \"very efficient\", \"extremely efficient\"]\n", - " },\n", - " \"Code Correctness\": {\n", - " \"description\": \"Correctness of the provided code\",\n", - " \"accepted_values\": [\"completely incorrect\", \"mostly incorrect\", \"partly correct\", \"mostly correct\", \"completely correct\"]\n", - " }\n", + " Task description: Given any question, the system needs to solve the problem as consisely and accurately as possible\n", + " Task successful example: {'problem': 'What is the sum of all the distinct positive two-digit factors of 144?', 'level': 'Level 5', 'type': 'Number Theory', 'solution': 'Prime factorize $144=2^4\\\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\\\cdot3^2+2^2\\\\cdot3+2^2\\\\cdot3^2+2^3\\\\cdot3+2^3\\\\cdot3^2+2^4\\\\cdot3=\\\\boxed{226}.$', 'problem_id': '0', 'response_with_ans': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'round': 0, 'messages': [{'content': 'What is the sum of all the distinct positive two-digit factors of 144?', 'role': 'user'}, {'content': 'To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\\n\\nHere\\'s a Python script to accomplish this:\\n\\n```python\\ntwo_digit_factors = []\\n\\nfor i in range(10, 100):\\n if 144 % i == 0:\\n two_digit_factors.append(i)\\n\\nsum_of_factors = sum(two_digit_factors)\\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\\n```\\n\\nPlease run this script to find the desired sum.', 'role': 'assistant'}], 'time': 11.140539407730103, 'trial': -1}\n", + " Task failed example: {'problem': 'Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'We have \\\\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\\\quad \\\\Rightarrow\\n\\\\\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (2x+10-(3x+24))(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (-x-14)(x+3)&<0 \\\\quad \\\\Rightarrow\\n\\\\\\\\ (x+14)(x+3)&>0.\\n\\\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14 0$ and then find intervals where the inequality holds true.\\n\\nLet's first find the factorized form of the quadratic:\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\\n\\n```python\\nfrom sympy import symbols, factor\\r\\nx = symbols('x')\\r\\nfactorized = factor(x**2 + 17*x + 42)\\r\\nfactorized\\r\\n```\\n\\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.\", 'role': 'assistant'}], 'time': 24.91333508491516, 'trial': -1}\n", + " Evaluation dictionary: [\n", + " {\n", + " \"name\": \"Problem Interpretation\",\n", + " \"description\": \"Ability to correctly interpret the problem.\",\n", + " \"accepted_values\": [\n", + " \"completely off\",\n", + " \"slightly relevant\",\n", + " \"relevant\",\n", + " \"mostly accurate\",\n", + " \"completely accurate\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Mathematical Methodology\",\n", + " \"description\": \"Adequacy of the chosen mathematical or algorithmic methodology for the question\",\n", + " \"accepted_values\": [\n", + " \"inappropriate\",\n", + " \"barely adequate\",\n", + " \"adequate\",\n", + " \"mostly effective\",\n", + " \"completely effective\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Calculation Correctness\",\n", + " \"description\": \"Accuracy of calculations made and solutions given\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"neither\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Explanation Clarity\",\n", + " \"description\": \"Clarity and comprehensibility of explanations, including language use and structure\",\n", + " \"accepted_values\": [\n", + " \"not at all clear\",\n", + " \"slightly clear\",\n", + " \"moderately clear\",\n", + " \"very clear\",\n", + " \"completely clear\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Efficiency\",\n", + " \"description\": \"Quality of code in terms of efficiency and elegance\",\n", + " \"accepted_values\": [\n", + " \"not at all efficient\",\n", + " \"slightly efficient\",\n", + " \"moderately efficient\",\n", + " \"very efficient\",\n", + " \"extremely efficient\"\n", + " ],\n", + " \"sub_criteria\": []\n", + " },\n", + " {\n", + " \"name\": \"Code Correctness\",\n", + " \"description\": \"Correctness of the provided code\",\n", + " \"accepted_values\": [\n", + " \"completely incorrect\",\n", + " \"mostly incorrect\",\n", + " \"partly correct\",\n", + " \"mostly correct\",\n", + " \"completely correct\"\n", + " ],\n", + " \"sub_criteria\": []\n", " }\n", - "actual test case to evaluate: {\n", - " \"problem\": \"How many integers $n$ satisfy both of the inequalities $4n + 3 < 25$ and $-7n + 5 < 24$?\",\n", - " \"level\": \"Level 5\",\n", - " \"type\": \"Prealgebra\",\n", - " \"solution\": \"Subtract 3 and divide by 4 on both sides of the first inequality to obtain \\\\begin{align*}\\n4n + 3 &< 25 \\\\\\\\\\n\\\\Rightarrow\\\\qquad 4n &< 22 \\\\\\\\\\n\\\\Rightarrow\\\\qquad n &< 5.5.\\n\\\\end{align*}Similarly, the second inequality yields \\\\begin{align*}\\n-7n + 5 &< 24 \\\\\\\\\\n\\\\Rightarrow\\\\qquad -7n &< 19 \\\\\\\\\\n\\\\Rightarrow\\\\qquad n &> -\\\\frac{19}{7}.\\n\\\\end{align*}Therefore, we are looking for all the integers between $-\\\\frac{19}{7}$ and $5.5$. Since $-\\\\frac{19}{7}$ is between $-3$ and $-2$ and the largest integer less than $5.5$ is 5, we need to count the number of integers between $-2$ and $5$, inclusive. There are $5$ positive integers, $2$ negative integers, and zero, so there are $\\\\boxed{8}$ integers that satisfy both $4n + 3 < 25$ and $-7n + 5 < 24$.\",\n", - " \"problem_id\": \"10\",\n", - " \"response_with_ans\": \"Here's the code to solve the inequalities and find the intersection of their solutions:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols(\\\"n\\\")\\ninequality_1 = 4 * n + 3 < 25\\ninequality_2 = -7 * n + 5 < 24\\n\\nsolution_1 = solve(inequality_1, n)\\nsolution_2 = solve(inequality_2, n)\\n\\nintersection = (max(solution_1[0], solution_2[0]), min(solution_1[1], solution_2[1]))\\n\\nprint(f\\\"Solution to inequality 1: {solution_1}\\\")\\nprint(f\\\"Solution to inequality 2: {solution_2}\\\")\\nprint(f\\\"Intersection of solutions: {intersection}\\\")\\n```\\n\\nExecute this code, and let's see the solutions for both inequalities and their intersection.\",\n", - " \"round\": 1,\n", - " \"messages\": [\n", - " {\n", - " \"content\": \"How many integers $n$ satisfy both of the inequalities $4n + 3 < 25$ and $-7n + 5 < 24$?\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"To find the number of integers $n$ that satisfy both inequalities, we'll first solve each inequality individually, and then find the intersection of the solutions.\\n\\nStep 1: Solve the inequalities\\n1. $4n + 3 < 25$\\n2. $-7n + 5 < 24$\\n\\nStep 2: Find the intersection of the solutions\\n\\nStep 3: Count the number of integers in the intersection\\n\\nFirst, let's solve the inequalities using the python code.\",\n", - " \"role\": \"assistant\"\n", - " },\n", - " {\n", - " \"content\": \"\",\n", - " \"role\": \"user\"\n", - " },\n", - " {\n", - " \"content\": \"Here's the code to solve the inequalities and find the intersection of their solutions:\\n\\n```python\\nfrom sympy import symbols, Eq, solve\\n\\nn = symbols(\\\"n\\\")\\ninequality_1 = 4 * n + 3 < 25\\ninequality_2 = -7 * n + 5 < 24\\n\\nsolution_1 = solve(inequality_1, n)\\nsolution_2 = solve(inequality_2, n)\\n\\nintersection = (max(solution_1[0], solution_2[0]), min(solution_1[1], solution_2[1]))\\n\\nprint(f\\\"Solution to inequality 1: {solution_1}\\\")\\nprint(f\\\"Solution to inequality 2: {solution_2}\\\")\\nprint(f\\\"Intersection of solutions: {intersection}\\\")\\n```\\n\\nExecute this code, and let's see the solutions for both inequalities and their intersection.\",\n", - " \"role\": \"assistant\"\n", - " }\n", - " ],\n", - " \"time\": 19.949471950531006,\n", - " \"trial\": -1\n", - "}\n", + "]actual test case to evaluate: {'problem': 'Simplify $\\\\frac{2m+8}{3}-\\\\frac{2-m}{3}$.', 'level': 'Level 5', 'type': 'Prealgebra', 'solution': 'Both fractions have the same denominator, so we can subtract them: \\\\[\\\\frac{2m+8}{3}-\\\\frac{2-m}{3}=\\\\frac{(2m+8)-(2-m)}{3}\\\\] Distributing the negative sign across the parentheses, we get \\\\[\\\\frac{2m+8-2-(-m)}{3}=\\\\frac{2m+8-2+m}{3}=\\\\frac{3m+6}{3}\\\\] Notice that every number in the numerator has a common factor of 3. We can use the distributive law in reverse to get \\\\[\\\\frac{3m+6}{3}=\\\\frac{3(m+2)}{3}=\\\\frac{\\\\cancel{3}(m+2)}{\\\\cancel{3}}=\\\\boxed{m+2}.\\\\]', 'problem_id': '19', 'response_with_ans': '', 'round': 15, 'messages': [{'content': 'Simplify $\\\\frac{2m+8}{3}-\\\\frac{2-m}{3}$.', 'role': 'user'}, {'content': 'To simplify the expression, we can combine the fractions as they have the same denominator:\\n\\n$\\\\frac{2m + 8}{3} - \\\\frac{2 - m}{3} = \\\\frac{(2m + 8) - (2 - m)}{3}$\\n\\nNow, I will calculate the numerator and finish simplifying the expression.', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '\\n', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': \"My apologies for the confusion in the previous responses. Let me simplify the expression step by step now. \\n\\nWe have the expression:\\n\\n$\\\\frac{2m + 8}{3} - \\\\frac{2 - m}{3}$\\n\\nWe can combine the fractions as they have the same denominator:\\n\\n$\\\\frac{(2m + 8) - (2 - m)}{3}$\\n\\nLet's further simplify the expression using Python code.\", 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}, {'content': '', 'role': 'user'}, {'content': '', 'role': 'assistant'}], 'time': 37.317880630493164, 'trial': -1}\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", "\u001b[33mquantifier\u001b[0m (to quantifier_user):\n", "\n", + "```json\n", "{\n", " \"Problem Interpretation\": \"completely accurate\",\n", " \"Mathematical Methodology\": \"completely effective\",\n", " \"Calculation Correctness\": \"completely correct\",\n", " \"Explanation Clarity\": \"very clear\",\n", - " \"Code Efficiency\": \"moderately efficient\",\n", - " \"Code Correctness\": \"mostly correct\"\n", + " \"Code Efficiency\": \"not applicable\",\n", + " \"Code Correctness\": \"not applicable\"\n", "}\n", + "```\n", "\n", "--------------------------------------------------------------------------------\n" ] } ], "source": [ - "# log_path = \"../test/test_files/agenteval-in-out/agentchat_results/\"\n", "criteria_file = \"../test/test_files/agenteval-in-out/samples/sample_math_criteria.json\"\n", + "criteria = Criterion.parse_json_str(open(criteria_file, \"r\").read())\n", "outcome = {}\n", "\n", "for prefix in os.listdir(log_path):\n", " for file_name in os.listdir(log_path + \"/\" + prefix):\n", " gameid = prefix + \"_\" + file_name\n", " if file_name.split(\".\")[-1] == \"json\":\n", - " outcome[gameid] = get_quantifier(log_path + \"/\" + prefix + \"/\" + file_name, criteria_file)\n", + " test_case, ground_truth = remove_ground_truth(open(log_path + \"/\" + prefix + \"/\" + file_name, \"r\").read())\n", + " quantifier_output = quantify_criteria(\n", + " llm_config={\"config_list\": config_list},\n", + " criteria=criteria,\n", + " task=task,\n", + " test_case=test_case,\n", + " ground_truth=ground_truth,\n", + " )\n", + " outcome[gameid] = quantifier_output\n", "\n", "# store the evaluated problems\n", "with open(\"../test/test_files/agenteval-in-out/evaluated_problems.json\", \"w\") as file:\n", @@ -3464,7 +2635,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3484,26 +2655,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/vscode/.local/lib/python3.10/site-packages/scipy/stats/_distn_infrastructure.py:2241: RuntimeWarning: invalid value encountered in multiply\n", + "/home/vscode/.local/lib/python3.10/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.\n", + " return _methods._mean(a, axis=axis, dtype=dtype,\n", + "/home/vscode/.local/lib/python3.10/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide\n", + " ret = ret.dtype.type(ret / rcount)\n", + "/home/vscode/.local/lib/python3.10/site-packages/scipy/stats/_distn_infrastructure.py:2244: RuntimeWarning: invalid value encountered in multiply\n", " lower_bound = _a * scale + loc\n", - "/home/vscode/.local/lib/python3.10/site-packages/scipy/stats/_distn_infrastructure.py:2242: RuntimeWarning: invalid value encountered in multiply\n", - " upper_bound = _b * scale + loc\n" + "/home/vscode/.local/lib/python3.10/site-packages/scipy/stats/_distn_infrastructure.py:2245: RuntimeWarning: invalid value encountered in multiply\n", + " upper_bound = _b * scale + loc\n", + "/home/vscode/.local/lib/python3.10/site-packages/numpy/core/_methods.py:206: RuntimeWarning: Degrees of freedom <= 0 for slice\n", + " ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,\n", + "/home/vscode/.local/lib/python3.10/site-packages/numpy/core/_methods.py:163: RuntimeWarning: invalid value encountered in divide\n", + " arrmean = um.true_divide(arrmean, div, out=arrmean,\n", + "/home/vscode/.local/lib/python3.10/site-packages/numpy/core/_methods.py:198: RuntimeWarning: invalid value encountered in scalar divide\n", + " ret = ret.dtype.type(ret / rcount)\n" ] } ], "source": [ "# computing average and 95% interval for failed and successful cases on all criteria\n", "try:\n", - " # convert the criteria to dict type if it is already not\n", - " dictionary_for_eval = eval(open(criteria_file, \"r\").read())\n", + " criteria = Criterion.parse_json_str(open(criteria_file, \"r\").read())\n", "except: # noqa: E722\n", " pass\n", "\n", - "criteria = list(dictionary_for_eval.keys())\n", + "\n", "nl2int = {}\n", - "for criterion in dictionary_for_eval:\n", + "for criterion in criteria:\n", " score = 0\n", - " for v in dictionary_for_eval[criterion][\"accepted_values\"]:\n", + " for v in criterion.accepted_values:\n", " nl2int[v] = score\n", " score += 1\n", "print(nl2int)\n", @@ -3521,17 +2701,17 @@ " try:\n", " tmp_dic = eval(outcome[game][\"estimated_performance\"])\n", " if outcome[game][\"actual_success\"] == \"false\":\n", - " task[\"f\"].append(nl2int[tmp_dic[criterion]])\n", + " task[\"f\"].append(nl2int[tmp_dic[criterion.name]])\n", " else:\n", - " task[\"s\"].append(nl2int[tmp_dic[criterion]])\n", + " task[\"s\"].append(nl2int[tmp_dic[criterion.name]])\n", " except: # noqa: E722\n", " pass\n", "\n", - " average_f[criterion] = np.mean(task[\"f\"])\n", - " average_s[criterion] = np.mean(task[\"s\"])\n", + " average_f[criterion.name] = np.mean(task[\"f\"])\n", + " average_s[criterion.name] = np.mean(task[\"s\"])\n", "\n", - " conf_interval_s[criterion] = stats.norm.interval(0.95, loc=np.mean(task[\"s\"]), scale=stats.sem(task[\"s\"]))\n", - " conf_interval_f[criterion] = stats.norm.interval(0.95, loc=np.mean(task[\"f\"]), scale=stats.sem(task[\"f\"]))" + " conf_interval_s[criterion.name] = stats.norm.interval(0.95, loc=np.mean(task[\"s\"]), scale=stats.sem(task[\"s\"]))\n", + " conf_interval_f[criterion.name] = stats.norm.interval(0.95, loc=np.mean(task[\"f\"]), scale=stats.sem(task[\"f\"]))" ] }, { @@ -3543,7 +2723,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -3553,9 +2733,17 @@ "outputId": "248cd0bc-0927-4d9f-b911-088bd76acf5d" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_394256/2108490914.py:34: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.\n", + " plt.tight_layout() # Adjust subplot parameters to fit the labels\n" + ] + }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -3596,7 +2784,7 @@ "plt.title(\n", " \"Average Values of 3 different baselines cases with 95% Confidence Intervals - math problems \", fontsize=12, pad=10\n", ") # Adjust titlepad to move the title further above\n", - "plt.xticks(index + bar_width / 2, criteria, rotation=45, fontsize=14)\n", + "plt.xticks(index + bar_width / 2, [crit.name for crit in criteria], rotation=45, fontsize=14)\n", "plt.legend(loc=\"upper center\", fontsize=14, bbox_to_anchor=(0.5, 1), ncol=3) # Adjust legend placement and ncol\n", "plt.tight_layout() # Adjust subplot parameters to fit the labels\n", "plt.ylim(0, 5)\n", diff --git a/samples/apps/autogen-studio/.gitignore b/samples/apps/autogen-studio/.gitignore index e94e41454a8..e1e3c9942ec 100644 --- a/samples/apps/autogen-studio/.gitignore +++ b/samples/apps/autogen-studio/.gitignore @@ -1,6 +1,7 @@ database.sqlite .cache/* autogenstudio/web/files/user/* +autogenstudio/test autogenstudio/web/files/ui/* OAI_CONFIG_LIST scratch/ diff --git a/samples/apps/autogen-studio/README.md b/samples/apps/autogen-studio/README.md index 49f7e3d657b..1e60b5362db 100644 --- a/samples/apps/autogen-studio/README.md +++ b/samples/apps/autogen-studio/README.md @@ -15,6 +15,8 @@ Code for AutoGen Studio is on GitHub at [microsoft/autogen](https://github.com/m > AutoGen Studio is currently under active development and we are iterating quickly. Kindly consider that we may introduce breaking changes in the releases during the upcoming weeks, and also the `README` might be outdated. We'll update the `README` as soon as we stabilize the API. > [!NOTE] Updates +> April 17: AutoGen Studio database layer is now rewritten to use [SQLModel](https://sqlmodel.tiangolo.com/) (Pydantic + SQLAlchemy). This provides entity linking (skills, models, agents and workflows are linked via association tables) and supports multiple [database backend dialects](https://docs.sqlalchemy.org/en/20/dialects/) supported in SQLAlchemy (SQLite, PostgreSQL, MySQL, Oracle, Microsoft SQL Server). The backend database can be specified a `--database-uri` argument when running the application. For example, `autogenstudio ui --database-uri sqlite:///database.sqlite` for SQLite and `autogenstudio ui --database-uri postgresql+psycopg://user:password@localhost/dbname` for PostgreSQL. + > March 12: Default directory for AutoGen Studio is now /home//.autogenstudio. You can also specify this directory using the `--appdir` argument when running the application. For example, `autogenstudio ui --appdir /path/to/folder`. This will store the database and other files in the specified directory e.g. `/path/to/folder/database.sqlite`. `.env` files in that directory will be used to set environment variables for the app. ### Capabilities / Roadmap @@ -84,7 +86,14 @@ autogenstudio ui --port 8081 ``` This will start the application on the specified port. Open your web browser and go to `http://localhost:8081/` to begin using AutoGen Studio. -AutoGen Studio also takes a `--host ` argument to specify the host address. By default, it is set to `localhost`. You can also use the `--appdir ` argument to specify the directory where the app files (e.g., database and generated user files) are stored. By default, it is set to the directory where autogen pip package is installed. + +AutoGen Studio also takes several parameters to customize the application: + +- `--host ` argument to specify the host address. By default, it is set to `localhost`. Y +- `--appdir ` argument to specify the directory where the app files (e.g., database and generated user files) are stored. By default, it is set to the a `.autogenstudio` directory in the user's home directory. +- `--port ` argument to specify the port number. By default, it is set to `8080`. +- `--reload` argument to enable auto-reloading of the server when changes are made to the code. By default, it is set to `False`. +- `--database-uri` argument to specify the database URI. Example values include `sqlite:///database.sqlite` for SQLite and `postgresql+psycopg://user:password@localhost/dbname` for PostgreSQL. If this is not specified, the database URIL defaults to a `database.sqlite` file in the `--appdir` directory. Now that you have AutoGen Studio installed and running, you are ready to explore its capabilities, including defining and modifying agent workflows, interacting with agents and sessions, and expanding agent skills. @@ -98,8 +107,6 @@ AutoGen Studio proposes some high-level concepts. **Skills**: Skills are functions (e.g., Python functions) that describe how to solve a task. In general, a good skill has a descriptive name (e.g. `generate_images`), extensive docstrings and good defaults (e.g., writing out files to disk for persistence and reuse). You can add new skills AutoGen Studio app via the provided UI. At inference time, these skills are made available to the assistant agent as they address your tasks. -AutoGen Studio comes with 3 example skills: `fetch_profile`, `find_papers`, `generate_images`. The default skills, agents and workflows are based on the [dbdefaults.json](autogentstudio/utils/dbdefaults.json) file which is used to initialize the database. - ## Example Usage Consider the following query. @@ -116,8 +123,6 @@ The agent workflow responds by _writing and executing code_ to create a python p > Note: You can also view the debug console that generates useful information to see how the agents are interacting in the background. - - ## Contribution Guide We welcome contributions to AutoGen Studio. We recommend the following general steps to contribute to the project: @@ -134,7 +139,7 @@ We welcome contributions to AutoGen Studio. We recommend the following general s **Q: How do I specify the directory where files(e.g. database) are stored?** -A: You can specify the directory where files are stored by setting the `--appdir` argument when running the application. For example, `autogenstudio ui --appdir /path/to/folder`. This will store the database and other files in the specified directory e.g. `/path/to/folder/database.sqlite`. +A: You can specify the directory where files are stored by setting the `--appdir` argument when running the application. For example, `autogenstudio ui --appdir /path/to/folder`. This will store the database (default) and other files in the specified directory e.g. `/path/to/folder/database.sqlite`. **Q: Where can I adjust the default skills, agent and workflow configurations?** A: You can modify agent configurations directly from the UI or by editing the [dbdefaults.json](autogenstudio/utils/dbdefaults.json) file which is used to initialize the database. @@ -146,7 +151,7 @@ A: To reset your conversation history, you can delete the `database.sqlite` file A: Yes, you can view the generated messages in the debug console of the web UI, providing insights into the agent interactions. Alternatively, you can inspect the `database.sqlite` file for a comprehensive record of messages. **Q: Can I use other models with AutoGen Studio?** -Yes. AutoGen standardizes on the openai model api format, and you can use any api server that offers an openai compliant endpoint. In the AutoGen Studio UI, each agent has an `llm_config` field where you can input your model endpoint details including `model`, `api key`, `base url`, `model type` and `api version`. For Azure OpenAI models, you can find these details in the Azure portal. Note that for Azure OpenAI, the `model` is the deployment name or deployment id, and the `type` is "azure". +Yes. AutoGen standardizes on the openai model api format, and you can use any api server that offers an openai compliant endpoint. In the AutoGen Studio UI, each agent has an `llm_config` field where you can input your model endpoint details including `model`, `api key`, `base url`, `model type` and `api version`. For Azure OpenAI models, you can find these details in the Azure portal. Note that for Azure OpenAI, the `model name` is the deployment id or engine, and the `model type` is "azure". For other OSS models, we recommend using a server such as vllm to instantiate an openai compliant endpoint. **Q: The server starts but I can't access the UI** diff --git a/samples/apps/autogen-studio/autogenstudio/chatmanager.py b/samples/apps/autogen-studio/autogenstudio/chatmanager.py index 674ae3506a2..84b85673f07 100644 --- a/samples/apps/autogen-studio/autogenstudio/chatmanager.py +++ b/samples/apps/autogen-studio/autogenstudio/chatmanager.py @@ -4,14 +4,18 @@ import time from datetime import datetime from queue import Queue -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import websockets from fastapi import WebSocket, WebSocketDisconnect -from .datamodel import AgentWorkFlowConfig, Message, SocketMessage -from .utils import extract_successful_code_blocks, get_modified_files, summarize_chat_history -from .workflowmanager import AutoGenWorkFlowManager +from .datamodel import Message, SocketMessage, Workflow +from .utils import ( + extract_successful_code_blocks, + get_modified_files, + summarize_chat_history, +) +from .workflowmanager import WorkflowManager class AutoGenChatManager: @@ -41,7 +45,7 @@ def chat( self, message: Message, history: List[Dict[str, Any]], - flow_config: Optional[AgentWorkFlowConfig] = None, + workflow: Any = None, connection_id: Optional[str] = None, user_dir: Optional[str] = None, **kwargs, @@ -59,78 +63,93 @@ def chat( """ # create a working director for workflow based on user_dir/session_id/time_hash - work_dir = os.path.join(user_dir, message.session_id, datetime.now().strftime("%Y%m%d_%H-%M-%S")) + work_dir = os.path.join( + user_dir, + str(message.session_id), + datetime.now().strftime("%Y%m%d_%H-%M-%S"), + ) os.makedirs(work_dir, exist_ok=True) # if no flow config is provided, use the default - if flow_config is None: - raise ValueError("flow_config must be specified") + if workflow is None: + raise ValueError("Workflow must be specified") - flow = AutoGenWorkFlowManager( - config=flow_config, + workflow_manager = WorkflowManager( + workflow=workflow, history=history, work_dir=work_dir, send_message_function=self.send, connection_id=connection_id, ) + workflow = Workflow.model_validate(workflow) + message_text = message.content.strip() start_time = time.time() - flow.run(message=f"{message_text}", clear_history=False) + workflow_manager.run(message=f"{message_text}", clear_history=False) end_time = time.time() metadata = { - "messages": flow.agent_history, - "summary_method": flow_config.summary_method, + "messages": workflow_manager.agent_history, + "summary_method": workflow.summary_method, "time": end_time - start_time, "files": get_modified_files(start_time, end_time, source_dir=work_dir), } - print("Modified files: ", len(metadata["files"])) - - output = self._generate_output(message_text, flow, flow_config) + output = self._generate_output(message_text, workflow_manager, workflow) output_message = Message( user_id=message.user_id, - root_msg_id=message.root_msg_id, role="assistant", content=output, - metadata=json.dumps(metadata), + meta=json.dumps(metadata), session_id=message.session_id, ) return output_message def _generate_output( - self, message_text: str, flow: AutoGenWorkFlowManager, flow_config: AgentWorkFlowConfig + self, + message_text: str, + workflow_manager: WorkflowManager, + workflow: Workflow, ) -> str: """ Generates the output response based on the workflow configuration and agent history. :param message_text: The text of the incoming message. - :param flow: An instance of `AutoGenWorkFlowManager`. + :param flow: An instance of `WorkflowManager`. :param flow_config: An instance of `AgentWorkFlowConfig`. :return: The output response as a string. """ output = "" - if flow_config.summary_method == "last": - successful_code_blocks = extract_successful_code_blocks(flow.agent_history) - last_message = flow.agent_history[-1]["message"]["content"] if flow.agent_history else "" + if workflow.summary_method == "last": + successful_code_blocks = extract_successful_code_blocks(workflow_manager.agent_history) + last_message = ( + workflow_manager.agent_history[-1]["message"]["content"] if workflow_manager.agent_history else "" + ) successful_code_blocks = "\n\n".join(successful_code_blocks) output = (last_message + "\n" + successful_code_blocks) if successful_code_blocks else last_message - elif flow_config.summary_method == "llm": - model = flow.config.receiver.config.llm_config.config_list[0] + elif workflow.summary_method == "llm": + client = workflow_manager.receiver.client status_message = SocketMessage( type="agent_status", - data={"status": "summarizing", "message": "Generating summary of agent dialogue"}, - connection_id=flow.connection_id, + data={ + "status": "summarizing", + "message": "Summarizing agent dialogue", + }, + connection_id=workflow_manager.connection_id, ) self.send(status_message.dict()) - output = summarize_chat_history(task=message_text, messages=flow.agent_history, model=model) + output = summarize_chat_history( + task=message_text, + messages=workflow_manager.agent_history, + client=client, + ) - elif flow_config.summary_method == "none": + elif workflow.summary_method == "none": output = "" return output @@ -141,7 +160,9 @@ class WebSocketConnectionManager: """ def __init__( - self, active_connections: List[Tuple[WebSocket, str]] = None, active_connections_lock: asyncio.Lock = None + self, + active_connections: List[Tuple[WebSocket, str]] = None, + active_connections_lock: asyncio.Lock = None, ) -> None: """ Initializes WebSocketConnectionManager with an optional list of active WebSocket connections. @@ -185,7 +206,7 @@ async def disconnect_all(self) -> None: for connection, _ in self.active_connections[:]: await self.disconnect(connection) - async def send_message(self, message: Dict, websocket: WebSocket) -> None: + async def send_message(self, message: Union[Dict, str], websocket: WebSocket) -> None: """ Sends a JSON message to a single WebSocket connection. @@ -202,7 +223,7 @@ async def send_message(self, message: Dict, websocket: WebSocket) -> None: print("Error: WebSocket connection closed normally") await self.disconnect(websocket) except Exception as e: - print(f"Error in sending message: {str(e)}") + print(f"Error in sending message: {str(e)}", message) await self.disconnect(websocket) async def broadcast(self, message: Dict) -> None: diff --git a/samples/apps/autogen-studio/autogenstudio/cli.py b/samples/apps/autogen-studio/autogenstudio/cli.py index aafb13317c8..42642bcd68a 100644 --- a/samples/apps/autogen-studio/autogenstudio/cli.py +++ b/samples/apps/autogen-studio/autogenstudio/cli.py @@ -1,10 +1,10 @@ import os +from typing import Optional import typer import uvicorn from typing_extensions import Annotated -from .utils.dbutils import DBManager from .version import VERSION app = typer.Typer() @@ -18,6 +18,7 @@ def ui( reload: Annotated[bool, typer.Option("--reload")] = False, docs: bool = False, appdir: str = None, + database_uri: Optional[str] = None, ): """ Run the AutoGen Studio UI. @@ -29,11 +30,14 @@ def ui( reload (bool, optional): Whether to reload the UI on code changes. Defaults to False. docs (bool, optional): Whether to generate API docs. Defaults to False. appdir (str, optional): Path to the AutoGen Studio app directory. Defaults to None. + database-uri (str, optional): Database URI to connect to. Defaults to None. Examples include sqlite:///autogenstudio.db, postgresql://user:password@localhost/autogenstudio. """ os.environ["AUTOGENSTUDIO_API_DOCS"] = str(docs) if appdir: os.environ["AUTOGENSTUDIO_APPDIR"] = appdir + if database_uri: + os.environ["AUTOGENSTUDIO_DATABASE_URI"] = database_uri uvicorn.run( "autogenstudio.web.app:app", diff --git a/samples/apps/autogen-studio/autogenstudio/database/__init__.py b/samples/apps/autogen-studio/autogenstudio/database/__init__.py new file mode 100644 index 00000000000..0518c24ba4f --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/__init__.py @@ -0,0 +1,3 @@ +# from .dbmanager import * +from .dbmanager import * +from .utils import * diff --git a/samples/apps/autogen-studio/autogenstudio/database/alembic.ini b/samples/apps/autogen-studio/autogenstudio/database/alembic.ini new file mode 100644 index 00000000000..cd413a26066 --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/alembic.ini @@ -0,0 +1,116 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = migrations + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to migrations/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:migrations/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/samples/apps/autogen-studio/autogenstudio/database/dbmanager.py b/samples/apps/autogen-studio/autogenstudio/database/dbmanager.py new file mode 100644 index 00000000000..00d3714b63f --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/dbmanager.py @@ -0,0 +1,472 @@ +from datetime import datetime +from typing import Optional + +from loguru import logger +from sqlalchemy import exc +from sqlmodel import Session, SQLModel, and_, create_engine, select + +from ..datamodel import ( + Agent, + AgentLink, + AgentModelLink, + AgentSkillLink, + Model, + Response, + Skill, + Workflow, + WorkflowAgentLink, +) +from .utils import init_db_samples + +valid_link_types = ["agent_model", "agent_skill", "agent_agent", "workflow_agent"] + + +class DBManager: + """A class to manage database operations""" + + def __init__(self, engine_uri: str): + connection_args = {"check_same_thread": True} if "sqlite" in engine_uri else {} + self.engine = create_engine(engine_uri, connect_args=connection_args) + # run_migration(engine_uri=engine_uri) + + def create_db_and_tables(self): + """Create a new database and tables""" + try: + SQLModel.metadata.create_all(self.engine) + try: + init_db_samples(self) + except Exception as e: + logger.info("Error while initializing database samples: " + str(e)) + except Exception as e: + logger.info("Error while creating database tables:" + str(e)) + + def upsert(self, model: SQLModel): + """Create a new entity""" + # check if the model exists, update else add + status = True + model_class = type(model) + existing_model = None + + with Session(self.engine) as session: + try: + existing_model = session.exec(select(model_class).where(model_class.id == model.id)).first() + if existing_model: + model.updated_at = datetime.now() + for key, value in model.model_dump().items(): + setattr(existing_model, key, value) + model = existing_model + session.add(model) + else: + session.add(model) + session.commit() + session.refresh(model) + except Exception as e: + session.rollback() + logger.error("Error while upserting %s", e) + status = False + + response = Response( + message=( + f"{model_class.__name__} Updated Successfully " + if existing_model + else f"{model_class.__name__} Created Successfully" + ), + status=status, + data=model.model_dump(), + ) + + return response + + def _model_to_dict(self, model_obj): + return {col.name: getattr(model_obj, col.name) for col in model_obj.__table__.columns} + + def get_items( + self, + model_class: SQLModel, + session: Session, + filters: dict = None, + return_json: bool = False, + order: str = "desc", + ): + """List all entities""" + result = [] + status = True + status_message = "" + + try: + if filters: + conditions = [getattr(model_class, col) == value for col, value in filters.items()] + statement = select(model_class).where(and_(*conditions)) + + if hasattr(model_class, "created_at") and order: + if order == "desc": + statement = statement.order_by(model_class.created_at.desc()) + else: + statement = statement.order_by(model_class.created_at.asc()) + else: + statement = select(model_class) + + if return_json: + result = [self._model_to_dict(row) for row in session.exec(statement).all()] + else: + result = session.exec(statement).all() + status_message = f"{model_class.__name__} Retrieved Successfully" + except Exception as e: + session.rollback() + status = False + status_message = f"Error while fetching {model_class.__name__}" + logger.error("Error while getting %s: %s", model_class.__name__, e) + + response: Response = Response( + message=status_message, + status=status, + data=result, + ) + return response + + def get( + self, + model_class: SQLModel, + filters: dict = None, + return_json: bool = False, + order: str = "desc", + ): + """List all entities""" + + with Session(self.engine) as session: + response = self.get_items(model_class, session, filters, return_json, order) + return response + + def delete(self, model_class: SQLModel, filters: dict = None): + """Delete an entity""" + row = None + status_message = "" + status = True + + with Session(self.engine) as session: + try: + if filters: + conditions = [getattr(model_class, col) == value for col, value in filters.items()] + row = session.exec(select(model_class).where(and_(*conditions))).all() + else: + row = session.exec(select(model_class)).all() + if row: + for row in row: + session.delete(row) + session.commit() + status_message = f"{model_class.__name__} Deleted Successfully" + else: + print(f"Row with filters {filters} not found") + logger.info("Row with filters %s not found", filters) + status_message = "Row not found" + except exc.IntegrityError as e: + session.rollback() + logger.error("Integrity ... Error while deleting: %s", e) + status_message = f"The {model_class.__name__} is linked to another entity and cannot be deleted." + status = False + except Exception as e: + session.rollback() + logger.error("Error while deleting: %s", e) + status_message = f"Error while deleting: {e}" + status = False + response = Response( + message=status_message, + status=status, + data=None, + ) + return response + + def get_linked_entities( + self, + link_type: str, + primary_id: int, + return_json: bool = False, + agent_type: Optional[str] = None, + ): + """ + Get all entities linked to the primary entity. + + Args: + link_type (str): The type of link to retrieve, e.g., "agent_model". + primary_id (int): The identifier for the primary model. + return_json (bool): Whether to return the result as a JSON object. + + Returns: + List[SQLModel]: A list of linked entities. + """ + + linked_entities = [] + + if link_type not in valid_link_types: + return [] + + status = True + status_message = "" + + with Session(self.engine) as session: + try: + if link_type == "agent_model": + # get the agent + agent = self.get_items(Agent, filters={"id": primary_id}, session=session).data[0] + linked_entities = agent.models + elif link_type == "agent_skill": + agent = self.get_items(Agent, filters={"id": primary_id}, session=session).data[0] + linked_entities = agent.skills + elif link_type == "agent_agent": + agent = self.get_items(Agent, filters={"id": primary_id}, session=session).data[0] + linked_entities = agent.agents + elif link_type == "workflow_agent": + linked_entities = session.exec( + select(Agent) + .join(WorkflowAgentLink) + .where( + WorkflowAgentLink.workflow_id == primary_id, + WorkflowAgentLink.agent_type == agent_type, + ) + ).all() + except Exception as e: + logger.error("Error while getting linked entities: %s", e) + status_message = f"Error while getting linked entities: {e}" + status = False + if return_json: + linked_entities = [self._model_to_dict(row) for row in linked_entities] + + response = Response( + message=status_message, + status=status, + data=linked_entities, + ) + + return response + + def link( + self, + link_type: str, + primary_id: int, + secondary_id: int, + agent_type: Optional[str] = None, + ) -> Response: + """ + Link two entities together. + + Args: + link_type (str): The type of link to create, e.g., "agent_model". + primary_id (int): The identifier for the primary model. + secondary_id (int): The identifier for the secondary model. + agent_type (Optional[str]): The type of agent, e.g., "sender" or receiver. + + Returns: + Response: The response of the linking operation, including success status and message. + """ + + # TBD verify that is creator of the primary entity being linked + status = True + status_message = "" + primary_model = None + secondary_model = None + + if link_type not in valid_link_types: + status = False + status_message = f"Invalid link type: {link_type}. Valid link types are: {valid_link_types}" + else: + with Session(self.engine) as session: + try: + if link_type == "agent_model": + primary_model = session.exec(select(Agent).where(Agent.id == primary_id)).first() + secondary_model = session.exec(select(Model).where(Model.id == secondary_id)).first() + if primary_model is None or secondary_model is None: + status = False + status_message = "One or both entity records do not exist." + else: + # check if the link already exists + existing_link = session.exec( + select(AgentModelLink).where( + AgentModelLink.agent_id == primary_id, + AgentModelLink.model_id == secondary_id, + ) + ).first() + if existing_link: # link already exists + return Response( + message=( + f"{secondary_model.__class__.__name__} already linked " + f"to {primary_model.__class__.__name__}" + ), + status=False, + ) + else: + primary_model.models.append(secondary_model) + elif link_type == "agent_agent": + primary_model = session.exec(select(Agent).where(Agent.id == primary_id)).first() + secondary_model = session.exec(select(Agent).where(Agent.id == secondary_id)).first() + if primary_model is None or secondary_model is None: + status = False + status_message = "One or both entity records do not exist." + else: + # check if the link already exists + existing_link = session.exec( + select(AgentLink).where( + AgentLink.parent_id == primary_id, + AgentLink.agent_id == secondary_id, + ) + ).first() + if existing_link: + return Response( + message=( + f"{secondary_model.__class__.__name__} already linked " + f"to {primary_model.__class__.__name__}" + ), + status=False, + ) + else: + primary_model.agents.append(secondary_model) + + elif link_type == "agent_skill": + primary_model = session.exec(select(Agent).where(Agent.id == primary_id)).first() + secondary_model = session.exec(select(Skill).where(Skill.id == secondary_id)).first() + if primary_model is None or secondary_model is None: + status = False + status_message = "One or both entity records do not exist." + else: + # check if the link already exists + existing_link = session.exec( + select(AgentSkillLink).where( + AgentSkillLink.agent_id == primary_id, + AgentSkillLink.skill_id == secondary_id, + ) + ).first() + if existing_link: + return Response( + message=( + f"{secondary_model.__class__.__name__} already linked " + f"to {primary_model.__class__.__name__}" + ), + status=False, + ) + else: + primary_model.skills.append(secondary_model) + elif link_type == "workflow_agent": + primary_model = session.exec(select(Workflow).where(Workflow.id == primary_id)).first() + secondary_model = session.exec(select(Agent).where(Agent.id == secondary_id)).first() + if primary_model is None or secondary_model is None: + status = False + status_message = "One or both entity records do not exist." + else: + # check if the link already exists + existing_link = session.exec( + select(WorkflowAgentLink).where( + WorkflowAgentLink.workflow_id == primary_id, + WorkflowAgentLink.agent_id == secondary_id, + WorkflowAgentLink.agent_type == agent_type, + ) + ).first() + if existing_link: + return Response( + message=( + f"{secondary_model.__class__.__name__} already linked " + f"to {primary_model.__class__.__name__}" + ), + status=False, + ) + else: + # primary_model.agents.append(secondary_model) + workflow_agent_link = WorkflowAgentLink( + workflow_id=primary_id, + agent_id=secondary_id, + agent_type=agent_type, + ) + session.add(workflow_agent_link) + # add and commit the link + session.add(primary_model) + session.commit() + status_message = ( + f"{secondary_model.__class__.__name__} successfully linked " + f"to {primary_model.__class__.__name__}" + ) + + except Exception as e: + session.rollback() + logger.error("Error while linking: %s", e) + status = False + status_message = f"Error while linking due to an exception: {e}" + + response = Response( + message=status_message, + status=status, + ) + + return response + + def unlink( + self, + link_type: str, + primary_id: int, + secondary_id: int, + agent_type: Optional[str] = None, + ) -> Response: + """ + Unlink two entities. + + Args: + link_type (str): The type of link to remove, e.g., "agent_model". + primary_id (int): The identifier for the primary model. + secondary_id (int): The identifier for the secondary model. + agent_type (Optional[str]): The type of agent, e.g., "sender" or receiver. + + Returns: + Response: The response of the unlinking operation, including success status and message. + """ + status = True + status_message = "" + + if link_type not in valid_link_types: + status = False + status_message = f"Invalid link type: {link_type}. Valid link types are: {valid_link_types}" + return Response(message=status_message, status=status) + + with Session(self.engine) as session: + try: + if link_type == "agent_model": + existing_link = session.exec( + select(AgentModelLink).where( + AgentModelLink.agent_id == primary_id, + AgentModelLink.model_id == secondary_id, + ) + ).first() + elif link_type == "agent_skill": + existing_link = session.exec( + select(AgentSkillLink).where( + AgentSkillLink.agent_id == primary_id, + AgentSkillLink.skill_id == secondary_id, + ) + ).first() + elif link_type == "agent_agent": + existing_link = session.exec( + select(AgentLink).where( + AgentLink.parent_id == primary_id, + AgentLink.agent_id == secondary_id, + ) + ).first() + elif link_type == "workflow_agent": + existing_link = session.exec( + select(WorkflowAgentLink).where( + WorkflowAgentLink.workflow_id == primary_id, + WorkflowAgentLink.agent_id == secondary_id, + WorkflowAgentLink.agent_type == agent_type, + ) + ).first() + + if existing_link: + session.delete(existing_link) + session.commit() + status_message = "Link removed successfully." + else: + status = False + status_message = "Link does not exist." + + except Exception as e: + session.rollback() + logger.error("Error while unlinking: %s", e) + status = False + status_message = f"Error while unlinking due to an exception: {e}" + + return Response(message=status_message, status=status) diff --git a/samples/apps/autogen-studio/autogenstudio/database/migrations/README b/samples/apps/autogen-studio/autogenstudio/database/migrations/README new file mode 100644 index 00000000000..2500aa1bcf7 --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/migrations/README @@ -0,0 +1 @@ +Generic single-database configuration. diff --git a/samples/apps/autogen-studio/autogenstudio/database/migrations/__init__.py b/samples/apps/autogen-studio/autogenstudio/database/migrations/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/samples/apps/autogen-studio/autogenstudio/database/migrations/env.py b/samples/apps/autogen-studio/autogenstudio/database/migrations/env.py new file mode 100644 index 00000000000..1431492ad91 --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/migrations/env.py @@ -0,0 +1,80 @@ +import os +from logging.config import fileConfig + +from alembic import context +from sqlalchemy import engine_from_config, pool +from sqlmodel import SQLModel + +from autogenstudio.datamodel import * +from autogenstudio.utils import get_db_uri + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config +config.set_main_option("sqlalchemy.url", get_db_uri()) + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = SQLModel.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure(connection=connection, target_metadata=target_metadata) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/samples/apps/autogen-studio/autogenstudio/database/migrations/script.py.mako b/samples/apps/autogen-studio/autogenstudio/database/migrations/script.py.mako new file mode 100644 index 00000000000..6ce3351093c --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/migrations/script.py.mako @@ -0,0 +1,27 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/samples/apps/autogen-studio/autogenstudio/database/utils.py b/samples/apps/autogen-studio/autogenstudio/database/utils.py new file mode 100644 index 00000000000..c14003b414c --- /dev/null +++ b/samples/apps/autogen-studio/autogenstudio/database/utils.py @@ -0,0 +1,323 @@ +# from .util import get_app_root +import os +import time +from datetime import datetime +from pathlib import Path +from typing import Any + +from alembic import command, util +from alembic.config import Config +from loguru import logger + +# from ..utils.db_utils import get_db_uri +from sqlmodel import Session, create_engine, text + +from autogen.agentchat import AssistantAgent + +from ..datamodel import ( + Agent, + AgentConfig, + AgentType, + CodeExecutionConfigTypes, + Model, + Skill, + Workflow, + WorkflowAgentLink, +) + + +def workflow_from_id(workflow_id: int, dbmanager: Any): + workflow = dbmanager.get(Workflow, filters={"id": workflow_id}).data + if not workflow or len(workflow) == 0: + raise ValueError("The specified workflow does not exist.") + workflow = workflow[0].model_dump(mode="json") + workflow_agent_links = dbmanager.get(WorkflowAgentLink, filters={"workflow_id": workflow_id}).data + + def dump_agent(agent: Agent): + exclude = [] + if agent.type != AgentType.groupchat: + exclude = [ + "admin_name", + "messages", + "max_round", + "admin_name", + "speaker_selection_method", + "allow_repeat_speaker", + ] + return agent.model_dump(warnings=False, mode="json", exclude=exclude) + + def get_agent(agent_id): + with Session(dbmanager.engine) as session: + agent: Agent = dbmanager.get_items(Agent, filters={"id": agent_id}, session=session).data[0] + agent_dict = dump_agent(agent) + agent_dict["skills"] = [Skill.model_validate(skill.model_dump(mode="json")) for skill in agent.skills] + model_exclude = [ + "id", + "agent_id", + "created_at", + "updated_at", + "user_id", + "description", + ] + models = [model.model_dump(mode="json", exclude=model_exclude) for model in agent.models] + agent_dict["models"] = [model.model_dump(mode="json") for model in agent.models] + + if len(models) > 0: + agent_dict["config"]["llm_config"] = agent_dict.get("config", {}).get("llm_config", {}) + llm_config = agent_dict["config"]["llm_config"] + if llm_config: + llm_config["config_list"] = models + agent_dict["config"]["llm_config"] = llm_config + agent_dict["agents"] = [get_agent(agent.id) for agent in agent.agents] + return agent_dict + + for link in workflow_agent_links: + agent_dict = get_agent(link.agent_id) + workflow[str(link.agent_type.value)] = agent_dict + return workflow + + +def run_migration(engine_uri: str): + database_dir = Path(__file__).parent + script_location = database_dir / "migrations" + + engine = create_engine(engine_uri) + buffer = open(script_location / "alembic.log", "w") + alembic_cfg = Config(stdout=buffer) + alembic_cfg.set_main_option("script_location", str(script_location)) + alembic_cfg.set_main_option("sqlalchemy.url", engine_uri) + + print(f"Running migrations with engine_uri: {engine_uri}") + + should_initialize_alembic = False + with Session(engine) as session: + try: + session.exec(text("SELECT * FROM alembic_version")) + except Exception: + logger.info("Alembic not initialized") + should_initialize_alembic = True + else: + logger.info("Alembic already initialized") + + if should_initialize_alembic: + try: + logger.info("Initializing alembic") + command.ensure_version(alembic_cfg) + command.upgrade(alembic_cfg, "head") + logger.info("Alembic initialized") + except Exception as exc: + logger.error(f"Error initializing alembic: {exc}") + raise RuntimeError("Error initializing alembic") from exc + + logger.info(f"Running DB migrations in {script_location}") + + try: + buffer.write(f"{datetime.now().isoformat()}: Checking migrations\n") + command.check(alembic_cfg) + except Exception as exc: + if isinstance(exc, (util.exc.CommandError, util.exc.AutogenerateDiffsDetected)): + try: + command.upgrade(alembic_cfg, "head") + time.sleep(3) + except Exception as exc: + logger.error(f"Error running migrations: {exc}") + + try: + buffer.write(f"{datetime.now().isoformat()}: Checking migrations\n") + command.check(alembic_cfg) + except util.exc.AutogenerateDiffsDetected as exc: + logger.info(f"AutogenerateDiffsDetected: {exc}") + # raise RuntimeError( + # f"There's a mismatch between the models and the database.\n{exc}") + except util.exc.CommandError as exc: + logger.error(f"CommandError: {exc}") + # raise RuntimeError(f"Error running migrations: {exc}") + + +def init_db_samples(dbmanager: Any): + workflows = dbmanager.get(Workflow).data + workflow_names = [w.name for w in workflows] + if "Default Workflow" in workflow_names and "Travel Planning Workflow" in workflow_names: + logger.info("Database already initialized with Default and Travel Planning Workflows") + return + logger.info("Initializing database with Default and Travel Planning Workflows") + # models + gpt_4_model = Model( + model="gpt-4-1106-preview", description="OpenAI GPT-4 model", user_id="guestuser@gmail.com", api_type="open_ai" + ) + azure_model = Model( + model="gpt4-turbo", + description="Azure OpenAI model", + user_id="guestuser@gmail.com", + api_type="azure", + base_url="https://api.your azureendpoint.com/v1", + ) + zephyr_model = Model( + model="zephyr", + description="Local Huggingface Zephyr model via vLLM, LMStudio or Ollama", + base_url="http://localhost:1234/v1", + user_id="guestuser@gmail.com", + api_type="open_ai", + ) + + google_gemini_model = Model( + model="gemini-1.5-pro-latest", + description="Google's Gemini model", + user_id="guestuser@gmail.com", + api_type="google", + ) + + # skills + + generate_image_skill = Skill( + name="generate_images", + description="Generate and save images based on a user's query.", + content='\nfrom typing import List\nimport uuid\nimport requests # to perform HTTP requests\nfrom pathlib import Path\n\nfrom openai import OpenAI\n\n\ndef generate_and_save_images(query: str, image_size: str = "1024x1024") -> List[str]:\n """\n Function to paint, draw or illustrate images based on the users query or request. Generates images from a given query using OpenAI\'s DALL-E model and saves them to disk. Use the code below anytime there is a request to create an image.\n\n :param query: A natural language description of the image to be generated.\n :param image_size: The size of the image to be generated. (default is "1024x1024")\n :return: A list of filenames for the saved images.\n """\n\n client = OpenAI() # Initialize the OpenAI client\n response = client.images.generate(model="dall-e-3", prompt=query, n=1, size=image_size) # Generate images\n\n # List to store the file names of saved images\n saved_files = []\n\n # Check if the response is successful\n if response.data:\n for image_data in response.data:\n # Generate a random UUID as the file name\n file_name = str(uuid.uuid4()) + ".png" # Assuming the image is a PNG\n file_path = Path(file_name)\n\n img_url = image_data.url\n img_response = requests.get(img_url)\n if img_response.status_code == 200:\n # Write the binary content to a file\n with open(file_path, "wb") as img_file:\n img_file.write(img_response.content)\n print(f"Image saved to {file_path}")\n saved_files.append(str(file_path))\n else:\n print(f"Failed to download the image from {img_url}")\n else:\n print("No image data found in the response!")\n\n # Return the list of saved files\n return saved_files\n\n\n# Example usage of the function:\n# generate_and_save_images("A cute baby sea otter")\n', + user_id="guestuser@gmail.com", + ) + + # agents + user_proxy_config = AgentConfig( + name="user_proxy", + description="User Proxy Agent Configuration", + human_input_mode="NEVER", + max_consecutive_auto_reply=25, + system_message="You are a helpful assistant", + code_execution_config=CodeExecutionConfigTypes.local, + default_auto_reply="TERMINATE", + llm_config=False, + ) + user_proxy = Agent( + user_id="guestuser@gmail.com", type=AgentType.userproxy, config=user_proxy_config.model_dump(mode="json") + ) + + painter_assistant_config = AgentConfig( + name="default_assistant", + description="Assistant Agent", + human_input_mode="NEVER", + max_consecutive_auto_reply=25, + system_message=AssistantAgent.DEFAULT_SYSTEM_MESSAGE, + code_execution_config=CodeExecutionConfigTypes.none, + llm_config={}, + ) + painter_assistant = Agent( + user_id="guestuser@gmail.com", type=AgentType.assistant, config=painter_assistant_config.model_dump(mode="json") + ) + + planner_assistant_config = AgentConfig( + name="planner_assistant", + description="Assistant Agent", + human_input_mode="NEVER", + max_consecutive_auto_reply=25, + system_message="You are a helpful assistant that can suggest a travel plan for a user. You are the primary cordinator who will receive suggestions or advice from other agents (local_assistant, language_assistant). You must ensure that the finally plan integrates the suggestions from other agents or team members. YOUR FINAL RESPONSE MUST BE THE COMPLETE PLAN. When the plan is complete and all perspectives are integrated, you can respond with TERMINATE.", + code_execution_config=CodeExecutionConfigTypes.none, + llm_config={}, + ) + planner_assistant = Agent( + user_id="guestuser@gmail.com", type=AgentType.assistant, config=planner_assistant_config.model_dump(mode="json") + ) + + local_assistant_config = AgentConfig( + name="local_assistant", + description="Local Assistant Agent", + human_input_mode="NEVER", + max_consecutive_auto_reply=25, + system_message="You are a local assistant that can suggest local activities or places to visit for a user. You can suggest local activities, places to visit, restaurants to eat at, etc. You can also provide information about the weather, local events, etc. You can provide information about the local area, but you cannot suggest a complete travel plan. You can only provide information about the local area.", + code_execution_config=CodeExecutionConfigTypes.none, + llm_config={}, + ) + local_assistant = Agent( + user_id="guestuser@gmail.com", type=AgentType.assistant, config=local_assistant_config.model_dump(mode="json") + ) + + language_assistant_config = AgentConfig( + name="language_assistant", + description="Language Assistant Agent", + human_input_mode="NEVER", + max_consecutive_auto_reply=25, + system_message="You are a helpful assistant that can review travel plans, providing feedback on important/critical tips about how best to address language or communication challenges for the given destination. If the plan already includes language tips, you can mention that the plan is satisfactory, with rationale.", + code_execution_config=CodeExecutionConfigTypes.none, + llm_config={}, + ) + language_assistant = Agent( + user_id="guestuser@gmail.com", + type=AgentType.assistant, + config=language_assistant_config.model_dump(mode="json"), + ) + + # group chat + travel_groupchat_config = AgentConfig( + name="travel_groupchat", + admin_name="groupchat", + description="Group Chat Agent Configuration", + human_input_mode="NEVER", + max_consecutive_auto_reply=25, + system_message="You are a group chat manager", + code_execution_config=CodeExecutionConfigTypes.none, + default_auto_reply="TERMINATE", + llm_config={}, + speaker_selection_method="auto", + ) + travel_groupchat_agent = Agent( + user_id="guestuser@gmail.com", type=AgentType.groupchat, config=travel_groupchat_config.model_dump(mode="json") + ) + + # workflows + default_workflow = Workflow(name="Default Workflow", description="Default workflow", user_id="guestuser@gmail.com") + + travel_workflow = Workflow( + name="Travel Planning Workflow", description="Travel workflow", user_id="guestuser@gmail.com" + ) + + with Session(dbmanager.engine) as session: + session.add(zephyr_model) + session.add(google_gemini_model) + session.add(azure_model) + session.add(gpt_4_model) + session.add(generate_image_skill) + session.add(user_proxy) + session.add(painter_assistant) + session.add(travel_groupchat_agent) + session.add(planner_assistant) + session.add(local_assistant) + session.add(language_assistant) + + session.add(default_workflow) + session.add(travel_workflow) + session.commit() + + dbmanager.link(link_type="agent_model", primary_id=painter_assistant.id, secondary_id=gpt_4_model.id) + dbmanager.link(link_type="agent_skill", primary_id=painter_assistant.id, secondary_id=generate_image_skill.id) + dbmanager.link( + link_type="workflow_agent", primary_id=default_workflow.id, secondary_id=user_proxy.id, agent_type="sender" + ) + dbmanager.link( + link_type="workflow_agent", + primary_id=default_workflow.id, + secondary_id=painter_assistant.id, + agent_type="receiver", + ) + + # link agents to travel groupchat agent + + dbmanager.link(link_type="agent_agent", primary_id=travel_groupchat_agent.id, secondary_id=planner_assistant.id) + dbmanager.link(link_type="agent_agent", primary_id=travel_groupchat_agent.id, secondary_id=local_assistant.id) + dbmanager.link( + link_type="agent_agent", primary_id=travel_groupchat_agent.id, secondary_id=language_assistant.id + ) + dbmanager.link(link_type="agent_agent", primary_id=travel_groupchat_agent.id, secondary_id=user_proxy.id) + dbmanager.link(link_type="agent_model", primary_id=travel_groupchat_agent.id, secondary_id=gpt_4_model.id) + dbmanager.link(link_type="agent_model", primary_id=planner_assistant.id, secondary_id=gpt_4_model.id) + dbmanager.link(link_type="agent_model", primary_id=local_assistant.id, secondary_id=gpt_4_model.id) + dbmanager.link(link_type="agent_model", primary_id=language_assistant.id, secondary_id=gpt_4_model.id) + + dbmanager.link( + link_type="workflow_agent", primary_id=travel_workflow.id, secondary_id=user_proxy.id, agent_type="sender" + ) + dbmanager.link( + link_type="workflow_agent", + primary_id=travel_workflow.id, + secondary_id=travel_groupchat_agent.id, + agent_type="receiver", + ) + logger.info("Successfully initialized database with Default and Travel Planning Workflows") diff --git a/samples/apps/autogen-studio/autogenstudio/datamodel.py b/samples/apps/autogen-studio/autogenstudio/datamodel.py index 083bddccfcf..3dbd46c357e 100644 --- a/samples/apps/autogen-studio/autogenstudio/datamodel.py +++ b/samples/apps/autogen-studio/autogenstudio/datamodel.py @@ -1,318 +1,262 @@ -import uuid -from dataclasses import asdict, field from datetime import datetime +from enum import Enum from typing import Any, Callable, Dict, List, Literal, Optional, Union -from pydantic.dataclasses import dataclass - - -@dataclass -class Message(object): - user_id: str +from sqlalchemy import ForeignKey, Integer, orm +from sqlmodel import ( + JSON, + Column, + DateTime, + Field, + Relationship, + SQLModel, + func, +) +from sqlmodel import ( + Enum as SqlEnum, +) + +SQLModel.model_config["protected_namespaces"] = () +# pylint: disable=protected-access + + +class Message(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + id: Optional[int] = Field(default=None, primary_key=True) + created_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), server_default=func.now()), + ) # pylint: disable=not-callable + updated_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), onupdate=func.now()), + ) # pylint: disable=not-callable + user_id: Optional[str] = None role: str content: str - root_msg_id: Optional[str] = None - msg_id: Optional[str] = None - timestamp: Optional[str] = None - personalize: Optional[bool] = False - ra: Optional[str] = None - code: Optional[str] = None - metadata: Optional[Any] = None - session_id: Optional[str] = None - - def __post_init__(self): - if self.msg_id is None: - self.msg_id = str(uuid.uuid4()) - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - - def dict(self): - result = asdict(self) - return result - - -@dataclass -class Skill(object): - title: str - content: str - file_name: Optional[str] = None - id: Optional[str] = None - description: Optional[str] = None - timestamp: Optional[str] = None + session_id: Optional[int] = Field( + default=None, sa_column=Column(Integer, ForeignKey("session.id", ondelete="CASCADE")) + ) + connection_id: Optional[str] = None + meta: Optional[Dict] = Field(default={}, sa_column=Column(JSON)) + + +class Session(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + id: Optional[int] = Field(default=None, primary_key=True) + created_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), server_default=func.now()), + ) # pylint: disable=not-callable + updated_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), onupdate=func.now()), + ) # pylint: disable=not-callable user_id: Optional[str] = None + workflow_id: Optional[int] = Field(default=None, foreign_key="workflow.id") + name: Optional[str] = None + description: Optional[str] = None - def __post_init__(self): - if self.id is None: - self.id = str(uuid.uuid4()) - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - if self.user_id is None: - self.user_id = "default" - - def dict(self): - result = asdict(self) - return result +class AgentSkillLink(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + agent_id: int = Field(default=None, primary_key=True, foreign_key="agent.id") + skill_id: int = Field(default=None, primary_key=True, foreign_key="skill.id") -# web api data models +class AgentModelLink(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + agent_id: int = Field(default=None, primary_key=True, foreign_key="agent.id") + model_id: int = Field(default=None, primary_key=True, foreign_key="model.id") -# autogenflow data models -@dataclass -class Model: - """Data model for Model Config item in LLMConfig for AutoGen""" - model: str - api_key: Optional[str] = None - base_url: Optional[str] = None - api_type: Optional[str] = None - api_version: Optional[str] = None - id: Optional[str] = None - timestamp: Optional[str] = None +class Skill(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + id: Optional[int] = Field(default=None, primary_key=True) + created_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), server_default=func.now()), + ) # pylint: disable=not-callable + updated_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), onupdate=func.now()), + ) # pylint: disable=not-callable user_id: Optional[str] = None + name: str + content: str description: Optional[str] = None + secrets: Optional[Dict] = Field(default={}, sa_column=Column(JSON)) + libraries: Optional[Dict] = Field(default={}, sa_column=Column(JSON)) + agents: List["Agent"] = Relationship(back_populates="skills", link_model=AgentSkillLink) - def dict(self): - result = asdict(self) - return result - def __post_init__(self): - if self.id is None: - self.id = str(uuid.uuid4()) - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - if self.user_id is None: - self.user_id = "default" - - -@dataclass -class LLMConfig: +class LLMConfig(SQLModel, table=False): """Data model for LLM Config for AutoGen""" - config_list: List[Any] = field(default_factory=list) + config_list: List[Any] = Field(default_factory=list) temperature: float = 0 cache_seed: Optional[Union[int, None]] = None timeout: Optional[int] = None - max_tokens: Optional[int] = None + max_tokens: Optional[int] = 1000 extra_body: Optional[dict] = None - def dict(self): - result = asdict(self) - result["config_list"] = [c.dict() for c in self.config_list] - return result +class ModelTypes(str, Enum): + openai = "open_ai" + google = "google" + azure = "azure" -@dataclass -class AgentConfig: - """Data model for Agent Config for AutoGen""" - name: str - llm_config: Optional[Union[LLMConfig, bool]] = False +class Model(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + id: Optional[int] = Field(default=None, primary_key=True) + created_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), server_default=func.now()), + ) # pylint: disable=not-callable + updated_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), onupdate=func.now()), + ) # pylint: disable=not-callable + user_id: Optional[str] = None + model: str + api_key: Optional[str] = None + base_url: Optional[str] = None + api_type: ModelTypes = Field(default=ModelTypes.openai, sa_column=Column(SqlEnum(ModelTypes))) + api_version: Optional[str] = None + description: Optional[str] = None + agents: List["Agent"] = Relationship(back_populates="models", link_model=AgentModelLink) + + +class CodeExecutionConfigTypes(str, Enum): + local = "local" + docker = "docker" + none = "none" + + +class AgentConfig(SQLModel, table=False): + name: Optional[str] = None human_input_mode: str = "NEVER" max_consecutive_auto_reply: int = 10 system_message: Optional[str] = None is_termination_msg: Optional[Union[bool, str, Callable]] = None - code_execution_config: Optional[Union[bool, str, Dict[str, Any]]] = None + code_execution_config: CodeExecutionConfigTypes = Field( + default=CodeExecutionConfigTypes.local, sa_column=Column(SqlEnum(CodeExecutionConfigTypes)) + ) default_auto_reply: Optional[str] = "" description: Optional[str] = None + llm_config: Optional[Union[LLMConfig, bool]] = Field(default=False, sa_column=Column(JSON)) - def dict(self): - result = asdict(self) - if isinstance(result["llm_config"], LLMConfig): - result["llm_config"] = result["llm_config"].dict() - return result - - -@dataclass -class AgentFlowSpec: - """Data model to help flow load agents from config""" - - type: Literal["assistant", "userproxy"] - config: AgentConfig - id: Optional[str] = None - timestamp: Optional[str] = None - user_id: Optional[str] = None - skills: Optional[Union[None, List[Skill]]] = None - - def __post_init__(self): - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - if self.id is None: - self.id = str(uuid.uuid4()) - if self.user_id is None: - self.user_id = "default" - - def dict(self): - result = asdict(self) - return result - - -@dataclass -class GroupChatConfig: - """Data model for GroupChat Config for AutoGen""" - - agents: List[AgentFlowSpec] = field(default_factory=list) - admin_name: str = "Admin" - messages: List[Dict] = field(default_factory=list) - max_round: Optional[int] = 10 admin_name: Optional[str] = "Admin" + messages: Optional[List[Dict]] = Field(default_factory=list) + max_round: Optional[int] = 100 speaker_selection_method: Optional[str] = "auto" - # TODO: match the new group chat default and support transition spec - allow_repeat_speaker: Optional[Union[bool, List[AgentConfig]]] = True + allow_repeat_speaker: Optional[Union[bool, List["AgentConfig"]]] = True - def dict(self): - result = asdict(self) - result["agents"] = [a.dict() for a in self.agents] - return result +class AgentType(str, Enum): + assistant = "assistant" + userproxy = "userproxy" + groupchat = "groupchat" -@dataclass -class GroupChatFlowSpec: - """Data model to help flow load agents from config""" - type: Literal["groupchat"] - config: AgentConfig = field(default_factory=AgentConfig) - groupchat_config: Optional[GroupChatConfig] = field(default_factory=GroupChatConfig) - id: Optional[str] = None - timestamp: Optional[str] = None - user_id: Optional[str] = None - skills: Optional[Union[None, List[Skill]]] = None +class WorkflowAgentType(str, Enum): + sender = "sender" + receiver = "receiver" + planner = "planner" - def __post_init__(self): - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - if self.id is None: - self.id = str(uuid.uuid4()) - if self.user_id is None: - self.user_id = "default" - def dict(self): - result = asdict(self) - # result["config"] = self.config.dict() - # result["groupchat_config"] = self.groupchat_config.dict() - return result +class WorkflowAgentLink(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + workflow_id: int = Field(default=None, primary_key=True, foreign_key="workflow.id") + agent_id: int = Field(default=None, primary_key=True, foreign_key="agent.id") + agent_type: WorkflowAgentType = Field( + default=WorkflowAgentType.sender, + sa_column=Column(SqlEnum(WorkflowAgentType), primary_key=True), + ) -@dataclass -class AgentWorkFlowConfig: - """Data model for Flow Config for AutoGen""" +class AgentLink(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + parent_id: Optional[int] = Field(default=None, foreign_key="agent.id", primary_key=True) + agent_id: Optional[int] = Field(default=None, foreign_key="agent.id", primary_key=True) + +class Agent(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + id: Optional[int] = Field(default=None, primary_key=True) + created_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), server_default=func.now()), + ) # pylint: disable=not-callable + updated_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), onupdate=func.now()), + ) # pylint: disable=not-callable + user_id: Optional[str] = None + type: AgentType = Field(default=AgentType.assistant, sa_column=Column(SqlEnum(AgentType))) + config: AgentConfig = Field(default_factory=AgentConfig, sa_column=Column(JSON)) + skills: List[Skill] = Relationship(back_populates="agents", link_model=AgentSkillLink) + models: List[Model] = Relationship(back_populates="agents", link_model=AgentModelLink) + workflows: List["Workflow"] = Relationship(link_model=WorkflowAgentLink, back_populates="agents") + parents: List["Agent"] = Relationship( + back_populates="agents", + link_model=AgentLink, + sa_relationship_kwargs=dict( + primaryjoin="Agent.id==AgentLink.agent_id", + secondaryjoin="Agent.id==AgentLink.parent_id", + ), + ) + agents: List["Agent"] = Relationship( + back_populates="parents", + link_model=AgentLink, + sa_relationship_kwargs=dict( + primaryjoin="Agent.id==AgentLink.parent_id", + secondaryjoin="Agent.id==AgentLink.agent_id", + ), + ) + + +class WorkFlowType(str, Enum): + twoagents = "twoagents" + groupchat = "groupchat" + + +class WorkFlowSummaryMethod(str, Enum): + last = "last" + none = "none" + llm = "llm" + + +class Workflow(SQLModel, table=True): + __table_args__ = {"sqlite_autoincrement": True} + id: Optional[int] = Field(default=None, primary_key=True) + created_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), server_default=func.now()), + ) # pylint: disable=not-callable + updated_at: datetime = Field( + default_factory=datetime.now, + sa_column=Column(DateTime(timezone=True), onupdate=func.now()), + ) # pylint: disable=not-callable + user_id: Optional[str] = None name: str description: str - sender: AgentFlowSpec - receiver: Union[AgentFlowSpec, GroupChatFlowSpec] - type: Literal["twoagents", "groupchat"] = "twoagents" - id: Optional[str] = None - user_id: Optional[str] = None - timestamp: Optional[str] = None - # how the agent message summary is generated. last: only last message is used, none: no summary, llm: use llm to generate summary - summary_method: Optional[Literal["last", "none", "llm"]] = "last" - - def init_spec(self, spec: Dict): - """initialize the agent spec""" - if not isinstance(spec, dict): - spec = spec.dict() - if spec["type"] == "groupchat": - return GroupChatFlowSpec(**spec) - else: - return AgentFlowSpec(**spec) - - def __post_init__(self): - if self.id is None: - self.id = str(uuid.uuid4()) - self.sender = self.init_spec(self.sender) - self.receiver = self.init_spec(self.receiver) - if self.user_id is None: - self.user_id = "default" - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - - def dict(self): - result = asdict(self) - result["sender"] = self.sender.dict() - result["receiver"] = self.receiver.dict() - return result - - -@dataclass -class Session(object): - """Data model for AutoGen Chat Session""" - - user_id: str - id: Optional[str] = None - timestamp: Optional[str] = None - flow_config: AgentWorkFlowConfig = None - name: Optional[str] = None - description: Optional[str] = None + agents: List[Agent] = Relationship(back_populates="workflows", link_model=WorkflowAgentLink) + type: WorkFlowType = Field(default=WorkFlowType.twoagents, sa_column=Column(SqlEnum(WorkFlowType))) + summary_method: Optional[WorkFlowSummaryMethod] = Field( + default=WorkFlowSummaryMethod.last, + sa_column=Column(SqlEnum(WorkFlowSummaryMethod)), + ) - def __post_init__(self): - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - if self.id is None: - self.id = str(uuid.uuid4()) - - def dict(self): - result = asdict(self) - result["flow_config"] = self.flow_config.dict() - return result - - -@dataclass -class Gallery(object): - """Data model for Gallery Item""" - - session: Session - messages: List[Message] - tags: List[str] - id: Optional[str] = None - timestamp: Optional[str] = None - - def __post_init__(self): - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - if self.id is None: - self.id = str(uuid.uuid4()) - - def dict(self): - result = asdict(self) - return result - - -@dataclass -class ChatWebRequestModel(object): - """Data model for Chat Web Request for Web End""" - - message: Message - flow_config: AgentWorkFlowConfig - - -@dataclass -class DeleteMessageWebRequestModel(object): - user_id: str - msg_id: str - session_id: Optional[str] = None - - -@dataclass -class DBWebRequestModel(object): - user_id: str - msg_id: Optional[str] = None - session: Optional[Session] = None - skill: Optional[Skill] = None - tags: Optional[List[str]] = None - agent: Optional[AgentFlowSpec] = None - workflow: Optional[AgentWorkFlowConfig] = None - model: Optional[Model] = None - message: Optional[Message] = None - connection_id: Optional[str] = None + +class Response(SQLModel): + message: str + status: bool + data: Optional[Any] = None -@dataclass -class SocketMessage(object): +class SocketMessage(SQLModel, table=False): connection_id: str data: Dict[str, Any] type: str - - def dict(self): - result = asdict(self) - return result diff --git a/samples/apps/autogen-studio/autogenstudio/utils/__init__.py b/samples/apps/autogen-studio/autogenstudio/utils/__init__.py index f37b0b0486a..16281fe0b66 100644 --- a/samples/apps/autogen-studio/autogenstudio/utils/__init__.py +++ b/samples/apps/autogen-studio/autogenstudio/utils/__init__.py @@ -1,2 +1 @@ -from .dbutils import * from .utils import * diff --git a/samples/apps/autogen-studio/autogenstudio/utils/dbutils.py b/samples/apps/autogen-studio/autogenstudio/utils/dbutils.py deleted file mode 100644 index dca0fc6b0a6..00000000000 --- a/samples/apps/autogen-studio/autogenstudio/utils/dbutils.py +++ /dev/null @@ -1,860 +0,0 @@ -import json -import logging -import os -import sqlite3 -import threading -from typing import Any, Dict, List, Optional, Tuple - -from ..datamodel import AgentFlowSpec, AgentWorkFlowConfig, Gallery, Message, Model, Session, Skill -from ..version import __version__ as __db_version__ - -VERSION_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS version ( - - version TEXT NOT NULL, - UNIQUE (version) - ) - """ - -MODELS_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS models ( - id TEXT NOT NULL, - user_id TEXT NOT NULL, - timestamp DATETIME NOT NULL, - model TEXT, - api_key TEXT, - base_url TEXT, - api_type TEXT, - api_version TEXT, - description TEXT, - UNIQUE (id, user_id) - ) - """ - - -MESSAGES_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS messages ( - user_id TEXT NOT NULL, - session_id TEXT, - root_msg_id TEXT NOT NULL, - msg_id TEXT, - role TEXT NOT NULL, - content TEXT NOT NULL, - metadata TEXT, - timestamp DATETIME, - UNIQUE (user_id, root_msg_id, msg_id) - ) - """ - -SESSIONS_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS sessions ( - id TEXT NOT NULL, - user_id TEXT NOT NULL, - timestamp DATETIME NOT NULL, - name TEXT, - flow_config TEXT, - UNIQUE (user_id, id) - ) - """ - -SKILLS_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS skills ( - id TEXT NOT NULL, - user_id TEXT NOT NULL, - timestamp DATETIME NOT NULL, - content TEXT, - title TEXT, - file_name TEXT, - UNIQUE (id, user_id) - ) - """ -AGENTS_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS agents ( - - id TEXT NOT NULL, - user_id TEXT NOT NULL, - timestamp DATETIME NOT NULL, - config TEXT, - type TEXT, - skills TEXT, - UNIQUE (id, user_id) - ) - """ - -WORKFLOWS_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS workflows ( - id TEXT NOT NULL, - user_id TEXT NOT NULL, - timestamp DATETIME NOT NULL, - sender TEXT, - receiver TEXT, - type TEXT, - name TEXT, - description TEXT, - summary_method TEXT, - UNIQUE (id, user_id) - ) - """ - -GALLERY_TABLE_SQL = """ - CREATE TABLE IF NOT EXISTS gallery ( - id TEXT NOT NULL, - session TEXT, - messages TEXT, - tags TEXT, - timestamp DATETIME NOT NULL, - UNIQUE ( id) - ) - """ - - -lock = threading.Lock() -logger = logging.getLogger() - - -class DBManager: - """ - A database manager class that handles the creation and interaction with an SQLite database. - """ - - def __init__(self, path: str = "database.sqlite", **kwargs: Any) -> None: - """ - Initializes the DBManager object, creates a database if it does not exist, and establishes a connection. - - Args: - path (str): The file path to the SQLite database file. - **kwargs: Additional keyword arguments to pass to the sqlite3.connect method. - """ - - self.path = path - # check if the database exists, if not create it - # self.reset_db() - if not os.path.exists(self.path): - logger.info("Creating database") - self.init_db(path=self.path, **kwargs) - - try: - self.conn = sqlite3.connect(self.path, check_same_thread=False, **kwargs) - self.cursor = self.conn.cursor() - self.migrate() - except Exception as e: - logger.error("Error connecting to database: %s", e) - raise e - - def migrate(self): - """ - Run migrations to update the database schema. - """ - self.add_column_if_not_exists("sessions", "name", "TEXT") - self.add_column_if_not_exists("models", "description", "TEXT") - - def add_column_if_not_exists(self, table: str, column: str, column_type: str): - """ - Adds a new column to the specified table if it does not exist. - - Args: - table (str): The table name where the column should be added. - column (str): The column name that should be added. - column_type (str): The data type of the new column. - """ - try: - self.cursor.execute(f"PRAGMA table_info({table})") - column_names = [row[1] for row in self.cursor.fetchall()] - if column not in column_names: - self.cursor.execute(f"ALTER TABLE {table} ADD COLUMN {column} {column_type}") - self.conn.commit() - logger.info(f"Migration: New '{column}' column has been added to the '{table}' table.") - else: - logger.info(f"'{column}' column already exists in the '{table}' table.") - - except Exception as e: - print(f"Error while checking and updating '{table}' table: {e}") - - def reset_db(self): - """ - Reset the database by deleting the database file and creating a new one. - """ - print("resetting db") - if os.path.exists(self.path): - os.remove(self.path) - self.init_db(path=self.path) - - def init_db(self, path: str = "database.sqlite", **kwargs: Any) -> None: - """ - Initializes the database by creating necessary tables. - - Args: - path (str): The file path to the SQLite database file. - **kwargs: Additional keyword arguments to pass to the sqlite3.connect method. - """ - # Connect to the database (or create a new one if it doesn't exist) - self.conn = sqlite3.connect(path, check_same_thread=False, **kwargs) - self.cursor = self.conn.cursor() - - # Create the version table - self.cursor.execute(VERSION_TABLE_SQL) - self.cursor.execute("INSERT INTO version (version) VALUES (?)", (__db_version__,)) - - # Create the models table - self.cursor.execute(MODELS_TABLE_SQL) - - # Create the messages table - self.cursor.execute(MESSAGES_TABLE_SQL) - - # Create a sessions table - self.cursor.execute(SESSIONS_TABLE_SQL) - - # Create a skills - self.cursor.execute(SKILLS_TABLE_SQL) - - # Create a gallery table - self.cursor.execute(GALLERY_TABLE_SQL) - - # Create a agents table - self.cursor.execute(AGENTS_TABLE_SQL) - - # Create a workflows table - self.cursor.execute(WORKFLOWS_TABLE_SQL) - - # init skills table with content of defaultskills.json in current directory - current_dir = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(current_dir, "dbdefaults.json"), "r", encoding="utf-8") as json_file: - data = json.load(json_file) - skills = data["skills"] - agents = data["agents"] - models = data["models"] - for model in models: - model = Model(**model) - self.cursor.execute( - "INSERT INTO models (id, user_id, timestamp, model, api_key, base_url, api_type, api_version, description) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", - ( - model.id, - "default", - model.timestamp, - model.model, - model.api_key, - model.base_url, - model.api_type, - model.api_version, - model.description, - ), - ) - - for skill in skills: - skill = Skill(**skill) - - self.cursor.execute( - "INSERT INTO skills (id, user_id, timestamp, content, title, file_name) VALUES (?, ?, ?, ?, ?, ?)", - (skill.id, "default", skill.timestamp, skill.content, skill.title, skill.file_name), - ) - for agent in agents: - agent = AgentFlowSpec(**agent) - agent.skills = [skill.dict() for skill in agent.skills] if agent.skills else None - self.cursor.execute( - "INSERT INTO agents (id, user_id, timestamp, config, type, skills) VALUES (?, ?, ?, ?, ?, ?)", - ( - agent.id, - "default", - agent.timestamp, - json.dumps(agent.config.dict()), - agent.type, - json.dumps(agent.skills), - ), - ) - - for workflow in data["workflows"]: - workflow = AgentWorkFlowConfig(**workflow) - self.cursor.execute( - "INSERT INTO workflows (id, user_id, timestamp, sender, receiver, type, name, description, summary_method) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?)", - ( - workflow.id, - "default", - workflow.timestamp, - json.dumps(workflow.sender.dict()), - json.dumps(workflow.receiver.dict()), - workflow.type, - workflow.name, - workflow.description, - workflow.summary_method, - ), - ) - - # Commit the changes and close the connection - self.conn.commit() - - def query(self, query: str, args: Tuple = (), return_json: bool = False) -> List[Dict[str, Any]]: - """ - Executes a given SQL query and returns the results. - - Args: - query (str): The SQL query to execute. - args (Tuple): The arguments to pass to the SQL query. - return_json (bool): If True, the results will be returned as a list of dictionaries. - - Returns: - List[Dict[str, Any]]: The result of the SQL query. - """ - try: - with lock: - self.cursor.execute(query, args) - result = self.cursor.fetchall() - self.commit() - if return_json: - result = [dict(zip([key[0] for key in self.cursor.description], row)) for row in result] - return result - except Exception as e: - logger.error("Error running query with query %s and args %s: %s", query, args, e) - raise e - - def commit(self) -> None: - """ - Commits the current transaction Modelto the database. - """ - self.conn.commit() - - def close(self) -> None: - """ - Closes the database connection. - """ - self.conn.close() - - -def get_models(user_id: str, dbmanager: DBManager) -> List[dict]: - """ - Get all models for a given user from the database. - - Args: - user_id: The user id to get models for - dbmanager: The DBManager instance to interact with the database - - Returns: - A list of model configurations - """ - query = "SELECT * FROM models WHERE user_id = ? OR user_id = ?" - args = (user_id, "default") - results = dbmanager.query(query, args, return_json=True) - return results - - -def upsert_model(model: Model, dbmanager: DBManager) -> List[dict]: - """ - Insert or update a model configuration in the database. - - Args: - model: The Model object containing model configuration data - dbmanager: The DBManager instance to interact with the database - - Returns: - A list of model configurations - """ - - # Check if the model config with the provided id already exists in the database - existing_model = get_item_by_field("models", "id", model.id, dbmanager) - - if existing_model: - # If the model config exists, update it with the new data - updated_data = { - "model": model.model, - "api_key": model.api_key, - "base_url": model.base_url, - "api_type": model.api_type, - "api_version": model.api_version, - "user_id": model.user_id, - "timestamp": model.timestamp, - "description": model.description, - } - update_item("models", model.id, updated_data, dbmanager) - else: - # If the model config does not exist, insert a new one - query = """ - INSERT INTO models (id, user_id, timestamp, model, api_key, base_url, api_type, api_version, description) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - args = ( - model.id, - model.user_id, - model.timestamp, - model.model, - model.api_key, - model.base_url, - model.api_type, - model.api_version, - model.description, - ) - dbmanager.query(query=query, args=args) - - # Return the inserted or updated model config - models = get_models(model.user_id, dbmanager) - return models - - -def delete_model(model: Model, dbmanager: DBManager) -> List[dict]: - """ - Delete a model configuration from the database where id = model.id and user_id = model.user_id. - - Args: - model: The Model object containing model configuration data - dbmanager: The DBManager instance to interact with the database - - Returns: - A list of model configurations - """ - - query = "DELETE FROM models WHERE id = ? AND user_id = ?" - args = (model.id, model.user_id) - dbmanager.query(query=query, args=args) - - # Return the remaining model configs - models = get_models(model.user_id, dbmanager) - return models - - -def create_message(message: Message, dbmanager: DBManager) -> List[dict]: - """ - Save a message in the database using the provided database manager. - - :param message: The Message object containing message data - :param dbmanager: The DBManager instance used to interact with the database - """ - query = "INSERT INTO messages (user_id, root_msg_id, msg_id, role, content, metadata, timestamp, session_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)" - args = ( - message.user_id, - message.root_msg_id, - message.msg_id, - message.role, - message.content, - message.metadata, - message.timestamp, - message.session_id, - ) - dbmanager.query(query=query, args=args) - messages = get_messages(user_id=message.user_id, session_id=message.session_id, dbmanager=dbmanager) - return messages - - -def get_messages(user_id: str, session_id: str, dbmanager: DBManager) -> List[dict]: - """ - Load messages for a specific user and session from the database, sorted by timestamp. - - :param user_id: The ID of the user whose messages are to be loaded - :param session_id: The ID of the session whose messages are to be loaded - :param dbmanager: The DBManager instance to interact with the database - - :return: A list of dictionaries, each representing a message - """ - query = "SELECT * FROM messages WHERE user_id = ? AND session_id = ?" - args = (user_id, session_id) - result = dbmanager.query(query=query, args=args, return_json=True) - # Sort by timestamp ascending - result = sorted(result, key=lambda k: k["timestamp"], reverse=False) - return result - - -def get_sessions(user_id: str, dbmanager: DBManager) -> List[dict]: - """ - Load sessions for a specific user from the database, sorted by timestamp. - - :param user_id: The ID of the user whose sessions are to be loaded - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a session - """ - query = "SELECT * FROM sessions WHERE user_id = ?" - args = (user_id,) - result = dbmanager.query(query=query, args=args, return_json=True) - # Sort by timestamp ascending - result = sorted(result, key=lambda k: k["timestamp"], reverse=True) - for row in result: - row["flow_config"] = json.loads(row["flow_config"]) - return result - - -def create_session(user_id: str, session: Session, dbmanager: DBManager) -> List[dict]: - """ - Create a new session for a specific user in the database. - - :param user_id: The ID of the user whose session is to be created - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a session - """ - query = "INSERT INTO sessions (user_id, id, timestamp, flow_config) VALUES (?, ?, ?,?)" - args = (session.user_id, session.id, session.timestamp, json.dumps(session.flow_config.dict())) - dbmanager.query(query=query, args=args) - sessions = get_sessions(user_id=user_id, dbmanager=dbmanager) - - return sessions - - -def rename_session(name: str, session: Session, dbmanager: DBManager) -> List[dict]: - """ - Edit a session for a specific user in the database. - - :param name: The new name of the session - :param session: The Session object containing session data - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a session - """ - - query = "UPDATE sessions SET name = ? WHERE id = ?" - args = (name, session.id) - dbmanager.query(query=query, args=args) - sessions = get_sessions(user_id=session.user_id, dbmanager=dbmanager) - - return sessions - - -def delete_session(session: Session, dbmanager: DBManager) -> List[dict]: - """ - Delete a specific session and all messages for that session in the database. - - :param session: The Session object containing session data - :param dbmanager: The DBManager instance to interact with the database - :return: A list of the remaining sessions - """ - - query = "DELETE FROM sessions WHERE id = ?" - args = (session.id,) - dbmanager.query(query=query, args=args) - - query = "DELETE FROM messages WHERE session_id = ?" - args = (session.id,) - dbmanager.query(query=query, args=args) - - return get_sessions(user_id=session.user_id, dbmanager=dbmanager) - - -def create_gallery(session: Session, dbmanager: DBManager, tags: List[str] = []) -> Gallery: - """ - Publish a session to the gallery table in the database. Fetches the session messages first, then saves session and messages object to the gallery database table. - :param session: The Session object containing session data - :param dbmanager: The DBManager instance used to interact with the database - :param tags: A list of tags to associate with the session - :return: A gallery object containing the session and messages objects - """ - - messages = get_messages(user_id=session.user_id, session_id=session.id, dbmanager=dbmanager) - gallery_item = Gallery(session=session, messages=messages, tags=tags) - query = "INSERT INTO gallery (id, session, messages, tags, timestamp) VALUES (?, ?, ?, ?,?)" - args = ( - gallery_item.id, - json.dumps(gallery_item.session.dict()), - json.dumps([message.dict() for message in gallery_item.messages]), - json.dumps(gallery_item.tags), - gallery_item.timestamp, - ) - dbmanager.query(query=query, args=args) - return gallery_item - - -def get_gallery(gallery_id, dbmanager: DBManager) -> List[Gallery]: - """ - Load gallery items from the database, sorted by timestamp. If gallery_id is provided, only the gallery item with the matching gallery_id will be returned. - - :param gallery_id: The ID of the gallery item to be loaded - :param dbmanager: The DBManager instance to interact with the database - :return: A list of Gallery objects - """ - - if gallery_id: - query = "SELECT * FROM gallery WHERE id = ?" - args = (gallery_id,) - else: - query = "SELECT * FROM gallery" - args = () - result = dbmanager.query(query=query, args=args, return_json=True) - # Sort by timestamp ascending - result = sorted(result, key=lambda k: k["timestamp"], reverse=True) - gallery = [] - for row in result: - gallery_item = Gallery( - id=row["id"], - session=Session(**json.loads(row["session"])), - messages=[Message(**message) for message in json.loads(row["messages"])], - tags=json.loads(row["tags"]), - timestamp=row["timestamp"], - ) - gallery.append(gallery_item) - return gallery - - -def get_skills(user_id: str, dbmanager: DBManager) -> List[Skill]: - """ - Load skills from the database, sorted by timestamp. Load skills where id = user_id or user_id = default. - - :param user_id: The ID of the user whose skills are to be loaded - :param dbmanager: The DBManager instance to interact with the database - :return: A list of Skill objects - """ - - query = "SELECT * FROM skills WHERE user_id = ? OR user_id = ?" - args = (user_id, "default") - result = dbmanager.query(query=query, args=args, return_json=True) - # Sort by timestamp ascending - result = sorted(result, key=lambda k: k["timestamp"], reverse=True) - skills = [] - for row in result: - skill = Skill(**row) - skills.append(skill) - return skills - - -def upsert_skill(skill: Skill, dbmanager: DBManager) -> List[Skill]: - """ - Insert or update a skill for a specific user in the database. - - If the skill with the given ID already exists, it will be updated with the new data. - Otherwise, a new skill will be created. - - :param skill: The Skill object containing skill data - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a skill - """ - - existing_skill = get_item_by_field("skills", "id", skill.id, dbmanager) - - if existing_skill: - updated_data = { - "user_id": skill.user_id, - "timestamp": skill.timestamp, - "content": skill.content, - "title": skill.title, - "file_name": skill.file_name, - } - update_item("skills", skill.id, updated_data, dbmanager) - else: - query = "INSERT INTO skills (id, user_id, timestamp, content, title, file_name) VALUES (?, ?, ?, ?, ?, ?)" - args = (skill.id, skill.user_id, skill.timestamp, skill.content, skill.title, skill.file_name) - dbmanager.query(query=query, args=args) - - skills = get_skills(user_id=skill.user_id, dbmanager=dbmanager) - - return skills - - -def delete_skill(skill: Skill, dbmanager: DBManager) -> List[Skill]: - """ - Delete a skill for a specific user in the database. - - :param skill: The Skill object containing skill data - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a skill - """ - # delete where id = skill.id and user_id = skill.user_id - query = "DELETE FROM skills WHERE id = ? AND user_id = ?" - args = (skill.id, skill.user_id) - dbmanager.query(query=query, args=args) - - return get_skills(user_id=skill.user_id, dbmanager=dbmanager) - - -def delete_message( - user_id: str, msg_id: str, session_id: str, dbmanager: DBManager, delete_all: bool = False -) -> List[dict]: - """ - Delete a specific message or all messages for a user and session from the database. - - :param user_id: The ID of the user whose messages are to be deleted - :param msg_id: The ID of the specific message to be deleted (ignored if delete_all is True) - :param session_id: The ID of the session whose messages are to be deleted - :param dbmanager: The DBManager instance to interact with the database - :param delete_all: If True, all messages for the user will be deleted - :return: A list of the remaining messages if not all were deleted, otherwise an empty list - """ - - if delete_all: - query = "DELETE FROM messages WHERE user_id = ? AND session_id = ?" - args = (user_id, session_id) - dbmanager.query(query=query, args=args) - return [] - else: - query = "DELETE FROM messages WHERE user_id = ? AND msg_id = ? AND session_id = ?" - args = (user_id, msg_id, session_id) - dbmanager.query(query=query, args=args) - messages = get_messages(user_id=user_id, session_id=session_id, dbmanager=dbmanager) - return messages - - -def get_agents(user_id: str, dbmanager: DBManager) -> List[AgentFlowSpec]: - """ - Load agents from the database, sorted by timestamp. Load agents where id = user_id or user_id = default. - - :param user_id: The ID of the user whose agents are to be loaded - :param dbmanager: The DBManager instance to interact with the database - :return: A list of AgentFlowSpec objects - """ - - query = "SELECT * FROM agents WHERE user_id = ? OR user_id = ?" - args = (user_id, "default") - result = dbmanager.query(query=query, args=args, return_json=True) - # Sort by timestamp ascending - result = sorted(result, key=lambda k: k["timestamp"], reverse=True) - agents = [] - for row in result: - row["config"] = json.loads(row["config"]) - row["skills"] = json.loads(row["skills"] or "[]") - agent = AgentFlowSpec(**row) - agents.append(agent) - return agents - - -def upsert_agent(agent_flow_spec: AgentFlowSpec, dbmanager: DBManager) -> List[Dict[str, Any]]: - """ - Insert or update an agent for a specific user in the database. - - If the agent with the given ID already exists, it will be updated with the new data. - Otherwise, a new agent will be created. - - :param agent_flow_spec: The AgentFlowSpec object containing agent configuration - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing an agent after insertion or update - """ - - existing_agent = get_item_by_field("agents", "id", agent_flow_spec.id, dbmanager) - - if existing_agent: - updated_data = { - "user_id": agent_flow_spec.user_id, - "timestamp": agent_flow_spec.timestamp, - "config": json.dumps(agent_flow_spec.config.dict()), - "type": agent_flow_spec.type, - "skills": json.dumps([x.dict() for x in agent_flow_spec.skills] if agent_flow_spec.skills else []), - } - update_item("agents", agent_flow_spec.id, updated_data, dbmanager) - else: - query = "INSERT INTO agents (id, user_id, timestamp, config, type, skills) VALUES (?, ?, ?, ?, ?,?)" - config_json = json.dumps(agent_flow_spec.config.dict()) - args = ( - agent_flow_spec.id, - agent_flow_spec.user_id, - agent_flow_spec.timestamp, - config_json, - agent_flow_spec.type, - json.dumps([x.dict() for x in agent_flow_spec.skills] if agent_flow_spec.skills else []), - ) - dbmanager.query(query=query, args=args) - - agents = get_agents(user_id=agent_flow_spec.user_id, dbmanager=dbmanager) - return agents - - -def delete_agent(agent: AgentFlowSpec, dbmanager: DBManager) -> List[Dict[str, Any]]: - """ - Delete an agent for a specific user from the database. - - :param agent: The AgentFlowSpec object containing agent configuration - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing an agent after deletion - """ - - # delete based on agent.id and agent.user_id - query = "DELETE FROM agents WHERE id = ? AND user_id = ?" - args = (agent.id, agent.user_id) - dbmanager.query(query=query, args=args) - - return get_agents(user_id=agent.user_id, dbmanager=dbmanager) - - -def get_item_by_field(table: str, field: str, value: Any, dbmanager: DBManager) -> Optional[Dict[str, Any]]: - query = f"SELECT * FROM {table} WHERE {field} = ?" - args = (value,) - result = dbmanager.query(query=query, args=args) - return result[0] if result else None - - -def update_item(table: str, item_id: str, updated_data: Dict[str, Any], dbmanager: DBManager) -> None: - set_clause = ", ".join([f"{key} = ?" for key in updated_data.keys()]) - query = f"UPDATE {table} SET {set_clause} WHERE id = ?" - args = (*updated_data.values(), item_id) - dbmanager.query(query=query, args=args) - - -def get_workflows(user_id: str, dbmanager: DBManager) -> List[Dict[str, Any]]: - """ - Load workflows for a specific user from the database, sorted by timestamp. - - :param user_id: The ID of the user whose workflows are to be loaded - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a workflow - """ - query = "SELECT * FROM workflows WHERE user_id = ? OR user_id = ?" - args = (user_id, "default") - result = dbmanager.query(query=query, args=args, return_json=True) - # Sort by timestamp ascending - result = sorted(result, key=lambda k: k["timestamp"], reverse=True) - workflows = [] - for row in result: - row["sender"] = json.loads(row["sender"]) - row["receiver"] = json.loads(row["receiver"]) - workflow = AgentWorkFlowConfig(**row) - workflows.append(workflow) - return workflows - - -def upsert_workflow(workflow: AgentWorkFlowConfig, dbmanager: DBManager) -> List[Dict[str, Any]]: - """ - Insert or update a workflow for a specific user in the database. - - If the workflow with the given ID already exists, it will be updated with the new data. - Otherwise, a new workflow will be created. - - :param workflow: The AgentWorkFlowConfig object containing workflow data - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a workflow after insertion or update - """ - existing_workflow = get_item_by_field("workflows", "id", workflow.id, dbmanager) - - # print(workflow.receiver) - - if existing_workflow: - updated_data = { - "user_id": workflow.user_id, - "timestamp": workflow.timestamp, - "sender": json.dumps(workflow.sender.dict()), - "receiver": json.dumps( - [receiver.dict() for receiver in workflow.receiver] - if isinstance(workflow.receiver, list) - else workflow.receiver.dict() - ), - "type": workflow.type, - "name": workflow.name, - "description": workflow.description, - "summary_method": workflow.summary_method, - } - update_item("workflows", workflow.id, updated_data, dbmanager) - else: - query = "INSERT INTO workflows (id, user_id, timestamp, sender, receiver, type, name, description, summary_method) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?)" - args = ( - workflow.id, - workflow.user_id, - workflow.timestamp, - json.dumps(workflow.sender.dict()), - json.dumps( - [receiver.dict() for receiver in workflow.receiver] - if isinstance(workflow.receiver, list) - else workflow.receiver.dict() - ), - workflow.type, - workflow.name, - workflow.description, - workflow.summary_method, - ) - dbmanager.query(query=query, args=args) - - return get_workflows(user_id=workflow.user_id, dbmanager=dbmanager) - - -def delete_workflow(workflow: AgentWorkFlowConfig, dbmanager: DBManager) -> List[Dict[str, Any]]: - """ - Delete a workflow for a specific user from the database. If the workflow does not exist, do nothing. - - :param workflow: The AgentWorkFlowConfig object containing workflow data - :param dbmanager: The DBManager instance to interact with the database - :return: A list of dictionaries, each representing a workflow after deletion - """ - - # delete where workflow.id =id and workflow.user_id = user_id - - query = "DELETE FROM workflows WHERE id = ? AND user_id = ?" - args = (workflow.id, workflow.user_id) - dbmanager.query(query=query, args=args) - - return get_workflows(user_id=workflow.user_id, dbmanager=dbmanager) diff --git a/samples/apps/autogen-studio/autogenstudio/utils/utils.py b/samples/apps/autogen-studio/autogenstudio/utils/utils.py index 49a8ac91acd..ed533ec3883 100644 --- a/samples/apps/autogen-studio/autogenstudio/utils/utils.py +++ b/samples/apps/autogen-studio/autogenstudio/utils/utils.py @@ -3,15 +3,17 @@ import os import re import shutil +from datetime import datetime from pathlib import Path -from typing import Dict, List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union from dotenv import load_dotenv +from loguru import logger -import autogen -from autogen.oai.client import OpenAIWrapper +from autogen.coding import DockerCommandLineCodeExecutor, LocalCommandLineCodeExecutor +from autogen.oai.client import ModelClient, OpenAIWrapper -from ..datamodel import AgentConfig, AgentFlowSpec, AgentWorkFlowConfig, LLMConfig, Model, Skill +from ..datamodel import CodeExecutionConfigTypes, Model, Skill from ..version import APP_NAME @@ -25,6 +27,23 @@ def md5_hash(text: str) -> str: return hashlib.md5(text.encode()).hexdigest() +def check_and_cast_datetime_fields(obj: Any) -> Any: + if hasattr(obj, "created_at") and isinstance(obj.created_at, str): + obj.created_at = str_to_datetime(obj.created_at) + + if hasattr(obj, "updated_at") and isinstance(obj.updated_at, str): + obj.updated_at = str_to_datetime(obj.updated_at) + + return obj + + +def str_to_datetime(dt_str: str) -> datetime: + if dt_str[-1] == "Z": + # Replace 'Z' with '+00:00' for UTC timezone + dt_str = dt_str[:-1] + "+00:00" + return datetime.fromisoformat(dt_str) + + def clear_folder(folder_path: str) -> None: """ Clear the contents of a folder. @@ -98,7 +117,16 @@ def get_file_type(file_path: str) -> str: CSV_EXTENSIONS = {".csv", ".xlsx"} # Supported image extensions - IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".svg", ".webp"} + IMAGE_EXTENSIONS = { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".svg", + ".webp", + } # Supported (web) video extensions VIDEO_EXTENSIONS = {".mp4", ".webm", ".ogg", ".mov", ".avi", ".wmv"} @@ -199,20 +227,42 @@ def get_modified_files(start_timestamp: float, end_timestamp: float, source_dir: return modified_files -def init_app_folders(app_file_path: str) -> Dict[str, str]: +def get_app_root() -> str: """ - Initialize folders needed for a web server, such as static file directories - and user-specific data directories. Also load any .env file if it exists. + Get the root directory of the application. - :param root_file_path: The root directory where webserver folders will be created - :return: A dictionary with the path of each created folder + :return: The root directory of the application. """ - app_name = f".{APP_NAME}" default_app_root = os.path.join(os.path.expanduser("~"), app_name) if not os.path.exists(default_app_root): os.makedirs(default_app_root, exist_ok=True) app_root = os.environ.get("AUTOGENSTUDIO_APPDIR") or default_app_root + return app_root + + +def get_db_uri(app_root: str) -> str: + """ + Get the default database URI for the application. + + :param app_root: The root directory of the application. + :return: The default database URI. + """ + db_uri = f"sqlite:///{os.path.join(app_root, 'database.sqlite')}" + db_uri = os.environ.get("AUTOGENSTUDIO_DATABASE_URI") or db_uri + logger.info(f"Using database URI: {db_uri}") + return db_uri + + +def init_app_folders(app_file_path: str) -> Dict[str, str]: + """ + Initialize folders needed for a web server, such as static file directories + and user-specific data directories. Also load any .env file if it exists. + + :param root_file_path: The root directory where webserver folders will be created + :return: A dictionary with the path of each created folder + """ + app_root = get_app_root() if not os.path.exists(app_root): os.makedirs(app_root, exist_ok=True) @@ -220,7 +270,7 @@ def init_app_folders(app_file_path: str) -> Dict[str, str]: # load .env file if it exists env_file = os.path.join(app_root, ".env") if os.path.exists(env_file): - print(f"Loading environment variables from {env_file}") + logger.info(f"Loaded environment variables from {env_file}") load_dotenv(env_file) files_static_root = os.path.join(app_root, "files/") @@ -233,8 +283,9 @@ def init_app_folders(app_file_path: str) -> Dict[str, str]: "files_static_root": files_static_root, "static_folder_root": static_folder_root, "app_root": app_root, + "database_engine_uri": get_db_uri(app_root=app_root), } - print(f"Initialized application data folder: {app_root}") + logger.info(f"Initialized application data folder: {app_root}") return folders @@ -258,11 +309,11 @@ def get_skills_from_prompt(skills: List[Skill], work_dir: str) -> str: for skill in skills: prompt += f""" -##### Begin of {skill.title} ##### +##### Begin of {skill.name} ##### {skill.content} -#### End of {skill.title} #### +#### End of {skill.name} #### """ @@ -290,7 +341,6 @@ def delete_files_in_folder(folders: Union[str, List[str]]) -> None: for folder in folders: # Check if the folder exists if not os.path.isdir(folder): - print(f"The folder {folder} does not exist.") continue # List all the entries in the directory @@ -306,56 +356,7 @@ def delete_files_in_folder(folders: Union[str, List[str]]) -> None: shutil.rmtree(path) except Exception as e: # Print the error message and skip - print(f"Failed to delete {path}. Reason: {e}") - - -def get_default_agent_config(work_dir: str) -> AgentWorkFlowConfig: - """ - Get a default agent flow config . - """ - - llm_config = LLMConfig( - config_list=[{"model": "gpt-4"}], - temperature=0, - ) - - USER_PROXY_INSTRUCTIONS = """If the request has been addressed sufficiently, summarize the answer and end with the word TERMINATE. Otherwise, ask a follow-up question. - """ - - userproxy_spec = AgentFlowSpec( - type="userproxy", - config=AgentConfig( - name="user_proxy", - human_input_mode="NEVER", - system_message=USER_PROXY_INSTRUCTIONS, - code_execution_config={ - "work_dir": work_dir, - "use_docker": False, - }, - max_consecutive_auto_reply=10, - llm_config=llm_config, - is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"), - ), - ) - - assistant_spec = AgentFlowSpec( - type="assistant", - config=AgentConfig( - name="primary_assistant", - system_message=autogen.AssistantAgent.DEFAULT_SYSTEM_MESSAGE, - llm_config=llm_config, - ), - ) - - flow_config = AgentWorkFlowConfig( - name="default", - sender=userproxy_spec, - receiver=assistant_spec, - type="default", - description="Default agent flow config", - ) - - return flow_config + logger.info(f"Failed to delete {path}. Reason: {e}") def extract_successful_code_blocks(messages: List[Dict[str, str]]) -> List[str]: @@ -392,7 +393,7 @@ def sanitize_model(model: Model): Sanitize model dictionary to remove None values and empty strings and only keep valid keys. """ if isinstance(model, Model): - model = model.dict() + model = model.model_dump() valid_keys = ["model", "base_url", "api_key", "api_type", "api_version"] # only add key if value is not None sanitized_model = {k: v for k, v in model.items() if (v is not None and v != "") and k in valid_keys} @@ -410,16 +411,36 @@ def test_model(model: Model): return response.choices[0].message.content -# summarize_chat_history (messages, model) .. returns a summary of the chat history +def load_code_execution_config(code_execution_type: CodeExecutionConfigTypes, work_dir: str): + """ + Load the code execution configuration based on the code execution type. + :param code_execution_type: The code execution type. + :param work_dir: The working directory to store code execution files. + :return: The code execution configuration. -def summarize_chat_history(task: str, messages: List[Dict[str, str]], model: Model): + """ + work_dir = Path(work_dir) + work_dir.mkdir(exist_ok=True) + executor = None + if code_execution_type == CodeExecutionConfigTypes.local: + executor = LocalCommandLineCodeExecutor(work_dir=work_dir) + elif code_execution_type == CodeExecutionConfigTypes.docker: + executor = DockerCommandLineCodeExecutor(work_dir=work_dir) + elif code_execution_type == CodeExecutionConfigTypes.none: + return False + else: + raise ValueError(f"Invalid code execution type: {code_execution_type}") + code_execution_config = { + "executor": executor, + } + return code_execution_config + + +def summarize_chat_history(task: str, messages: List[Dict[str, str]], client: ModelClient): """ Summarize the chat history using the model endpoint and returning the response. """ - - sanitized_model = sanitize_model(model) - client = OpenAIWrapper(config_list=[sanitized_model]) summarization_system_prompt = f""" You are a helpful assistant that is able to review the chat history between a set of agents (userproxy agents, assistants etc) as they try to address a given TASK and provide a summary. Be SUCCINCT but also comprehensive enough to allow others (who cannot see the chat history) understand and recreate the solution. @@ -427,7 +448,7 @@ def summarize_chat_history(task: str, messages: List[Dict[str, str]], model: Mod === {task} === - The summary should focus on extracting the actual solution to the task from the chat history (assuming the task was addressed) such that any other agent reading the summary will understand what the actual solution is. Use a neutral tone and DO NOT directly mention the agents. Instead only focus on the actions that were carried out (e.g. do not say 'assistant agent generated some code visualization code ..' instead say say 'visualization code was generated ..' ). + The summary should focus on extracting the actual solution to the task from the chat history (assuming the task was addressed) such that any other agent reading the summary will understand what the actual solution is. Use a neutral tone and DO NOT directly mention the agents. Instead only focus on the actions that were carried out (e.g. do not say 'assistant agent generated some code visualization code ..' instead say say 'visualization code was generated ..'. The answer should be framed as a response to the user task. E.g. if the task is "What is the height of the Eiffel tower", the summary should be "The height of the Eiffel Tower is ..."). """ summarization_prompt = [ { diff --git a/samples/apps/autogen-studio/autogenstudio/version.py b/samples/apps/autogen-studio/autogenstudio/version.py index 18b7f42aac3..bafe37f75b1 100644 --- a/samples/apps/autogen-studio/autogenstudio/version.py +++ b/samples/apps/autogen-studio/autogenstudio/version.py @@ -1,3 +1,3 @@ -VERSION = "0.0.54" +VERSION = "0.0.56rc9" __version__ = VERSION APP_NAME = "autogenstudio" diff --git a/samples/apps/autogen-studio/autogenstudio/web/app.py b/samples/apps/autogen-studio/autogenstudio/web/app.py index 6d5412e9fed..76ab8139ebc 100644 --- a/samples/apps/autogen-studio/autogenstudio/web/app.py +++ b/samples/apps/autogen-studio/autogenstudio/web/app.py @@ -1,25 +1,23 @@ import asyncio -import json import os import queue import threading import traceback from contextlib import asynccontextmanager +from typing import Any -from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect +from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles +from loguru import logger from openai import OpenAIError from ..chatmanager import AutoGenChatManager, WebSocketConnectionManager -from ..datamodel import ( - DBWebRequestModel, - DeleteMessageWebRequestModel, - Message, - Session, -) -from ..utils import DBManager, dbutils, init_app_folders, md5_hash, test_model -from ..version import APP_NAME, VERSION +from ..database import workflow_from_id +from ..database.dbmanager import DBManager +from ..datamodel import Agent, Message, Model, Response, Session, Skill, Workflow +from ..utils import check_and_cast_datetime_fields, init_app_folders, md5_hash, test_model +from ..version import VERSION managers = {"chat": None} # manage calls to autogen # Create thread-safe queue for messages between api thread and autogen threads @@ -27,18 +25,29 @@ active_connections = [] active_connections_lock = asyncio.Lock() websocket_manager = WebSocketConnectionManager( - active_connections=active_connections, active_connections_lock=active_connections_lock + active_connections=active_connections, + active_connections_lock=active_connections_lock, ) def message_handler(): while True: message = message_queue.get() - print("Active Connections: ", [client_id for _, client_id in websocket_manager.active_connections]) - print("Current message connection id: ", message["connection_id"]) + logger.info( + "** Processing Agent Message on Queue: Active Connections: " + + str([client_id for _, client_id in websocket_manager.active_connections]) + + " **" + ) for connection, socket_client_id in websocket_manager.active_connections: if message["connection_id"] == socket_client_id: + logger.info( + f"Sending message to connection_id: {message['connection_id']}. Connection ID: {socket_client_id}" + ) asyncio.run(websocket_manager.send_message(message, connection)) + else: + logger.info( + f"Skipping message for connection_id: {message['connection_id']}. Connection ID: {socket_client_id}" + ) message_queue.task_done() @@ -46,10 +55,19 @@ def message_handler(): message_handler_thread.start() +app_file_path = os.path.dirname(os.path.abspath(__file__)) +folders = init_app_folders(app_file_path) +ui_folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui") + +database_engine_uri = folders["database_engine_uri"] +dbmanager = DBManager(engine_uri=database_engine_uri) + + @asynccontextmanager async def lifespan(app: FastAPI): print("***** App started *****") managers["chat"] = AutoGenChatManager(message_queue=message_queue) + dbmanager.create_db_and_tables() yield # Close all active connections @@ -75,477 +93,312 @@ async def lifespan(app: FastAPI): ) -app_file_path = os.path.dirname(os.path.abspath(__file__)) -# init folders skills, workdir, static, files etc -folders = init_app_folders(app_file_path) -ui_folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui") - api = FastAPI(root_path="/api") # mount an api route such that the main route serves the ui and the /api app.mount("/api", api) app.mount("/", StaticFiles(directory=ui_folder_path, html=True), name="ui") -api.mount("/files", StaticFiles(directory=folders["files_static_root"], html=True), name="files") +api.mount( + "/files", + StaticFiles(directory=folders["files_static_root"], html=True), + name="files", +) -db_path = os.path.join(folders["app_root"], "database.sqlite") -dbmanager = DBManager(path=db_path) # manage database operations # manage websocket connections -@api.post("/messages") -async def add_message(req: DBWebRequestModel): - message = Message(**req.message.dict()) - user_history = dbutils.get_messages(user_id=message.user_id, session_id=req.message.session_id, dbmanager=dbmanager) - - # save incoming message to db - dbutils.create_message(message=message, dbmanager=dbmanager) - user_dir = os.path.join(folders["files_static_root"], "user", md5_hash(message.user_id)) - os.makedirs(user_dir, exist_ok=True) - +def create_entity(model: Any, model_class: Any, filters: dict = None): + """Create a new entity""" + model = check_and_cast_datetime_fields(model) try: - response_message: Message = managers["chat"].chat( - message=message, - history=user_history, - user_dir=user_dir, - flow_config=req.workflow, - connection_id=req.connection_id, - ) - - # save agent's response to db - messages = dbutils.create_message(message=response_message, dbmanager=dbmanager) - response = { - "status": True, - "message": "Message processed successfully", - "data": messages, - # "metadata": json.loads(response_message.metadata), - } - return response - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while processing message: " + str(ex_error), - } - - -@api.get("/messages") -async def get_messages(user_id: str = None, session_id: str = None): - if user_id is None: - raise HTTPException(status_code=400, detail="user_id is required") - try: - user_history = dbutils.get_messages(user_id=user_id, session_id=session_id, dbmanager=dbmanager) - - return { - "status": True, - "data": user_history, - "message": "Messages retrieved successfully", - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while retrieving messages: " + str(ex_error), - } - + response: Response = dbmanager.upsert(model) + return response.model_dump(mode="json") -@api.get("/gallery") -async def get_gallery_items(gallery_id: str = None): - try: - gallery = dbutils.get_gallery(gallery_id=gallery_id, dbmanager=dbmanager) - return { - "status": True, - "data": gallery, - "message": "Gallery items retrieved successfully", - } except Exception as ex_error: print(ex_error) return { "status": False, - "message": "Error occurred while retrieving messages: " + str(ex_error), + "message": f"Error occurred while creating {model_class.__name__}: " + str(ex_error), } -@api.get("/sessions") -async def get_user_sessions(user_id: str = None): - """Return a list of all sessions for a user""" - if user_id is None: - raise HTTPException(status_code=400, detail="user_id is required") +def list_entity( + model_class: Any, + filters: dict = None, + return_json: bool = True, + order: str = "desc", +): + """List all entities for a user""" + return dbmanager.get(model_class, filters=filters, return_json=return_json, order=order) - try: - user_sessions = dbutils.get_sessions(user_id=user_id, dbmanager=dbmanager) - return { - "status": True, - "data": user_sessions, - "message": "Sessions retrieved successfully", - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while retrieving sessions: " + str(ex_error), - } +def delete_entity(model_class: Any, filters: dict = None): + """Delete an entity""" + return dbmanager.delete(filters=filters, model_class=model_class) -@api.post("/sessions") -async def create_user_session(req: DBWebRequestModel): - """Create a new session for a user""" - # print(req.session, "**********" ) - - try: - session = Session(user_id=req.session.user_id, flow_config=req.session.flow_config) - user_sessions = dbutils.create_session(user_id=req.user_id, session=session, dbmanager=dbmanager) - - return { - "status": True, - "message": "Session created successfully", - "data": user_sessions, - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while creating session: " + str(ex_error), - } - - -@api.post("/sessions/rename") -async def rename_user_session(name: str, req: DBWebRequestModel): - """Rename a session for a user""" - print("Rename: " + name) - print("renaming session for user: " + req.user_id + " to: " + name) - try: - session = dbutils.rename_session(name=name, session=req.session, dbmanager=dbmanager) - return { - "status": True, - "message": "Session renamed successfully", - "data": session, - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while renaming session: " + str(ex_error), - } - - -@api.post("/sessions/publish") -async def publish_user_session_to_gallery(req: DBWebRequestModel): - """Create a new session for a user""" - - try: - gallery_item = dbutils.create_gallery(req.session, tags=req.tags, dbmanager=dbmanager) - return { - "status": True, - "message": "Session successfully published", - "data": gallery_item, - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while publishing session: " + str(ex_error), - } +@api.get("/skills") +async def list_skills(user_id: str): + """List all skills for a user""" + filters = {"user_id": user_id} + return list_entity(Skill, filters=filters) -@api.delete("/sessions/delete") -async def delete_user_session(req: DBWebRequestModel): - """Delete a session for a user""" - try: - sessions = dbutils.delete_session(session=req.session, dbmanager=dbmanager) - return { - "status": True, - "message": "Session deleted successfully", - "data": sessions, - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while deleting session: " + str(ex_error), - } +@api.post("/skills") +async def create_skill(skill: Skill): + """Create a new skill""" + filters = {"user_id": skill.user_id} + return create_entity(skill, Skill, filters=filters) -@api.post("/messages/delete") -async def remove_message(req: DeleteMessageWebRequestModel): - """Delete a message from the database""" +@api.delete("/skills/delete") +async def delete_skill(skill_id: int, user_id: str): + """Delete a skill""" + filters = {"id": skill_id, "user_id": user_id} + return delete_entity(Skill, filters=filters) - try: - messages = dbutils.delete_message( - user_id=req.user_id, msg_id=req.msg_id, session_id=req.session_id, dbmanager=dbmanager - ) - return { - "status": True, - "message": "Message deleted successfully", - "data": messages, - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while deleting message: " + str(ex_error), - } +@api.get("/models") +async def list_models(user_id: str): + """List all models for a user""" + filters = {"user_id": user_id} + return list_entity(Model, filters=filters) -@api.get("/skills") -async def get_user_skills(user_id: str): - try: - skills = dbutils.get_skills(user_id, dbmanager=dbmanager) - return { - "status": True, - "message": "Skills retrieved successfully", - "data": skills, - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while retrieving skills: " + str(ex_error), - } +@api.post("/models") +async def create_model(model: Model): + """Create a new model""" + return create_entity(model, Model) -@api.post("/skills") -async def create_user_skills(req: DBWebRequestModel): +@api.post("/models/test") +async def test_model_endpoint(model: Model): + """Test a model""" try: - skills = dbutils.upsert_skill(skill=req.skill, dbmanager=dbmanager) + response = test_model(model) return { "status": True, - "message": "Skills retrieved successfully", - "data": skills, + "message": "Model tested successfully", + "data": response, } - - except Exception as ex_error: - print(ex_error) + except (OpenAIError, Exception) as ex_error: return { "status": False, - "message": "Error occurred while creating skills: " + str(ex_error), + "message": "Error occurred while testing model: " + str(ex_error), } -@api.delete("/skills/delete") -async def delete_user_skills(req: DBWebRequestModel): - """Delete a skill for a user""" +@api.delete("/models/delete") +async def delete_model(model_id: int, user_id: str): + """Delete a model""" + filters = {"id": model_id, "user_id": user_id} + return delete_entity(Model, filters=filters) - try: - skills = dbutils.delete_skill(req.skill, dbmanager=dbmanager) - return { - "status": True, - "message": "Skill deleted successfully", - "data": skills, - } +@api.get("/agents") +async def list_agents(user_id: str): + """List all agents for a user""" + filters = {"user_id": user_id} + return list_entity(Agent, filters=filters) - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while deleting skill: " + str(ex_error), - } +@api.post("/agents") +async def create_agent(agent: Agent): + """Create a new agent""" + return create_entity(agent, Agent) -@api.get("/agents") -async def get_user_agents(user_id: str): - try: - agents = dbutils.get_agents(user_id, dbmanager=dbmanager) - return { - "status": True, - "message": "Agents retrieved successfully", - "data": agents, - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while retrieving agents: " + str(ex_error), - } +@api.delete("/agents/delete") +async def delete_agent(agent_id: int, user_id: str): + """Delete an agent""" + filters = {"id": agent_id, "user_id": user_id} + return delete_entity(Agent, filters=filters) -@api.post("/agents") -async def create_user_agents(req: DBWebRequestModel): - """Create a new agent for a user""" +@api.post("/agents/link/model/{agent_id}/{model_id}") +async def link_agent_model(agent_id: int, model_id: int): + """Link a model to an agent""" + return dbmanager.link(link_type="agent_model", primary_id=agent_id, secondary_id=model_id) - try: - agents = dbutils.upsert_agent(agent_flow_spec=req.agent, dbmanager=dbmanager) - return { - "status": True, - "message": "Agent created successfully", - "data": agents, - } +@api.delete("/agents/link/model/{agent_id}/{model_id}") +async def unlink_agent_model(agent_id: int, model_id: int): + """Unlink a model from an agent""" + return dbmanager.unlink(link_type="agent_model", primary_id=agent_id, secondary_id=model_id) - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while creating agent: " + str(ex_error), - } +@api.get("/agents/link/model/{agent_id}") +async def get_agent_models(agent_id: int): + """Get all models linked to an agent""" + return dbmanager.get_linked_entities("agent_model", agent_id, return_json=True) -@api.delete("/agents/delete") -async def delete_user_agent(req: DBWebRequestModel): - """Delete an agent for a user""" - try: - agents = dbutils.delete_agent(agent=req.agent, dbmanager=dbmanager) +@api.post("/agents/link/skill/{agent_id}/{skill_id}") +async def link_agent_skill(agent_id: int, skill_id: int): + """Link an a skill to an agent""" + return dbmanager.link(link_type="agent_skill", primary_id=agent_id, secondary_id=skill_id) - return { - "status": True, - "message": "Agent deleted successfully", - "data": agents, - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while deleting agent: " + str(ex_error), - } +@api.delete("/agents/link/skill/{agent_id}/{skill_id}") +async def unlink_agent_skill(agent_id: int, skill_id: int): + """Unlink an a skill from an agent""" + return dbmanager.unlink(link_type="agent_skill", primary_id=agent_id, secondary_id=skill_id) -@api.get("/models") -async def get_user_models(user_id: str): - try: - models = dbutils.get_models(user_id, dbmanager=dbmanager) +@api.get("/agents/link/skill/{agent_id}") +async def get_agent_skills(agent_id: int): + """Get all skills linked to an agent""" + return dbmanager.get_linked_entities("agent_skill", agent_id, return_json=True) - return { - "status": True, - "message": "Models retrieved successfully", - "data": models, - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while retrieving models: " + str(ex_error), - } +@api.post("/agents/link/agent/{primary_agent_id}/{secondary_agent_id}") +async def link_agent_agent(primary_agent_id: int, secondary_agent_id: int): + """Link an agent to another agent""" + return dbmanager.link( + link_type="agent_agent", + primary_id=primary_agent_id, + secondary_id=secondary_agent_id, + ) -@api.post("/models") -async def create_user_models(req: DBWebRequestModel): - """Create a new model for a user""" - try: - models = dbutils.upsert_model(model=req.model, dbmanager=dbmanager) +@api.delete("/agents/link/agent/{primary_agent_id}/{secondary_agent_id}") +async def unlink_agent_agent(primary_agent_id: int, secondary_agent_id: int): + """Unlink an agent from another agent""" + return dbmanager.unlink( + link_type="agent_agent", + primary_id=primary_agent_id, + secondary_id=secondary_agent_id, + ) - return { - "status": True, - "message": "Model created successfully", - "data": models, - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while creating model: " + str(ex_error), - } +@api.get("/agents/link/agent/{agent_id}") +async def get_linked_agents(agent_id: int): + """Get all agents linked to an agent""" + return dbmanager.get_linked_entities("agent_agent", agent_id, return_json=True) -@api.post("/models/test") -async def test_user_models(req: DBWebRequestModel): - """Test a model to verify it works""" +@api.get("/workflows") +async def list_workflows(user_id: str): + """List all workflows for a user""" + filters = {"user_id": user_id} + return list_entity(Workflow, filters=filters) - try: - response = test_model(model=req.model) - return { - "status": True, - "message": "Model tested successfully", - "data": response, - } - except OpenAIError as oai_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while testing model: " + str(oai_error), - } - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while testing model: " + str(ex_error), - } +@api.get("/workflows/{workflow_id}") +async def get_workflow(workflow_id: int, user_id: str): + """Get a workflow""" + filters = {"id": workflow_id, "user_id": user_id} + return list_entity(Workflow, filters=filters) -@api.delete("/models/delete") -async def delete_user_model(req: DBWebRequestModel): - """Delete a model for a user""" +@api.post("/workflows") +async def create_workflow(workflow: Workflow): + """Create a new workflow""" + return create_entity(workflow, Workflow) - try: - models = dbutils.delete_model(model=req.model, dbmanager=dbmanager) - return { - "status": True, - "message": "Model deleted successfully", - "data": models, - } +@api.delete("/workflows/delete") +async def delete_workflow(workflow_id: int, user_id: str): + """Delete a workflow""" + filters = {"id": workflow_id, "user_id": user_id} + return delete_entity(Workflow, filters=filters) + + +@api.post("/workflows/link/agent/{workflow_id}/{agent_id}/{agent_type}") +async def link_workflow_agent(workflow_id: int, agent_id: int, agent_type: str): + """Link an agent to a workflow""" + return dbmanager.link( + link_type="workflow_agent", + primary_id=workflow_id, + secondary_id=agent_id, + agent_type=agent_type, + ) + + +@api.delete("/workflows/link/agent/{workflow_id}/{agent_id}/{agent_type}") +async def unlink_workflow_agent(workflow_id: int, agent_id: int, agent_type: str): + """Unlink an agent from a workflow""" + return dbmanager.unlink( + link_type="workflow_agent", + primary_id=workflow_id, + secondary_id=agent_id, + agent_type=agent_type, + ) + + +@api.get("/workflows/link/agent/{workflow_id}/{agent_type}") +async def get_linked_workflow_agents(workflow_id: int, agent_type: str): + """Get all agents linked to a workflow""" + return dbmanager.get_linked_entities( + link_type="workflow_agent", + primary_id=workflow_id, + agent_type=agent_type, + return_json=True, + ) - except Exception as ex_error: - print(traceback.format_exc()) - return { - "status": False, - "message": "Error occurred while deleting model: " + str(ex_error), - } +@api.get("/sessions") +async def list_sessions(user_id: str): + """List all sessions for a user""" + filters = {"user_id": user_id} + return list_entity(Session, filters=filters) -@api.get("/workflows") -async def get_user_workflows(user_id: str): - try: - workflows = dbutils.get_workflows(user_id, dbmanager=dbmanager) - return { - "status": True, - "message": "Workflows retrieved successfully", - "data": workflows, - } - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while retrieving workflows: " + str(ex_error), - } +@api.post("/sessions") +async def create_session(session: Session): + """Create a new session""" + return create_entity(session, Session) -@api.post("/workflows") -async def create_user_workflow(req: DBWebRequestModel): - """Create a new workflow for a user""" - try: - workflow = dbutils.upsert_workflow(workflow=req.workflow, dbmanager=dbmanager) - return { - "status": True, - "message": "Workflow created successfully", - "data": workflow, - } +@api.delete("/sessions/delete") +async def delete_session(session_id: int, user_id: str): + """Delete a session""" + filters = {"id": session_id, "user_id": user_id} + return delete_entity(Session, filters=filters) - except Exception as ex_error: - print(ex_error) - return { - "status": False, - "message": "Error occurred while creating workflow: " + str(ex_error), - } +@api.get("/sessions/{session_id}/messages") +async def list_messages(user_id: str, session_id: int): + """List all messages for a use session""" + filters = {"user_id": user_id, "session_id": session_id} + return list_entity(Message, filters=filters, order="asc", return_json=True) -@api.delete("/workflows/delete") -async def delete_user_workflow(req: DBWebRequestModel): - """Delete a workflow for a user""" +@api.post("/sessions/{session_id}/workflow/{workflow_id}/run") +async def run_session_workflow(message: Message, session_id: int, workflow_id: int): + """Runs a workflow on provided message""" try: - workflow = dbutils.delete_workflow(workflow=req.workflow, dbmanager=dbmanager) - return { - "status": True, - "message": "Workflow deleted successfully", - "data": workflow, - } + user_message_history = ( + dbmanager.get( + Message, + filters={"user_id": message.user_id, "session_id": message.session_id}, + return_json=True, + ).data + if session_id is not None + else [] + ) + # save incoming message + dbmanager.upsert(message) + user_dir = os.path.join(folders["files_static_root"], "user", md5_hash(message.user_id)) + os.makedirs(user_dir, exist_ok=True) + workflow = workflow_from_id(workflow_id, dbmanager=dbmanager) + agent_response: Message = managers["chat"].chat( + message=message, + history=user_message_history, + user_dir=user_dir, + workflow=workflow, + connection_id=message.connection_id, + ) + response: Response = dbmanager.upsert(agent_response) + return response.model_dump(mode="json") except Exception as ex_error: - print(ex_error) + print(traceback.format_exc()) return { "status": False, - "message": "Error occurred while deleting workflow: " + str(ex_error), + "message": "Error occurred while processing message: " + str(ex_error), } @@ -558,11 +411,16 @@ async def get_version(): } +# websockets + + async def process_socket_message(data: dict, websocket: WebSocket, client_id: str): print(f"Client says: {data['type']}") if data["type"] == "user_message": - user_request_body = DBWebRequestModel(**data["data"]) - response = await add_message(user_request_body) + user_message = Message(**data["data"]) + session_id = data["data"].get("session_id", None) + workflow_id = data["data"].get("workflow_id", None) + response = await run_session_workflow(message=user_message, session_id=session_id, workflow_id=workflow_id) response_socket_message = { "type": "agent_response", "data": response, diff --git a/samples/apps/autogen-studio/autogenstudio/workflowmanager.py b/samples/apps/autogen-studio/autogenstudio/workflowmanager.py index c5475e58d83..8b41caab428 100644 --- a/samples/apps/autogen-studio/autogenstudio/workflowmanager.py +++ b/samples/apps/autogen-studio/autogenstudio/workflowmanager.py @@ -1,23 +1,26 @@ import os from datetime import datetime -from typing import Dict, List, Optional, Union - -from requests import Session +from typing import Any, Dict, List, Optional, Union import autogen -from .datamodel import AgentConfig, AgentFlowSpec, AgentWorkFlowConfig, Message, SocketMessage -from .utils import clear_folder, get_skills_from_prompt, sanitize_model +from .datamodel import ( + Agent, + AgentType, + Message, + SocketMessage, +) +from .utils import clear_folder, get_skills_from_prompt, load_code_execution_config, sanitize_model -class AutoGenWorkFlowManager: +class WorkflowManager: """ AutoGenWorkFlowManager class to load agents from a provided configuration and run a chat between them """ def __init__( self, - config: AgentWorkFlowConfig, + workflow: Dict, history: Optional[List[Message]] = None, work_dir: str = None, clear_work_dir: bool = True, @@ -33,20 +36,57 @@ def __init__( history: An optional list of previous messages to populate the agents' history. """ + # TODO - improved typing for workflow self.send_message_function = send_message_function self.connection_id = connection_id self.work_dir = work_dir or "work_dir" if clear_work_dir: clear_folder(self.work_dir) - self.config = config - # given the config, return an AutoGen agent object - self.sender = self.load(config.sender) - # given the config, return an AutoGen agent object - self.receiver = self.load(config.receiver) + self.workflow = workflow + self.sender = self.load(workflow.get("sender")) + self.receiver = self.load(workflow.get("receiver")) self.agent_history = [] if history: - self.populate_history(history) + self._populate_history(history) + + def _serialize_agent( + self, + agent: Agent, + mode: str = "python", + include: Optional[List[str]] = {"config"}, + exclude: Optional[List[str]] = None, + ) -> Dict: + """ """ + # exclude = ["id","created_at", "updated_at","user_id","type"] + exclude = exclude or {} + include = include or {} + if agent.type != AgentType.groupchat: + exclude.update( + { + "config": { + "admin_name", + "messages", + "max_round", + "admin_name", + "speaker_selection_method", + "allow_repeat_speaker", + } + } + ) + else: + include = { + "config": { + "admin_name", + "messages", + "max_round", + "admin_name", + "speaker_selection_method", + "allow_repeat_speaker", + } + } + result = agent.model_dump(warnings=False, exclude=exclude, include=include, mode=mode) + return result["config"] def process_message( self, @@ -84,25 +124,14 @@ def process_message( if request_reply is not False or sender_type == "groupchat": self.agent_history.append(message_payload) # add to history if self.send_message_function: # send over the message queue - socket_msg = SocketMessage(type="agent_message", data=message_payload, connection_id=self.connection_id) + socket_msg = SocketMessage( + type="agent_message", + data=message_payload, + connection_id=self.connection_id, + ) self.send_message_function(socket_msg.dict()) - def _sanitize_history_message(self, message: str) -> str: - """ - Sanitizes the message e.g. remove references to execution completed - - Args: - message: The message to be sanitized. - - Returns: - The sanitized message. - """ - to_replace = ["execution succeeded", "exitcode"] - for replace in to_replace: - message = message.replace(replace, "") - return message - - def populate_history(self, history: List[Message]) -> None: + def _populate_history(self, history: List[Message]) -> None: """ Populates the agent message history from the provided list of messages. @@ -127,19 +156,12 @@ def populate_history(self, history: List[Message]) -> None: silent=True, ) - def sanitize_agent_spec(self, agent_spec: AgentFlowSpec) -> AgentFlowSpec: - """ - Sanitizes the agent spec by setting loading defaults - - Args: - config: The agent configuration to be sanitized. - agent_type: The type of the agent. + def sanitize_agent(self, agent: Dict) -> Agent: + """ """ - Returns: - The sanitized agent configuration. - """ - - agent_spec.config.is_termination_msg = agent_spec.config.is_termination_msg or ( + skills = agent.get("skills", []) + agent = Agent.model_validate(agent) + agent.config.is_termination_msg = agent.config.is_termination_msg or ( lambda x: "TERMINATE" in x.get("content", "").rstrip()[-20:] ) @@ -149,40 +171,33 @@ def get_default_system_message(agent_type: str) -> str: else: return "You are a helpful AI Assistant." - # sanitize llm_config if present - if agent_spec.config.llm_config is not False: + if agent.config.llm_config is not False: config_list = [] - for llm in agent_spec.config.llm_config.config_list: + for llm in agent.config.llm_config.config_list: # check if api_key is present either in llm or env variable if "api_key" not in llm and "OPENAI_API_KEY" not in os.environ: - error_message = f"api_key is not present in llm_config or OPENAI_API_KEY env variable for agent ** {agent_spec.config.name}**. Update your workflow to provide an api_key to use the LLM." + error_message = f"api_key is not present in llm_config or OPENAI_API_KEY env variable for agent ** {agent.config.name}**. Update your workflow to provide an api_key to use the LLM." raise ValueError(error_message) # only add key if value is not None sanitized_llm = sanitize_model(llm) config_list.append(sanitized_llm) - agent_spec.config.llm_config.config_list = config_list - if agent_spec.config.code_execution_config is not False: - code_execution_config = agent_spec.config.code_execution_config or {} - code_execution_config["work_dir"] = self.work_dir - # tbd check if docker is installed - code_execution_config["use_docker"] = False - agent_spec.config.code_execution_config = code_execution_config - - if agent_spec.skills: - # get skill prompt, also write skills to a file named skills.py - skills_prompt = "" - skills_prompt = get_skills_from_prompt(agent_spec.skills, self.work_dir) - if agent_spec.config.system_message: - agent_spec.config.system_message = agent_spec.config.system_message + "\n\n" + skills_prompt - else: - agent_spec.config.system_message = ( - get_default_system_message(agent_spec.type) + "\n\n" + skills_prompt - ) - - return agent_spec - - def load(self, agent_spec: AgentFlowSpec) -> autogen.Agent: + agent.config.llm_config.config_list = config_list + + agent.config.code_execution_config = load_code_execution_config( + agent.config.code_execution_config, work_dir=self.work_dir + ) + + if skills: + skills_prompt = "" + skills_prompt = get_skills_from_prompt(skills, self.work_dir) + if agent.config.system_message: + agent.config.system_message = agent.config.system_message + "\n\n" + skills_prompt + else: + agent.config.system_message = get_default_system_message(agent.type) + "\n\n" + skills_prompt + return agent + + def load(self, agent: Any) -> autogen.Agent: """ Loads an agent based on the provided agent specification. @@ -192,43 +207,40 @@ def load(self, agent_spec: AgentFlowSpec) -> autogen.Agent: Returns: An instance of the loaded agent. """ - agent_spec = self.sanitize_agent_spec(agent_spec) - if agent_spec.type == "groupchat": - agents = [ - self.load(self.sanitize_agent_spec(agent_config)) for agent_config in agent_spec.groupchat_config.agents - ] - group_chat_config = agent_spec.groupchat_config.dict() - group_chat_config["agents"] = agents + if not agent: + raise ValueError( + "An agent configuration in this workflow is empty. Please provide a valid agent configuration." + ) + + linked_agents = agent.get("agents", []) + agent = self.sanitize_agent(agent) + if agent.type == "groupchat": + groupchat_agents = [self.load(agent) for agent in linked_agents] + group_chat_config = self._serialize_agent(agent) + group_chat_config["agents"] = groupchat_agents groupchat = autogen.GroupChat(**group_chat_config) agent = ExtendedGroupChatManager( - groupchat=groupchat, **agent_spec.config.dict(), message_processor=self.process_message + groupchat=groupchat, + message_processor=self.process_message, + llm_config=agent.config.llm_config.model_dump(), ) return agent else: - agent = self.load_agent_config(agent_spec.config, agent_spec.type) + if agent.type == "assistant": + agent = ExtendedConversableAgent( + **self._serialize_agent(agent), + message_processor=self.process_message, + ) + elif agent.type == "userproxy": + agent = ExtendedConversableAgent( + **self._serialize_agent(agent), + message_processor=self.process_message, + ) + else: + raise ValueError(f"Unknown agent type: {agent.type}") return agent - def load_agent_config(self, agent_config: AgentConfig, agent_type: str) -> autogen.Agent: - """ - Loads an agent based on the provided agent configuration. - - Args: - agent_config: The configuration of the agent to be loaded. - agent_type: The type of the agent to be loaded. - - Returns: - An instance of the loaded agent. - """ - if agent_type == "assistant": - agent = ExtendedConversableAgent(**agent_config.dict(), message_processor=self.process_message) - elif agent_type == "userproxy": - agent = ExtendedConversableAgent(**agent_config.dict(), message_processor=self.process_message) - else: - raise ValueError(f"Unknown agent type: {agent_type}") - - return agent - def run(self, message: str, clear_history: bool = False) -> None: """ Initiates a chat between the sender and receiver agents with an initial message @@ -262,6 +274,9 @@ def receive( super().receive(message, sender, request_reply, silent) +"" + + class ExtendedGroupChatManager(autogen.GroupChatManager): def __init__(self, message_processor=None, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/samples/apps/autogen-studio/frontend/src/components/atoms.tsx b/samples/apps/autogen-studio/frontend/src/components/atoms.tsx index 8bc70f89a90..c4c1368a123 100644 --- a/samples/apps/autogen-studio/frontend/src/components/atoms.tsx +++ b/samples/apps/autogen-studio/frontend/src/components/atoms.tsx @@ -4,53 +4,18 @@ import { Cog8ToothIcon, XMarkIcon, ClipboardIcon, - PlusIcon, - UserGroupIcon, - UsersIcon, - ExclamationTriangleIcon, InformationCircleIcon, } from "@heroicons/react/24/outline"; import React, { ReactNode, useEffect, useRef, useState } from "react"; import Icon from "./icons"; -import { - Button, - Divider, - Dropdown, - Input, - MenuProps, - Modal, - Select, - Slider, - Table, - Space, - Tooltip, - message, - theme, -} from "antd"; +import { Modal, Table, Tooltip, theme } from "antd"; import Editor from "@monaco-editor/react"; import Papa from "papaparse"; import remarkGfm from "remark-gfm"; import ReactMarkdown from "react-markdown"; import { atomDark } from "react-syntax-highlighter/dist/esm/styles/prism"; import { Prism as SyntaxHighlighter } from "react-syntax-highlighter"; -import { - checkAndSanitizeInput, - fetchJSON, - getServerUrl, - obscureString, - truncateText, -} from "./utils"; -import { - IAgentFlowSpec, - IFlowConfig, - IGroupChatFlowSpec, - ILLMConfig, - IModelConfig, - ISkill, - IStatus, -} from "./types"; -import TextArea from "antd/es/input/TextArea"; -import { appContext } from "../hooks/provider"; +import { truncateText } from "./utils"; const { useToken } = theme; interface CodeProps { @@ -162,12 +127,13 @@ export const Card = ({ border = hoverable ? border : "border-secondary"; return ( -
-
+
{title && (
{title} @@ -176,7 +142,7 @@ export const Card = ({
{subtitle}
{children}
-
+ ); }; @@ -303,7 +269,7 @@ export const MessageBox = ({ title, children, className }: IProps) => { export const GroupView = ({ children, title, - className = " bg-primary ", + className = "text-primary bg-primary ", }: any) => { return (
@@ -590,19 +556,21 @@ export const ControlRowView = ({ value, control, className, + truncateLength = 20, }: { title: string; description: string; - value: string | number; + value: string | number | boolean; control: any; className?: string; + truncateLength?: number; }) => { return (
{title} - {truncateText(value + "", 20)} + {truncateText(value + "", truncateLength)} {" "} @@ -614,291 +582,6 @@ export const ControlRowView = ({ ); }; -export const ModelSelector = ({ - configs, - setConfigs, - className, -}: { - configs: IModelConfig[]; - setConfigs: (configs: IModelConfig[]) => void; - className?: string; -}) => { - // const [configs, setConfigs] = useState(modelConfigs); - const [isModalVisible, setIsModalVisible] = useState(false); - const [newModelConfig, setNewModelConfig] = useState( - null - ); - const [editIndex, setEditIndex] = useState(null); - const [loading, setLoading] = useState(false); - const [error, setError] = useState(null); - - const [models, setModels] = useState([]); - const serverUrl = getServerUrl(); - - const { user } = React.useContext(appContext); - const listModelsUrl = `${serverUrl}/models?user_id=${user?.email}`; - - // const sanitizeModelConfig = (config: IModelConfig) => { - // const sanitizedConfig: IModelConfig = { model: config.model }; - // if (config.api_key) sanitizedConfig.api_key = config.api_key; - // if (config.base_url) sanitizedConfig.base_url = config.base_url; - // if (config.api_type) sanitizedConfig.api_type = config.api_type; - // if (config.api_version) sanitizedConfig.api_version = config.api_version; - // return sanitizedConfig; - // }; - - const handleRemoveConfig = (index: number) => { - const updatedConfigs = configs.filter((_, i) => i !== index); - - setConfigs(updatedConfigs); - }; - - const showModal = (config: IModelConfig | null, index: number | null) => { - setNewModelConfig(config); - setEditIndex(index); - setIsModalVisible(true); - }; - - const fetchModels = () => { - setError(null); - setLoading(true); - // const fetch; - const payLoad = { - method: "GET", - headers: { - "Content-Type": "application/json", - }, - }; - - const onSuccess = (data: any) => { - if (data && data.status) { - // message.success(data.message); - setModels(data.data); - } else { - message.error(data.message); - } - setLoading(false); - }; - const onError = (err: any) => { - setError(err); - message.error(err.message); - setLoading(false); - }; - fetchJSON(listModelsUrl, payLoad, onSuccess, onError); - }; - - useEffect(() => { - fetchModels(); - }, []); - - const modelItems: MenuProps["items"] = - models.length > 0 - ? models.map((model: IModelConfig, index: number) => ({ - key: index, - label: ( - <> -
{model.model}
-
- {truncateText(model.description || "", 20)} -
- - ), - value: index, - })) - : [ - { - key: -1, - label: <>No models found, - value: 0, - }, - ]; - - const modelOnClick: MenuProps["onClick"] = ({ key }) => { - const selectedIndex = parseInt(key.toString()); - let selectedModel = models[selectedIndex]; - const updatedConfigs = [...configs, selectedModel]; - setConfigs(updatedConfigs); - }; - - const menuStyle: React.CSSProperties = { - boxShadow: "none", - }; - - const { token } = useToken(); - const contentStyle: React.CSSProperties = { - backgroundColor: token.colorBgElevated, - borderRadius: token.borderRadiusLG, - boxShadow: token.boxShadowSecondary, - }; - - const addModelsMessage = ( - - {" "} - Please - create models in the Model tab - - ); - - const AddModelsDropDown = () => { - return ( - ( -
- {React.cloneElement(menu as React.ReactElement, { - style: menuStyle, - })} - {models.length === 0 && ( - <> - - -
{addModelsMessage}
- - )} -
- )} - > -
- add -
-
- ); - }; - - const handleOk = () => { - if (newModelConfig?.model.trim()) { - const sanitizedConfig = newModelConfig; - - if (editIndex !== null) { - // Edit existing model - const updatedConfigs = [...configs]; - updatedConfigs[editIndex] = sanitizedConfig; - setConfigs(updatedConfigs); - } else { - // Add new model - setConfigs([...configs, sanitizedConfig]); - } - setIsModalVisible(false); - setNewModelConfig(null); - setEditIndex(null); - } else { - // Handle case where 'model' field is empty - // Could provide user feedback here (e.g., input validation error) - message.error("Model name cannot be empty"); - } - }; - - const handleCancel = () => { - setIsModalVisible(false); - setNewModelConfig(null); - setEditIndex(null); - }; - - const updateNewModelConfig = (field: keyof IModelConfig, value: string) => { - setNewModelConfig((prevState) => - prevState ? { ...prevState, [field]: value } : null - ); - }; - - const modelButtons = configs.map((config, i) => { - const tooltipText = ( - <> -
{config.model}
- {config.base_url &&
{config.base_url}
} - {config.api_key &&
{obscureString(config.api_key, 3)}
} -
- {truncateText(config.description || "", 90)} -
- - ); - return ( -
showModal(config, i)} - > -
- {" "} - -
{config.model}
{" "} -
-
{ - e.stopPropagation(); // Prevent opening the modal to edit - handleRemoveConfig(i); - }} - className="ml-1 text-primary hover:text-accent duration-300" - > - -
-
-
- ); - }); - - return ( -
-
- {modelButtons} - -
- - Cancel - , - , - ]} - > -
Enter parameters for your model.
- updateNewModelConfig("model", e.target.value)} - /> - updateNewModelConfig("api_key", e.target.value)} - /> - updateNewModelConfig("base_url", e.target.value)} - /> - updateNewModelConfig("api_type", e.target.value)} - /> - - updateNewModelConfig("api_version", e.target.value)} - /> -
-
- ); -}; - export const BounceLoader = ({ className, title = "", @@ -937,7 +620,7 @@ export const ImageLoader = ({ Dynamic content setIsLoading(false)} @@ -1077,946 +760,6 @@ export const PdfViewer = ({ url }: { url: string }) => { ); }; -export const AgentFlowSpecView = ({ - title = "Agent Specification", - flowSpec, - setFlowSpec, -}: { - title: string; - flowSpec: IAgentFlowSpec; - setFlowSpec: (newFlowSpec: IAgentFlowSpec) => void; - editMode?: boolean; -}) => { - // Local state for the FlowView component - const [localFlowSpec, setLocalFlowSpec] = - React.useState(flowSpec); - - // Required to monitor localAgent updates that occur in GroupChatFlowSpecView and reflect updates. - useEffect(() => { - setLocalFlowSpec(flowSpec); - }, [flowSpec]); - - // Event handlers for updating local state and propagating changes - - const onControlChange = (value: any, key: string) => { - if (key === "llm_config") { - if (value.config_list.length === 0) { - value = false; - } - } - const updatedFlowSpec = { - ...localFlowSpec, - config: { ...localFlowSpec.config, [key]: value }, - }; - - setLocalFlowSpec(updatedFlowSpec); - setFlowSpec(updatedFlowSpec); - }; - - const llm_config: ILLMConfig = localFlowSpec?.config?.llm_config || { - config_list: [], - temperature: 0.1, - }; - - const nameValidation = checkAndSanitizeInput(flowSpec?.config?.name); - - return ( - <> -
{title}
- {flowSpec?.config?.name}
- className="mb-4 bg-primary " - > - - { - onControlChange(e.target.value, "name"); - }} - /> - {!nameValidation.status && ( -
- {nameValidation.message} -
- )} - - } - /> - - { - onControlChange(e.target.value, "description"); - }} - /> - } - /> - - { - onControlChange(value, "max_consecutive_auto_reply"); - }} - /> - } - /> - - { - onControlChange(e.target.value, "default_auto_reply"); - }} - /> - } - /> - - { - onControlChange(value, "human_input_mode"); - }} - options={ - [ - { label: "NEVER", value: "NEVER" }, - // { label: "TERMINATE", value: "TERMINATE" }, - // { label: "ALWAYS", value: "ALWAYS" }, - ] as any - } - /> - } - /> - - {llm_config && llm_config.config_list.length > 0 && ( - { - onControlChange(e.target.value, "system_message"); - }} - /> - } - /> - )} - - {llm_config && ( - { - const llm_config = { - ...(flowSpec.config.llm_config || { temperature: 0.1 }), - config_list, - }; - onControlChange(llm_config, "llm_config"); - }} - /> - } - /> - )} - - {llm_config && llm_config.config_list.length > 0 && ( - { - const llm_config = { - ...flowSpec.config.llm_config, - temperature: value, - }; - onControlChange(llm_config, "llm_config"); - }} - /> - } - /> - )} - - { - { - const updatedFlowSpec = { - ...localFlowSpec, - skills, - }; - setLocalFlowSpec(updatedFlowSpec); - setFlowSpec(updatedFlowSpec); - }} - /> - } - /> - } - - - ); -}; - -interface SkillSelectorProps { - skills: ISkill[]; - setSkills: (skills: ISkill[]) => void; - className?: string; -} - -export const SkillSelector: React.FC = ({ - skills, - setSkills, - className, -}) => { - const [isModalVisible, setIsModalVisible] = useState(false); - const [showSkillModal, setShowSkillModal] = React.useState(false); - const [newSkill, setNewSkill] = useState(null); - - const [localSkills, setLocalSkills] = useState(skills); - const [selectedSkill, setSelectedSkill] = useState(null); - - const handleRemoveSkill = (index: number) => { - const updatedSkills = localSkills.filter((_, i) => i !== index); - setLocalSkills(updatedSkills); - setSkills(updatedSkills); - }; - - const handleAddSkill = () => { - if (newSkill) { - const updatedSkills = [...localSkills, newSkill]; - setLocalSkills(updatedSkills); - setSkills(updatedSkills); - setNewSkill(null); - } - }; - - useEffect(() => { - if (selectedSkill) { - setShowSkillModal(true); - } - }, [selectedSkill]); - - return ( - <> - { - setShowSkillModal(false); - setSelectedSkill(null); - }} - onCancel={() => { - setShowSkillModal(false); - setSelectedSkill(null); - }} - > - {selectedSkill && ( -
-
{selectedSkill.file_name}
- -
- )} -
- -
- {localSkills.map((skill, index) => ( -
- { - setSelectedSkill(skill); - }} - className=" inline-block " - > - {skill.title} - - handleRemoveSkill(index)} - className="ml-1 text-primary hover:text-accent duration-300 w-4 h-4 inline-block" - /> -
- ))} - -
{ - setIsModalVisible(true); - }} - > - add -
-
- - setIsModalVisible(false)} - footer={[ - , - , - ]} - > - - - - ); -}; - -export const SkillLoader = ({ - skill, - setSkill, -}: { - skill: ISkill | null; - setSkill: (skill: ISkill | null) => void; -}) => { - const [skills, setSkills] = useState([]); - const [loading, setLoading] = useState(false); - const [error, setError] = React.useState({ - status: true, - message: "All good", - }); - const serverUrl = getServerUrl(); - const { user } = React.useContext(appContext); - const listSkillsUrl = `${serverUrl}/skills?user_id=${user?.email}`; - - const fetchSkills = () => { - setError(null); - setLoading(true); - // const fetch; - const payLoad = { - method: "GET", - headers: { - "Content-Type": "application/json", - }, - }; - - const onSuccess = (data: any) => { - if (data && data.status) { - message.success(data.message); - setSkills(data.data); - if (data.data.length > 0) { - setSkill(data.data[0]); - } - } else { - message.error(data.message); - } - setLoading(false); - }; - const onError = (err: any) => { - setError(err); - message.error(err.message); - setLoading(false); - }; - fetchJSON(listSkillsUrl, payLoad, onSuccess, onError); - }; - - useEffect(() => { - fetchSkills(); - }, []); - - const skillOptions = skills.map((skill: ISkill, index: number) => ({ - label: skill.title, - value: index, - })); - return ( -
- - - {skills && ( - <> - ({ - label: spec.config.name, - value: index, - }))} - /> -
- )} - {/* {JSON.stringify(localAgent)} */} - - ); -}; - -export const AgentSelector = ({ - flowSpec, - setFlowSpec, -}: { - flowSpec: IAgentFlowSpec | null; - setFlowSpec: (agent: IAgentFlowSpec | null) => void; -}) => { - const [isModalVisible, setIsModalVisible] = useState(false); - - return ( -
-
setIsModalVisible(true)} - className="hover:bg-secondary h-full duration-300 border border-dashed rounded p-2" - > - {flowSpec && ( -
- {flowSpec.type === "groupchat" ? ( - - ) : ( - - )} - {flowSpec.config.name} -
- {" "} - {flowSpec.config.description || flowSpec.config.name} -
-
- {" "} - - {(flowSpec.skills && flowSpec.skills?.length) || 0} skills - - - | max replies: {flowSpec.config.max_consecutive_auto_reply} - -
-
- )} -
- { - <> - { - setFlowSpec(agent); - }} - /> - - } -
- ); -}; -export const FlowConfigViewer = ({ - flowConfig, - setFlowConfig, -}: { - flowConfig: IFlowConfig; - setFlowConfig: (newFlowConfig: IFlowConfig) => void; -}) => { - // Local state for sender and receiver FlowSpecs - const [senderFlowSpec, setSenderFlowSpec] = - React.useState(flowConfig.sender); - - const [localFlowConfig, setLocalFlowConfig] = - React.useState(flowConfig); - - const [receiverFlowSpec, setReceiverFlowSpec] = - React.useState(flowConfig.receiver); - - // Update the local state and propagate changes to the parent component - const updateSenderFlowSpec = (newFlowSpec: IAgentFlowSpec | null) => { - setSenderFlowSpec(newFlowSpec); - if (newFlowSpec) { - setFlowConfig({ ...flowConfig, sender: newFlowSpec }); - } - }; - - const updateReceiverFlowSpec = (newFlowSpec: IAgentFlowSpec | null) => { - setReceiverFlowSpec(newFlowSpec); - if (newFlowSpec) { - setFlowConfig({ ...flowConfig, receiver: newFlowSpec }); - } - }; - - const updateFlowConfig = (key: string, value: string) => { - // When an updatedFlowConfig is created using localFlowConfig, if the contents of FlowConfigViewer Modal are changed after the Agent Specification Modal is updated, the updated contents of the Agent Specification Modal are not saved. Fixed to localFlowConfig->flowConfig. Fixed a bug. - const updatedFlowConfig = { ...flowConfig, [key]: value }; - console.log("updatedFlowConfig: ", updatedFlowConfig); - setLocalFlowConfig(updatedFlowConfig); - setFlowConfig(updatedFlowConfig); - }; - - return ( - <> - {/*
{flowConfig.name}
*/} - updateFlowConfig("name", e.target.value)} - /> - } - /> - - updateFlowConfig("description", e.target.value)} - /> - } - /> - - updateFlowConfig("summary_method", value)} - options={ - [ - { label: "last", value: "last" }, - { label: "none", value: "none" }, - { label: "llm", value: "llm" }, - ] as any - } - /> - } - /> -
-
-
Sender
- -
-
-
Receiver
- -
-
- - ); -}; - export const MonacoEditor = ({ value, editorRef, diff --git a/samples/apps/autogen-studio/frontend/src/components/header.tsx b/samples/apps/autogen-studio/frontend/src/components/header.tsx index 8ec85326923..d0adf2e0a3a 100644 --- a/samples/apps/autogen-studio/frontend/src/components/header.tsx +++ b/samples/apps/autogen-studio/frontend/src/components/header.tsx @@ -25,7 +25,7 @@ const Header = ({ meta, link }: any) => { const links: any[] = [ { name: "Build", href: "/build" }, { name: "Playground", href: "/" }, - { name: "Gallery", href: "/gallery" }, + // { name: "Gallery", href: "/gallery" }, // { name: "Data Explorer", href: "/explorer" }, ]; diff --git a/samples/apps/autogen-studio/frontend/src/components/types.ts b/samples/apps/autogen-studio/frontend/src/components/types.ts index 522682a4884..eba39144602 100644 --- a/samples/apps/autogen-studio/frontend/src/components/types.ts +++ b/samples/apps/autogen-studio/frontend/src/components/types.ts @@ -2,14 +2,13 @@ export type NotificationType = "success" | "info" | "warning" | "error"; export interface IMessage { user_id: string; - root_msg_id: string; - msg_id?: string; role: string; content: string; - timestamp?: string; - personalize?: boolean; - ra?: string; - session_id?: string; + created_at?: string; + updated_at?: string; + session_id?: number; + connection_id?: string; + workflow_id?: number; } export interface IStatus { @@ -21,7 +20,7 @@ export interface IStatus { export interface IChatMessage { text: string; sender: "user" | "bot"; - metadata?: any; + meta?: any; msg_id: string; } @@ -30,6 +29,7 @@ export interface ILLMConfig { timeout?: number; cache_seed?: number | null; temperature: number; + max_tokens: number; } export interface IAgentConfig { @@ -40,47 +40,36 @@ export interface IAgentConfig { system_message: string | ""; is_termination_msg?: boolean | string; default_auto_reply?: string | null; - code_execution_config?: boolean | string | { [key: string]: any } | null; + code_execution_config?: "none" | "local" | "docker"; description?: string; -} -export interface IAgentFlowSpec { - type: "assistant" | "userproxy" | "groupchat"; - config: IAgentConfig; - timestamp?: string; - id?: string; - skills?: Array; - user_id?: string; + admin_name?: string; + messages?: Array; + max_round?: number; + speaker_selection_method?: string; + allow_repeat_speaker?: boolean; } -export interface IGroupChatConfig { - agents: Array; - admin_name: string; - messages: Array; - max_round: number; - speaker_selection_method: "auto" | "round_robin" | "random"; - allow_repeat_speaker: boolean | Array; -} - -export interface IGroupChatFlowSpec { - type: "groupchat"; +export interface IAgent { + type?: "assistant" | "userproxy" | "groupchat"; config: IAgentConfig; - groupchat_config: IGroupChatConfig; - id?: string; - timestamp?: string; + created_at?: string; + updated_at?: string; + id?: number; + skills?: Array; user_id?: string; - description?: string; } -export interface IFlowConfig { +export interface IWorkflow { name: string; description: string; - sender: IAgentFlowSpec; - receiver: IAgentFlowSpec | IGroupChatFlowSpec; + sender: IAgent; + receiver: IAgent; type: "twoagents" | "groupchat"; - timestamp?: string; + created_at?: string; + updated_at?: string; summary_method?: "none" | "last" | "llm"; - id?: string; + id?: number; user_id?: string; } @@ -89,11 +78,12 @@ export interface IModelConfig { api_key?: string; api_version?: string; base_url?: string; - api_type?: string; + api_type?: "open_ai" | "azure" | "google"; user_id?: string; - timestamp?: string; + created_at?: string; + updated_at?: string; description?: string; - id?: string; + id?: number; } export interface IMetadataFile { @@ -105,27 +95,29 @@ export interface IMetadataFile { } export interface IChatSession { - id: string; + id?: number; user_id: string; - timestamp: string; - flow_config: IFlowConfig; + workflow_id?: number; + created_at?: string; + updated_at?: string; name: string; } export interface IGalleryItem { - id: string; + id: number; messages: Array; session: IChatSession; tags: Array; - timestamp: string; + created_at: string; + updated_at: string; } export interface ISkill { - title: string; - file_name?: string; + name: string; content: string; - id?: string; - timestamp?: string; + id?: number; description?: string; user_id?: string; + created_at?: string; + updated_at?: string; } diff --git a/samples/apps/autogen-studio/frontend/src/components/utils.ts b/samples/apps/autogen-studio/frontend/src/components/utils.ts index 73b9f42207c..2264f5c66a2 100644 --- a/samples/apps/autogen-studio/frontend/src/components/utils.ts +++ b/samples/apps/autogen-studio/frontend/src/components/utils.ts @@ -1,12 +1,11 @@ import { + IAgent, IAgentConfig, - IAgentFlowSpec, - IFlowConfig, - IGroupChatFlowSpec, ILLMConfig, IModelConfig, ISkill, IStatus, + IWorkflow, } from "./types"; export const getServerUrl = () => { @@ -66,7 +65,8 @@ export function fetchJSON( url: string | URL, payload: any = {}, onSuccess: (data: any) => void, - onError: (error: IStatus) => void + onError: (error: IStatus) => void, + onFinal: () => void = () => {} ) { return fetch(url, payload) .then(function (response) { @@ -95,6 +95,9 @@ export function fetchJSON( status: false, message: `There was an error connecting to server. (${err}) `, }); + }) + .finally(() => { + onFinal(); }); } export const capitalize = (s: string) => { @@ -243,60 +246,138 @@ export const formatDuration = (seconds: number) => { return parts.length > 0 ? parts.join(" ") : "0 sec"; }; -export const sampleAgentConfig = (user_id: string = "guestuser@gmail.com") => { - const sampleAgent: IAgentFlowSpec = { +export const sampleModelConfig = (modelType: string = "open_ai") => { + const openaiConfig: IModelConfig = { + model: "gpt-4-1106-preview", + api_type: "open_ai", + description: "OpenAI GPT-4 model", + }; + const azureConfig: IModelConfig = { + model: "gpt-4", + api_type: "azure", + api_version: "v1", + base_url: "https://youazureendpoint.azure.com/", + description: "Azure model", + }; + + const googleConfig: IModelConfig = { + model: "gemini-1.0-pro", + api_type: "google", + description: "Google Gemini Model model", + }; + + switch (modelType) { + case "open_ai": + return openaiConfig; + case "azure": + return azureConfig; + case "google": + return googleConfig; + default: + return openaiConfig; + } +}; + +export const getRandomIntFromDateAndSalt = (salt: number = 43444) => { + const currentDate = new Date(); + const seed = currentDate.getTime() + salt; + const randomValue = Math.sin(seed) * 10000; + const randomInt = Math.floor(randomValue) % 100; + return randomInt; +}; + +export const sampleAgentConfig = (agent_type: string = "assistant") => { + const llm_config: ILLMConfig = { + config_list: [], + temperature: 0.1, + timeout: 600, + cache_seed: null, + max_tokens: 1000, + }; + + const userProxyConfig: IAgentConfig = { + name: "userproxy", + human_input_mode: "NEVER", + description: "User Proxy", + max_consecutive_auto_reply: 25, + system_message: "You are a helpful assistant.", + default_auto_reply: "TERMINATE", + llm_config: false, + code_execution_config: "local", + }; + const userProxyFlowSpec: IAgent = { + type: "userproxy", + config: userProxyConfig, + }; + + const assistantConfig: IAgentConfig = { + name: "primary_assistant", + description: "Primary Assistant", + llm_config: llm_config, + human_input_mode: "NEVER", + max_consecutive_auto_reply: 25, + code_execution_config: "none", + system_message: + "You are a helpful AI assistant. Solve tasks using your coding and language skills. In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. 1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. 2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. Reply 'TERMINATE' in the end when everything is done.", + }; + + const assistantFlowSpec: IAgent = { type: "assistant", - user_id: user_id, - config: { - name: "sample_assistant", - description: "Sample assistant", - llm_config: { - config_list: [ - { - model: "gpt-4-1106-preview", - }, - ], - temperature: 0.1, - timeout: 600, - cache_seed: null, - }, - human_input_mode: "NEVER", - code_execution_config: false, - max_consecutive_auto_reply: 8, - system_message: - "You are a helpful AI assistant. Solve tasks using your coding and language skills. In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. 1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. 2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. Reply 'TERMINATE' in the end when everything is done.", + config: assistantConfig, + }; + + const groupChatAssistantConfig = Object.assign( + { + admin_name: "groupchat_assistant", + messages: [], + max_round: 10, + speaker_selection_method: "auto", + allow_repeat_speaker: false, }, + assistantConfig + ); + groupChatAssistantConfig.name = "groupchat_assistant"; + groupChatAssistantConfig.system_message = + "You are a helpful assistant skilled at cordinating a group of other assistants to solve a task. "; + groupChatAssistantConfig.description = "Group Chat Assistant"; + + const groupChatFlowSpec: IAgent = { + type: "groupchat", + config: groupChatAssistantConfig, }; - return sampleAgent; + + if (agent_type === "userproxy") { + return userProxyFlowSpec; + } else if (agent_type === "assistant") { + return assistantFlowSpec; + } else if (agent_type === "groupchat") { + return groupChatFlowSpec; + } else { + return assistantFlowSpec; + } }; export const sampleWorkflowConfig = (type = "twoagents") => { - const llm_model_config: IModelConfig[] = [ - { - model: "gpt-4-1106-preview", - }, - ]; + const llm_model_config: IModelConfig[] = []; const llm_config: ILLMConfig = { config_list: llm_model_config, temperature: 0.1, timeout: 600, cache_seed: null, + max_tokens: 1000, }; const userProxyConfig: IAgentConfig = { name: "userproxy", human_input_mode: "NEVER", - max_consecutive_auto_reply: 5, + max_consecutive_auto_reply: 15, system_message: "You are a helpful assistant.", default_auto_reply: "TERMINATE", llm_config: false, - code_execution_config: { - work_dir: null, - use_docker: false, - }, + code_execution_config: "local", }; - const userProxyFlowSpec: IAgentFlowSpec = { + const userProxyFlowSpec: IAgent = { type: "userproxy", config: userProxyConfig, }; @@ -306,17 +387,17 @@ export const sampleWorkflowConfig = (type = "twoagents") => { llm_config: llm_config, human_input_mode: "NEVER", max_consecutive_auto_reply: 8, - code_execution_config: false, + code_execution_config: "none", system_message: "You are a helpful AI assistant. Solve tasks using your coding and language skills. In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. 1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. 2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. Reply 'TERMINATE' in the end when everything is done.", }; - const assistantFlowSpec: IAgentFlowSpec = { + const assistantFlowSpec: IAgent = { type: "assistant", config: assistantConfig, }; - const workFlowConfig: IFlowConfig = { + const workFlowConfig: IWorkflow = { name: "Default Agent Workflow", description: "Default Agent Workflow", sender: userProxyFlowSpec, @@ -324,26 +405,27 @@ export const sampleWorkflowConfig = (type = "twoagents") => { type: "twoagents", }; - const groupChatAssistantConfig = Object.assign({}, assistantConfig); - groupChatAssistantConfig.name = "groupchat_assistant"; - groupChatAssistantConfig.system_message = - "You are a helpful assistant skilled at cordinating a group of other assistants to solve a task. "; - - const groupChatFlowSpec: IGroupChatFlowSpec = { - type: "groupchat", - config: groupChatAssistantConfig, - groupchat_config: { - agents: [assistantFlowSpec, assistantFlowSpec], + const groupChatAssistantConfig = Object.assign( + { admin_name: "groupchat_assistant", messages: [], max_round: 10, speaker_selection_method: "auto", allow_repeat_speaker: false, + description: "Group Chat Assistant", }, - description: "Default Group Workflow", + assistantConfig + ); + groupChatAssistantConfig.name = "groupchat_assistant"; + groupChatAssistantConfig.system_message = + "You are a helpful assistant skilled at cordinating a group of other assistants to solve a task. "; + + const groupChatFlowSpec: IAgent = { + type: "groupchat", + config: groupChatAssistantConfig, }; - const groupChatWorkFlowConfig: IFlowConfig = { + const groupChatWorkFlowConfig: IWorkflow = { name: "Default Group Workflow", description: "Default Group Workflow", sender: userProxyFlowSpec, @@ -359,79 +441,72 @@ export const sampleWorkflowConfig = (type = "twoagents") => { return workFlowConfig; }; -export const getModels = () => { - const models = [ - { - model: "gpt-4-1106-preview", - }, - { - model: "gpt-3.5-turbo-16k", - }, - { - model: "TheBloke/zephyr-7B-alpha-AWQ", - base_url: "http://localhost:8000/v1", - }, - ]; - return models; -}; - export const getSampleSkill = () => { const content = ` - ## This is a sample skill. Replace with your own skill function - ## In general, a good skill must have 3 sections: - ## 1. Imports (import libraries needed for your skill) - ## 2. Function definition AND docstrings (this helps the LLM understand what the function does and how to use it) - ## 3. Function body (the actual code that implements the function) - - import numpy as np - import matplotlib.pyplot as plt - from matplotlib import font_manager as fm - - def save_cat_ascii_art_to_png(filename='ascii_cat.png'): - """ - Creates ASCII art of a cat and saves it to a PNG file. - - :param filename: str, the name of the PNG file to save the ASCII art. - """ - # ASCII art string - cat_art = [ - " /\_/\ ", - " ( o.o ) ", - " > ^ < " - ] - - # Determine shape of output array - height = len(cat_art) - width = max(len(line) for line in cat_art) - - # Create a figure and axis to display ASCII art - fig, ax = plt.subplots(figsize=(width, height)) - ax.axis('off') # Hide axes - - # Get a monospace font - prop = fm.FontProperties(family='monospace') - - # Display ASCII art using text - for y, line in enumerate(cat_art): - ax.text(0, height-y-1, line, fontproperties=prop, fontsize=12) - - # Adjust layout - plt.tight_layout() - - # Save figure to file - plt.savefig(filename, dpi=120, bbox_inches='tight', pad_inches=0.1) - plt.close(fig)`; +from typing import List +import uuid +import requests # to perform HTTP requests +from pathlib import Path + +from openai import OpenAI + + +def generate_and_save_images(query: str, image_size: str = "1024x1024") -> List[str]: + """ + Function to paint, draw or illustrate images based on the users query or request. Generates images from a given query using OpenAI's DALL-E model and saves them to disk. Use the code below anytime there is a request to create an image. + + :param query: A natural language description of the image to be generated. + :param image_size: The size of the image to be generated. (default is "1024x1024") + :return: A list of filenames for the saved images. + """ + + client = OpenAI() # Initialize the OpenAI client + response = client.images.generate(model="dall-e-3", prompt=query, n=1, size=image_size) # Generate images + + # List to store the file names of saved images + saved_files = [] + + # Check if the response is successful + if response.data: + for image_data in response.data: + # Generate a random UUID as the file name + file_name = str(uuid.uuid4()) + ".png" # Assuming the image is a PNG + file_path = Path(file_name) + + img_url = image_data.url + img_response = requests.get(img_url) + if img_response.status_code == 200: + # Write the binary content to a file + with open(file_path, "wb") as img_file: + img_file.write(img_response.content) + print(f"Image saved to {file_path}") + saved_files.append(str(file_path)) + else: + print(f"Failed to download the image from {img_url}") + else: + print("No image data found in the response!") + + # Return the list of saved files + return saved_files + + +# Example usage of the function: +# generate_and_save_images("A cute baby sea otter") + `; const skill: ISkill = { - title: "save_cat_ascii_art_to_png", - description: "save cat ascii art to png", + name: "generate_images", + description: "Generate and save images based on a user's query.", content: content, }; return skill; }; -export const timeAgo = (dateString: string): string => { +export const timeAgo = ( + dateString: string, + returnFormatted: boolean = false +): string => { // if dateStr is empty, return empty string if (!dateString) { return ""; @@ -454,10 +529,20 @@ export const timeAgo = (dateString: string): string => { const minutesAgo = Math.floor(timeDifference / (1000 * 60)); const hoursAgo = Math.floor(minutesAgo / 60); - // Format the date into a readable format e.g. "November 27" - const options: Intl.DateTimeFormatOptions = { month: "long", day: "numeric" }; + // Format the date into a readable format e.g. "November 27, 2021, 3:45 PM" + const options: Intl.DateTimeFormatOptions = { + month: "long", + day: "numeric", + year: "numeric", + hour: "numeric", + minute: "numeric", + }; const formattedDate = timestamp.toLocaleDateString(undefined, options); + if (returnFormatted) { + return formattedDate; + } + // Determine the time difference string let timeAgoStr: string; if (minutesAgo < 1) { @@ -527,7 +612,7 @@ export const fetchVersion = () => { */ export const sanitizeConfig = ( data: any, - keys: string[] = ["api_key", "id"] + keys: string[] = ["api_key", "id", "created_at", "updated_at"] ): any => { if (Array.isArray(data)) { return data.map((item) => sanitizeConfig(item, keys)); diff --git a/samples/apps/autogen-studio/frontend/src/components/views/builder/agents.tsx b/samples/apps/autogen-studio/frontend/src/components/views/builder/agents.tsx index be8a30f7247..8800ebfbdd3 100644 --- a/samples/apps/autogen-studio/frontend/src/components/views/builder/agents.tsx +++ b/samples/apps/autogen-studio/frontend/src/components/views/builder/agents.tsx @@ -8,24 +8,17 @@ import { } from "@heroicons/react/24/outline"; import { Dropdown, MenuProps, Modal, message } from "antd"; import * as React from "react"; -import { IAgentFlowSpec, IStatus } from "../../types"; +import { IAgent, IStatus } from "../../types"; import { appContext } from "../../../hooks/provider"; import { fetchJSON, getServerUrl, - sampleAgentConfig, sanitizeConfig, timeAgo, truncateText, } from "../../utils"; -import { - AgentFlowSpecView, - BounceLoader, - Card, - CardHoverBar, - LaunchButton, - LoadingOverlay, -} from "../../atoms"; +import { BounceLoader, Card, CardHoverBar, LoadingOverlay } from "../../atoms"; +import { AgentViewer } from "./utils/agentconfig"; const AgentsView = ({}: any) => { const [loading, setLoading] = React.useState(false); @@ -37,25 +30,30 @@ const AgentsView = ({}: any) => { const { user } = React.useContext(appContext); const serverUrl = getServerUrl(); const listAgentsUrl = `${serverUrl}/agents?user_id=${user?.email}`; - const saveAgentsUrl = `${serverUrl}/agents`; - const deleteAgentUrl = `${serverUrl}/agents/delete`; - const [agents, setAgents] = React.useState([]); - const [selectedAgent, setSelectedAgent] = - React.useState(null); + const [agents, setAgents] = React.useState([]); + const [selectedAgent, setSelectedAgent] = React.useState(null); const [showNewAgentModal, setShowNewAgentModal] = React.useState(false); const [showAgentModal, setShowAgentModal] = React.useState(false); - const sampleAgent = sampleAgentConfig(user?.email || ""); - const [newAgent, setNewAgent] = React.useState( - sampleAgent - ); + const sampleAgent = { + config: { + name: "sample_agent", + description: "Sample agent description", + human_input_mode: "NEVER", + max_consecutive_auto_reply: 3, + system_message: "", + }, + }; + const [newAgent, setNewAgent] = React.useState(sampleAgent); - const deleteAgent = (agent: IAgentFlowSpec) => { + const deleteAgent = (agent: IAgent) => { setError(null); setLoading(true); + + const deleteAgentUrl = `${serverUrl}/agents/delete?user_id=${user?.email}&agent_id=${agent.id}`; // const fetch; const payLoad = { method: "DELETE", @@ -71,8 +69,7 @@ const AgentsView = ({}: any) => { const onSuccess = (data: any) => { if (data && data.status) { message.success(data.message); - console.log("agents", data.data); - setAgents(data.data); + fetchAgents(); } else { message.error(data.message); } @@ -98,8 +95,6 @@ const AgentsView = ({}: any) => { const onSuccess = (data: any) => { if (data && data.status) { - // message.success(data.message); - setAgents(data.data); } else { message.error(data.message); @@ -114,42 +109,6 @@ const AgentsView = ({}: any) => { fetchJSON(listAgentsUrl, payLoad, onSuccess, onError); }; - const saveAgent = (agent: IAgentFlowSpec) => { - setError(null); - setLoading(true); - // const fetch; - - const payLoad = { - method: "POST", - headers: { - Accept: "application/json", - "Content-Type": "application/json", - }, - body: JSON.stringify({ - user_id: user?.email, - agent: agent, - }), - }; - - const onSuccess = (data: any) => { - if (data && data.status) { - message.success(data.message); - // console.log("agents", data.data); - setAgents(data.data); - } else { - message.error(data.message); - } - setLoading(false); - setNewAgent(sampleAgent); - }; - const onError = (err: any) => { - setError(err); - message.error(err.message); - setLoading(false); - }; - fetchJSON(saveAgentsUrl, payLoad, onSuccess, onError); - }; - React.useEffect(() => { if (user) { // console.log("fetching messages", messages); @@ -157,7 +116,7 @@ const AgentsView = ({}: any) => { } }, []); - const agentRows = (agents || []).map((agent: IAgentFlowSpec, i: number) => { + const agentRows = (agents || []).map((agent: IAgent, i: number) => { const cardItems = [ { title: "Download", @@ -185,11 +144,10 @@ const AgentsView = ({}: any) => { let newAgent = { ...agent }; newAgent.config.name = `${agent.config.name}_copy`; newAgent.user_id = user?.email; - newAgent.timestamp = new Date().toISOString(); + newAgent.updated_at = new Date().toISOString(); if (newAgent.id) { delete newAgent.id; } - setNewAgent(newAgent); setShowNewAgentModal(true); }, @@ -206,27 +164,41 @@ const AgentsView = ({}: any) => { }, ]; return ( -
-
- {truncateText(agent.config.name, 25)}
- } - onClick={() => { - setSelectedAgent(agent); - setShowAgentModal(true); - }} - > -
- {" "} - {truncateText(agent.config.description || "", 70)} +
  • + + {truncateText(agent.config.name || "", 25)}
  • -
    {timeAgo(agent.timestamp || "")}
    - - -
    -
    + } + onClick={() => { + setSelectedAgent(agent); + setShowAgentModal(true); + }} + > + +
    + {timeAgo(agent.updated_at || "")} +
    + + + ); }); @@ -237,45 +209,39 @@ const AgentsView = ({}: any) => { setShowAgentModal, handler, }: { - agent: IAgentFlowSpec | null; - setAgent: (agent: IAgentFlowSpec | null) => void; + agent: IAgent | null; + setAgent: (agent: IAgent | null) => void; showAgentModal: boolean; setShowAgentModal: (show: boolean) => void; - handler?: (agent: IAgentFlowSpec | null) => void; + handler?: (agent: IAgent | null) => void; }) => { - const [localAgent, setLocalAgent] = React.useState( - agent - ); + const [localAgent, setLocalAgent] = React.useState(agent); + + const closeModal = () => { + setShowAgentModal(false); + if (handler) { + handler(localAgent); + } + }; return ( - Agent Specification{" "} - - {agent?.config?.name || ""} - {" "} - - } + title={<>Agent Configuration} width={800} open={showAgentModal} onOk={() => { - setAgent(null); - setShowAgentModal(false); - if (handler) { - handler(localAgent); - } + closeModal(); }} onCancel={() => { - setAgent(null); - setShowAgentModal(false); + closeModal(); }} + footer={[]} > {agent && ( - )} {/* {JSON.stringify(localAgent)} */} @@ -344,10 +310,8 @@ const AgentsView = ({}: any) => { setAgent={setSelectedAgent} setShowAgentModal={setShowAgentModal} showAgentModal={showAgentModal} - handler={(agent: IAgentFlowSpec | null) => { - if (agent) { - saveAgent(agent); - } + handler={(agent: IAgent | null) => { + fetchAgents(); }} /> @@ -356,10 +320,8 @@ const AgentsView = ({}: any) => { setAgent={setNewAgent} setShowAgentModal={setShowNewAgentModal} showAgentModal={showNewAgentModal} - handler={(agent: IAgentFlowSpec | null) => { - if (agent) { - saveAgent(agent); - } + handler={(agent: IAgent | null) => { + fetchAgents(); }} /> @@ -397,7 +359,7 @@ const AgentsView = ({}: any) => { {agents && agents.length > 0 && (
    -
    {agentRows}
    +
      {agentRows}
    )} diff --git a/samples/apps/autogen-studio/frontend/src/components/views/builder/models.tsx b/samples/apps/autogen-studio/frontend/src/components/views/builder/models.tsx index be2c11099e3..2a3b0506d79 100644 --- a/samples/apps/autogen-studio/frontend/src/components/views/builder/models.tsx +++ b/samples/apps/autogen-studio/frontend/src/components/views/builder/models.tsx @@ -2,7 +2,6 @@ import { ArrowDownTrayIcon, ArrowUpTrayIcon, DocumentDuplicateIcon, - ExclamationTriangleIcon, InformationCircleIcon, PlusIcon, TrashIcon, @@ -18,8 +17,15 @@ import { timeAgo, truncateText, } from "../../utils"; -import { BounceLoader, Card, CardHoverBar, LoadingOverlay } from "../../atoms"; +import { + BounceLoader, + Card, + CardHoverBar, + ControlRowView, + LoadingOverlay, +} from "../../atoms"; import TextArea from "antd/es/input/TextArea"; +import { ModelConfigView } from "./utils/modelconfig"; const ModelsView = ({}: any) => { const [loading, setLoading] = React.useState(false); @@ -31,8 +37,7 @@ const ModelsView = ({}: any) => { const { user } = React.useContext(appContext); const serverUrl = getServerUrl(); const listModelsUrl = `${serverUrl}/models?user_id=${user?.email}`; - const saveModelsUrl = `${serverUrl}/models`; - const deleteModelUrl = `${serverUrl}/models/delete`; + const createModelUrl = `${serverUrl}/models`; const testModelUrl = `${serverUrl}/models/test`; const defaultModel: IModelConfig = { @@ -50,28 +55,23 @@ const ModelsView = ({}: any) => { ); const [showNewModelModal, setShowNewModelModal] = React.useState(false); - const [showModelModal, setShowModelModal] = React.useState(false); const deleteModel = (model: IModelConfig) => { setError(null); setLoading(true); - // const fetch; + const deleteModelUrl = `${serverUrl}/models/delete?user_id=${user?.email}&model_id=${model.id}`; const payLoad = { method: "DELETE", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({ - user_id: user?.email, - model: model, - }), }; const onSuccess = (data: any) => { if (data && data.status) { message.success(data.message); - setModels(data.data); + fetchModels(); } else { message.error(data.message); } @@ -111,9 +111,10 @@ const ModelsView = ({}: any) => { fetchJSON(listModelsUrl, payLoad, onSuccess, onError); }; - const saveModel = (model: IModelConfig) => { + const createModel = (model: IModelConfig) => { setError(null); setLoading(true); + model.user_id = user?.email; const payLoad = { method: "POST", @@ -121,17 +122,14 @@ const ModelsView = ({}: any) => { Accept: "application/json", "Content-Type": "application/json", }, - body: JSON.stringify({ - user_id: user?.email, - model: model, - }), + body: JSON.stringify(model), }; const onSuccess = (data: any) => { if (data && data.status) { message.success(data.message); - // console.log("models", data.data); - setModels(data.data); + const updatedModels = [data.data].concat(models || []); + setModels(updatedModels); } else { message.error(data.message); } @@ -142,7 +140,7 @@ const ModelsView = ({}: any) => { message.error(err.message); setLoading(false); }; - fetchJSON(saveModelsUrl, payLoad, onSuccess, onError); + fetchJSON(createModelUrl, payLoad, onSuccess, onError); }; React.useEffect(() => { @@ -180,7 +178,7 @@ const ModelsView = ({}: any) => { let newModel = { ...model }; newModel.model = `${model.model} Copy`; newModel.user_id = user?.email; - newModel.timestamp = new Date().toISOString(); + newModel.updated_at = new Date().toISOString(); if (newModel.id) { delete newModel.id; } @@ -200,27 +198,35 @@ const ModelsView = ({}: any) => { }, ]; return ( -
    -
    - {truncateText(model.model || "", 20)}
    - } - onClick={() => { - setSelectedModel(model); - setShowModelModal(true); - }} +
  • + {truncateText(model.model || "", 20)}
  • + } + onClick={() => { + setSelectedModel(model); + setShowModelModal(true); + }} + > +
    + {" "} + {truncateText(model.description || model.model || "", 70)} +
    +
    -
    - {" "} - {truncateText(model.description || model.model || "", 70)} -
    -
    {timeAgo(model.timestamp || "")}
    - - -
    -
    + {timeAgo(model.updated_at || "")} +
    + + + ); }); @@ -231,47 +237,20 @@ const ModelsView = ({}: any) => { setShowModelModal, handler, }: { - model: IModelConfig | null; + model: IModelConfig; setModel: (model: IModelConfig | null) => void; showModelModal: boolean; setShowModelModal: (show: boolean) => void; handler?: (agent: IModelConfig) => void; }) => { - const [loadingModelTest, setLoadingModelTest] = React.useState(false); - const [modelStatus, setModelStatus] = React.useState(null); - - const [localModel, setLocalModel] = React.useState( - model - ); - const testModel = (model: IModelConfig) => { - setModelStatus(null); - setLoadingModelTest(true); - const payLoad = { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - user_id: user?.email, - model: model, - }), - }; + const [localModel, setLocalModel] = React.useState(model); - const onSuccess = (data: any) => { - if (data && data.status) { - message.success(data.message); - setModelStatus(data.data); - } else { - message.error(data.message); - } - setLoadingModelTest(false); - setModelStatus(data); - }; - const onError = (err: any) => { - message.error(err.message); - setLoadingModelTest(false); - }; - fetchJSON(testModelUrl, payLoad, onSuccess, onError); + const closeModal = () => { + setModel(null); + setShowModelModal(false); + if (handler) { + handler(model); + } }; return ( @@ -284,137 +263,21 @@ const ModelsView = ({}: any) => { } width={800} open={showModelModal} - footer={[ - , - , - , - ]} + footer={[]} onOk={() => { - setModel(null); - setShowModelModal(false); - if (handler) { - if (localModel) { - handler(localModel); - } - } + closeModal(); }} onCancel={() => { - setModel(null); - setShowModelModal(false); + closeModal(); }} > -
    -
    Enter parameters for your model.
    - { - setLocalModel({ ...localModel, model: e.target.value }); - }} - /> - { - if (localModel) { - setLocalModel({ ...localModel, api_key: e.target.value }); - } - }} - /> - { - if (localModel) { - setLocalModel({ ...localModel, base_url: e.target.value }); - } - }} - /> - { - if (localModel) { - setLocalModel({ ...localModel, api_type: e.target.value }); - } - }} + {model && ( + - { - if (localModel) { - setLocalModel({ ...localModel, api_version: e.target.value }); - } - }} - /> -