From 0e996a9a86dbf74907411a623523b4e87df87fb6 Mon Sep 17 00:00:00 2001 From: jarvis8x7b <157810922+jarvis8x7b@users.noreply.github.com> Date: Thu, 8 Feb 2024 01:27:00 +0800 Subject: [PATCH] feat: update working scaleai test script --- test_scaleai.py | 247 ++++++++++++++++-------------------------------- 1 file changed, 81 insertions(+), 166 deletions(-) diff --git a/test_scaleai.py b/test_scaleai.py index 64cc76c8..a7604501 100644 --- a/test_scaleai.py +++ b/test_scaleai.py @@ -6,14 +6,13 @@ from typing import List, Optional from uuid import uuid4 from datetime import datetime -from fastapi.encoders import jsonable_encoder -import requests import scaleapi from dotenv import load_dotenv -from scaleapi.exceptions import ScaleDuplicateResource +from scaleapi.exceptions import ScaleDuplicateResource, ScaleInvalidRequest from scaleapi.tasks import TaskType from commons.llm.openai_proxy import Provider +from commons.llm.prompts import ScoreRange from commons.objects import ScoreItem, ScoresResponse from commons.reward_model.models import ModelUtils @@ -26,7 +25,47 @@ scale_api_key = os.getenv("SCALE_API_KEY") client = scaleapi.ScaleClient(api_key=scale_api_key) -url = "https://api.scale.com/v1/projects" +import textwrap + + +score_range = ScoreRange(lower=1, upper=10) + + +def _build_project_level_instruction(score_range): + instruction = f""" + # Instructions + Score each prompt and completions, where {score_range.lower} is the lowest score and {score_range.upper} is the highest score. + """ + return textwrap.dedent(instruction) + + +def _build_task_level_instruction(score_range): + instruction = f""" + # Instructions + Score each prompt and completions, where {score_range.lower} is the lowest score and {score_range.upper} is the highest score. + """ + return textwrap.dedent(instruction) + + +def _build_fields_dict(score_range): + return ( + { + "type": "number", + "field_id": "Quality Score", + "title": "Quality Score", + "required": True, + "use_slider": True, + "min": score_range.lower, + "max": score_range.upper, + # text that labellers will see at "min" part of slider + "prefix": "lowest quality", + # text that labellers will see at "max" part of slider + "suffix": "highest quality", + # prompt above slider + "description": f"Rate the completions quality compared to the prompt from {score_range.lower} (lowest quality) to {score_range.upper} (highest quality)", + # "hint": None, + }, + ) def create_project(project_name: str) -> Optional[bool]: @@ -37,80 +76,28 @@ def create_project(project_name: str) -> Optional[bool]: project_payload = { "task_type": TaskType.TextCollection, "project_name": project_name, - "rapid": False, "studio": True, - "pipeline": "standard_task", "params": { - "instruction": "# Instructions\n\nScore each prompt and completions, where 1 is the lowest score and 10 is the highest score.", - # # specify this so for studio projects - "fields": [ - { - "type": "number", - "field_id": "Quality Score", - "title": "Quality Score", - "required": True, - "use_slider": True, - "min": 1, - "max": 10, - "prefix": "lowest quality", - "suffix": "highest quality", - # prompt that human labellers will see - "description": "Rate the completions quality compared to the prompt from 1 (lowest quality) to 10 (highest quality)", - # "hint": None, - }, - ], + "instruction": _build_project_level_instruction(score_range), + "fields": [_build_fields_dict(score_range)], }, } - - headers = { - "accept": "application/json", - "content-type": "application/json", - "authorization": "Basic bGl2ZV84YjIxOWMzNDM5MmI0NjVlYTQwZDU1MzQ3ODNjYjVmZTo=", - } try: - response = requests.post( - url, headers=headers, json=jsonable_encoder(project_payload) - ) - print(response.status_code) - print(response.json()) - # project = client.create_project(**project_payload) + _ = client.create_project(**project_payload) except ScaleDuplicateResource as e: - # for some reason will get duplicate resource error although project was created + # may get duplicate resource error although project was created properly... pass - is_project_created = project_name in [p.name for p in client.get_projects()] - print(f"Successfully created project? {is_project_created}") - return is_project_created + if is_project_created := project_name in [p.name for p in client.get_projects()]: + batch_name = build_batch_name(project_name) + try: + # need to create first batch in order to "complete setup" + _ = client.create_batch(project=project_name, batch_name=batch_name) + except Exception as e: + pass -def _ensure_project_setup_completed(project_name: str): - project = client.get_project(project_name) - print(f"Before: {project=}") - payload = { - "patch": True, - "pipelineName": "standard_task", - "numReviews": 0, - # "params": { - # "pipeline": "standard_task", - # "fields": [ - # { - # "type": "number", - # "field_id": "Quality Score", - # "title": "Quality Score", - # "required": True, - # "use_slider": True, - # "min": 1, - # "max": 10, - # "prefix": "lowest quality", - # "suffix": "highest quality", - # "description": "Rate the completions quality compared to the prompt from 1 (lowest quality) to 10 (highest quality)", - # }, - # ], - # }, - } - client.update_project(project_name, **payload) - - project = client.get_project(project_name) - print(f"After: {project=}") + print(f"Successfully created project? {is_project_created}") + return is_project_created def dedupe_dataset(): @@ -143,124 +130,54 @@ def dedupe_dataset(): return deduplicated_data -def build_batch_name(project_name: str, is_calibration) -> str: +def build_batch_name(project_name: str) -> str: current_date = datetime.now().strftime("%Y_%m_%d") - # TODO remove after testing - batch_name = f"{project_name}_{current_date}_2" - if not is_calibration: - return batch_name - return batch_name + "_calibration" + batch_name = f"{project_name}_{current_date}" + return batch_name def create_task(project_name: str): batch = None - batch_name = build_batch_name(project_name, False) + batch_name = build_batch_name(project_name) try: batch = client.create_batch(project=project_name, batch_name=batch_name) + print("Created batch successfully") except ScaleDuplicateResource as e: print("Batch already exists... skipping creation") batch = client.get_batch(batch_name) pass - print(f"Batch json: {batch._json}") - - # # convert completions into attachments for human to view - # def format_prompt_and_completion(prompt: str, completion: Completion) -> dict: - # return { - # "type": "text", - # "content": f"# Prompt: {prompt} # Completion: {completion.text}", - # } - - # attachments = [{"type": "text", "content": c.text} for c in completions] payload = { - "instruction": "**Instructions:** Please annotate all the things", + "instruction": _build_task_level_instruction(score_range), + # TODO use multiple prompt / completion pairs "attachments": [ { "type": "text", "content": "# Prompt:\nwhat is your name?\n\n# Completion:\n my name is Alice, nice to meet you!", }, + { + "type": "text", + "content": "# Prompt:\nwhat is your name?\n\n# Completion:\n my name is Bob", + }, ], "responses_required": 1, "priority": 30, "project": project_name, "batch": batch_name, - "callback_url": "https://webhook.site/#!/71d292d7-4ef0-41a6-b8d5-b4e1717da367", + # TODO + "callback_url": "https://webhook.site/71d292d7-4ef0-41a6-b8d5-b4e1717da367", "title": "title", "description": "desc", "unique_id": str(uuid4()), - "fields": [ - { - "type": "number", - "field_id": "Quality Score", - "title": "Quality Score", - "description": "Rate the completions quality compared to the prompt from 1 (lowest quality) to 10 (highest quality)", - "required": False, - "use_slider": True, - "min": 1, - "max": 10, - # "prefix": "lowest quality", - # "suffix": "highest quality", - # prompt that human labellers will see - }, - ], - "isProcessed": False, + "fields": [_build_fields_dict(score_range)], } - task_payload = { - "project": project_name, - "batch": batch_name, - "callback_url": "https://webhook.site/#!/71d292d7-4ef0-41a6-b8d5-b4e1717da367", - "attachments": [ - { - "type": "text", - "content": "# Prompt:what is your name?\n\n# Completion: my name is Alice.", - }, - ], - "description": "This project blah blah description", - "responses_required": 1, - "instruction": "# Instructions\n\nScore each prompt and completions, where 1 is the lowest score and 10 is the highest score.", - "fields": [ - { - "type": "number", - "field_id": "Quality Score", - "title": "Quality Score", - "required": True, - "use_slider": True, - "min": 1, - "max": 10, - "prefix": "lowest quality", - "suffix": "highest quality", - # prompt that human labellers will see - "description": "Rate the completions quality compared to the prompt from 1 (lowest quality) to 10 (highest quality)", - # "hint": None, - }, - ], - "priority": 30, - # "params": { - # # "fields": [ - # # { - # # "type": "number", - # # "field_id": "Quality Score", - # # "title": "Quality Score", - # # "required": True, - # # "use_slider": True, - # # "min": 1, - # # "max": 10, - # # "prefix": "lowest quality", - # # "suffix": "highest quality", - # # # prompt that human labellers will see - # # "description": "Rate the completions quality compared to the prompt from 1 (lowest quality) to 10 (highest quality)", - # # # "hint": None, - # # }, - # # ], - # # specify this so that we can use num_reviews = - # # "pipeline": "standard_task", - # }, - } - # task = client.create_evaluation_task(TaskType.TextCollection, **task_payload) - # task = client.create_task(TaskType.TextCollection, **task_payload) task = client.create_task(TaskType.TextCollection, **payload) - batch.finalize() + try: + batch.finalize() + except ScaleInvalidRequest as e: + print(f"Error occured while finalising batch, exception: {str(e)}") + pass def create_eval_tasks(project_name): @@ -269,7 +186,7 @@ def create_eval_tasks(project_name): for line in f: eval_data.append(json.loads(line)) - batch_name = build_batch_name(project_name, True) + batch_name = build_batch_name(project_name) + "_calibration" batch = None try: batch = client.create_batch( @@ -301,7 +218,7 @@ def create_eval_tasks(project_name): task_payload = { "project": project_name, "batch": batch_name, - "callback_url": "https://webhook.site/#!/71d292d7-4ef0-41a6-b8d5-b4e1717da367", + "callback_url": "https://webhook.site/71d292d7-4ef0-41a6-b8d5-b4e1717da367", "attachments": attachments, "expected_response": { "Quality Score": { @@ -332,7 +249,7 @@ def create_eval_tasks(project_name): ) print(f"Created task... for {batch_name}") count += 1 - # finalised_res = batch.finalize() + finalised_res = batch.finalize() # print(f"Finalised batch, response: {finalised_res}") @@ -381,7 +298,7 @@ async def add_scores_to_dataset(): def one_time_finalise(project_name): batch = None - batch_name = build_batch_name(project_name, False) + batch_name = build_batch_name(project_name) try: batch = client.create_batch(project=project_name, batch_name=batch_name) except ScaleDuplicateResource as e: @@ -394,14 +311,12 @@ def one_time_finalise(project_name): if __name__ == "__main__": - project_name = "human_feedback23" - # print(client.get_projects()) + project_name = "human_feedback_PLEASE_WORK" create_project(project_name) - # _ensure_project_setup_completed(project_name) + create_task(project_name) # dedupe_dataset() # asyncio.run(add_scores_to_dataset()) # create_eval_tasks(project_name) - # create_task(project_name) # one_time_finalise(project_name) # print(client.get_projects())