Skip to content

Commit

Permalink
feature: get initial prompts & completions from synthetic dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
jarvis8x7b committed Feb 2, 2024
1 parent ffdc937 commit 3551591
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 58 deletions.
109 changes: 67 additions & 42 deletions commons/dataset.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from enum import StrEnum
from typing import Dict, Iterable, List
from typing import Dict, Iterable, List, Tuple
import random
import bittensor as bt

from datasets import load_dataset, interleave_datasets

from template.protocol import Completion, RankingRequest

seed = 42


Expand Down Expand Up @@ -66,42 +68,6 @@ def get_openai_webgpt_comparisons():
)


# NOTE this serves as a start for prompt/completion pairs to be generated because at first there will be no requests coming in
oasst1 = load_dataset(
DatasetName.OPENASSISTANT_OASST1,
split="train",
streaming=True,
)
oasst2 = load_dataset(
DatasetName.OPENASSISTANT_OASST2,
split="train",
streaming=True,
)


def _is_oasst_prompt(row):
return not row["parent_id"] and row["role"] == "prompter"


# ensure we only grab the 'text' fields from the dataset
oasst1_prompts = oasst1.filter(lambda row: _is_oasst_prompt(row)).map(
lambda row: {"text": row["text"]}
)
oasst2_prompts = oasst2.filter(lambda row: _is_oasst_prompt(row)).map(
lambda row: {"text": row["text"]}
)
ALL_OASST_PROMPTS = "all_oasst_prompts"
seed_datasets = {
ALL_OASST_PROMPTS: iter(
interleave_datasets(
[oasst1_prompts, oasst2_prompts],
probabilities=[0.5, 0.5],
seed=seed,
stopping_strategy="all_exhausted",
)
)
}

eval_datasets = {
DatasetName.ANTHROPIC_HHRLHF: iter(get_anthropic_hhrlhf()),
DatasetName.STANFORD_SHP: iter(get_stanford_shp()),
Expand Down Expand Up @@ -132,10 +98,69 @@ def get_batch() -> List[Dict]:
return [next_circular(eval_datasets, key) for _ in range(batch_size)]


# # NOTE this serves as a start for prompt/completion pairs to be generated because at first there will be no requests coming in
# oasst1 = load_dataset(
# DatasetName.OPENASSISTANT_OASST1,
# split="train",
# streaming=True,
# )
# oasst2 = load_dataset(
# DatasetName.OPENASSISTANT_OASST2,
# split="train",
# streaming=True,
# )


# def _is_oasst_prompt(row):
# return not row["parent_id"] and row["role"] == "prompter"


# # ensure we only grab the 'text' fields from the dataset
# oasst1_prompts = oasst1.filter(lambda row: _is_oasst_prompt(row)).map(
# lambda row: {"text": row["text"]}
# )
# oasst2_prompts = oasst2.filter(lambda row: _is_oasst_prompt(row)).map(
# lambda row: {"text": row["text"]}
# )
# ALL_OASST_PROMPTS = "all_oasst_prompts"
# seed_datasets = {
# ALL_OASST_PROMPTS: iter(
# interleave_datasets(
# [oasst1_prompts, oasst2_prompts],
# probabilities=[0.5, 0.5],
# seed=seed,
# stopping_strategy="all_exhausted",
# )
# )
# }

# TODO change name to actual datset name
seed_dataset_name = "prooompt/test_dataset"


def get_seed_dataset():
return load_dataset(
seed_dataset_name,
split="train",
streaming=True,
)


seed_dataset = iter(get_seed_dataset())


class SeedDataManager:
@staticmethod
def get_prompts(num_prompts=5):
return [
next_circular(seed_datasets, ALL_OASST_PROMPTS)["text"]
for _ in range(num_prompts)
]
def get_prompt_and_completions():
global seed_dataset
try:
return SeedDataManager._map_seed_data(next(seed_dataset))
except StopIteration:
seed_dataset = iter(get_seed_dataset())
return SeedDataManager._map_seed_data(next(seed_dataset))

@staticmethod
def _map_seed_data(row: Dict) -> Tuple[str, List[str]]:
prompt = row["prompt"]
completions = [c["response"] for c in row["responses"]]
return prompt, completions
23 changes: 7 additions & 16 deletions neurons/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import bittensor as bt
from commons.data_manager import DataManager
from commons.dataset import SeedDataManager
from commons.objects import DendriteQueryResponse
from commons.consensus import Consensus

Expand Down Expand Up @@ -54,7 +55,7 @@ async def _update_score_and_send_feedback(self):
await self._forward_consensus(
synapse=d.result, hotkeys=list(d.hotkey_to_scores.keys())
)
asyncio.sleep(5)
await asyncio.sleep(5)

async def forward(self):
"""
Expand All @@ -65,24 +66,14 @@ async def forward(self):
- Rewarding the miners
- Updating the scores
"""

# TODO change to real data
prompt, completions = SeedDataManager.get_prompt_and_completions(n=1)
request = RankingRequest(
n_completions=3,
n_completions=len(completions),
pid=get_new_uuid(),
prompt="What is your name?",
completions=[
Completion(
text="My name is Assistant, and I am a helpful assisstant created by OpenAI.",
),
Completion(
text="My name is Llama, and I am an assistant created by Meta.",
),
Completion(
text="My name is GPT-3, and I am an AI created by OpenAI.",
),
],
prompt=prompt,
completions=[Completion(text=c) for c in completions],
)

miner_uids = get_random_uids(
metagraph=self.metagraph,
k=self.config.neuron.sample_size,
Expand Down

0 comments on commit 3551591

Please sign in to comment.