Skip to content

Commit

Permalink
Add method for generating synthetic qa pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
homanp committed Oct 6, 2023
1 parent f835472 commit 81f19b1
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 15 deletions.
4 changes: 3 additions & 1 deletion lib/api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ async def ingest(body: dict):
documents = await embedding_service.generate_documents()
nodes = await embedding_service.generate_chunks(documents=documents)
# embeddings = await embedding_service.generate_embeddings(nodes=nodes)
finetunning_service = await get_finetuning_service(nodes=nodes, provider="openai")
finetunning_service = await get_finetuning_service(
nodes=nodes, provider="openai", batch_size=5
)
await finetunning_service.generate_dataset()
# print(embeddings)
return {"success": True, "data": None}
32 changes: 23 additions & 9 deletions lib/service/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,37 +23,47 @@ async def generate_dataset(self) -> List[Tuple[str, ndarray]]:

class OpenAIFinetuningService(FinetuningService):
def __init__(
self, nodes: List[Union[Document, None]], num_questions_per_chunk: int = 10
self,
nodes: List[Union[Document, None]],
num_questions_per_chunk: int = 10,
batch_size: int = 10,
):
super().__init__(nodes=nodes)
self.num_questions_per_chunk = num_questions_per_chunk
self.batch_size = batch_size

async def generate_prompt_and_completion(self, node):
prompt = generate_qa_pair_prompt(
context=node.text, num_of_qa_paris=10, format=GPT_DATA_FORMAT
)
completion = await openai.ChatCompletion.acreate(
model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}]
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
)
return completion.choices[0].message.content

async def generate_dataset(self):
with open("dataset.jsonl", "w") as f:
for i in range(0, len(self.nodes), 10): # Process nodes in chunks of 10
for i in range(
0, len(self.nodes), self.batch_size
): # Process nodes in chunks of batch_size
tasks = [
self.generate_prompt_and_completion(node)
for node in self.nodes[i : i + 10]
for node in self.nodes[i : i + self.batch_size]
]
results = await asyncio.gather(*tasks)
for data in results:
json.dump(data, f)
f.write("\n")
qa_pairs = await asyncio.gather(*tasks)
for qa_pair in qa_pairs:
json_objects = qa_pair.split("\n\n")
for json_obj in json_objects:
f.write(json_obj + "\n")


async def get_finetuning_service(
nodes: List[Union[Document, None]],
provider: str = "openai",
num_questions_per_chunk: int = 10,
batch_size: int = 10,
):
services = {
"openai": OpenAIFinetuningService,
Expand All @@ -62,4 +72,8 @@ async def get_finetuning_service(
service = services.get(provider)
if service is None:
raise ValueError(f"Unsupported provider: {provider}")
return service(nodes=nodes, num_questions_per_chunk=num_questions_per_chunk)
return service(
nodes=nodes,
num_questions_per_chunk=num_questions_per_chunk,
batch_size=batch_size,
)
10 changes: 5 additions & 5 deletions lib/service/prompts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
GPT_DATA_FORMAT = (
"{"
"'messages': ["
"{'role': 'system', 'content': 'You are an AI agent that's an expert at answering questions.'}, "
"{'role': 'user', 'content': 'What's the capital of France?'}, "
"{'role': 'assistant', 'content': 'Paris, as if everyone doesn't know that already.'}"
'"messages": ['
'{"role": "system", "content": "You are an AI agent that\'s an expert at answering questions."}, '
'{"role": "user", "content": "What\'s the capital of France?"}, '
'{"role": "assistant", "content": "Paris, as if everyone doesn\'t know that already."}'
"]"
"}"
)
Expand All @@ -14,7 +14,7 @@ def generate_qa_pair_prompt(format: str, context: str, num_of_qa_paris: int = 10
"You are an AI assistant tasked with generating question and answer pairs"
"for the given context using the given format. Only answer in the format with"
f"no other text. You should create the following number of question/answer pairs: {num_of_qa_paris}"
"Return the question/answer pairs as a Python List."
"Return the question/answer pairs as a JSONL."
"Each dict in the list should have the full context provided,"
"a relevant question to the context and an answer to the question.\n\n"
f"Format:\n {format}\n\n"
Expand Down

0 comments on commit 81f19b1

Please sign in to comment.