Add method for generating synthetic qa pairs

homanp · Oct 6, 2023 · 81f19b1 · 81f19b1
1 parent f835472
commit 81f19b1
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 15 deletions.
diff --git a/lib/api/ingest.py b/lib/api/ingest.py
@@ -19,7 +19,9 @@ async def ingest(body: dict):
     documents = await embedding_service.generate_documents()
     nodes = await embedding_service.generate_chunks(documents=documents)
     # embeddings = await embedding_service.generate_embeddings(nodes=nodes)
-    finetunning_service = await get_finetuning_service(nodes=nodes, provider="openai")
+    finetunning_service = await get_finetuning_service(
+        nodes=nodes, provider="openai", batch_size=5
+    )
     await finetunning_service.generate_dataset()
     # print(embeddings)
     return {"success": True, "data": None}
diff --git a/lib/service/finetune.py b/lib/service/finetune.py
@@ -23,37 +23,47 @@ async def generate_dataset(self) -> List[Tuple[str, ndarray]]:
 
 class OpenAIFinetuningService(FinetuningService):
     def __init__(
-        self, nodes: List[Union[Document, None]], num_questions_per_chunk: int = 10
+        self,
+        nodes: List[Union[Document, None]],
+        num_questions_per_chunk: int = 10,
+        batch_size: int = 10,
     ):
         super().__init__(nodes=nodes)
         self.num_questions_per_chunk = num_questions_per_chunk
+        self.batch_size = batch_size
 
     async def generate_prompt_and_completion(self, node):
         prompt = generate_qa_pair_prompt(
             context=node.text, num_of_qa_paris=10, format=GPT_DATA_FORMAT
         )
         completion = await openai.ChatCompletion.acreate(
-            model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}]
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.3,
         )
         return completion.choices[0].message.content
 
     async def generate_dataset(self):
         with open("dataset.jsonl", "w") as f:
-            for i in range(0, len(self.nodes), 10):  # Process nodes in chunks of 10
+            for i in range(
+                0, len(self.nodes), self.batch_size
+            ):  # Process nodes in chunks of batch_size
                 tasks = [
                     self.generate_prompt_and_completion(node)
-                    for node in self.nodes[i : i + 10]
+                    for node in self.nodes[i : i + self.batch_size]
                 ]
-                results = await asyncio.gather(*tasks)
-                for data in results:
-                    json.dump(data, f)
-                    f.write("\n")
+                qa_pairs = await asyncio.gather(*tasks)
+                for qa_pair in qa_pairs:
+                    json_objects = qa_pair.split("\n\n")
+                    for json_obj in json_objects:
+                        f.write(json_obj + "\n")
 
 
 async def get_finetuning_service(
     nodes: List[Union[Document, None]],
     provider: str = "openai",
     num_questions_per_chunk: int = 10,
+    batch_size: int = 10,
 ):
     services = {
         "openai": OpenAIFinetuningService,
@@ -62,4 +72,8 @@ async def get_finetuning_service(
     service = services.get(provider)
     if service is None:
         raise ValueError(f"Unsupported provider: {provider}")
-    return service(nodes=nodes, num_questions_per_chunk=num_questions_per_chunk)
+    return service(
+        nodes=nodes,
+        num_questions_per_chunk=num_questions_per_chunk,
+        batch_size=batch_size,
+    )
diff --git a/lib/service/prompts.py b/lib/service/prompts.py
@@ -1,9 +1,9 @@
 GPT_DATA_FORMAT = (
     "{"
-    "'messages': ["
-    "{'role': 'system', 'content': 'You are an AI agent that's an expert at answering questions.'}, "
-    "{'role': 'user', 'content': 'What's the capital of France?'}, "
-    "{'role': 'assistant', 'content': 'Paris, as if everyone doesn't know that already.'}"
+    '"messages": ['
+    '{"role": "system", "content": "You are an AI agent that\'s an expert at answering questions."}, '
+    '{"role": "user", "content": "What\'s the capital of France?"}, '
+    '{"role": "assistant", "content": "Paris, as if everyone doesn\'t know that already."}'
     "]"
     "}"
 )
@@ -14,7 +14,7 @@ def generate_qa_pair_prompt(format: str, context: str, num_of_qa_paris: int = 10
         "You are an AI assistant tasked with generating question and answer pairs"
         "for the given context using the given format. Only answer in the format with"
         f"no other text. You should create the following number of question/answer pairs: {num_of_qa_paris}"
-        "Return the question/answer pairs as a Python List."
+        "Return the question/answer pairs as a JSONL."
         "Each dict in the list should have the full context provided,"
         "a relevant question to the context and an answer to the question.\n\n"
         f"Format:\n {format}\n\n"