-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from homanp/finetune-gpt-3.5
WIP
- Loading branch information
Showing
12 changed files
with
255 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from pydantic import BaseModel | ||
|
||
|
||
class IngestRequest(BaseModel): | ||
webhook_url: str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import asyncio | ||
import os | ||
import uuid | ||
from abc import ABC, abstractmethod | ||
from typing import Dict, List, Tuple, Union | ||
|
||
import openai | ||
from decouple import config | ||
from llama_index import Document | ||
from numpy import ndarray | ||
|
||
from lib.service.prompts import GPT_DATA_FORMAT, generate_qa_pair_prompt | ||
|
||
openai.api_key = config("OPENAI_API_KEY") | ||
|
||
|
||
class FinetuningService(ABC): | ||
def __init__(self, nodes: List[Union[Document, None]]): | ||
self.nodes = nodes | ||
|
||
@abstractmethod | ||
async def generate_dataset(self) -> List[Tuple[str, ndarray]]: | ||
pass | ||
|
||
@abstractmethod | ||
async def finetune(self, training_file: str) -> Dict: | ||
pass | ||
|
||
@abstractmethod | ||
async def cleanup(self, training_file: str) -> None: | ||
os.remove(training_file) | ||
|
||
|
||
class OpenAIFinetuningService(FinetuningService): | ||
def __init__( | ||
self, | ||
nodes: List[Union[Document, None]], | ||
num_questions_per_chunk: int = 10, | ||
batch_size: int = 10, | ||
): | ||
super().__init__(nodes=nodes) | ||
self.num_questions_per_chunk = num_questions_per_chunk | ||
self.batch_size = batch_size | ||
|
||
async def generate_prompt_and_completion(self, node): | ||
prompt = generate_qa_pair_prompt( | ||
context=node.text, num_of_qa_paris=10, format=GPT_DATA_FORMAT | ||
) | ||
completion = await openai.ChatCompletion.acreate( | ||
model="gpt-3.5-turbo", | ||
messages=[{"role": "user", "content": prompt}], | ||
temperature=0, | ||
) | ||
return completion.choices[0].message.content | ||
|
||
async def generate_dataset(self) -> str: | ||
training_file = f"{uuid.uuid4()}.jsonl" | ||
with open(training_file, "w") as f: | ||
for i in range( | ||
0, len(self.nodes), self.batch_size | ||
): # Process nodes in chunks of batch_size | ||
tasks = [ | ||
self.generate_prompt_and_completion(node) | ||
for node in self.nodes[i : i + self.batch_size] | ||
] | ||
qa_pairs = await asyncio.gather(*tasks) | ||
for qa_pair in qa_pairs: | ||
json_objects = qa_pair.split("\n\n") | ||
for json_obj in json_objects: | ||
f.write(json_obj + "\n") | ||
|
||
async def finetune(self, training_file: str) -> Dict: | ||
file = openai.File.create(file=open(training_file, "rb"), purpose="fine-tune") | ||
finetune = await openai.FineTuningJob.acreate( | ||
training_file=file.get("id"), model="gpt-3.5-turbo" | ||
) | ||
return {**finetune, "training_file": training_file} | ||
|
||
|
||
async def get_finetuning_service( | ||
nodes: List[Union[Document, None]], | ||
provider: str = "openai", | ||
num_questions_per_chunk: int = 10, | ||
batch_size: int = 10, | ||
): | ||
services = { | ||
"openai": OpenAIFinetuningService, | ||
# Add other providers here | ||
} | ||
service = services.get(provider) | ||
if service is None: | ||
raise ValueError(f"Unsupported provider: {provider}") | ||
return service( | ||
nodes=nodes, | ||
num_questions_per_chunk=num_questions_per_chunk, | ||
batch_size=batch_size, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from typing import List, Union | ||
|
||
import openai | ||
from llama_index import Document | ||
from prefect import flow, task | ||
|
||
from lib.service.embedding import EmbeddingService | ||
from lib.service.finetune import get_finetuning_service | ||
from lib.utils.prisma import prisma | ||
from prisma.models import Datasource | ||
|
||
|
||
@task | ||
async def create_vector_embeddings( | ||
datasource: Datasource, | ||
) -> List[Union[Document, None]]: | ||
embedding_service = EmbeddingService(datasource=datasource) | ||
documents = await embedding_service.generate_documents() | ||
nodes = await embedding_service.generate_chunks(documents=documents) | ||
await embedding_service.generate_embeddings(nodes=nodes) | ||
return nodes | ||
|
||
|
||
@task | ||
async def create_finetuned_model(datasource: Datasource): | ||
embedding_service = EmbeddingService(datasource=datasource) | ||
documents = await embedding_service.generate_documents() | ||
nodes = await embedding_service.generate_chunks(documents=documents) | ||
finetunning_service = await get_finetuning_service( | ||
nodes=nodes, provider="openai", batch_size=5 | ||
) | ||
await finetunning_service.generate_dataset() | ||
finetune_job = await finetunning_service.finetune() | ||
finetune = await openai.FineTune.retrieve(id=finetune_job.id) | ||
await finetunning_service.cleanup(training_file=finetune_job.get("training_file")) | ||
return finetune | ||
|
||
|
||
@flow(name="create_embeddings", description="Create embeddings", retries=0) | ||
async def create_embeddings(datasource: Datasource): | ||
await create_vector_embeddings(datasource=datasource) | ||
|
||
|
||
@flow(name="create_finetune", description="Create a finetune", retries=0) | ||
async def create_finetune(datasource: Datasource): | ||
finetune = await create_finetuned_model(datasource=datasource) | ||
await prisma.datasource.update( | ||
where={"id": datasource.id}, | ||
data={"finetune": finetune}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# flake8: noqa | ||
|
||
GPT_DATA_FORMAT = ( | ||
"{" | ||
'"messages": [' | ||
'{"role": "system", "content": "You are an AI agent that\'s an expert at answering questions."}, ' | ||
'{"role": "user", "content": "What\'s the capital of France?"}, ' | ||
'{"role": "assistant", "content": "Paris, as if everyone doesn\'t know that already."}' | ||
"]" | ||
"}" | ||
) | ||
|
||
|
||
def generate_qa_pair_prompt(format: str, context: str, num_of_qa_paris: int = 10): | ||
prompt = ( | ||
"You are an AI assistant tasked with generating question and answer pairs" | ||
"for the given context using the given format. Only answer in the format with" | ||
f"no other text. You should create the following number of question/answer pairs: {num_of_qa_paris}" | ||
"Return the question/answer pairs as a JSONL." | ||
"Each dict in the list should have the full context provided," | ||
"a relevant question to the context and an answer to the question.\n\n" | ||
f"Format:\n {format}\n\n" | ||
f"Context:\n {context}" | ||
) | ||
return prompt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 3 additions & 0 deletions
3
prisma/migrations/20231007192636_datasource_finetune_webhook/migration.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
-- AlterTable | ||
ALTER TABLE "Datasource" ADD COLUMN "finetuneId" TEXT, | ||
ADD COLUMN "webhookUrl" TEXT; |
12 changes: 12 additions & 0 deletions
12
prisma/migrations/20231007193258_datasource_fields_update/migration.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
/* | ||
Warnings: | ||
- You are about to drop the column `finetuneId` on the `Datasource` table. All the data in the column will be lost. | ||
- You are about to drop the column `webhookUrl` on the `Datasource` table. All the data in the column will be lost. | ||
*/ | ||
-- AlterTable | ||
ALTER TABLE "Datasource" DROP COLUMN "finetuneId", | ||
DROP COLUMN "webhookUrl", | ||
ADD COLUMN "finetune_id" TEXT, | ||
ADD COLUMN "webhook_url" TEXT; |
9 changes: 9 additions & 0 deletions
9
prisma/migrations/20231007194311_datasource_finetune_object/migration.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
/* | ||
Warnings: | ||
- You are about to drop the column `finetune_id` on the `Datasource` table. All the data in the column will be lost. | ||
*/ | ||
-- AlterTable | ||
ALTER TABLE "Datasource" DROP COLUMN "finetune_id", | ||
ADD COLUMN "finetune" JSONB; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters