Skip to content

Commit

Permalink
Merge pull request #26 from homanp/remove-db
Browse files Browse the repository at this point in the history
Remove db
  • Loading branch information
homanp authored Oct 18, 2023
2 parents 4063a6e + 5fbef6f commit 566586e
Show file tree
Hide file tree
Showing 38 changed files with 1,396 additions and 1,970 deletions.
19 changes: 0 additions & 19 deletions .dockerignore

This file was deleted.

2 changes: 0 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
DATABASE_URL=
DATABASE_MIGRATION_URL=
OPENAI_API_KEY=
HF_API_KEY=
PINECONE_API_KEY=
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ superenv/
.DS_Store
venv/
/.vscode
/.codesandbox
/.codesandbox
.pypirc
31 changes: 0 additions & 31 deletions Dockerfile

This file was deleted.

Binary file added dist/nagato_ai-0.0.5-py3-none-any.whl
Binary file not shown.
Binary file added dist/nagato_ai-0.0.5.tar.gz
Binary file not shown.
3 changes: 3 additions & 0 deletions lib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# flake8: noqa

from .service import create_finetuned_model, create_vector_embeddings
31 changes: 0 additions & 31 deletions lib/api/ingest.py

This file was deleted.

13 changes: 0 additions & 13 deletions lib/api/invoke.py

This file was deleted.

28 changes: 0 additions & 28 deletions lib/api/webhook.py

This file was deleted.

69 changes: 0 additions & 69 deletions lib/main.py

This file was deleted.

12 changes: 0 additions & 12 deletions lib/models/ingest.py

This file was deleted.

10 changes: 0 additions & 10 deletions lib/routers.py

This file was deleted.

48 changes: 48 additions & 0 deletions lib/service/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import List, Union

import openai
from llama_index import Document

from lib.service.embedding import EmbeddingService
from lib.service.finetune import get_finetuning_service


def create_vector_embeddings(
type: str, finetune_id: str, url: str = None, content: str = None
) -> List[Union[Document, None]]:
embedding_service = EmbeddingService(type=type, content=content, url=url)
documents = embedding_service.generate_documents()
nodes = embedding_service.generate_chunks(documents=documents)
embedding_service.generate_embeddings(nodes=nodes, finetune_id=finetune_id)
return nodes


def create_finetuned_model(
provider: str,
base_model: str,
type: str,
url: str = None,
content: str = None,
webhook_url: str = None,
):
embedding_service = EmbeddingService(type=type, url=url, content=content)
documents = embedding_service.generate_documents()
nodes = embedding_service.generate_chunks(documents=documents)
finetunning_service = get_finetuning_service(
nodes=nodes,
provider=provider,
batch_size=5,
base_model=base_model,
num_questions_per_chunk=1,
)
training_file = finetunning_service.generate_dataset()
formatted_training_file = finetunning_service.validate_dataset(
training_file=training_file
)
finetune = finetunning_service.finetune(
training_file=formatted_training_file, webhook_url=webhook_url
)
if provider == "OPENAI":
finetune = openai.FineTune.retrieve(id=finetune.get("id"))
finetunning_service.cleanup(training_file=finetune.get("training_file"))
return finetune
33 changes: 17 additions & 16 deletions lib/service/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,48 +9,49 @@
from sentence_transformers import SentenceTransformer

from lib.service.vectordb import get_vector_service
from prisma.models import Datasource


class EmbeddingService:
def __init__(self, datasource: Datasource):
self.datasource = datasource
def __init__(self, type: str, url: str = None, content: str = None):
self.type = type
self.url = url
self.content = content

def get_datasource_suffix(self) -> str:
suffixes = {"TXT": ".txt", "PDF": ".pdf", "MARKDOWN": ".md"}
try:
return suffixes[self.datasource.type]
return suffixes[self.type]
except KeyError:
raise ValueError("Unsupported datasource type")

async def generate_documents(self) -> List[Document]:
def generate_documents(self) -> List[Document]:
with NamedTemporaryFile(
suffix=self.get_datasource_suffix(), delete=True
) as temp_file:
if self.datasource.url:
content = requests.get(self.datasource.url).content
if self.url:
content = requests.get(self.url).content
else:
content = self.datasource.content
content = self.content
temp_file.write(content)
temp_file.flush()
reader = SimpleDirectoryReader(input_files=[temp_file.name])
docs = reader.load_data()
return docs

async def generate_chunks(
self, documents: List[Document]
) -> List[Union[Document, None]]:
def generate_chunks(self, documents: List[Document]) -> List[Union[Document, None]]:
parser = SimpleNodeParser.from_defaults(chunk_size=350, chunk_overlap=20)
nodes = parser.get_nodes_from_documents(documents, show_progress=True)
return nodes

async def generate_embeddings(
self, nodes: List[Union[Document, None]]
def generate_embeddings(
self,
nodes: List[Union[Document, None]],
finetune_id: str,
) -> List[ndarray]:
vectordb = await get_vector_service(
vectordb = get_vector_service(
provider="pinecone",
index_name="all-minilm-l6-v2",
namespace=self.datasource.id,
namespace=finetune_id,
dimension=384,
)
model = SentenceTransformer(
Expand All @@ -65,7 +66,7 @@ async def generate_embeddings(
{**node.metadata, "content": node.text},
)
embeddings.append(embedding)
await vectordb.upsert(vectors=embeddings)
vectordb.upsert(vectors=embeddings)
return embeddings

# def generate_query(self):
Expand Down
Loading

0 comments on commit 566586e

Please sign in to comment.