diff --git a/.gitignore b/.gitignore index 7eea21f..8e309b6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ superenv/ venv/ /.vscode /.codesandbox -.pypirc \ No newline at end of file +.pypirc +dist/ diff --git a/dist/nagato_ai-0.0.8-py3-none-any.whl b/dist/nagato_ai-0.0.8-py3-none-any.whl deleted file mode 100644 index 6b88791..0000000 Binary files a/dist/nagato_ai-0.0.8-py3-none-any.whl and /dev/null differ diff --git a/dist/nagato_ai-0.0.8.tar.gz b/dist/nagato_ai-0.0.8.tar.gz deleted file mode 100644 index 1588d16..0000000 Binary files a/dist/nagato_ai-0.0.8.tar.gz and /dev/null differ diff --git a/nagato/service/finetune.py b/nagato/service/finetune.py index ff407eb..44471e2 100644 --- a/nagato/service/finetune.py +++ b/nagato/service/finetune.py @@ -1,5 +1,6 @@ # flake8: noqa +import sys import requests import json import os @@ -70,7 +71,9 @@ def generate_dataset(self) -> str: with open(training_file, "w") as f: with ThreadPoolExecutor() as executor: progress_bar = tqdm( - total=total_pairs, desc="Generating synthetic Q&A pairs" + total=total_pairs, + desc="Generating synthetic Q&A pairs", + file=sys.stdout, ) for i in range( 0, len(self.nodes), self.batch_size @@ -163,7 +166,11 @@ def validate_dataset(self, training_file: str) -> str: with open(training_file, "r") as f: lines = f.readlines() total_lines = len(lines) - progress_bar = tqdm(total=total_lines, desc="Validating lines") + progress_bar = tqdm( + total=total_lines, + desc="Validating lines", + file=sys.stdout, + ) for i, line in enumerate(lines, start=1): try: data = json.loads(line) diff --git a/nagato/utils/logger.py b/nagato/utils/logger.py index ab807f1..8775341 100644 --- a/nagato/utils/logger.py +++ b/nagato/utils/logger.py @@ -20,7 +20,7 @@ def setup_logger(): style="%", ) - logger = colorlog.getLogger("example") + logger = colorlog.getLogger(__name__) handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) diff --git a/poetry.lock b/poetry.lock index 734ccd4..62daa27 100644 --- a/poetry.lock +++ b/poetry.lock @@ -154,19 +154,6 @@ files = [ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] -[[package]] -name = "asyncio" -version = "3.4.3" -description = "reference implementation of PEP 3156" -optional = false -python-versions = "*" -files = [ - {file = "asyncio-3.4.3-cp33-none-win32.whl", hash = "sha256:b62c9157d36187eca799c378e572c969f0da87cd5fc42ca372d92cdb06e7e1de"}, - {file = "asyncio-3.4.3-cp33-none-win_amd64.whl", hash = "sha256:c46a87b48213d7464f22d9a497b9eef8c1928b68320a2fa94240f969f6fec08c"}, - {file = "asyncio-3.4.3-py3-none-any.whl", hash = "sha256:c4d18b22701821de07bd6aea8b53d21449ec0ec5680645e5317062ea21817d2d"}, - {file = "asyncio-3.4.3.tar.gz", hash = "sha256:83360ff8bc97980e4ff25c964c7bd3923d333d177aa4f7fb736b019f26c7cb41"}, -] - [[package]] name = "attrs" version = "23.1.0" @@ -434,16 +421,6 @@ files = [ marshmallow = ">=3.18.0,<4.0.0" typing-inspect = ">=0.4.0,<1" -[[package]] -name = "decouple" -version = "0.0.7" -description = "Decoupling logic" -optional = false -python-versions = ">=3.7" -files = [ - {file = "decouple-0.0.7.tar.gz", hash = "sha256:c253c1cc62b76e9720a066f334552548c6de1884bc3bdf456ddb62d61b867b09"}, -] - [[package]] name = "dnspython" version = "2.4.2" @@ -2455,6 +2432,22 @@ files = [ {file = "sentencepiece-0.1.99.tar.gz", hash = "sha256:189c48f5cb2949288f97ccdb97f0473098d9c3dcf5a3d99d4eabe719ec27297f"}, ] +[[package]] +name = "setuptools" +version = "68.2.2" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "setuptools-68.2.2-py3-none-any.whl", hash = "sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a"}, + {file = "setuptools-68.2.2.tar.gz", hash = "sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "six" version = "1.16.0" @@ -2954,13 +2947,13 @@ files = [ [[package]] name = "unstructured" -version = "0.10.24" +version = "0.10.25" description = "A library that prepares raw documents for downstream ML tasks." optional = false python-versions = ">=3.7.0" files = [ - {file = "unstructured-0.10.24-py3-none-any.whl", hash = "sha256:19bcc5392f2df517a8d2807e338a363025de5e52a9e0b8289ccbe4bc7f08c117"}, - {file = "unstructured-0.10.24.tar.gz", hash = "sha256:c991daf72cce18bcd09535e2e29cd3ff463bb7ab66206e181b1a9f5ced5cf82f"}, + {file = "unstructured-0.10.25-py3-none-any.whl", hash = "sha256:25c67c91970fa425e4afa876a1ec5aba90f082a552e12231fc7de95cb13aea73"}, + {file = "unstructured-0.10.25.tar.gz", hash = "sha256:b826b3f1a48c39d13097d4ad97a37b394ec8d4ccdb4df1847c3920a4cf57c46e"}, ] [package.dependencies] @@ -2982,9 +2975,10 @@ tabulate = "*" [package.extras] airtable = ["pyairtable"] -all-docs = ["ebooklib", "markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.7)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +all-docs = ["ebooklib", "markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.9)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] azure = ["adlfs", "fsspec (==2023.9.1)"] azure-cognitive-search = ["azure-search-documents"] +bedrock = ["boto3", "langchain"] biomed = ["bs4"] box = ["boxfs", "fsspec (==2023.9.1)"] confluence = ["atlassian-python-api"] @@ -2995,15 +2989,16 @@ doc = ["python-docx (>=1.0.1)"] docx = ["python-docx (>=1.0.1)"] dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] elasticsearch = ["elasticsearch", "jq"] +embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] epub = ["ebooklib"] gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] github = ["pygithub (>1.58.0)"] gitlab = ["python-gitlab"] google-drive = ["google-api-python-client"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.7)", "unstructured.pytesseract (>=0.3.12)"] +image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.9)", "unstructured.pytesseract (>=0.3.12)"] jira = ["atlassian-python-api"] -local-inference = ["ebooklib", "markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.7)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +local-inference = ["ebooklib", "markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.9)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] msg = ["msg-parser"] notion = ["htmlBuilder", "notion-client"] @@ -3013,7 +3008,7 @@ openai = ["langchain", "openai", "tiktoken"] org = ["pypandoc"] outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.7)", "unstructured.pytesseract (>=0.3.12)"] +pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.9)", "unstructured.pytesseract (>=0.3.12)"] ppt = ["python-pptx (<=0.6.21)"] pptx = ["python-pptx (<=0.6.21)"] reddit = ["praw"] @@ -3175,4 +3170,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "ba72fbfeacab49ba9925e940dcfbfa625e1950aedef0f2d568dc81cc1c1b30e5" +content-hash = "11e5ccb14c0954e143f89b189dbcadfb93f4663b47fc24f6339e5e34d16ddee6" diff --git a/pyproject.toml b/pyproject.toml index edd4066..2136269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "nagato-ai" -version = "0.0.8" +version = "0.0.15" description = "" authors = ["Ismail Pelaseyed"] readme = "./README.md" @@ -18,7 +18,6 @@ unstructured = "^0.10.16" requests = "^2.31.0" colorlog = "^6.7.0" vulture = "^2.7" -asyncio = "^3.4.3" llama-index = "^0.8.37" pypdf = "^3.16.2" tiktoken = "^0.5.1" @@ -27,7 +26,7 @@ replicate = "^0.15.4" wheel = "^0.41.0" python-dotenv = "^1.0.0" tqdm = "^4.66.1" -decouple = "^0.0.7" +setuptools = "^68.2.2" [build-system] requires = ["poetry-core"] diff --git a/setup.py b/setup.py index f3f0d1a..3ae8cef 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ -from setuptools import find_packages, setup +from setuptools import setup setup( name="nagato-ai", - version="0.0.1", - packages=find_packages(), - description="The open framework for finetuning LLMs on private data", + version="0.0.14", + packages=["nagato"], + description="The open framework for Q&A finetuning LLMs on private data", long_description=open("README.md").read(), long_description_content_type="text/markdown", author="Ismail Pelaseyed", @@ -14,4 +14,25 @@ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", ], + install_requires=[ + "python-decouple>=3.8", + "pydantic>=1.10.7", + "flake8>=6.0.0", + "ruff>=0.0.265", + "black>=23.3.0", + "pinecone-client>=2.2.2", + "unstructured>=0.10.16", + "requests>=2.31.0", + "colorlog>=6.7.0", + "vulture>=2.7", + "llama-index>=0.8.37", + "pypdf>=3.16.2", + "tiktoken>=0.5.1", + "sentence-transformers>=2.2.2", + "replicate>=0.15.4", + "wheel>=0.41.0", + "python-dotenv>=1.0.0", + "tqdm>=4.66.1", + "setuptools>=68.2.2", + ], )