From 9d173280f2edeef2581f82f17be576d24bc3a0fc Mon Sep 17 00:00:00 2001 From: su77ungr <69374354+su77ungr@users.noreply.github.com> Date: Tue, 16 May 2023 22:39:33 +0200 Subject: [PATCH] Fixed HF download + gui stable (#65) * Update load_env.py * Create utils.py * Update gui.py * Update startLLM.py * Update ask_libgen.py * Update ingest.py * Delete meta.json * Update README.md --- README.md | 32 +++++----------- casalioy/ask_libgen.py | 8 ++-- casalioy/gui.py | 3 +- casalioy/ingest.py | 4 +- casalioy/load_env.py | 42 ++++----------------- casalioy/startLLM.py | 6 ++- casalioy/utils.py | 85 ++++++++++++++++++++++++++++++++++++++++++ meta.json | 1 - 8 files changed, 113 insertions(+), 68 deletions(-) create mode 100644 casalioy/utils.py delete mode 100644 meta.json diff --git a/README.md b/README.md index f66861c..f4af62f 100644 --- a/README.md +++ b/README.md @@ -45,13 +45,6 @@ for older docker without GUI use `casalioy:latest` might deprecate soon > Fetch the default models -``` -cd models -wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin && -wget https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin -cd ../ -``` - > All set! Proceed with ingesting your [dataset](#ingesting-your-own-dataset) ### Build it from source @@ -74,20 +67,12 @@ pip uninstall -y llama-cpp-python CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force llama-cpp-python ``` -> Download the 2 models and place them in a folder called `./models`: - -- LLM: default - is [ggml-vic7b-q5_1](https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin) -- Embedding: default - to [ggml-model-q4_0](https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin). - > > Edit the example.env to fit your models and rename it to .env ```env # Generic -# Generic MODEL_N_CTX=1024 -TEXT_EMBEDDINGS_MODEL=all-MiniLM-L6-v2 +TEXT_EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2 TEXT_EMBEDDINGS_MODEL_TYPE=HF # LlamaCpp or HF USE_MLOCK=true @@ -99,10 +84,12 @@ INGEST_CHUNK_OVERLAP=50 # Generation MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp -MODEL_PATH=models/ggml-vic7b-q5_1.bin +MODEL_PATH=eachadea/ggml-vicuna-7b-1.1/ggml-vic7b-q5_1.bin MODEL_TEMP=0.8 MODEL_STOP=[STOP] CHAIN_TYPE=stuff +N_RETRIEVE_DOCUMENTS=100 # How many documents to retrieve from the db +N_FORWARD_DOCUMENTS=6 # How many documents to forward to the LLM, chosen among those retrieved ``` This should look like this @@ -111,14 +98,13 @@ This should look like this └── repo ├── startLLM.py ├── casalioy - │ └── ingest.py, load_env.py, startLLM.py, gui.py, __init__.py + │ └── ingest.py, load_env.py, startLLM.py, gui.py, ... ├── source_documents │ └── sample.csv - │ └── shor.pdfstate_of_the_union.txt - │ └── state_of_the_union.txt + │ └── ... ├── models │ ├── ggml-vic7b-q5_1.bin - │ └── ggml-model-q4_0.bin + │ └── ... └── .env, convert.py, Dockerfile ``` @@ -126,7 +112,7 @@ This should look like this To automatically ingest different data types (.txt, .pdf, .csv, .epub, .html, .docx, .pptx, .eml, .msg) -> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/tree/main/source_documents) +> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/main/source_documents/) > inside `source_documents` to run tests with. ```shell @@ -181,7 +167,6 @@ streamlit run casalioy/gui.py | Model | BoolQ | PIQA | HellaSwag | WinoGrande | ARC-e | ARC-c | OBQA | Avg. | |:-------------------------------------------------------------------------------------------------------------------------------------------------|:-----:|:----:|:---------:|:----------:|:-----:|:-----:|:----:|:----:| -| [ggml-vic-7b-uncensored](https://huggingface.co/datasets/dnato/ggjt-v1-vic7b-uncensored-q4_0.bin/resolve/main/ggjt-v1-vic7b-uncensored-q4_0.bin) | 73.4 | 74.8 | 63.4 | 64.7 | 54.9 | 36.0 | 40.2 | 58.2 | | [GPT4All-13b-snoozy q5](https://huggingface.co/TheBloke/GPT4All-13B-snoozy-GGML/blob/main/GPT4All-13B-snoozy.ggml.q5_1.bin) | 83.3 | 79.2 | 75.0 | 71.3 | 60.9 | 44.2 | 43.4 | 65.3 | ### models inside of the GPT-J ecosphere @@ -224,6 +209,7 @@ leaving your environment, and with reasonable performance.

+ # Disclaimer The contents of this repository are provided "as is" and without warranties of any kind, whether express or implied. We diff --git a/casalioy/ask_libgen.py b/casalioy/ask_libgen.py index 17088d0..3d634aa 100644 --- a/casalioy/ask_libgen.py +++ b/casalioy/ask_libgen.py @@ -20,13 +20,12 @@ model_temp, n_gpu_layers, persist_directory, - print_HTML, - prompt_HTML, use_mlock, ) from casalioy.startLLM import QASystem +from casalioy.utils import print_HTML, prompt_HTML -max_doc_size_mb = 10 +max_doc_size_mb = 5 out_path = Path("source_documents/libgen") logging.getLogger().setLevel(logging.WARNING) # because libgenesis changes it @@ -38,9 +37,10 @@ def load_documents(keyword: str, n: int = 3) -> None: """load random documents from LG using keyword""" - lg = Libgen() + lg = Libgen(result_limit=100) result = asyncio.run(lg.search(keyword)) dl_N = 0 + print_HTML(f"Searching for interesting documents (max {n})") with ProgressBar() as pb: for item_id in pb(result): if dl_N >= n: diff --git a/casalioy/gui.py b/casalioy/gui.py index 86cca2c..bde1001 100644 --- a/casalioy/gui.py +++ b/casalioy/gui.py @@ -1,14 +1,13 @@ """LLM through a GUI""" import streamlit as st -from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, print_HTML, use_mlock +from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, use_mlock, print_HTML from streamlit_chat import message from streamlit_extras.add_vertical_space import add_vertical_space from streamlit_extras.colored_header import colored_header from casalioy import startLLM from casalioy.startLLM import QASystem - title = "CASALIOY" diff --git a/casalioy/ingest.py b/casalioy/ingest.py index 49543ae..358229b 100644 --- a/casalioy/ingest.py +++ b/casalioy/ingest.py @@ -20,11 +20,13 @@ UnstructuredPowerPointLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter -from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory, print_HTML, prompt_HTML +from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory from prompt_toolkit import PromptSession from prompt_toolkit.shortcuts import ProgressBar from qdrant_client import QdrantClient, models +from casalioy.utils import print_HTML, prompt_HTML + class Ingester: """ingest documents""" diff --git a/casalioy/load_env.py b/casalioy/load_env.py index 88021e2..34306de 100644 --- a/casalioy/load_env.py +++ b/casalioy/load_env.py @@ -5,18 +5,16 @@ from dotenv import load_dotenv from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings from langchain.prompts import PromptTemplate -from prompt_toolkit import HTML, PromptSession, print_formatted_text -from prompt_toolkit.styles import Style -from pyexpat import ExpatError -load_dotenv() +from casalioy.utils import download_if_repo, print_HTML -# generic +load_dotenv() text_embeddings_model = os.environ.get("TEXT_EMBEDDINGS_MODEL") text_embeddings_model_type = os.environ.get("TEXT_EMBEDDINGS_MODEL_TYPE") model_n_ctx = int(os.environ.get("MODEL_N_CTX")) use_mlock = os.environ.get("USE_MLOCK").lower() == "true" +print_HTML # ingest persist_directory = os.environ.get("PERSIST_DIRECTORY") documents_directory = os.environ.get("DOCUMENTS_DIRECTORY") @@ -30,8 +28,12 @@ model_stop = os.environ.get("MODEL_STOP", "") model_stop = model_stop.split(",") if model_stop else [] chain_type = os.environ.get("CHAIN_TYPE", "refine") +n_retrieve_documents = int(os.environ.get("N_RETRIEVE_DOCUMENTS", 25)) +n_forward_documents = int(os.environ.get("N_FORWARD_DOCUMENTS", 3)) n_gpu_layers = int(os.environ.get("N_GPU_LAYERS", 0)) +text_embeddings_model = download_if_repo(text_embeddings_model) +model_path = download_if_repo(model_path) def get_embedding_model() -> tuple[HuggingFaceEmbeddings | LlamaCppEmbeddings, Callable]: """get the text embedding model @@ -86,33 +88,3 @@ def get_prompt_template_kwargs() -> dict[str, PromptTemplate]: } case _: return {} - - -style = Style.from_dict( - { - "r": "italic gray", # remark - "w": "italic yellow", # warning - "d": "bold red", # danger - "b": "bold", - "i": "italic", - "question": "ansicyan", - "answer": "ansigreen", - "source": "ansimagenta", - } -) - - -def print_HTML(text: str, **kwargs) -> None: - """print formatted HTML text""" - try: - print_formatted_text(HTML(text).format(**kwargs), style=style) - except (ExpatError, IndexError): - print(text) - - -def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str: - """print formatted HTML text""" - try: - return session.prompt(HTML(prompt).format(**kwargs), style=style) - except (ExpatError, IndexError): - print(prompt) diff --git a/casalioy/startLLM.py b/casalioy/startLLM.py index 38ab3b7..daf439e 100644 --- a/casalioy/startLLM.py +++ b/casalioy/startLLM.py @@ -18,12 +18,13 @@ model_stop, model_temp, model_type, + n_forward_documents, n_gpu_layers, + n_retrieve_documents, persist_directory, - print_HTML, - prompt_HTML, use_mlock, ) +from casalioy.utils import print_HTML, prompt_HTML class QASystem: @@ -86,6 +87,7 @@ def __init__( return_source_documents=True, chain_type_kwargs=get_prompt_template_kwargs(), ) + self.qa.retriever.search_kwargs = {**self.qa.retriever.search_kwargs, "k": n_forward_documents, "fetch_k": n_retrieve_documents} def prompt_once(self, query: str) -> tuple[str, str]: """run a prompt""" diff --git a/casalioy/utils.py b/casalioy/utils.py new file mode 100644 index 0000000..1a0bd1d --- /dev/null +++ b/casalioy/utils.py @@ -0,0 +1,85 @@ +"""some useful functions""" +from pathlib import Path + +from huggingface_hub import snapshot_download +from huggingface_hub.utils import HFValidationError, validate_repo_id +from prompt_toolkit import HTML, PromptSession, print_formatted_text +from prompt_toolkit.styles import Style +from pyexpat import ExpatError +from requests import HTTPError + +style = Style.from_dict( + { + "r": "italic gray", # remark + "w": "italic yellow", # warning + "d": "bold red", # danger + "b": "bold", + "i": "italic", + "question": "ansicyan", + "answer": "ansigreen", + "source": "ansimagenta", + } +) + + +def print_HTML(text: str, **kwargs) -> None: + """print formatted HTML text""" + try: + for k, v in kwargs.items(): # necessary + kwargs[k] = str(v).replace("\f", "") + text = text.replace("\f", "") + print_formatted_text(HTML(text).format(**kwargs), style=style) + except ExpatError: + print(text) + + +def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str: + """print formatted HTML text""" + try: + for k, v in kwargs.items(): # necessary + kwargs[k] = str(v).replace("\f", "") + prompt = prompt.replace("\f", "") + return session.prompt(HTML(prompt).format(**kwargs), style=style) + except ExpatError: + return input(prompt) + + +def download_if_repo(path: str, file: str = None, allow_patterns: str | list[str] = None) -> str: + """download model from HF if not local""" + if allow_patterns is None: + allow_patterns = ["*.bin", "*.json"] + p = Path("models/"+path) + if p.is_file() or p.is_dir(): + print(p, "already installed") + return str(p) + + try: + split = path.split("/") + is_dataset = split[0] == "datasets" + if is_dataset: + split = split[1:] + path = "/".join(split) + + if path.endswith(".bin"): + path, file = "/".join(split[: 3 if is_dataset else 2]), split[-1] + validate_repo_id(path) + print_HTML("Downloading {model} from HF", model=path) + new_path = Path( + snapshot_download( + repo_id=path, + allow_patterns=file or allow_patterns, + local_dir=f"models/{path}", + repo_type="dataset" if is_dataset else None, + local_dir_use_symlinks=False, + ) + ) + if file is not None: + files = [f for f in new_path.iterdir() if f.is_file() and f.name.endswith(".bin")] + if len(files) > 1: + names = "\n".join([f" - {f.name}" for f in files]) + raise ValueError(f"Multiple model files found: \n\n{names}\n\n") + new_path = files[0] + return str(new_path.resolve()) + + except (HFValidationError, HTTPError) as e: + print_HTML("Could not download model {model} from HF: {e}", model=path, e=e) diff --git a/meta.json b/meta.json deleted file mode 100644 index 5af20a4..0000000 --- a/meta.json +++ /dev/null @@ -1 +0,0 @@ -{"collections": {"db": {"vectors": {"size": 4096, "distance": "Cosine", "hnsw_config": null, "quantization_config": null}, "shard_number": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null}}, "aliases": {}}