From 9d173280f2edeef2581f82f17be576d24bc3a0fc Mon Sep 17 00:00:00 2001
From: su77ungr <69374354+su77ungr@users.noreply.github.com>
Date: Tue, 16 May 2023 22:39:33 +0200
Subject: [PATCH] Fixed HF download + gui stable  (#65)

* Update load_env.py

* Create utils.py

* Update gui.py

* Update startLLM.py

* Update ask_libgen.py

* Update ingest.py

* Delete meta.json

* Update README.md
---
 README.md              | 32 +++++-----------
 casalioy/ask_libgen.py |  8 ++--
 casalioy/gui.py        |  3 +-
 casalioy/ingest.py     |  4 +-
 casalioy/load_env.py   | 42 ++++-----------------
 casalioy/startLLM.py   |  6 ++-
 casalioy/utils.py      | 85 ++++++++++++++++++++++++++++++++++++++++++
 meta.json              |  1 -
 8 files changed, 113 insertions(+), 68 deletions(-)
 create mode 100644 casalioy/utils.py
 delete mode 100644 meta.json

diff --git a/README.md b/README.md
index f66861c..f4af62f 100644
--- a/README.md
+++ b/README.md
@@ -45,13 +45,6 @@ for older docker without GUI use `casalioy:latest` might deprecate soon
 
 > Fetch the default models
 
-```
-cd models
-wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin &&
-wget https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin
-cd ../
-```
-
 > All set! Proceed with ingesting your [dataset](#ingesting-your-own-dataset)
 
 ### Build it from source
@@ -74,20 +67,12 @@ pip uninstall -y llama-cpp-python
 CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force llama-cpp-python
 ```
 
-> Download the 2 models and place them in a folder called `./models`:
-
-- LLM: default
-  is [ggml-vic7b-q5_1](https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin)
-- Embedding: default
-  to [ggml-model-q4_0](https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin).
-
 > > Edit the example.env to fit your models and rename it to .env
 
 ```env
 # Generic
-# Generic
 MODEL_N_CTX=1024
-TEXT_EMBEDDINGS_MODEL=all-MiniLM-L6-v2
+TEXT_EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2
 TEXT_EMBEDDINGS_MODEL_TYPE=HF  # LlamaCpp or HF
 USE_MLOCK=true
 
@@ -99,10 +84,12 @@ INGEST_CHUNK_OVERLAP=50
 
 # Generation
 MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
-MODEL_PATH=models/ggml-vic7b-q5_1.bin
+MODEL_PATH=eachadea/ggml-vicuna-7b-1.1/ggml-vic7b-q5_1.bin
 MODEL_TEMP=0.8
 MODEL_STOP=[STOP]
 CHAIN_TYPE=stuff
+N_RETRIEVE_DOCUMENTS=100 # How many documents to retrieve from the db
+N_FORWARD_DOCUMENTS=6 # How many documents to forward to the LLM, chosen among those retrieved
 ```
 
 This should look like this
@@ -111,14 +98,13 @@ This should look like this
 └── repo
       ├── startLLM.py
       ├── casalioy
-      │   └── ingest.py, load_env.py, startLLM.py, gui.py, __init__.py
+      │   └── ingest.py, load_env.py, startLLM.py, gui.py, ...
       ├── source_documents
       │   └── sample.csv
-      │   └── shor.pdfstate_of_the_union.txt
-      │   └── state_of_the_union.txt
+      │   └── ...
       ├── models
       │   ├── ggml-vic7b-q5_1.bin
-      │   └── ggml-model-q4_0.bin
+      │   └── ...
       └── .env, convert.py, Dockerfile
 ```
 
@@ -126,7 +112,7 @@ This should look like this
 
 To automatically ingest different data types (.txt, .pdf, .csv, .epub, .html, .docx, .pptx, .eml, .msg)
 
-> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/tree/main/source_documents)
+> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/main/source_documents/)
 > inside `source_documents` to run tests with.
 
 ```shell
@@ -181,7 +167,6 @@ streamlit run casalioy/gui.py
 
 | Model                                                                                                                                            | BoolQ | PIQA | HellaSwag | WinoGrande | ARC-e | ARC-c | OBQA | Avg. |
 |:-------------------------------------------------------------------------------------------------------------------------------------------------|:-----:|:----:|:---------:|:----------:|:-----:|:-----:|:----:|:----:|
-| [ggml-vic-7b-uncensored](https://huggingface.co/datasets/dnato/ggjt-v1-vic7b-uncensored-q4_0.bin/resolve/main/ggjt-v1-vic7b-uncensored-q4_0.bin) | 73.4  | 74.8 |   63.4    |    64.7    | 54.9  | 36.0  | 40.2 | 58.2 |
 | [GPT4All-13b-snoozy q5](https://huggingface.co/TheBloke/GPT4All-13B-snoozy-GGML/blob/main/GPT4All-13B-snoozy.ggml.q5_1.bin)                      | 83.3  | 79.2 |   75.0    |    71.3    | 60.9  | 44.2  | 43.4 | 65.3 |
 
 ### models inside of the GPT-J ecosphere
@@ -224,6 +209,7 @@ leaving your environment, and with reasonable performance.
 
 <br><br>
 
+
 # Disclaimer
 
 The contents of this repository are provided "as is" and without warranties of any kind, whether express or implied. We
diff --git a/casalioy/ask_libgen.py b/casalioy/ask_libgen.py
index 17088d0..3d634aa 100644
--- a/casalioy/ask_libgen.py
+++ b/casalioy/ask_libgen.py
@@ -20,13 +20,12 @@
     model_temp,
     n_gpu_layers,
     persist_directory,
-    print_HTML,
-    prompt_HTML,
     use_mlock,
 )
 from casalioy.startLLM import QASystem
+from casalioy.utils import print_HTML, prompt_HTML
 
-max_doc_size_mb = 10
+max_doc_size_mb = 5
 out_path = Path("source_documents/libgen")
 
 logging.getLogger().setLevel(logging.WARNING)  # because libgenesis changes it
@@ -38,9 +37,10 @@
 
 def load_documents(keyword: str, n: int = 3) -> None:
     """load random documents from LG using keyword"""
-    lg = Libgen()
+    lg = Libgen(result_limit=100)
     result = asyncio.run(lg.search(keyword))
     dl_N = 0
+    print_HTML(f"<r>Searching for interesting documents (max {n})</r>")
     with ProgressBar() as pb:
         for item_id in pb(result):
             if dl_N >= n:
diff --git a/casalioy/gui.py b/casalioy/gui.py
index 86cca2c..bde1001 100644
--- a/casalioy/gui.py
+++ b/casalioy/gui.py
@@ -1,14 +1,13 @@
 """LLM through a GUI"""
 
 import streamlit as st
-from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, print_HTML, use_mlock
+from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, use_mlock, print_HTML
 from streamlit_chat import message
 from streamlit_extras.add_vertical_space import add_vertical_space
 from streamlit_extras.colored_header import colored_header
 
 from casalioy import startLLM
 from casalioy.startLLM import QASystem
-
 title = "CASALIOY"
 
 
diff --git a/casalioy/ingest.py b/casalioy/ingest.py
index 49543ae..358229b 100644
--- a/casalioy/ingest.py
+++ b/casalioy/ingest.py
@@ -20,11 +20,13 @@
     UnstructuredPowerPointLoader,
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory, print_HTML, prompt_HTML
+from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory
 from prompt_toolkit import PromptSession
 from prompt_toolkit.shortcuts import ProgressBar
 from qdrant_client import QdrantClient, models
 
+from casalioy.utils import print_HTML, prompt_HTML
+
 
 class Ingester:
     """ingest documents"""
diff --git a/casalioy/load_env.py b/casalioy/load_env.py
index 88021e2..34306de 100644
--- a/casalioy/load_env.py
+++ b/casalioy/load_env.py
@@ -5,18 +5,16 @@
 from dotenv import load_dotenv
 from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
 from langchain.prompts import PromptTemplate
-from prompt_toolkit import HTML, PromptSession, print_formatted_text
-from prompt_toolkit.styles import Style
-from pyexpat import ExpatError
 
-load_dotenv()
+from casalioy.utils import download_if_repo, print_HTML
 
-# generic
+load_dotenv()
 text_embeddings_model = os.environ.get("TEXT_EMBEDDINGS_MODEL")
 text_embeddings_model_type = os.environ.get("TEXT_EMBEDDINGS_MODEL_TYPE")
 model_n_ctx = int(os.environ.get("MODEL_N_CTX"))
 use_mlock = os.environ.get("USE_MLOCK").lower() == "true"
 
+print_HTML
 # ingest
 persist_directory = os.environ.get("PERSIST_DIRECTORY")
 documents_directory = os.environ.get("DOCUMENTS_DIRECTORY")
@@ -30,8 +28,12 @@
 model_stop = os.environ.get("MODEL_STOP", "")
 model_stop = model_stop.split(",") if model_stop else []
 chain_type = os.environ.get("CHAIN_TYPE", "refine")
+n_retrieve_documents = int(os.environ.get("N_RETRIEVE_DOCUMENTS", 25))
+n_forward_documents = int(os.environ.get("N_FORWARD_DOCUMENTS", 3))
 n_gpu_layers = int(os.environ.get("N_GPU_LAYERS", 0))
 
+text_embeddings_model = download_if_repo(text_embeddings_model)
+model_path = download_if_repo(model_path)
 
 def get_embedding_model() -> tuple[HuggingFaceEmbeddings | LlamaCppEmbeddings, Callable]:
     """get the text embedding model
@@ -86,33 +88,3 @@ def get_prompt_template_kwargs() -> dict[str, PromptTemplate]:
             }
         case _:
             return {}
-
-
-style = Style.from_dict(
-    {
-        "r": "italic gray",  # remark
-        "w": "italic yellow",  # warning
-        "d": "bold red",  # danger
-        "b": "bold",
-        "i": "italic",
-        "question": "ansicyan",
-        "answer": "ansigreen",
-        "source": "ansimagenta",
-    }
-)
-
-
-def print_HTML(text: str, **kwargs) -> None:
-    """print formatted HTML text"""
-    try:
-        print_formatted_text(HTML(text).format(**kwargs), style=style)
-    except (ExpatError, IndexError):
-        print(text)
-
-
-def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str:
-    """print formatted HTML text"""
-    try:
-        return session.prompt(HTML(prompt).format(**kwargs), style=style)
-    except (ExpatError, IndexError):
-        print(prompt)
diff --git a/casalioy/startLLM.py b/casalioy/startLLM.py
index 38ab3b7..daf439e 100644
--- a/casalioy/startLLM.py
+++ b/casalioy/startLLM.py
@@ -18,12 +18,13 @@
     model_stop,
     model_temp,
     model_type,
+    n_forward_documents,
     n_gpu_layers,
+    n_retrieve_documents,
     persist_directory,
-    print_HTML,
-    prompt_HTML,
     use_mlock,
 )
+from casalioy.utils import print_HTML, prompt_HTML
 
 
 class QASystem:
@@ -86,6 +87,7 @@ def __init__(
             return_source_documents=True,
             chain_type_kwargs=get_prompt_template_kwargs(),
         )
+        self.qa.retriever.search_kwargs = {**self.qa.retriever.search_kwargs, "k": n_forward_documents, "fetch_k": n_retrieve_documents}
 
     def prompt_once(self, query: str) -> tuple[str, str]:
         """run a prompt"""
diff --git a/casalioy/utils.py b/casalioy/utils.py
new file mode 100644
index 0000000..1a0bd1d
--- /dev/null
+++ b/casalioy/utils.py
@@ -0,0 +1,85 @@
+"""some useful functions"""
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+from prompt_toolkit import HTML, PromptSession, print_formatted_text
+from prompt_toolkit.styles import Style
+from pyexpat import ExpatError
+from requests import HTTPError
+
+style = Style.from_dict(
+    {
+        "r": "italic gray",  # remark
+        "w": "italic yellow",  # warning
+        "d": "bold red",  # danger
+        "b": "bold",
+        "i": "italic",
+        "question": "ansicyan",
+        "answer": "ansigreen",
+        "source": "ansimagenta",
+    }
+)
+
+
+def print_HTML(text: str, **kwargs) -> None:
+    """print formatted HTML text"""
+    try:
+        for k, v in kwargs.items():  # necessary
+            kwargs[k] = str(v).replace("\f", "")
+        text = text.replace("\f", "")
+        print_formatted_text(HTML(text).format(**kwargs), style=style)
+    except ExpatError:
+        print(text)
+
+
+def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str:
+    """print formatted HTML text"""
+    try:
+        for k, v in kwargs.items():  # necessary
+            kwargs[k] = str(v).replace("\f", "")
+        prompt = prompt.replace("\f", "")
+        return session.prompt(HTML(prompt).format(**kwargs), style=style)
+    except ExpatError:
+        return input(prompt)
+
+
+def download_if_repo(path: str, file: str = None, allow_patterns: str | list[str] = None) -> str:
+    """download model from HF if not local"""
+    if allow_patterns is None:
+        allow_patterns = ["*.bin", "*.json"]
+    p = Path("models/"+path)
+    if p.is_file() or p.is_dir():
+        print(p, "already installed")
+        return str(p)
+        
+    try:
+        split = path.split("/")
+        is_dataset = split[0] == "datasets"
+        if is_dataset:
+            split = split[1:]
+            path = "/".join(split)
+
+        if path.endswith(".bin"):
+            path, file = "/".join(split[: 3 if is_dataset else 2]), split[-1]
+        validate_repo_id(path)
+        print_HTML("<r>Downloading {model} from HF</r>", model=path)
+        new_path = Path(
+            snapshot_download(
+                repo_id=path,
+                allow_patterns=file or allow_patterns,
+                local_dir=f"models/{path}",
+                repo_type="dataset" if is_dataset else None,
+                local_dir_use_symlinks=False,
+            )
+        )
+        if file is not None:
+            files = [f for f in new_path.iterdir() if f.is_file() and f.name.endswith(".bin")]
+            if len(files) > 1:
+                names = "\n".join([f" - {f.name}" for f in files])
+                raise ValueError(f"Multiple model files found: \n\n{names}\n\n")
+            new_path = files[0]
+        return str(new_path.resolve())
+
+    except (HFValidationError, HTTPError) as e:
+        print_HTML("<w>Could not download model {model} from HF: {e}</w>", model=path, e=e)
diff --git a/meta.json b/meta.json
deleted file mode 100644
index 5af20a4..0000000
--- a/meta.json
+++ /dev/null
@@ -1 +0,0 @@
-{"collections": {"db": {"vectors": {"size": 4096, "distance": "Cosine", "hnsw_config": null, "quantization_config": null}, "shard_number": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null}}, "aliases": {}}