Skip to content

Commit

Permalink
Fixed HF download + gui stable (#65)
Browse files Browse the repository at this point in the history
* Update load_env.py

* Create utils.py

* Update gui.py

* Update startLLM.py

* Update ask_libgen.py

* Update ingest.py

* Delete meta.json

* Update README.md
  • Loading branch information
su77ungr authored May 16, 2023
1 parent 5c8e465 commit 9d17328
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 68 deletions.
32 changes: 9 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,6 @@ for older docker without GUI use `casalioy:latest` might deprecate soon

> Fetch the default models
```
cd models
wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin &&
wget https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin
cd ../
```

> All set! Proceed with ingesting your [dataset](#ingesting-your-own-dataset)
### Build it from source
Expand All @@ -74,20 +67,12 @@ pip uninstall -y llama-cpp-python
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force llama-cpp-python
```

> Download the 2 models and place them in a folder called `./models`:
- LLM: default
is [ggml-vic7b-q5_1](https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin)
- Embedding: default
to [ggml-model-q4_0](https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin).

> > Edit the example.env to fit your models and rename it to .env
```env
# Generic
# Generic
MODEL_N_CTX=1024
TEXT_EMBEDDINGS_MODEL=all-MiniLM-L6-v2
TEXT_EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2
TEXT_EMBEDDINGS_MODEL_TYPE=HF # LlamaCpp or HF
USE_MLOCK=true
Expand All @@ -99,10 +84,12 @@ INGEST_CHUNK_OVERLAP=50
# Generation
MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
MODEL_PATH=models/ggml-vic7b-q5_1.bin
MODEL_PATH=eachadea/ggml-vicuna-7b-1.1/ggml-vic7b-q5_1.bin
MODEL_TEMP=0.8
MODEL_STOP=[STOP]
CHAIN_TYPE=stuff
N_RETRIEVE_DOCUMENTS=100 # How many documents to retrieve from the db
N_FORWARD_DOCUMENTS=6 # How many documents to forward to the LLM, chosen among those retrieved
```

This should look like this
Expand All @@ -111,22 +98,21 @@ This should look like this
└── repo
├── startLLM.py
├── casalioy
│ └── ingest.py, load_env.py, startLLM.py, gui.py, __init__.py
│ └── ingest.py, load_env.py, startLLM.py, gui.py, ...
├── source_documents
│ └── sample.csv
│ └── shor.pdfstate_of_the_union.txt
│ └── state_of_the_union.txt
│ └── ...
├── models
│ ├── ggml-vic7b-q5_1.bin
│ └── ggml-model-q4_0.bin
│ └── ...
└── .env, convert.py, Dockerfile
```

## Ingesting your own dataset

To automatically ingest different data types (.txt, .pdf, .csv, .epub, .html, .docx, .pptx, .eml, .msg)

> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/tree/main/source_documents)
> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/main/source_documents/)
> inside `source_documents` to run tests with.
```shell
Expand Down Expand Up @@ -181,7 +167,6 @@ streamlit run casalioy/gui.py

| Model | BoolQ | PIQA | HellaSwag | WinoGrande | ARC-e | ARC-c | OBQA | Avg. |
|:-------------------------------------------------------------------------------------------------------------------------------------------------|:-----:|:----:|:---------:|:----------:|:-----:|:-----:|:----:|:----:|
| [ggml-vic-7b-uncensored](https://huggingface.co/datasets/dnato/ggjt-v1-vic7b-uncensored-q4_0.bin/resolve/main/ggjt-v1-vic7b-uncensored-q4_0.bin) | 73.4 | 74.8 | 63.4 | 64.7 | 54.9 | 36.0 | 40.2 | 58.2 |
| [GPT4All-13b-snoozy q5](https://huggingface.co/TheBloke/GPT4All-13B-snoozy-GGML/blob/main/GPT4All-13B-snoozy.ggml.q5_1.bin) | 83.3 | 79.2 | 75.0 | 71.3 | 60.9 | 44.2 | 43.4 | 65.3 |

### models inside of the GPT-J ecosphere
Expand Down Expand Up @@ -224,6 +209,7 @@ leaving your environment, and with reasonable performance.

<br><br>


# Disclaimer

The contents of this repository are provided "as is" and without warranties of any kind, whether express or implied. We
Expand Down
8 changes: 4 additions & 4 deletions casalioy/ask_libgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@
model_temp,
n_gpu_layers,
persist_directory,
print_HTML,
prompt_HTML,
use_mlock,
)
from casalioy.startLLM import QASystem
from casalioy.utils import print_HTML, prompt_HTML

max_doc_size_mb = 10
max_doc_size_mb = 5
out_path = Path("source_documents/libgen")

logging.getLogger().setLevel(logging.WARNING) # because libgenesis changes it
Expand All @@ -38,9 +37,10 @@

def load_documents(keyword: str, n: int = 3) -> None:
"""load random documents from LG using keyword"""
lg = Libgen()
lg = Libgen(result_limit=100)
result = asyncio.run(lg.search(keyword))
dl_N = 0
print_HTML(f"<r>Searching for interesting documents (max {n})</r>")
with ProgressBar() as pb:
for item_id in pb(result):
if dl_N >= n:
Expand Down
3 changes: 1 addition & 2 deletions casalioy/gui.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
"""LLM through a GUI"""

import streamlit as st
from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, print_HTML, use_mlock
from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, use_mlock, print_HTML
from streamlit_chat import message
from streamlit_extras.add_vertical_space import add_vertical_space
from streamlit_extras.colored_header import colored_header

from casalioy import startLLM
from casalioy.startLLM import QASystem

title = "CASALIOY"


Expand Down
4 changes: 3 additions & 1 deletion casalioy/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
UnstructuredPowerPointLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory, print_HTML, prompt_HTML
from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory
from prompt_toolkit import PromptSession
from prompt_toolkit.shortcuts import ProgressBar
from qdrant_client import QdrantClient, models

from casalioy.utils import print_HTML, prompt_HTML


class Ingester:
"""ingest documents"""
Expand Down
42 changes: 7 additions & 35 deletions casalioy/load_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
from langchain.prompts import PromptTemplate
from prompt_toolkit import HTML, PromptSession, print_formatted_text
from prompt_toolkit.styles import Style
from pyexpat import ExpatError

load_dotenv()
from casalioy.utils import download_if_repo, print_HTML

# generic
load_dotenv()
text_embeddings_model = os.environ.get("TEXT_EMBEDDINGS_MODEL")
text_embeddings_model_type = os.environ.get("TEXT_EMBEDDINGS_MODEL_TYPE")
model_n_ctx = int(os.environ.get("MODEL_N_CTX"))
use_mlock = os.environ.get("USE_MLOCK").lower() == "true"

print_HTML
# ingest
persist_directory = os.environ.get("PERSIST_DIRECTORY")
documents_directory = os.environ.get("DOCUMENTS_DIRECTORY")
Expand All @@ -30,8 +28,12 @@
model_stop = os.environ.get("MODEL_STOP", "")
model_stop = model_stop.split(",") if model_stop else []
chain_type = os.environ.get("CHAIN_TYPE", "refine")
n_retrieve_documents = int(os.environ.get("N_RETRIEVE_DOCUMENTS", 25))
n_forward_documents = int(os.environ.get("N_FORWARD_DOCUMENTS", 3))
n_gpu_layers = int(os.environ.get("N_GPU_LAYERS", 0))

text_embeddings_model = download_if_repo(text_embeddings_model)
model_path = download_if_repo(model_path)

def get_embedding_model() -> tuple[HuggingFaceEmbeddings | LlamaCppEmbeddings, Callable]:
"""get the text embedding model
Expand Down Expand Up @@ -86,33 +88,3 @@ def get_prompt_template_kwargs() -> dict[str, PromptTemplate]:
}
case _:
return {}


style = Style.from_dict(
{
"r": "italic gray", # remark
"w": "italic yellow", # warning
"d": "bold red", # danger
"b": "bold",
"i": "italic",
"question": "ansicyan",
"answer": "ansigreen",
"source": "ansimagenta",
}
)


def print_HTML(text: str, **kwargs) -> None:
"""print formatted HTML text"""
try:
print_formatted_text(HTML(text).format(**kwargs), style=style)
except (ExpatError, IndexError):
print(text)


def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str:
"""print formatted HTML text"""
try:
return session.prompt(HTML(prompt).format(**kwargs), style=style)
except (ExpatError, IndexError):
print(prompt)
6 changes: 4 additions & 2 deletions casalioy/startLLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
model_stop,
model_temp,
model_type,
n_forward_documents,
n_gpu_layers,
n_retrieve_documents,
persist_directory,
print_HTML,
prompt_HTML,
use_mlock,
)
from casalioy.utils import print_HTML, prompt_HTML


class QASystem:
Expand Down Expand Up @@ -86,6 +87,7 @@ def __init__(
return_source_documents=True,
chain_type_kwargs=get_prompt_template_kwargs(),
)
self.qa.retriever.search_kwargs = {**self.qa.retriever.search_kwargs, "k": n_forward_documents, "fetch_k": n_retrieve_documents}

def prompt_once(self, query: str) -> tuple[str, str]:
"""run a prompt"""
Expand Down
85 changes: 85 additions & 0 deletions casalioy/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""some useful functions"""
from pathlib import Path

from huggingface_hub import snapshot_download
from huggingface_hub.utils import HFValidationError, validate_repo_id
from prompt_toolkit import HTML, PromptSession, print_formatted_text
from prompt_toolkit.styles import Style
from pyexpat import ExpatError
from requests import HTTPError

style = Style.from_dict(
{
"r": "italic gray", # remark
"w": "italic yellow", # warning
"d": "bold red", # danger
"b": "bold",
"i": "italic",
"question": "ansicyan",
"answer": "ansigreen",
"source": "ansimagenta",
}
)


def print_HTML(text: str, **kwargs) -> None:
"""print formatted HTML text"""
try:
for k, v in kwargs.items(): # necessary
kwargs[k] = str(v).replace("\f", "")
text = text.replace("\f", "")
print_formatted_text(HTML(text).format(**kwargs), style=style)
except ExpatError:
print(text)


def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str:
"""print formatted HTML text"""
try:
for k, v in kwargs.items(): # necessary
kwargs[k] = str(v).replace("\f", "")
prompt = prompt.replace("\f", "")
return session.prompt(HTML(prompt).format(**kwargs), style=style)
except ExpatError:
return input(prompt)


def download_if_repo(path: str, file: str = None, allow_patterns: str | list[str] = None) -> str:
"""download model from HF if not local"""
if allow_patterns is None:
allow_patterns = ["*.bin", "*.json"]
p = Path("models/"+path)
if p.is_file() or p.is_dir():
print(p, "already installed")
return str(p)

try:
split = path.split("/")
is_dataset = split[0] == "datasets"
if is_dataset:
split = split[1:]
path = "/".join(split)

if path.endswith(".bin"):
path, file = "/".join(split[: 3 if is_dataset else 2]), split[-1]
validate_repo_id(path)
print_HTML("<r>Downloading {model} from HF</r>", model=path)
new_path = Path(
snapshot_download(
repo_id=path,
allow_patterns=file or allow_patterns,
local_dir=f"models/{path}",
repo_type="dataset" if is_dataset else None,
local_dir_use_symlinks=False,
)
)
if file is not None:
files = [f for f in new_path.iterdir() if f.is_file() and f.name.endswith(".bin")]
if len(files) > 1:
names = "\n".join([f" - {f.name}" for f in files])
raise ValueError(f"Multiple model files found: \n\n{names}\n\n")
new_path = files[0]
return str(new_path.resolve())

except (HFValidationError, HTTPError) as e:
print_HTML("<w>Could not download model {model} from HF: {e}</w>", model=path, e=e)
1 change: 0 additions & 1 deletion meta.json

This file was deleted.

0 comments on commit 9d17328

Please sign in to comment.