Update AI Q&A pyton script

Update langchain modules (now langchain_community is used). Alpine won't support these libraries due to the problems to install dependencies on it. GROQ models are also supported if you have the proper API KEY in your shell.
testillano · Aug 22, 2024 · 1c0ffc3 · 1c0ffc3
1 parent dc38c63
commit 1c0ffc3
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 76 deletions.
diff --git a/Dockerfile.training b/Dockerfile.training
@@ -19,13 +19,8 @@ RUN if [ "${base_os}" = "alpine" ] ; then apk update ; elif [ "${base_os}" = "ub
 RUN if [ "${base_os}" = "alpine" ] ; then apk add dos2unix ; elif [ "${base_os}" = "ubuntu" ] ; then apt-get install -y dos2unix netcat-openbsd ; fi
 
 ## Installation for questions & answers (langchain & OpenAI)
-RUN if [ "${enable_qa}" = "true" -a "${base_os}" = "alpine" ] ; then apk add python3.9 py3-pip python3-dev ; fi
-RUN if [ "${enable_qa}" = "true" -a "${base_os}" = "ubuntu" ] ; then apt-get install -y python3.9 python3-pip ; fi
-
-RUN if [ "${enable_qa}" = "true" -a "${base_os}" = "alpine" ] ; then pip3 install torch -f https://download.pytorch.org/whl/torch_stable.html ; fi
-
-# Why split pip3 installed modules: https://github.com/pypa/pip/issues/1386
-RUN if [ "${enable_qa}" = "true" ] ; then pip3 install "langchain==0.0.138" "unstructured==0.5.12" && pip3 install "chromadb==0.3.21" "tiktoken==0.3.3" && pip3 install "openai==0.27.4" ; fi
+RUN if [ "${enable_qa}" = "true" -a "${base_os}" = "ubuntu" ] ; then apt-get install -y python3 python3-pip python3.12-venv ; fi
+RUN if [ "${enable_qa}" = "true" -a "${base_os}" = "ubuntu" ] ; then python3 -m venv /opt/venv ; /bin/bash -c "source /opt/venv/bin/activate && pip3 install -r tools/questions-and-answers/requirements.txt" ; fi
 
 # Cleanup
 RUN if [ "${base_os}" = "alpine" ] ; then rm -rf /var/cache/apk/* ; elif [ "${base_os}" = "ubuntu" ] ; then apt-get clean ; fi

diff --git a/README.md b/README.md
@@ -1554,7 +1554,7 @@ bash-5.1# ./h2agent --verbose &
 
 #### Questions and answers
 
-A conversational bot is available in `./tools/questions-and-answers` directory. It is implemented in python using *langchain* and *OpenAI* (ChatGPT) technology. Check its [README.md](./tools/questions-and-answers/README.md) file to learn more about.
+A conversational bot is available in `./tools/questions-and-answers` directory. It is implemented in python using *langchain* and *OpenAI* (ChatGPT) technology. Also *Groq* model can be used if the proper key is detected. Check its [README.md](./tools/questions-and-answers/README.md) file to learn more about.
 
 #### Play
 

diff --git a/tools/questions-and-answers/README.md b/tools/questions-and-answers/README.md
@@ -3,15 +3,15 @@
 Install requirements file by mean:
 
 ```bash
-$ pip3 install -r requirements.txt
+$ pip3 install -r tools/questions-and-answers/requirements.txt
 ```
 
 And then, execute the python script:
 
 ```bash
-$ python3.9 tools/questions-and-answers/run.py
+$ python3 tools/questions-and-answers/run.py
 ```
 
 Please note that the conversational bot has certain limitations and although it relies on the documentation of this project, some questions may be incorrect or incomplete. It is always recommended to carefully read all the documentation, especially the main [README.md](../../README.md) file.
 
-You may use this python script within the training docker image, but all the dependencies must be installed there (a prompt when executing `./tools/training.sh`, ask for it because those dependencies are quite big and the user could prefer to run natively if those requirements are already installed outside).
+You may use this python script within the training docker image, but all the dependencies must be installed there (a prompt when executing `./tools/training.sh`, ask for it because those dependencies are quite big and the user could prefer to run natively if those requirements are already installed outside). Remember that you will need to activate the virtual python environment inside the training image to use the python script: `source /opt/venv/bin/activate`.
diff --git a/tools/questions-and-answers/requirements.txt b/tools/questions-and-answers/requirements.txt
@@ -1,6 +1,7 @@
-langchain>=0.0.138
-unstructured>=0.5.12
-chromadb>=0.3.21
-tiktoken>=0.3.3
-openai>=0.27.4
-
+langchain_community>=0.2.12
+langchain_openai>=0.1.22
+langchain_groq>=0.1.9
+chromadb>=0.5.3
+langchain_chroma>=0.1.3
+unstructured>=0.15.7
+markdown>=3.7
diff --git a/tools/questions-and-answers/run.py b/tools/questions-and-answers/run.py
@@ -1,17 +1,29 @@
-#######################################################
-# QUESTIONS AND ANSWERS FROM DOCUMENTS - OpenAI-based #
-#######################################################
+################################################
+# QUESTIONS AND ANSWERS FROM PROJECT MARKDOWNS #
+################################################
 
 # Imports
 import os, sys, glob, pickle
 
+from langchain_community.document_loaders import UnstructuredMarkdownLoader # pip3 install --upgrade requests
+from langchain.text_splitter import MarkdownTextSplitter
+#from langchain_community.document_loaders import PyPDFLoader
+#from langchain.text_splitter import CharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain_openai import OpenAI
+from langchain_groq import ChatGroq
+
+from langchain.chains import ConversationalRetrievalChain # RetrievalQA is more simple (no context)
+
+from langchain_chroma import Chroma
+
 # Script location
 SCR_PATH = os.path.abspath(__file__)
 SCR_DIR, SCR_BN = os.path.split(SCR_PATH)
-REPO_DIR = os.path.abspath(SCR_DIR + "/../..")
+TARGET_DIR = os.path.abspath(SCR_DIR + "/../..")
 
 # Chat history
-CHAT_HISTORY = REPO_DIR + "/." + SCR_BN + "-chat-history"
+CHAT_HISTORY = "chat-history"
 
 # Basic checkings:
 # Python3 version
@@ -22,58 +34,78 @@
   print("Python version must be >= 3.8.1 (current: {x}.{y}.{z}). Try alias it, i.e.: alias python3='/usr/bin/python3.9'".format(x=major, y=minor, z=micro))
   sys.exit(1)
 
-# OpenAI version
-try:
-  apikey = os.environ["OPENAI_API_KEY"]
-except:
-  print("Please, export your OpenAI API KEY over 'OPENAI_API_KEY' environment variable")
-  print("You may create the key here: https://platform.openai.com/account/api-keys")
-  sys.exit(1)
+# API KEYs:
+def has_apikey(key, mandatory = True):
+  result = False
+  try:
+    dummy = os.environ[key]
+    result = True
+  except:
+    if mandatory:
+      print("ERROR: you must define '{}' environment variable".format(key))
+      sys.exit(1)
+    else:
+      print("WARNING: you may want to define '{}' environment variable".format(key))
 
-# Load documents
-print("Loading markdown documents under this directory ({}) ...".format(REPO_DIR))
-wildcard=REPO_DIR + '/**/*.md'
-markdowns = glob.glob(wildcard, recursive = True)
-#print(markdowns)
-
-from langchain.document_loaders import UnstructuredMarkdownLoader
-loaders = [UnstructuredMarkdownLoader(os.path.join(SCR_DIR, md)) for md in markdowns]
-#print("Loading URL sources ...")
-#from langchain.document_loaders import UnstructuredURLLoader
-#loaders.append(UnstructuredURLLoader(["https://prezi.com/p/1ijxuu0rt-sj/?present=1)"]))
-documents = []
-for loader in loaders:
-  documents.extend(loader.load())
-
-# create index
-#from langchain.indexes import VectorstoreIndexCreator
-#index = VectorstoreIndexCreator().from_loaders(loaders)
-#index.query_with_sources("here the query...")
-
-# Indexing data:
-print("Indexing data ...")
-
-# Split the document into chunks:
-from langchain.text_splitter import MarkdownTextSplitter
-text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
-texts = text_splitter.split_documents(documents)
+  return result
+
+# Check OpenAI api key:
+has_apikey("OPENAI_API_KEY")
+
+# Check optional Groq api key:
+has_groq = has_apikey("GROQ_API_KEY", False)
 
 # Select which enbeddings we want to use
-from langchain.embeddings import OpenAIEmbeddings
 embeddings = OpenAIEmbeddings()
 
-# Create the vectorstore to use as the index
-from langchain.vectorstores import Chroma
-vectorstore = Chroma.from_documents(texts, embeddings)
+# Load documents into RAG: read if exsits
+vectorstore_path = "./vectorstore-data"
+if not os.path.exists(vectorstore_path):
+  print("Creating vectorstore ...")
 
-# Expose this index in a retriever interface
+  wildcard= TARGET_DIR + '/**/*.md' # or '/**/*.pdf' for PDF files
+  files = glob.glob(wildcard, recursive = True)
+
+  loaders = [UnstructuredMarkdownLoader(os.path.join(SCR_DIR, f)) for f in files]
+  #loaders.append(UnstructuredURLLoader(["https://prezi.com/p/1ijxuu0rt-sj/?present=1)"])) # For URLs
+  #loaders = [PyPDFLoader(os.path.join(SCR_DIR, f)) for f in files] # For PDF files
+
+  documents = []
+  for loader in loaders:
+    documents.extend(loader.load())
+
+  # Split the document into chunks:
+  text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
+  fragments = text_splitter.split_documents(documents)
+  # For PDF files:
+  #text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=150, length_function=len)
+  #fragments = text_splitter.split_documents(documents)
+
+  # Create the vectorstore to use as the index
+  vectorstore = Chroma.from_documents(fragments, embeddings, persist_directory=vectorstore_path)
+
+else:
+    print("Loading vectorstore ...")
+    vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings)
+
+
+# Expose vectorstore index in a retriever interface
 retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":9})
 
-# Create a chain to answer questions
-from langchain.chains import ConversationalRetrievalChain
-from langchain.llms import OpenAI
-qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever) # , return_source_documents=True)
-#vectordbkwargs = {"search_distance": 0.9}
+# Create a chain to answer questions (temperature 0 to be more deterministic, less creative)
+llm = OpenAI(temperature=0)
+
+if has_groq:
+  llm=ChatGroq(
+    model="mixtral-8x7b-32768",
+    temperature=0,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    # other params...
+  )
+
+qa = ConversationalRetrievalChain.from_llm(llm, retriever)
 
 # Chat history: read if exists
 chat_history = []
@@ -87,12 +119,13 @@
 while True:
   query = input("\nAsk me anything (0 = quit): ")
   if query == "0": break
-  result = qa({"question": query, "chat_history": chat_history}) # , "vectordbkwargs": vectordbkwargs1})
+  result = qa.invoke({"question": query, "chat_history": chat_history})
   answer = result["answer"]
   print(answer)
-  chat_history.append([query, answer])
+  chat_history.append((query, answer))
 
-print("\n[saving chat history ...]\n")
+# TODO: chat history rotation is needed to avoid huge data and so, greater cost:
+print("\n[saving chat history for next chatbot sessions ...]\n")
 with open(CHAT_HISTORY, 'wb') as f: # write in binary mode
   pickle.dump(chat_history, f) # serialize the array and write it to the file
 

diff --git a/tools/training.sh b/tools/training.sh
@@ -15,15 +15,18 @@ read base_os
 bargs="--build-arg base_os=${base_os}"
 bargs+=" --build-arg base_tag=latest"
 
-# OpenAI Questions & Answers:
-echo "Do you want to install 'OpenAI Q&A helper' dependencies (y/n) ? [n]:"
-echo " (warning: image size would be increased from 250MB to more than 8GB !)"
-read opt
-[ -z "${opt}" ] && opt=n
+# OpenAI Questions & Answers; only supported for ubuntu base:
 qa=false
-[ "${opt}" = "y" ] && qa=true
+if [ "${base_os}" = "ubuntu" ]
+then
+  echo "Do you want to install 'OpenAI/Groq Q&A helper' dependencies (y/n) ? [n]:"
+  echo " (warning: image size would be increased from 250MB to more than 8GB !)"
+  read opt
+  [ -z "${opt}" ] && opt=n
+  [ "${opt}" = "y" ] && qa=true
+fi
 bargs+=" --build-arg enable_qa=${qa}"
 
 cd ${git_root_dir}
 docker build --rm ${bargs} -f Dockerfile.training  ${NO_CACHE} -t testillano/h2agent_training:latest . || exit 1
-docker run -it --rm --entrypoint=/bin/bash -e OPENAI_API_KEY=${OPENAI_API_KEY} testillano/h2agent_training:latest || exit 1
+docker run -it --rm --entrypoint=/bin/bash -e OPENAI_API_KEY=${OPENAI_API_KEY} -e GROQ_API_KEY=${GROQ_API_KEY} testillano/h2agent_training:latest || exit 1