OpenBMB · LOGIC-10 · Jan 8, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ config private.yml
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
+.DS_Store
+*/.DS_Store
 *.py[cod]
 *$py.class
 
@@ -162,5 +164,3 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-# AI doc
-.project_hierarchy.json
diff --git a/repo_agent/chat_with_repo/chat_with_repo.py b/repo_agent/chat_with_repo/chat_with_repo.py
@@ -1,81 +1,137 @@
-
 import os
 import gradio as gr
 import chromadb
 import openai
-from llama_index import Document, VectorStoreIndex, ServiceContext, SimpleDirectoryReader, StorageContext, load_index_from_storage
+import json
+import logging
+from llama_index import Document,VectorStoreIndex,ServiceContext,SimpleDirectoryReader,StorageContext,load_index_from_storage
 from llama_index.llms import OpenAI
-from llama_index.node_parser import HierarchicalNodeParser, get_leaf_nodes
+from llama_index.node_parser import HierarchicalNodeParser,get_leaf_nodes
 from llama_index.vector_stores import ChromaVectorStore
+from llama_index.storage.storage_context import StorageContext
 from llama_index.embeddings import OpenAIEmbedding
 from llama_index.retrievers import AutoMergingRetriever
 from llama_index.query_engine import RetrieverQueryEngine
-
-# Class Definitions
-class DocumentIndexer:
-    def __init__(self, dir_path, required_exts):
-        self.dir_path = dir_path
-        self.required_exts = required_exts
-        self.documents = []
-        self.nodes = []
-        self.leaf_nodes = []
-        self.nodes_by_id = {}
-        self.embed_model = OpenAIEmbedding()
-        self.storage_context = StorageContext.from_defaults()
-        self.service_context = ServiceContext.from_defaults(embed_model=self.embed_model)
-        self.automerging_index = None
-
-    def load_documents(self):
-        # Load and parse documents
-        documents = SimpleDirectoryReader(
-            input_dir=self.dir_path,
-            required_exts=self.required_exts,
-            recursive=True,
-        ).load_data()
-        document = Document(text="\n\n".join([doc.text for doc in documents]))
-        self.documents = documents
-        return document
-
-    def parse_documents(self, document):
-        # Create node parser and get nodes
-        node_parser = HierarchicalNodeParser.from_defaults(
-            chunk_sizes=[2048, 512, 128]
-        )
-        self.nodes = node_parser.get_nodes_from_documents([document])
-        self.leaf_nodes = get_leaf_nodes(self.nodes)
-        self.nodes_by_id = {node.node_id: node for node in self.nodes}
-
-
-    def save_storedb(self):
-        # save the index to chromadb
-        db = chromadb.PersistentClient(path="./chroma_db")
-        chroma_collection = db.get_or_create_collection("quickstart")
-        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
-
-        storage_context = StorageContext.from_defaults(vector_store=vector_store)
-        storage_context.docstore.add_documents(self.nodes)
-
-        self.automerging_index = VectorStoreIndex(
-           self.leaf_nodes, storage_context=storage_context, service_context=self.service_context
-        )
-
-    def load_storedb(self):    
-        # load the index from chromadb
-        db2 = chromadb.PersistentClient(path="./chroma_db")
-        chroma_collection = db2.get_or_create_collection("quickstart")
-        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
-        self.automerging_index = VectorStoreIndex.from_vector_store(
-            vector_store,
-            service_context=self.service_context,
+from chromadb.utils import embedding_functions
+
+
+class RepoAssistant:
+    def __init__(self, api_key, api_base, db_path, log_file):
+        # Initialize OpenAI, database, and load JSON data
+        # 设置日志记录器
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.DEBUG)  # 设置日志级别为DEBUG
+        # 创建一个文件处理程序，将日志写入文件
+        file_handler = logging.FileHandler(log_file, encoding='utf-8')
+        file_handler.setLevel(logging.DEBUG)  # 设置文件处理程序的日志级别为DEBUG
+        # 创建一个格式化器，定义日志记录的格式
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        file_handler.setFormatter(formatter)
+        # 将文件处理程序添加到日志记录器
+        self.logger.addHandler(file_handler)
+        self.api_key = api_key
+        self.api_base = api_base
+        self.db_path = db_path
+        self.init_chroma_collection()
+        self.llm = OpenAI(api_key=api_key, api_base=api_base)
+
+    def read_json_file(self, file_path):
+        # ...
+        with open(file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+        return data
+
+    def extract_md_contents(self, json_data):
+        # ...
+        md_contents = []
+        for file in json_data["files"]:
+            for obj in file["objects"]:
+                if "md_content" in obj:
+                    md_contents.append(obj["md_content"])
+        return md_contents
+
+    def init_chroma_collection(self):
+        chroma_client = chromadb.Client()
+        self.chroma_collection = chroma_client.create_collection(
+            "test", 
+            embedding_function=embedding_functions.OpenAIEmbeddingFunction(
+                api_key=self.api_key,
+                api_base=self.api_base,
+                model_name="text-embedding-ada-002"
+            )
         )
 
-
-
-class ChatbotResponder:
-    def __init__(self, automerging_engine):
-        self.automerging_engine = automerging_engine
-
-    def format_chat_prompt(self,message, chat_history, instruction):
+    def search_in_json_nested(self, file_path, search_text):
+        try:
+            with open(file_path, 'r',encoding='utf-8') as file:
+                data = json.load(file)
+
+                def recursive_search(data_item):
+                    if isinstance(data_item, dict):
+                        if 'name' in data_item and search_text.lower() in data_item['name'].lower():
+                            return data_item
+
+                        for key, value in data_item.items():
+                            if isinstance(value, (dict, list)):
+                                result = recursive_search(value)
+                                if result:
+                                    return result
+                    elif isinstance(data_item, list):
+                        for item in data_item:
+                            result = recursive_search(item)
+                            if result:
+                                return result
+
+                result = recursive_search(data)
+                if result:
+                    return result
+                else:
+                    return "No matching item found."
+
+        except FileNotFoundError:
+            return "File not found."
+        except json.JSONDecodeError:
+            return "Invalid JSON file."
+        except Exception as e:
+            return f"An error occurred: {e}"
+
+    def NerQuery(self,message):
+        #...
+        query = "Extract the most relevant class or function from the following input:\n" + message + "\nOutput:"
+        response = self.llm.complete(query)
+        self.logger.debug(f"Input: {message}, Output: {response}")
+        return response
+
+    def queryblock(self,message):
+        #...
+        search_result = self.search_in_json_nested(self.db_path, message)
+        if isinstance(search_result, dict):
+            search_result = search_result['code_content']
+        return str(search_result)
+
+    def create_vector_store(self, md_contents):
+        #...
+        ids = [str(i) for i in range(len(md_contents))]
+        self.chroma_collection.add(ids=ids, documents=md_contents)
+
+
+    def rag(self, query, retrieved_documents):
+        # ...
+        information = "\n\n".join(retrieved_documents)
+        messages = f"You are a helpful expert repo research assistant. Your users are asking questions about information contained in repo . You will be shown the user's question, and the relevant information from the repo. Answer the user's question using only this information.\nQuestion: {query}. \nInformation: {information}"
+        response = self.llm.complete(messages)
+        content = response
+        return content
+
+    def Tree(self, query):
+        # ...
+        input_text = query
+        prompt = f"Please analyze the following text and generate a tree structure based on its hierarchy:\n\n{input_text}"
+        response = self.llm.complete(prompt)
+        return response
+
+    def format_chat_prompt(self, message, chat_history, instruction):
+        # ...
         prompt = f"System:{instruction}"
         for turn in chat_history:
             user_message, bot_message = turn
@@ -84,46 +140,46 @@ def format_chat_prompt(self,message, chat_history, instruction):
         return prompt
 
     def respond(self, message, chat_history, instruction):
+        # ...
         prompt = self.format_chat_prompt(message, chat_history, instruction)
         chat_history = chat_history + [[message, ""]]
-        auto_merging_response = self.automerging_engine.query(prompt)
-        bot_message = str(auto_merging_response)
+        results = self.chroma_collection.query(query_texts=[prompt], n_results=5)
+        self.logger.debug(f"Results: {results}")
+        retrieved_documents = results['documents'][0]
+        response = self.rag(prompt,retrieved_documents)
+        bot_message = str(response)
+        keyword=str(self.NerQuery(bot_message))
+        bot_message=bot_message +'\n'+ str(self.Tree(bot_message))+'\n'+'```python'+'\n'+self.queryblock(keyword)+'\n'+'```'
+        # bot_message=bot_message +'\n'+ str(self.Tree(bot_message))
         chat_history.append((message, bot_message))
         return "", chat_history
 
-# Main Execution Block
+    def setup_gradio_interface(self):
+        # Gradio UI setupwith gr.Blocks() as demo:
+        with gr.Blocks() as demo:
+            chatbot = gr.Chatbot(height=540) #just to fit the notebook
+            msg = gr.Textbox(label="Prompt")
+            with gr.Accordion(label="Advanced options",open=False):
+                system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
+            btn = gr.Button("Submit")
+            clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")
+
+            btn.click(self.respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])
+            msg.submit(self.respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot]) #Press enter to submit
+
+        gr.close_all()
+        demo.queue().launch(share=True)    
+
+def main():
+    api_key='sk-'
+    api_base='https://example.com' 
+    db_path = "./project_hierachy.json"
+    log_file= "./log.txt"
+    assistant = RepoAssistant(api_key, api_base, db_path,log_file)
+    json_data = assistant.read_json_file(db_path)
+    md_contents = assistant.extract_md_contents(json_data)
+    assistant.create_vector_store(md_contents)
+    assistant.setup_gradio_interface()
+
 if __name__ == "__main__":
-    openai.api_key = 'sk-'
-    openai.base_url = 'https://example.com'
-
-    # Initialize DocumentIndexer and ChatbotResponder
-    indexer = DocumentIndexer("../../Markdown_Docs/", [".md", ".py"])
-    document = indexer.load_documents()
-    indexer.parse_documents(document)
-    indexer.save_storedb()
-    indexer.load_storedb()
-    automerging_index = indexer.automerging_index
-    automerging_retriever = automerging_index.as_retriever(similarity_top_k=12)
-    retriever = AutoMergingRetriever(
-        automerging_retriever, 
-        automerging_index.storage_context, 
-        verbose=True
-    )
-    auto_merging_engine = RetrieverQueryEngine.from_args(automerging_retriever)
-    responder = ChatbotResponder(auto_merging_engine)
-
-    # Gradio Interface Setup
-    with gr.Blocks() as demo:
-        chatbot = gr.Chatbot(height=240)
-        msg = gr.Textbox(label="Prompt")
-        with gr.Accordion(label="Advanced options", open=False):
-            system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
-
-        btn = gr.Button("Submit")
-        clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")
-
-        btn.click(responder.respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])
-        msg.submit(responder.respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])
-
-    gr.close_all()
-    demo.queue().launch(share=True)
+    main()
diff --git a/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/data_level0.bin b/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/data_level0.bin
diff --git a/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/header.bin b/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/header.bin
diff --git a/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/length.bin b/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/length.bin
diff --git a/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/link_lists.bin b/repo_agent/chat_with_repo/chroma_db/b5b385bb-ab3d-4ef3-88d4-754cfee34a37/link_lists.bin
diff --git a/repo_agent/chat_with_repo/chroma_db/chroma.sqlite3 b/repo_agent/chat_with_repo/chroma_db/chroma.sqlite3