From 0c1cf4f09ea5e2a51008961c49352208f1ddc803 Mon Sep 17 00:00:00 2001
From: Eric-Shang <sencourant@outlook.com>
Date: Wed, 4 Dec 2024 08:15:12 +0000
Subject: [PATCH] Add the ability to add new documents to existing graph

---
 .../graph_rag/neo4j_graph_query_engine.py     |  23 ++-
 notebook/agentchat_graph_rag_neo4j.ipynb      | 164 +++++++++++++++++-
 2 files changed, 179 insertions(+), 8 deletions(-)

diff --git a/autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py b/autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py
index 5b45e3bd11..86f375468f 100644
--- a/autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py
+++ b/autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py
@@ -108,9 +108,28 @@ def init_db(self, input_doc: List[Document] | None = None):
 
     def add_records(self, new_records: List) -> bool:
         """
-        Add a record to the knowledge graph.
+        Add new records to the knowledge graph.
+
+        Args:
+            new_records (List[Document]): List of new documents to add.
+
+        Returns:
+            bool: True if successful, False otherwise.
         """
-        pass
+        if self.graph_store is None:
+            raise ValueError("Knowledge graph is not initialized. Please call init_db first.")
+
+        try:
+            # Load new documents
+            new_documents = SimpleDirectoryReader(input_files=[doc.path_or_url for doc in new_records]).load_data()
+
+            for doc in new_documents:
+                self.index.insert(doc)
+
+            return True
+        except Exception as e:
+            print(f"Error adding records: {e}")
+            return False
 
     def query(self, question: str, n_results: int = 1, **kwargs) -> GraphStoreQueryResult:
         """
diff --git a/notebook/agentchat_graph_rag_neo4j.ipynb b/notebook/agentchat_graph_rag_neo4j.ipynb
index e3dcbfd702..e2ece7591a 100644
--- a/notebook/agentchat_graph_rag_neo4j.ipynb
+++ b/notebook/agentchat_graph_rag_neo4j.ipynb
@@ -116,7 +116,7 @@
     "### A Simple Example\n",
     "\n",
     "In this example, the graph schema is auto-generated. This allows you to load data without specifying the specific types of entities and relationships that will make up the database (however, this may not be optimal and not cost efficient). \n",
-    "First, we create a Neo4j property graph with Paul Grahma's essay."
+    "First, we create a Neo4j property graph (knowledge graph) with Paul Grahma's essay."
    ]
   },
   {
@@ -259,7 +259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -267,10 +267,10 @@
      "output_type": "stream",
      "text": [
       "Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.AggregationSkippedNull} {category: UNRECOGNIZED} {title: The query contains an aggregation function that skips null values.} {description: null value eliminated in set function.} {position: None} for query: \"MATCH (n:`Chunk`)\\nWITH collect(distinct substring(toString(coalesce(n.`doc_id`, '')), 0, 50)) AS `doc_id_values`,\\n     collect(distinct substring(toString(coalesce(n.`document_id`, '')), 0, 50)) AS `document_id_values`,\\n     collect(distinct substring(toString(coalesce(n.`creation_date`, '')), 0, 50)) AS `creation_date_values`,\\n     collect(distinct substring(toString(coalesce(n.`_node_type`, '')), 0, 50)) AS `_node_type_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_type`, '')), 0, 50)) AS `file_type_values`,\\n     collect(distinct substring(toString(coalesce(n.`last_modified_date`, '')), 0, 50)) AS `last_modified_date_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_name`, '')), 0, 50)) AS `file_name_values`,\\n     collect(distinct substring(toString(coalesce(n.`_node_content`, '')), 0, 50)) AS `_node_content_values`,\\n     collect(distinct substring(toString(coalesce(n.`ref_doc_id`, '')), 0, 50)) AS `ref_doc_id_values`,\\n     min(size(coalesce(n.`embedding`, []))) AS `embedding_size_min`, max(size(coalesce(n.`embedding`, []))) AS `embedding_size_max`,\\n     collect(distinct substring(toString(coalesce(n.`text`, '')), 0, 50)) AS `text_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_path`, '')), 0, 50)) AS `file_path_values`,\\n     min(n.`file_size`) AS `file_size_min`,\\n     max(n.`file_size`) AS `file_size_max`,\\n     count(distinct n.`file_size`) AS `file_size_distinct`,\\n     collect(distinct substring(toString(coalesce(n.`id`, '')), 0, 50)) AS `id_values`\\nRETURN {`doc_id`: {values:`doc_id_values`[..10], distinct_count: size(`doc_id_values`)}, `document_id`: {values:`document_id_values`[..10], distinct_count: size(`document_id_values`)}, `creation_date`: {values:`creation_date_values`[..10], distinct_count: size(`creation_date_values`)}, `_node_type`: {values:`_node_type_values`[..10], distinct_count: size(`_node_type_values`)}, `file_type`: {values:`file_type_values`[..10], distinct_count: size(`file_type_values`)}, `last_modified_date`: {values:`last_modified_date_values`[..10], distinct_count: size(`last_modified_date_values`)}, `file_name`: {values:`file_name_values`[..10], distinct_count: size(`file_name_values`)}, `_node_content`: {values:`_node_content_values`[..10], distinct_count: size(`_node_content_values`)}, `ref_doc_id`: {values:`ref_doc_id_values`[..10], distinct_count: size(`ref_doc_id_values`)}, `embedding`: {min_size: `embedding_size_min`, max_size: `embedding_size_max`}, `text`: {values:`text_values`[..10], distinct_count: size(`text_values`)}, `file_path`: {values:`file_path_values`[..10], distinct_count: size(`file_path_values`)}, `file_size`: {min: toString(`file_size_min`), max: toString(`file_size_max`), distinct_count: `file_size_distinct`}, `id`: {values:`id_values`[..10], distinct_count: size(`id_values`)}} AS output\"\n",
-      "Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 19.52it/s]\n",
-      "Extracting paths from text with schema: 100%|██████████| 22/22 [00:21<00:00,  1.03it/s]\n",
-      "Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]\n",
-      "Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]\n",
+      "Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 12.06it/s]\n",
+      "Extracting paths from text with schema: 100%|██████████| 22/22 [00:36<00:00,  1.66s/it]\n",
+      "Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]\n",
+      "Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]\n",
       "Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.AggregationSkippedNull} {category: UNRECOGNIZED} {title: The query contains an aggregation function that skips null values.} {description: null value eliminated in set function.} {position: None} for query: \"MATCH (n:`Chunk`)\\nWITH collect(distinct substring(toString(coalesce(n.`doc_id`, '')), 0, 50)) AS `doc_id_values`,\\n     collect(distinct substring(toString(coalesce(n.`document_id`, '')), 0, 50)) AS `document_id_values`,\\n     collect(distinct substring(toString(coalesce(n.`creation_date`, '')), 0, 50)) AS `creation_date_values`,\\n     collect(distinct substring(toString(coalesce(n.`_node_type`, '')), 0, 50)) AS `_node_type_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_type`, '')), 0, 50)) AS `file_type_values`,\\n     collect(distinct substring(toString(coalesce(n.`last_modified_date`, '')), 0, 50)) AS `last_modified_date_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_name`, '')), 0, 50)) AS `file_name_values`,\\n     collect(distinct substring(toString(coalesce(n.`_node_content`, '')), 0, 50)) AS `_node_content_values`,\\n     collect(distinct substring(toString(coalesce(n.`ref_doc_id`, '')), 0, 50)) AS `ref_doc_id_values`,\\n     min(size(coalesce(n.`embedding`, []))) AS `embedding_size_min`, max(size(coalesce(n.`embedding`, []))) AS `embedding_size_max`,\\n     collect(distinct substring(toString(coalesce(n.`text`, '')), 0, 50)) AS `text_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_path`, '')), 0, 50)) AS `file_path_values`,\\n     min(n.`file_size`) AS `file_size_min`,\\n     max(n.`file_size`) AS `file_size_max`,\\n     count(distinct n.`file_size`) AS `file_size_distinct`,\\n     collect(distinct substring(toString(coalesce(n.`id`, '')), 0, 50)) AS `id_values`\\nRETURN {`doc_id`: {values:`doc_id_values`[..10], distinct_count: size(`doc_id_values`)}, `document_id`: {values:`document_id_values`[..10], distinct_count: size(`document_id_values`)}, `creation_date`: {values:`creation_date_values`[..10], distinct_count: size(`creation_date_values`)}, `_node_type`: {values:`_node_type_values`[..10], distinct_count: size(`_node_type_values`)}, `file_type`: {values:`file_type_values`[..10], distinct_count: size(`file_type_values`)}, `last_modified_date`: {values:`last_modified_date_values`[..10], distinct_count: size(`last_modified_date_values`)}, `file_name`: {values:`file_name_values`[..10], distinct_count: size(`file_name_values`)}, `_node_content`: {values:`_node_content_values`[..10], distinct_count: size(`_node_content_values`)}, `ref_doc_id`: {values:`ref_doc_id_values`[..10], distinct_count: size(`ref_doc_id_values`)}, `embedding`: {min_size: `embedding_size_min`, max_size: `embedding_size_max`}, `text`: {values:`text_values`[..10], distinct_count: size(`text_values`)}, `file_path`: {values:`file_path_values`[..10], distinct_count: size(`file_path_values`)}, `file_size`: {min: toString(`file_size_min`), max: toString(`file_size_max`), distinct_count: `file_size_distinct`}, `id`: {values:`id_values`[..10], distinct_count: size(`id_values`)}} AS output\"\n"
      ]
     }
@@ -427,6 +427,158 @@
     "\n",
     "user_proxy.initiate_chat(graph_rag_agent, message=\"What happened at Interleaf and Viaweb?\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### You can add new documents to the existing knoweldge graph!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  4.98it/s]\n",
+      "Extracting paths from text with schema: 100%|██████████| 4/4 [00:05<00:00,  1.30s/it]\n",
+      "Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]\n",
+      "Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]\n",
+      "Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.AggregationSkippedNull} {category: UNRECOGNIZED} {title: The query contains an aggregation function that skips null values.} {description: null value eliminated in set function.} {position: None} for query: \"MATCH (n:`Chunk`)\\nWITH collect(distinct substring(toString(coalesce(n.`doc_id`, '')), 0, 50)) AS `doc_id_values`,\\n     collect(distinct substring(toString(coalesce(n.`document_id`, '')), 0, 50)) AS `document_id_values`,\\n     collect(distinct substring(toString(coalesce(n.`creation_date`, '')), 0, 50)) AS `creation_date_values`,\\n     collect(distinct substring(toString(coalesce(n.`_node_type`, '')), 0, 50)) AS `_node_type_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_type`, '')), 0, 50)) AS `file_type_values`,\\n     collect(distinct substring(toString(coalesce(n.`last_modified_date`, '')), 0, 50)) AS `last_modified_date_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_name`, '')), 0, 50)) AS `file_name_values`,\\n     collect(distinct substring(toString(coalesce(n.`_node_content`, '')), 0, 50)) AS `_node_content_values`,\\n     collect(distinct substring(toString(coalesce(n.`ref_doc_id`, '')), 0, 50)) AS `ref_doc_id_values`,\\n     min(size(coalesce(n.`embedding`, []))) AS `embedding_size_min`, max(size(coalesce(n.`embedding`, []))) AS `embedding_size_max`,\\n     collect(distinct substring(toString(coalesce(n.`text`, '')), 0, 50)) AS `text_values`,\\n     collect(distinct substring(toString(coalesce(n.`file_path`, '')), 0, 50)) AS `file_path_values`,\\n     min(n.`file_size`) AS `file_size_min`,\\n     max(n.`file_size`) AS `file_size_max`,\\n     count(distinct n.`file_size`) AS `file_size_distinct`,\\n     collect(distinct substring(toString(coalesce(n.`id`, '')), 0, 50)) AS `id_values`\\nRETURN {`doc_id`: {values:`doc_id_values`[..10], distinct_count: size(`doc_id_values`)}, `document_id`: {values:`document_id_values`[..10], distinct_count: size(`document_id_values`)}, `creation_date`: {values:`creation_date_values`[..10], distinct_count: size(`creation_date_values`)}, `_node_type`: {values:`_node_type_values`[..10], distinct_count: size(`_node_type_values`)}, `file_type`: {values:`file_type_values`[..10], distinct_count: size(`file_type_values`)}, `last_modified_date`: {values:`last_modified_date_values`[..10], distinct_count: size(`last_modified_date_values`)}, `file_name`: {values:`file_name_values`[..10], distinct_count: size(`file_name_values`)}, `_node_content`: {values:`_node_content_values`[..10], distinct_count: size(`_node_content_values`)}, `ref_doc_id`: {values:`ref_doc_id_values`[..10], distinct_count: size(`ref_doc_id_values`)}, `embedding`: {min_size: `embedding_size_min`, max_size: `embedding_size_max`}, `text`: {values:`text_values`[..10], distinct_count: size(`text_values`)}, `file_path`: {values:`file_path_values`[..10], distinct_count: size(`file_path_values`)}, `file_size`: {min: toString(`file_size_min`), max: toString(`file_size_max`), distinct_count: `file_size_distinct`}, `id`: {values:`id_values`[..10], distinct_count: size(`id_values`)}} AS output\"\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_path = \"../test/agentchat/contrib/graph_rag/the_matrix.txt\"\n",
+    "input_documents = [Document(doctype=DocumentType.TEXT, path_or_url=input_path)]\n",
+    "\n",
+    "_ = query_engine.add_records(input_documents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now let's create a new graph rag agent and some quetions related to both 2 documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33muser_proxy\u001b[0m (to paul_graham_agent):\n",
+      "\n",
+      "What happened for movie 'The Matrix'?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mpaul_graham_agent\u001b[0m (to user_proxy):\n",
+      "\n",
+      "The movie 'The Matrix' received mixed reviews from critics. Some critics praised it for being an ambitious and captivating sci-fi film with electrifying action and strong characters, while others criticized it as a high-tech assault on the senses and a pretentious insult to the mind. Despite the varying opinions, the movie has garnered a cult following and is considered a groundbreaking film in the cyberpunk genre.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33muser_proxy\u001b[0m (to paul_graham_agent):\n",
+      "\n",
+      "What did Paul Graham do?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mpaul_graham_agent\u001b[0m (to user_proxy):\n",
+      "\n",
+      "Paul Graham worked on various projects such as Viaweb, Y Combinator, and painting. He also wrote essays on different topics, including startups and programming languages like Lisp.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33muser_proxy\u001b[0m (to paul_graham_agent):\n",
+      "\n",
+      "Did Paul Graham do anything for the movie 'The Matrix'?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mpaul_graham_agent\u001b[0m (to user_proxy):\n",
+      "\n",
+      "Paul Graham did not do anything for the movie 'The Matrix' based on the provided context information.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33muser_proxy\u001b[0m (to paul_graham_agent):\n",
+      "\n",
+      "Did he watch this movie?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mpaul_graham_agent\u001b[0m (to user_proxy):\n",
+      "\n",
+      "He watched the movie.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33muser_proxy\u001b[0m (to paul_graham_agent):\n",
+      "\n",
+      "which movie?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mpaul_graham_agent\u001b[0m (to user_proxy):\n",
+      "\n",
+      "The Matrix\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33muser_proxy\u001b[0m (to paul_graham_agent):\n",
+      "\n",
+      "Do you mean Paul Graham watched the 'The Matrix'?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mpaul_graham_agent\u001b[0m (to user_proxy):\n",
+      "\n",
+      "No, there is no mention or indication in the provided context information that Paul Graham watched 'The Matrix'.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "ChatResult(chat_id=None, chat_history=[{'content': \"What happened for movie 'The Matrix'?\", 'role': 'assistant', 'name': 'user_proxy'}, {'content': \"The movie 'The Matrix' received mixed reviews from critics. Some critics praised it for being an ambitious and captivating sci-fi film with electrifying action and strong characters, while others criticized it as a high-tech assault on the senses and a pretentious insult to the mind. Despite the varying opinions, the movie has garnered a cult following and is considered a groundbreaking film in the cyberpunk genre.\", 'role': 'user', 'name': 'paul_graham_agent'}, {'content': 'What did Paul Graham do?', 'role': 'assistant', 'name': 'user_proxy'}, {'content': 'Paul Graham worked on various projects such as Viaweb, Y Combinator, and painting. He also wrote essays on different topics, including startups and programming languages like Lisp.', 'role': 'user', 'name': 'paul_graham_agent'}, {'content': \"Did Paul Graham do anything for the movie 'The Matrix'?\", 'role': 'assistant', 'name': 'user_proxy'}, {'content': \"Paul Graham did not do anything for the movie 'The Matrix' based on the provided context information.\", 'role': 'user', 'name': 'paul_graham_agent'}, {'content': 'Did he watch this movie?', 'role': 'assistant', 'name': 'user_proxy'}, {'content': 'He watched the movie.', 'role': 'user', 'name': 'paul_graham_agent'}, {'content': 'which movie?', 'role': 'assistant', 'name': 'user_proxy'}, {'content': 'The Matrix', 'role': 'user', 'name': 'paul_graham_agent'}, {'content': \"Do you mean Paul Graham watched the 'The Matrix'?\", 'role': 'assistant', 'name': 'user_proxy'}, {'content': \"No, there is no mention or indication in the provided context information that Paul Graham watched 'The Matrix'.\", 'role': 'user', 'name': 'paul_graham_agent'}], summary=\"No, there is no mention or indication in the provided context information that Paul Graham watched 'The Matrix'.\", cost={'usage_including_cached_inference': {'total_cost': 0}, 'usage_excluding_cached_inference': {'total_cost': 0}}, human_input=['What did Paul Graham do?', \"Did Paul Graham do anything for the movie 'The Matrix'?\", 'Did he watch this movie?', 'which movie?', \"Do you mean Paul Graham watched the 'The Matrix'?\", 'exit'])"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from autogen.agentchat.contrib.graph_rag.neo4j_graph_rag_capability import Neo4jGraphCapability\n",
+    "\n",
+    "# Create a ConversableAgent (no LLM configuration)\n",
+    "graph_rag_agent = ConversableAgent(\n",
+    "    name=\"paul_graham_agent\",\n",
+    "    human_input_mode=\"NEVER\",\n",
+    ")\n",
+    "\n",
+    "# Associate the capability with the agent\n",
+    "graph_rag_capability = Neo4jGraphCapability(query_engine)\n",
+    "graph_rag_capability.add_to_agent(graph_rag_agent)\n",
+    "\n",
+    "# Create a user proxy agent to converse with our RAG agent\n",
+    "user_proxy = UserProxyAgent(\n",
+    "    name=\"user_proxy\",\n",
+    "    human_input_mode=\"ALWAYS\",\n",
+    ")\n",
+    "\n",
+    "user_proxy.initiate_chat(graph_rag_agent, message=\"What happened for movie 'The Matrix'?\")"
+   ]
   }
  ],
  "metadata": {