From 71a5a3b143e7f6a80cfc35d9234a87519bbd1398 Mon Sep 17 00:00:00 2001
From: Saurav Panda <sgp65@cornell.edu>
Date: Sun, 15 Sep 2024 22:20:12 -0700
Subject: [PATCH 1/2] feat: added github pre commit hook setup

---
 cli/kaizen_cli/cli.py                   |  2 +
 cli/kaizen_cli/config/default_config.py |  6 +--
 cli/kaizen_cli/hooks/prepare-commit-msg |  8 ++++
 cli/kaizen_cli/hooks/setup.py           | 58 +++++++++++++++++++++++++
 cli/poetry.lock                         |  4 +-
 cli/pyproject.toml                      |  2 +-
 kaizen/utils/config.py                  | 26 ++---------
 7 files changed, 75 insertions(+), 31 deletions(-)
 create mode 100644 cli/kaizen_cli/hooks/prepare-commit-msg
 create mode 100644 cli/kaizen_cli/hooks/setup.py

diff --git a/cli/kaizen_cli/cli.py b/cli/kaizen_cli/cli.py
index af4a3d67..580850a4 100644
--- a/cli/kaizen_cli/cli.py
+++ b/cli/kaizen_cli/cli.py
@@ -3,6 +3,7 @@
 from .commands.config_commands import config
 from .commands.unit_test_commands import unit_test
 from .commands.reviewer_commands import reviewer
+from .hooks.setup import hooks
 from kaizen.generator.e2e_tests import E2ETestGenerator
 
 
@@ -23,6 +24,7 @@ def ui_tests(url):
 cli.add_command(config)
 cli.add_command(unit_test)
 cli.add_command(reviewer)
+cli.add_command(hooks)
 
 if __name__ == "__main__":
     cli()
diff --git a/cli/kaizen_cli/config/default_config.py b/cli/kaizen_cli/config/default_config.py
index 8440aabc..4f8d99c4 100644
--- a/cli/kaizen_cli/config/default_config.py
+++ b/cli/kaizen_cli/config/default_config.py
@@ -4,11 +4,7 @@
         "models": [
             {
                 "model_name": "default",
-                "litellm_params": {
-                    "model": "azure/gpt-4o-mini",
-                    "api_key": "os.environ/AZURE_API_KEY",
-                    "api_base": "os.environ/AZURE_API_BASE",
-                },
+                "litellm_params": {"model": "ollama/phi3"},
             },
         ]
     },
diff --git a/cli/kaizen_cli/hooks/prepare-commit-msg b/cli/kaizen_cli/hooks/prepare-commit-msg
new file mode 100644
index 00000000..42dbf5c9
--- /dev/null
+++ b/cli/kaizen_cli/hooks/prepare-commit-msg
@@ -0,0 +1,8 @@
+#!/bin/sh
+# hooks/prepare-commit-msg
+
+# Run your CLI command and capture the output
+commit_msg=$(kaizen-cli generate-commit-msg)
+
+# Overwrite the commit message file with the generated message
+echo "$commit_msg" > "$1"
\ No newline at end of file
diff --git a/cli/kaizen_cli/hooks/setup.py b/cli/kaizen_cli/hooks/setup.py
new file mode 100644
index 00000000..107f64c3
--- /dev/null
+++ b/cli/kaizen_cli/hooks/setup.py
@@ -0,0 +1,58 @@
+import os
+import shutil
+import click
+
+HOOK_TYPES = ["prepare-commit-msg"]
+
+
+@click.group()
+def hooks():
+    """Manage git hooks"""
+    pass
+
+
+@hooks.command()
+@click.argument("hook_type", type=click.Choice(HOOK_TYPES))
+def install(hook_type):
+    """Install a specific git hook"""
+    source = os.path.join(os.path.dirname(__file__), "hooks", hook_type)
+    destination = os.path.join(".git", "hooks", hook_type)
+
+    if not os.path.exists(source):
+        click.echo(f"Error: Hook script for {hook_type} not found.")
+        return
+
+    try:
+        shutil.copy(source, destination)
+        os.chmod(destination, 0o755)
+        click.echo(f"{hook_type} hook installed successfully")
+    except IOError as e:
+        click.echo(f"Error installing {hook_type} hook: {str(e)}")
+
+
+@hooks.command()
+def install_all():
+    """Install all available git hooks"""
+    for hook_type in HOOK_TYPES:
+        ctx = click.get_current_context()
+        ctx.invoke(install, hook_type=hook_type)
+
+
+@hooks.command()
+@click.argument("hook_type", type=click.Choice(HOOK_TYPES))
+def uninstall(hook_type):
+    """Uninstall a specific git hook"""
+    hook_path = os.path.join(".git", "hooks", hook_type)
+    if os.path.exists(hook_path):
+        os.remove(hook_path)
+        click.echo(f"{hook_type} hook uninstalled successfully")
+    else:
+        click.echo(f"{hook_type} hook not found")
+
+
+@hooks.command()
+def uninstall_all():
+    """Uninstall all git hooks"""
+    for hook_type in HOOK_TYPES:
+        ctx = click.get_current_context()
+        ctx.invoke(uninstall, hook_type=hook_type)
diff --git a/cli/poetry.lock b/cli/poetry.lock
index 00b165c4..2172bad6 100644
--- a/cli/poetry.lock
+++ b/cli/poetry.lock
@@ -1454,7 +1454,7 @@ referencing = ">=0.31.0"
 
 [[package]]
 name = "kaizen-cloudcode"
-version = "0.4.11"
+version = "0.4.12"
 description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly."
 optional = false
 python-versions = "^3.9.0"
@@ -4370,4 +4370,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9.0"
-content-hash = "7751c3aba479207989d1f2f49251c8daed3adc5170f9abb7eece699436ff135b"
+content-hash = "d8dd0866cce15fa13755cf421b868b7e7fe5509dc7af511b18b210198ffeb8de"
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index de954377..ec4df3d3 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -8,7 +8,7 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.9.0"
 click = "^8.1.3"
-kaizen-cloudcode = "^0.4.11"
+kaizen-cloudcode = "^0.4.12"
 
 [tool.poetry.group.dev.dependencies]
 kaizen-cloudcode = {path = "..", develop = true, optional = true}
diff --git a/kaizen/utils/config.py b/kaizen/utils/config.py
index bd9fa343..e31af6de 100644
--- a/kaizen/utils/config.py
+++ b/kaizen/utils/config.py
@@ -1,5 +1,4 @@
 import json
-import os
 from pathlib import Path
 
 
@@ -9,6 +8,9 @@ def __init__(self, config_data=None):
         if Path(config_file).is_file():
             with open(config_file, "r") as f:
                 self.config_data = json.loads(f.read())
+        elif Path("~/.kaizen_config.json").is_file():
+            with open("~/.kaizen_config.json", "r") as f:
+                self.config_data = json.loads(f.read())
         else:
             print(f"Couldnt find config at {config_file} loading default vals")
             self.config_data = {
@@ -28,7 +30,6 @@ def __init__(self, config_data=None):
 
     def update_config_data(self, new_config_data):
         self.config_data.update(new_config_data)
-        self.validate_config_settings(self.config_data)
 
     def get_config_data(self):
         return self.config_data
@@ -38,24 +39,3 @@ def get_language_model_config(self):
 
     def get_github_app_config(self):
         return self.config_data["github_app"]
-
-    def validate_config_settings(self):
-        "Make sure relvant enviorment variables are set"
-        if self.config_data.get("github_app", {}).get("check_signature", False):
-            if not os.environ.get("GITHUB_APP_WEBHOOK_SECRET"):
-                raise EnvironmentError(
-                    "The environment variable 'GITHUB_APP_WEBHOOK_SECRET' is not set."
-                )
-
-        if self.config_data.get("language_model", {}).get("provider", {}) == "litellm":
-            if self.config_data.get("language_model", {}).get(
-                "enable_observability_logging", False
-            ):
-                if not os.environ.get("SUPABASE_URL"):
-                    raise EnvironmentError(
-                        "The environment variable 'SUPABASE_URL' is not set."
-                    )
-                if not os.environ.get("SUPABASE_KEY"):
-                    raise EnvironmentError(
-                        "The environment variable 'SUPABASE_KEY' is not set."
-                    )

From 3c893a3e7b9fddcf72d4a8936fb4568299c68105 Mon Sep 17 00:00:00 2001
From: Saurav Panda <sgp65@cornell.edu>
Date: Mon, 16 Sep 2024 01:38:13 -0700
Subject: [PATCH 2/2] expaned chunking to more different blocks

---
 kaizen/retriever/code_chunker.py          | 100 ++++++++++++++---
 kaizen/retriever/llama_index_retriever.py | 129 +++++++++++++---------
 kaizen/retriever/qdrant_vector_store.py   |   5 +-
 kaizen/retriever/tree_sitter_utils.py     |   8 +-
 pyproject.toml                            |   2 +-
 5 files changed, 167 insertions(+), 77 deletions(-)

diff --git a/kaizen/retriever/code_chunker.py b/kaizen/retriever/code_chunker.py
index 32ecd549..086d6d3f 100644
--- a/kaizen/retriever/code_chunker.py
+++ b/kaizen/retriever/code_chunker.py
@@ -7,24 +7,55 @@
 def chunk_code(code: str, language: str) -> ParsedBody:
     parser = ParserFactory.get_parser(language)
     tree = parser.parse(code.encode("utf8"))
-
+    code_bytes = code.encode("utf8")
     body: ParsedBody = {
+        "imports": [],
+        "global_variables": [],
+        "type_definitions": [],
         "functions": {},
+        "async_functions": {},
         "classes": {},
         "hooks": {},
         "components": {},
+        "jsx_elements": [],
         "other_blocks": [],
     }
-    # code_bytes = code.encode("utf8")
 
     def process_node(node):
-        result = parse_code(code, language)
+        result = parse_code(node, code_bytes)
         if result:
-            # Assuming parse_code is modified to return line numbers
             start_line = result.get("start_line", 0)
             end_line = result.get("end_line", 0)
 
-            if result["type"] == "function":
+            if result["type"] == "import_statement":
+                body["imports"].append(
+                    {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
+                )
+            elif (
+                result["type"] == "variable_declaration"
+                and node.parent.type == "program"
+            ):
+                body["global_variables"].append(
+                    {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
+                )
+            elif result["type"] in ["type_alias", "interface_declaration"]:
+                body["type_definitions"].append(
+                    {
+                        "name": result["name"],
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
+                )
+            elif result["type"] == "function":
                 if is_react_hook(result["name"]):
                     body["hooks"][result["name"]] = {
                         "code": result["code"],
@@ -32,18 +63,44 @@ def process_node(node):
                         "end_line": end_line,
                     }
                 elif is_react_component(result["code"]):
-                    body["components"][result["name"]] = result["code"]
+                    body["components"][result["name"]] = {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
+                elif "async" in result["code"].split()[0]:
+                    body["async_functions"][result["name"]] = {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
                 else:
-                    body["functions"][result["name"]] = result["code"]
+                    body["functions"][result["name"]] = {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
             elif result["type"] == "class":
                 if is_react_component(result["code"]):
-                    body["components"][result["name"]] = result["code"]
+                    body["components"][result["name"]] = {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
                 else:
-                    body["classes"][result["name"]] = result["code"]
-            elif result["type"] == "component":
-                body["components"][result["name"]] = result["code"]
-            elif result["type"] == "impl":
-                body["classes"][result["name"]] = result["code"]
+                    body["classes"][result["name"]] = {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
+            elif result["type"] == "jsx_element":
+                body["jsx_elements"].append(
+                    {
+                        "code": result["code"],
+                        "start_line": start_line,
+                        "end_line": end_line,
+                    }
+                )
         else:
             for child in node.children:
                 process_node(child)
@@ -55,8 +112,14 @@ def process_node(node):
     for section in body.values():
         if isinstance(section, dict):
             for code_block in section.values():
-                start = code.index(code_block)
-                collected_ranges.append((start, start + len(code_block)))
+                collected_ranges.append(
+                    (code_block["start_line"], code_block["end_line"])
+                )
+        elif isinstance(section, list):
+            for code_block in section:
+                collected_ranges.append(
+                    (code_block["start_line"], code_block["end_line"])
+                )
 
     collected_ranges.sort()
     last_end = 0
@@ -76,5 +139,10 @@ def is_react_hook(name: str) -> bool:
 
 def is_react_component(code: str) -> bool:
     return (
-        "React" in code or "jsx" in code.lower() or "tsx" in code.lower() or "<" in code
+        "React" in code
+        or "jsx" in code.lower()
+        or "tsx" in code.lower()
+        or "<" in code
+        or "props" in code
+        or "render" in code
     )
diff --git a/kaizen/retriever/llama_index_retriever.py b/kaizen/retriever/llama_index_retriever.py
index b70116bd..3fecd181 100644
--- a/kaizen/retriever/llama_index_retriever.py
+++ b/kaizen/retriever/llama_index_retriever.py
@@ -12,7 +12,7 @@
 from llama_index.embeddings.litellm import LiteLLMEmbedding
 from sqlalchemy import create_engine, text
 from kaizen.retriever.qdrant_vector_store import QdrantVectorStore
-
+import json
 
 # Set up logging
 logging.basicConfig(
@@ -43,10 +43,18 @@ def __init__(self, repo_id=1):
         )
         logger.info("RepositoryAnalyzer initialized successfully")
 
-    def setup_repository(self, repo_path: str, node_query: str = None):
+    def setup_repository(
+        self,
+        repo_path: str,
+        node_query: str = None,
+        file_query: str = None,
+        function_query: str = None,
+    ):
         self.total_usage = self.llm_provider.DEFAULT_USAGE
         self.total_files_processed = 0
         self.node_query = node_query
+        self.file_query = file_query
+        self.function_query = function_query
         self.embedding_usage = {"prompt_tokens": 10, "total_tokens": 10}
         logger.info(f"Starting repository setup for: {repo_path}")
         self.parse_repository(repo_path)
@@ -130,7 +138,7 @@ def process_code_block(
             return  # Skip this code block
 
         language = self.get_language_from_extension(file_path)
-        abstraction, usage = self.generate_abstraction(code, language)
+        abstraction, usage = self.generate_abstraction(code, language, section)
         self.total_usage = self.llm_provider.update_usage(
             total_usage=self.total_usage, current_usage=usage
         )
@@ -185,70 +193,85 @@ def store_abstraction_and_embedding(self, function_id: int, abstraction: str):
         logger.debug(f"Abstraction and embedding stored for function_id: {function_id}")
 
     def generate_abstraction(
-        self, code_block: str, language: str, max_tokens: int = 300
+        self, code_block: str, language: str, section: str, max_tokens: int = 300
     ) -> str:
         prompt = f"""Analyze the following {language} code block and generate a structured abstraction. 
-Your response should be in YAML format and include the following sections:
+Your response should be in JSON format and include the following sections:
+
+{{
+  "summary": "A concise one-sentence summary of the function's primary purpose.",
 
-summary: A concise one-sentence summary of the function's primary purpose.
+  "functionality": "A detailed explanation of what the function does, including its main steps and logic. Use multiple lines if needed for clarity.",
 
-functionality: |
-  A detailed explanation of what the function does, including its main steps and logic.
-  Use multiple lines if needed for clarity.
+  "inputs": [
+    {{
+      "name": "The parameter name",
+      "type": "The parameter type",
+      "description": "A brief description of the parameter's purpose",
+      "default_value": "The default value, if any (or null if not applicable)"
+    }}
+  ],
 
-inputs:
-  - name: The parameter name
-    type: The parameter type
-    description: A brief description of the parameter's purpose
-    default_value: The default value, if any (or null if not applicable)
+  "output": {{
+    "type": "The return type of the function",
+    "description": "A description of what is returned and under what conditions. Use multiple lines if needed."
+  }},
 
-output:
-  type: The return type of the function
-  description: |
-    A description of what is returned and under what conditions.
-    Use multiple lines if needed.
+  "dependencies": [
+    {{
+      "name": "Name of the external library or module",
+      "purpose": "Brief explanation of its use in this function"
+    }}
+  ],
 
-dependencies:
-  - name: Name of the external library or module
-    purpose: Brief explanation of its use in this function
+  "algorithms": [
+    {{
+      "name": "Name of the algorithm or data structure",
+      "description": "Brief explanation of its use and importance"
+    }}
+  ],
 
-algorithms:
-  - name: Name of the algorithm or data structure
-    description: Brief explanation of its use and importance
+  "edge_cases": [
+    "A list of potential edge cases or special conditions the function handles or should handle"
+  ],
 
-edge_cases:
-  - A list of potential edge cases or special conditions the function handles or should handle
+  "error_handling": "A description of how errors are handled or propagated. Include specific error types if applicable.",
 
-error_handling: |
-  A description of how errors are handled or propagated.
-  Include specific error types if applicable.
+  "usage_context": "A brief explanation of how this function might be used by parent functions or in a larger system. Include typical scenarios and any important considerations for its use.",
 
-usage_context: |
-  A brief explanation of how this function might be used by parent functions or in a larger system.
-  Include typical scenarios and any important considerations for its use.
+  "complexity": {{
+    "time": "Estimated time complexity (e.g., O(n))",
+    "space": "Estimated space complexity (e.g., O(1))",
+    "explanation": "Brief explanation of the complexity analysis"
+  }},
 
-complexity:
-  time: Estimated time complexity (e.g., O(n))
-  space: Estimated space complexity (e.g., O(1))
+  "tags": ["List", "of", "relevant", "tags"],
 
-code_snippet: |
-  ```{language}
-  {code_block}
-  ```
+  "testing_considerations": "Suggestions for unit tests or test cases to cover key functionality and edge cases",
 
-Provide your analysis in this clear, structured YAML format. If any section is not applicable, use an empty list [] or null value as appropriate. Ensure that multi-line descriptions are properly indented under their respective keys.
+  "version_compatibility": "Information about language versions or dependency versions this code is compatible with",
+
+  "performance_considerations": "Any notes on performance optimizations or potential bottlenecks",
+
+  "security_considerations": "Any security-related notes or best practices relevant to this code",
+
+  "maintainability_score": "A subjective score from 1-10 on how easy the code is to maintain, with a brief explanation"
+}}
+
+Provide your analysis in this clear, structured JSON format. If any section is not applicable, use an empty list [] or null value as appropriate. Ensure that multi-line descriptions are properly formatted as strings.
 
 Code to analyze:
-```{language}
-{code_block}
-```
+Language: {language}
+Block Type: {section}
+Code Block: 
+```{code_block}```
         """
 
         estimated_prompt_tokens = len(tokenizer.encode(prompt))
         adjusted_max_tokens = min(max(150, estimated_prompt_tokens), 1000)
 
         try:
-            abstraction, usage = self.llm_provider.chat_completion(
+            abstraction, usage = self.llm_provider.chat_completion_with_json(
                 prompt="",
                 messages=[
                     {
@@ -259,7 +282,7 @@ def generate_abstraction(
                 ],
                 custom_model={"max_tokens": adjusted_max_tokens, "model": "small"},
             )
-            return abstraction, usage
+            return json.dumps(abstraction), usage
 
         except Exception as e:
             raise e
@@ -272,21 +295,19 @@ def store_code_in_db(
         section: str,
         name: str,
         start_line: int,
-        file_query: str = None,
-        function_query: str = None,
     ) -> int:
         logger.debug(f"Storing code in DB: {file_path} - {section} - {name}")
         with self.engine.begin() as connection:
             # Insert into files table (assuming this part is already correct)
-            if not file_query:
-                file_query = """
+            if not self.file_query:
+                self.file_query = """
                         INSERT INTO files (repo_id, file_path, file_name, file_ext, programming_language)
                     VALUES (:repo_id, :file_path, :file_name, :file_ext, :programming_language)
                     ON CONFLICT (repo_id, file_path) DO UPDATE SET file_path = EXCLUDED.file_path
                     RETURNING file_id
                     """
             file_id = connection.execute(
-                text(file_query),
+                text(self.file_query),
                 {
                     "repo_id": self.repo_id,
                     "file_path": file_path,
@@ -297,15 +318,15 @@ def store_code_in_db(
             ).scalar_one()
 
             # Insert into function_abstractions table
-            if not function_query:
-                function_query = """
+            if not self.function_query:
+                self.function_query = """
                     INSERT INTO function_abstractions 
                     (file_id, function_name, function_signature, abstract_functionality, start_line, end_line)
                     VALUES (:file_id, :function_name, :function_signature, :abstract_functionality, :start_line, :end_line)
                     RETURNING function_id
                         """
             function_id = connection.execute(
-                text(function_query),
+                text(self.function_query),
                 {
                     "file_id": file_id,
                     "function_name": name,
diff --git a/kaizen/retriever/qdrant_vector_store.py b/kaizen/retriever/qdrant_vector_store.py
index d973c574..b655abf1 100644
--- a/kaizen/retriever/qdrant_vector_store.py
+++ b/kaizen/retriever/qdrant_vector_store.py
@@ -11,6 +11,7 @@ class QdrantVectorStore:
     def __init__(self, collection_name, vector_size, max_retries=3, retry_delay=2):
         self.HOST = os.getenv("QDRANT_HOST", "localhost")
         self.PORT = os.getenv("QDRANT_PORT", "6333")
+        self.QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
         self.collection_name = collection_name
         self.max_retries = max_retries
         self.retry_delay = retry_delay
@@ -49,7 +50,9 @@ def _create_collection(self, vector_size):
 
     def add(self, nodes):
         points = [
-            PointStruct(id=node.id_, vector=node.embedding, payload=node.metadata)
+            PointStruct(
+                id=node["id"], vector=node["embedding"], payload=node["metadata"]
+            )
             for node in nodes
         ]
         self.client.upsert(collection_name=self.collection_name, points=points)
diff --git a/kaizen/retriever/tree_sitter_utils.py b/kaizen/retriever/tree_sitter_utils.py
index 6f3f9af2..0f1f63ab 100644
--- a/kaizen/retriever/tree_sitter_utils.py
+++ b/kaizen/retriever/tree_sitter_utils.py
@@ -101,13 +101,11 @@ def traverse_tree(node, code_bytes: bytes) -> Dict[str, Any]:
         return None
 
 
-def parse_code(code: str, language: str) -> Dict[str, Any]:
+def parse_code(node: Any, code_bytes: bytes) -> Dict[str, Any]:
     try:
-        parser = ParserFactory.get_parser(language)
-        tree = parser.parse(bytes(code, "utf8"))
-        return traverse_tree(tree.root_node, code.encode("utf8"))
+        return traverse_tree(node, code_bytes)
     except Exception as e:
-        logger.error(f"Failed to parse {language} code: {str(e)}")
+        logger.error(f"Failed to parse code: {str(e)}")
         raise
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 3522272e..945d30fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "kaizen-cloudcode"
-version = "0.4.12"
+version = "0.4.13"
 description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly."
 authors = ["Saurav Panda <saurav.panda@cloudcode.ai>"]
 license = "Apache2.0"