From 71a5a3b143e7f6a80cfc35d9234a87519bbd1398 Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Sun, 15 Sep 2024 22:20:12 -0700 Subject: [PATCH 1/2] feat: added github pre commit hook setup --- cli/kaizen_cli/cli.py | 2 + cli/kaizen_cli/config/default_config.py | 6 +-- cli/kaizen_cli/hooks/prepare-commit-msg | 8 ++++ cli/kaizen_cli/hooks/setup.py | 58 +++++++++++++++++++++++++ cli/poetry.lock | 4 +- cli/pyproject.toml | 2 +- kaizen/utils/config.py | 26 ++--------- 7 files changed, 75 insertions(+), 31 deletions(-) create mode 100644 cli/kaizen_cli/hooks/prepare-commit-msg create mode 100644 cli/kaizen_cli/hooks/setup.py diff --git a/cli/kaizen_cli/cli.py b/cli/kaizen_cli/cli.py index af4a3d67..580850a4 100644 --- a/cli/kaizen_cli/cli.py +++ b/cli/kaizen_cli/cli.py @@ -3,6 +3,7 @@ from .commands.config_commands import config from .commands.unit_test_commands import unit_test from .commands.reviewer_commands import reviewer +from .hooks.setup import hooks from kaizen.generator.e2e_tests import E2ETestGenerator @@ -23,6 +24,7 @@ def ui_tests(url): cli.add_command(config) cli.add_command(unit_test) cli.add_command(reviewer) +cli.add_command(hooks) if __name__ == "__main__": cli() diff --git a/cli/kaizen_cli/config/default_config.py b/cli/kaizen_cli/config/default_config.py index 8440aabc..4f8d99c4 100644 --- a/cli/kaizen_cli/config/default_config.py +++ b/cli/kaizen_cli/config/default_config.py @@ -4,11 +4,7 @@ "models": [ { "model_name": "default", - "litellm_params": { - "model": "azure/gpt-4o-mini", - "api_key": "os.environ/AZURE_API_KEY", - "api_base": "os.environ/AZURE_API_BASE", - }, + "litellm_params": {"model": "ollama/phi3"}, }, ] }, diff --git a/cli/kaizen_cli/hooks/prepare-commit-msg b/cli/kaizen_cli/hooks/prepare-commit-msg new file mode 100644 index 00000000..42dbf5c9 --- /dev/null +++ b/cli/kaizen_cli/hooks/prepare-commit-msg @@ -0,0 +1,8 @@ +#!/bin/sh +# hooks/prepare-commit-msg + +# Run your CLI command and capture the output +commit_msg=$(kaizen-cli generate-commit-msg) + +# Overwrite the commit message file with the generated message +echo "$commit_msg" > "$1" \ No newline at end of file diff --git a/cli/kaizen_cli/hooks/setup.py b/cli/kaizen_cli/hooks/setup.py new file mode 100644 index 00000000..107f64c3 --- /dev/null +++ b/cli/kaizen_cli/hooks/setup.py @@ -0,0 +1,58 @@ +import os +import shutil +import click + +HOOK_TYPES = ["prepare-commit-msg"] + + +@click.group() +def hooks(): + """Manage git hooks""" + pass + + +@hooks.command() +@click.argument("hook_type", type=click.Choice(HOOK_TYPES)) +def install(hook_type): + """Install a specific git hook""" + source = os.path.join(os.path.dirname(__file__), "hooks", hook_type) + destination = os.path.join(".git", "hooks", hook_type) + + if not os.path.exists(source): + click.echo(f"Error: Hook script for {hook_type} not found.") + return + + try: + shutil.copy(source, destination) + os.chmod(destination, 0o755) + click.echo(f"{hook_type} hook installed successfully") + except IOError as e: + click.echo(f"Error installing {hook_type} hook: {str(e)}") + + +@hooks.command() +def install_all(): + """Install all available git hooks""" + for hook_type in HOOK_TYPES: + ctx = click.get_current_context() + ctx.invoke(install, hook_type=hook_type) + + +@hooks.command() +@click.argument("hook_type", type=click.Choice(HOOK_TYPES)) +def uninstall(hook_type): + """Uninstall a specific git hook""" + hook_path = os.path.join(".git", "hooks", hook_type) + if os.path.exists(hook_path): + os.remove(hook_path) + click.echo(f"{hook_type} hook uninstalled successfully") + else: + click.echo(f"{hook_type} hook not found") + + +@hooks.command() +def uninstall_all(): + """Uninstall all git hooks""" + for hook_type in HOOK_TYPES: + ctx = click.get_current_context() + ctx.invoke(uninstall, hook_type=hook_type) diff --git a/cli/poetry.lock b/cli/poetry.lock index 00b165c4..2172bad6 100644 --- a/cli/poetry.lock +++ b/cli/poetry.lock @@ -1454,7 +1454,7 @@ referencing = ">=0.31.0" [[package]] name = "kaizen-cloudcode" -version = "0.4.11" +version = "0.4.12" description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly." optional = false python-versions = "^3.9.0" @@ -4370,4 +4370,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.9.0" -content-hash = "7751c3aba479207989d1f2f49251c8daed3adc5170f9abb7eece699436ff135b" +content-hash = "d8dd0866cce15fa13755cf421b868b7e7fe5509dc7af511b18b210198ffeb8de" diff --git a/cli/pyproject.toml b/cli/pyproject.toml index de954377..ec4df3d3 100644 --- a/cli/pyproject.toml +++ b/cli/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.9.0" click = "^8.1.3" -kaizen-cloudcode = "^0.4.11" +kaizen-cloudcode = "^0.4.12" [tool.poetry.group.dev.dependencies] kaizen-cloudcode = {path = "..", develop = true, optional = true} diff --git a/kaizen/utils/config.py b/kaizen/utils/config.py index bd9fa343..e31af6de 100644 --- a/kaizen/utils/config.py +++ b/kaizen/utils/config.py @@ -1,5 +1,4 @@ import json -import os from pathlib import Path @@ -9,6 +8,9 @@ def __init__(self, config_data=None): if Path(config_file).is_file(): with open(config_file, "r") as f: self.config_data = json.loads(f.read()) + elif Path("~/.kaizen_config.json").is_file(): + with open("~/.kaizen_config.json", "r") as f: + self.config_data = json.loads(f.read()) else: print(f"Couldnt find config at {config_file} loading default vals") self.config_data = { @@ -28,7 +30,6 @@ def __init__(self, config_data=None): def update_config_data(self, new_config_data): self.config_data.update(new_config_data) - self.validate_config_settings(self.config_data) def get_config_data(self): return self.config_data @@ -38,24 +39,3 @@ def get_language_model_config(self): def get_github_app_config(self): return self.config_data["github_app"] - - def validate_config_settings(self): - "Make sure relvant enviorment variables are set" - if self.config_data.get("github_app", {}).get("check_signature", False): - if not os.environ.get("GITHUB_APP_WEBHOOK_SECRET"): - raise EnvironmentError( - "The environment variable 'GITHUB_APP_WEBHOOK_SECRET' is not set." - ) - - if self.config_data.get("language_model", {}).get("provider", {}) == "litellm": - if self.config_data.get("language_model", {}).get( - "enable_observability_logging", False - ): - if not os.environ.get("SUPABASE_URL"): - raise EnvironmentError( - "The environment variable 'SUPABASE_URL' is not set." - ) - if not os.environ.get("SUPABASE_KEY"): - raise EnvironmentError( - "The environment variable 'SUPABASE_KEY' is not set." - ) From 3c893a3e7b9fddcf72d4a8936fb4568299c68105 Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Mon, 16 Sep 2024 01:38:13 -0700 Subject: [PATCH 2/2] expaned chunking to more different blocks --- kaizen/retriever/code_chunker.py | 100 ++++++++++++++--- kaizen/retriever/llama_index_retriever.py | 129 +++++++++++++--------- kaizen/retriever/qdrant_vector_store.py | 5 +- kaizen/retriever/tree_sitter_utils.py | 8 +- pyproject.toml | 2 +- 5 files changed, 167 insertions(+), 77 deletions(-) diff --git a/kaizen/retriever/code_chunker.py b/kaizen/retriever/code_chunker.py index 32ecd549..086d6d3f 100644 --- a/kaizen/retriever/code_chunker.py +++ b/kaizen/retriever/code_chunker.py @@ -7,24 +7,55 @@ def chunk_code(code: str, language: str) -> ParsedBody: parser = ParserFactory.get_parser(language) tree = parser.parse(code.encode("utf8")) - + code_bytes = code.encode("utf8") body: ParsedBody = { + "imports": [], + "global_variables": [], + "type_definitions": [], "functions": {}, + "async_functions": {}, "classes": {}, "hooks": {}, "components": {}, + "jsx_elements": [], "other_blocks": [], } - # code_bytes = code.encode("utf8") def process_node(node): - result = parse_code(code, language) + result = parse_code(node, code_bytes) if result: - # Assuming parse_code is modified to return line numbers start_line = result.get("start_line", 0) end_line = result.get("end_line", 0) - if result["type"] == "function": + if result["type"] == "import_statement": + body["imports"].append( + { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } + ) + elif ( + result["type"] == "variable_declaration" + and node.parent.type == "program" + ): + body["global_variables"].append( + { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } + ) + elif result["type"] in ["type_alias", "interface_declaration"]: + body["type_definitions"].append( + { + "name": result["name"], + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } + ) + elif result["type"] == "function": if is_react_hook(result["name"]): body["hooks"][result["name"]] = { "code": result["code"], @@ -32,18 +63,44 @@ def process_node(node): "end_line": end_line, } elif is_react_component(result["code"]): - body["components"][result["name"]] = result["code"] + body["components"][result["name"]] = { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } + elif "async" in result["code"].split()[0]: + body["async_functions"][result["name"]] = { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } else: - body["functions"][result["name"]] = result["code"] + body["functions"][result["name"]] = { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } elif result["type"] == "class": if is_react_component(result["code"]): - body["components"][result["name"]] = result["code"] + body["components"][result["name"]] = { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } else: - body["classes"][result["name"]] = result["code"] - elif result["type"] == "component": - body["components"][result["name"]] = result["code"] - elif result["type"] == "impl": - body["classes"][result["name"]] = result["code"] + body["classes"][result["name"]] = { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } + elif result["type"] == "jsx_element": + body["jsx_elements"].append( + { + "code": result["code"], + "start_line": start_line, + "end_line": end_line, + } + ) else: for child in node.children: process_node(child) @@ -55,8 +112,14 @@ def process_node(node): for section in body.values(): if isinstance(section, dict): for code_block in section.values(): - start = code.index(code_block) - collected_ranges.append((start, start + len(code_block))) + collected_ranges.append( + (code_block["start_line"], code_block["end_line"]) + ) + elif isinstance(section, list): + for code_block in section: + collected_ranges.append( + (code_block["start_line"], code_block["end_line"]) + ) collected_ranges.sort() last_end = 0 @@ -76,5 +139,10 @@ def is_react_hook(name: str) -> bool: def is_react_component(code: str) -> bool: return ( - "React" in code or "jsx" in code.lower() or "tsx" in code.lower() or "<" in code + "React" in code + or "jsx" in code.lower() + or "tsx" in code.lower() + or "<" in code + or "props" in code + or "render" in code ) diff --git a/kaizen/retriever/llama_index_retriever.py b/kaizen/retriever/llama_index_retriever.py index b70116bd..3fecd181 100644 --- a/kaizen/retriever/llama_index_retriever.py +++ b/kaizen/retriever/llama_index_retriever.py @@ -12,7 +12,7 @@ from llama_index.embeddings.litellm import LiteLLMEmbedding from sqlalchemy import create_engine, text from kaizen.retriever.qdrant_vector_store import QdrantVectorStore - +import json # Set up logging logging.basicConfig( @@ -43,10 +43,18 @@ def __init__(self, repo_id=1): ) logger.info("RepositoryAnalyzer initialized successfully") - def setup_repository(self, repo_path: str, node_query: str = None): + def setup_repository( + self, + repo_path: str, + node_query: str = None, + file_query: str = None, + function_query: str = None, + ): self.total_usage = self.llm_provider.DEFAULT_USAGE self.total_files_processed = 0 self.node_query = node_query + self.file_query = file_query + self.function_query = function_query self.embedding_usage = {"prompt_tokens": 10, "total_tokens": 10} logger.info(f"Starting repository setup for: {repo_path}") self.parse_repository(repo_path) @@ -130,7 +138,7 @@ def process_code_block( return # Skip this code block language = self.get_language_from_extension(file_path) - abstraction, usage = self.generate_abstraction(code, language) + abstraction, usage = self.generate_abstraction(code, language, section) self.total_usage = self.llm_provider.update_usage( total_usage=self.total_usage, current_usage=usage ) @@ -185,70 +193,85 @@ def store_abstraction_and_embedding(self, function_id: int, abstraction: str): logger.debug(f"Abstraction and embedding stored for function_id: {function_id}") def generate_abstraction( - self, code_block: str, language: str, max_tokens: int = 300 + self, code_block: str, language: str, section: str, max_tokens: int = 300 ) -> str: prompt = f"""Analyze the following {language} code block and generate a structured abstraction. -Your response should be in YAML format and include the following sections: +Your response should be in JSON format and include the following sections: + +{{ + "summary": "A concise one-sentence summary of the function's primary purpose.", -summary: A concise one-sentence summary of the function's primary purpose. + "functionality": "A detailed explanation of what the function does, including its main steps and logic. Use multiple lines if needed for clarity.", -functionality: | - A detailed explanation of what the function does, including its main steps and logic. - Use multiple lines if needed for clarity. + "inputs": [ + {{ + "name": "The parameter name", + "type": "The parameter type", + "description": "A brief description of the parameter's purpose", + "default_value": "The default value, if any (or null if not applicable)" + }} + ], -inputs: - - name: The parameter name - type: The parameter type - description: A brief description of the parameter's purpose - default_value: The default value, if any (or null if not applicable) + "output": {{ + "type": "The return type of the function", + "description": "A description of what is returned and under what conditions. Use multiple lines if needed." + }}, -output: - type: The return type of the function - description: | - A description of what is returned and under what conditions. - Use multiple lines if needed. + "dependencies": [ + {{ + "name": "Name of the external library or module", + "purpose": "Brief explanation of its use in this function" + }} + ], -dependencies: - - name: Name of the external library or module - purpose: Brief explanation of its use in this function + "algorithms": [ + {{ + "name": "Name of the algorithm or data structure", + "description": "Brief explanation of its use and importance" + }} + ], -algorithms: - - name: Name of the algorithm or data structure - description: Brief explanation of its use and importance + "edge_cases": [ + "A list of potential edge cases or special conditions the function handles or should handle" + ], -edge_cases: - - A list of potential edge cases or special conditions the function handles or should handle + "error_handling": "A description of how errors are handled or propagated. Include specific error types if applicable.", -error_handling: | - A description of how errors are handled or propagated. - Include specific error types if applicable. + "usage_context": "A brief explanation of how this function might be used by parent functions or in a larger system. Include typical scenarios and any important considerations for its use.", -usage_context: | - A brief explanation of how this function might be used by parent functions or in a larger system. - Include typical scenarios and any important considerations for its use. + "complexity": {{ + "time": "Estimated time complexity (e.g., O(n))", + "space": "Estimated space complexity (e.g., O(1))", + "explanation": "Brief explanation of the complexity analysis" + }}, -complexity: - time: Estimated time complexity (e.g., O(n)) - space: Estimated space complexity (e.g., O(1)) + "tags": ["List", "of", "relevant", "tags"], -code_snippet: | - ```{language} - {code_block} - ``` + "testing_considerations": "Suggestions for unit tests or test cases to cover key functionality and edge cases", -Provide your analysis in this clear, structured YAML format. If any section is not applicable, use an empty list [] or null value as appropriate. Ensure that multi-line descriptions are properly indented under their respective keys. + "version_compatibility": "Information about language versions or dependency versions this code is compatible with", + + "performance_considerations": "Any notes on performance optimizations or potential bottlenecks", + + "security_considerations": "Any security-related notes or best practices relevant to this code", + + "maintainability_score": "A subjective score from 1-10 on how easy the code is to maintain, with a brief explanation" +}} + +Provide your analysis in this clear, structured JSON format. If any section is not applicable, use an empty list [] or null value as appropriate. Ensure that multi-line descriptions are properly formatted as strings. Code to analyze: -```{language} -{code_block} -``` +Language: {language} +Block Type: {section} +Code Block: +```{code_block}``` """ estimated_prompt_tokens = len(tokenizer.encode(prompt)) adjusted_max_tokens = min(max(150, estimated_prompt_tokens), 1000) try: - abstraction, usage = self.llm_provider.chat_completion( + abstraction, usage = self.llm_provider.chat_completion_with_json( prompt="", messages=[ { @@ -259,7 +282,7 @@ def generate_abstraction( ], custom_model={"max_tokens": adjusted_max_tokens, "model": "small"}, ) - return abstraction, usage + return json.dumps(abstraction), usage except Exception as e: raise e @@ -272,21 +295,19 @@ def store_code_in_db( section: str, name: str, start_line: int, - file_query: str = None, - function_query: str = None, ) -> int: logger.debug(f"Storing code in DB: {file_path} - {section} - {name}") with self.engine.begin() as connection: # Insert into files table (assuming this part is already correct) - if not file_query: - file_query = """ + if not self.file_query: + self.file_query = """ INSERT INTO files (repo_id, file_path, file_name, file_ext, programming_language) VALUES (:repo_id, :file_path, :file_name, :file_ext, :programming_language) ON CONFLICT (repo_id, file_path) DO UPDATE SET file_path = EXCLUDED.file_path RETURNING file_id """ file_id = connection.execute( - text(file_query), + text(self.file_query), { "repo_id": self.repo_id, "file_path": file_path, @@ -297,15 +318,15 @@ def store_code_in_db( ).scalar_one() # Insert into function_abstractions table - if not function_query: - function_query = """ + if not self.function_query: + self.function_query = """ INSERT INTO function_abstractions (file_id, function_name, function_signature, abstract_functionality, start_line, end_line) VALUES (:file_id, :function_name, :function_signature, :abstract_functionality, :start_line, :end_line) RETURNING function_id """ function_id = connection.execute( - text(function_query), + text(self.function_query), { "file_id": file_id, "function_name": name, diff --git a/kaizen/retriever/qdrant_vector_store.py b/kaizen/retriever/qdrant_vector_store.py index d973c574..b655abf1 100644 --- a/kaizen/retriever/qdrant_vector_store.py +++ b/kaizen/retriever/qdrant_vector_store.py @@ -11,6 +11,7 @@ class QdrantVectorStore: def __init__(self, collection_name, vector_size, max_retries=3, retry_delay=2): self.HOST = os.getenv("QDRANT_HOST", "localhost") self.PORT = os.getenv("QDRANT_PORT", "6333") + self.QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") self.collection_name = collection_name self.max_retries = max_retries self.retry_delay = retry_delay @@ -49,7 +50,9 @@ def _create_collection(self, vector_size): def add(self, nodes): points = [ - PointStruct(id=node.id_, vector=node.embedding, payload=node.metadata) + PointStruct( + id=node["id"], vector=node["embedding"], payload=node["metadata"] + ) for node in nodes ] self.client.upsert(collection_name=self.collection_name, points=points) diff --git a/kaizen/retriever/tree_sitter_utils.py b/kaizen/retriever/tree_sitter_utils.py index 6f3f9af2..0f1f63ab 100644 --- a/kaizen/retriever/tree_sitter_utils.py +++ b/kaizen/retriever/tree_sitter_utils.py @@ -101,13 +101,11 @@ def traverse_tree(node, code_bytes: bytes) -> Dict[str, Any]: return None -def parse_code(code: str, language: str) -> Dict[str, Any]: +def parse_code(node: Any, code_bytes: bytes) -> Dict[str, Any]: try: - parser = ParserFactory.get_parser(language) - tree = parser.parse(bytes(code, "utf8")) - return traverse_tree(tree.root_node, code.encode("utf8")) + return traverse_tree(node, code_bytes) except Exception as e: - logger.error(f"Failed to parse {language} code: {str(e)}") + logger.error(f"Failed to parse code: {str(e)}") raise diff --git a/pyproject.toml b/pyproject.toml index 3522272e..945d30fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "kaizen-cloudcode" -version = "0.4.12" +version = "0.4.13" description = "An intelligent coding companion that accelerates your development workflow by providing efficient assistance, enabling you to craft high-quality code more rapidly." authors = ["Saurav Panda "] license = "Apache2.0"