From 7f5ae4aa9563c1b94308d295a96b6c63216cbf5b Mon Sep 17 00:00:00 2001 From: Nick Sullivan Date: Sun, 16 Jul 2023 23:01:12 -0700 Subject: [PATCH] I know kung fu! Add learning functionality from repositories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces a new feature that allows the AI Code Bot to learn from a given repository. The bot can now clone a repository, load its documents, and store them in a local vector store for future use. This will enhance the bot's ability to provide contextually relevant suggestions and responses. Additionally, this commit includes the necessary updates to the configuration and helper functions to support this new feature. The requirements have also been updated to include the necessary dependencies. Lastly, a new test case has been added to ensure the correct parsing of GitHub URLs. ๐Ÿงช --- aicodebot/cli.py | 35 +++++++++- aicodebot/coder.py | 29 +++++++- aicodebot/config.py | 13 ++++ aicodebot/learn.py | 125 ++++++++++++++++++++++++++++++++++ requirements/requirements.in | 1 + requirements/requirements.txt | 26 +++---- tests/test_coder.py | 17 ++++- 7 files changed, 230 insertions(+), 16 deletions(-) create mode 100644 aicodebot/learn.py diff --git a/aicodebot/cli.py b/aicodebot/cli.py index 56758b9..0a2d517 100644 --- a/aicodebot/cli.py +++ b/aicodebot/cli.py @@ -1,7 +1,8 @@ from aicodebot import version as aicodebot_version from aicodebot.coder import CREATIVE_TEMPERATURE, DEFAULT_MAX_TOKENS, Coder -from aicodebot.config import get_config_file, read_config +from aicodebot.config import get_config_file, get_local_data_dir, read_config from aicodebot.helpers import RichLiveCallbackHandler, create_and_write_file, exec_and_get_output, logger +from aicodebot.learn import load_documents_from_repo, store_documents from aicodebot.prompts import DEFAULT_PERSONALITY, PERSONALITIES, generate_files_context, get_prompt from langchain.chains import LLMChain from langchain.memory import ConversationTokenBufferMemory @@ -352,6 +353,36 @@ def fun_fact(verbose, response_token_size): chain.run(f"programming and artificial intelligence in the year {year}") +@cli.command +@click.option("-v", "--verbose", count=True) +@click.option("-r", "--repo-url", help="The URL of the repository to learn from") +def learn(repo_url, verbose): + """Learn new skills and gain additional knowledge from a repository""" + # Clone the supplied repo locally and walk through it, load it into a + # local vector store, and pre-query this vector store for the LLM to use a + # context for the prompt + + setup_config() + + owner, repo_name = Coder.parse_github_url(repo_url) + + start_time = datetime.datetime.utcnow() + + local_data_dir = get_local_data_dir() + + Coder.clone_repo(repo_url, local_data_dir / "repos" / repo_name) + console.print("โœ… Repo cloned.") + + console.log("Loading documents") + vector_store_dir = local_data_dir / "vector_stores" / repo_name + documents = load_documents_from_repo(local_data_dir / "repos" / repo_name) + console.print("โœ… Repo loaded and indexed.") + + with console.status("Storing the repo in the vector store", spinner=DEFAULT_SPINNER): + store_documents(documents, vector_store_dir) + console.print(f"โœ… Repo loaded and indexed in {datetime.datetime.utcnow() - start_time} seconds.") + + @cli.command @click.option("-c", "--commit", help="The commit hash to review (otherwise look at [un]staged changes).") @click.option("-v", "--verbose", count=True) @@ -478,6 +509,7 @@ def sidekick(request, verbose, response_token_size, files): with Live(Markdown(""), auto_refresh=True) as live: callback = RichLiveCallbackHandler(live, bot_style) llm.callbacks = [callback] # a fresh callback handler for each question + chain.run({"task": human_input, "context": context}) if request: @@ -497,6 +529,7 @@ def setup_config(): configure.callback(openai_api_key=os.getenv("OPENAI_API_KEY"), verbose=0) sys.exit(0) else: + os.environ["OPENAI_API_KEY"] = existing_config["openai_api_key"] return existing_config diff --git a/aicodebot/coder.py b/aicodebot/coder.py index c05ab69..546cc72 100644 --- a/aicodebot/coder.py +++ b/aicodebot/coder.py @@ -3,7 +3,7 @@ from langchain.chat_models import ChatOpenAI from openai.api_resources import engine from pathlib import Path -import fnmatch, functools, openai, tiktoken +import fnmatch, functools, openai, re, subprocess, tiktoken DEFAULT_MAX_TOKENS = 512 PRECISE_TEMPERATURE = 0.05 @@ -16,6 +16,18 @@ class Coder: git, and the local file system. """ + @staticmethod + def clone_repo(repo_url, repo_dir): + """Clone a git repository to a directory.""" + if Path(repo_dir).exists(): + logger.info(f"Repo {repo_dir} already exists, updating it instead") + # Reset it first to make sure we don't have any local changes + subprocess.run(["git", "reset", "--hard"], cwd=repo_dir, check=True, stdout=subprocess.DEVNULL) + subprocess.run(["git", "pull"], cwd=repo_dir, check=True) + else: + logger.info(f"Cloning {repo_url} to {repo_dir}") + subprocess.run(["git", "clone", repo_url, repo_dir], check=True) + @classmethod def generate_directory_structure(cls, path, ignore_patterns=None, use_gitignore=True, indent=0): """Generate a text representation of the directory structure of a path.""" @@ -198,3 +210,18 @@ def git_staged_files(): @staticmethod def git_unstaged_files(): return exec_and_get_output(["git", "diff", "HEAD", "--name-only"]).splitlines() + + @staticmethod + def parse_github_url(repo_url): + """ + Parse a GitHub URL and return the owner and repo name. + Returns: A tuple containing the owner and repo name. + """ + pattern = r"(?:https:\/\/github\.com\/|git@github\.com:)([^\/]+)\/([^\/]+?)(?:\.git)?$" + match = re.match(pattern, repo_url) + + if not match: + raise ValueError("URL is not a valid GitHub URL") + + owner, repo = match.groups() + return owner, repo diff --git a/aicodebot/config.py b/aicodebot/config.py index 940fb0a..298dff2 100644 --- a/aicodebot/config.py +++ b/aicodebot/config.py @@ -3,6 +3,19 @@ import os, yaml +def get_local_data_dir(): + data_dir = Path(os.getenv("AICODEBOT_LOCAL_DATA_DIR", str(Path.home() / ".aicodebot_data"))) + # Make the directory if it doesn't exist + if not data_dir.exists(): + logger.debug(f"Creating local data directory {data_dir}") + data_dir.mkdir() + # Create the subdirectories + (data_dir / "repos").mkdir() + (data_dir / "vector_stores").mkdir() + + return data_dir + + def get_config_file(): return Path(os.getenv("AICODEBOT_CONFIG_FILE", str(Path.home() / ".aicodebot.yaml"))) diff --git a/aicodebot/learn.py b/aicodebot/learn.py new file mode 100644 index 0000000..b6d0adf --- /dev/null +++ b/aicodebot/learn.py @@ -0,0 +1,125 @@ +from aicodebot.config import get_local_data_dir +from aicodebot.helpers import logger +from git import Repo +from langchain.document_loaders import GitLoader, NotebookLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter, Language, RecursiveCharacterTextSplitter +from langchain.vectorstores import FAISS +from pathlib import Path + +DEFAULT_EXCLUDE = [".csv", ".enex", ".json", ".jsonl"] + + +def load_documents_from_repo(repo_dir, exclude=DEFAULT_EXCLUDE): + """Load a repo into the vector store.""" + + repo = Repo(repo_dir) + assert not repo.bare, f"Repo {repo_dir} does not appear to be a valid git repository" + + # Check main first, then master, then give up + for branch in ["main", "master"]: + if branch in repo.heads: + default_branch = branch + break + else: + raise ValueError(f"Repo {repo_dir} does not have a main or master branch") + + loader = GitLoader(repo_path=repo_dir, branch=default_branch) + + documents = loader.load() + logger.info(f"Loaded {len(documents)} documents from {repo_dir}") + + # Clean up + cleaned = [] + logger.info("Cleaning up documents") + for document in documents: + content = document.page_content + if not content: + logger.debug(f"Skipping empty file {document.metadata['file_path']}") + continue + + file_type = document.metadata["file_type"].lower() + if file_type in exclude: + logger.debug(f"Skipping excluded file {document.metadata['file_path']}") + continue + + # Reload notebooks + if file_type == ".ipynb": + logger.debug(f"Reloading notebook {document.metadata['file_path']}") + new_document = NotebookLoader(repo_dir / document.metadata["file_path"]).load()[0] + # Use the original metadata, because it contains file_type + new_document.metadata = document.metadata + cleaned.append(new_document) + else: + cleaned.append(document) + + return cleaned + + +def store_documents(documents, vector_store_dir): + """Store documents in the vector store.""" + vector_store_file = Path(vector_store_dir / "faiss_index") + embeddings = OpenAIEmbeddings() + if Path(vector_store_file).exists(): + logger.info(f"Loading existing vector store {vector_store_file}") + return FAISS.load_local(vector_store_file, embeddings) + + logger.info(f"Creating new vector store {vector_store_file}") + + language_extension_map = { + ".py": Language.PYTHON, + ".ipynb": Language.PYTHON, + ".js": Language.JS, + ".ts": Language.JS, + ".html": Language.HTML, + ".md": Language.MARKDOWN, + ".mdx": Language.MARKDOWN, + ".go": Language.GO, + ".java": Language.JAVA, + ".c": Language.CPP, + ".cpp": Language.CPP, + ".php": Language.PHP, + ".rb": Language.RUBY, + ".xml": Language.HTML, + } + + files = 0 + chunks = [] + for document in documents: + file_type = document.metadata["file_type"].lower() + files += 1 + + # Clean up + # Remove magic text that breaks processing + content = document.page_content.replace("<|end" + "of" + "text|>", "") # noqa: ISC003 + + if file_type in language_extension_map: + # Use a recursive splitter for code files + logger.debug( + f"Processing {document.metadata['file_path']} as {language_extension_map[file_type].value} code" + ) + splitter = RecursiveCharacterTextSplitter.from_language( + language=language_extension_map[document.metadata["file_type"].lower()], chunk_size=50, chunk_overlap=0 + ) + else: + # TODO: Check if it's a text file + if file_type not in [".txt", ".md", ".yml", ".yaml"]: + logger.info(f"Processing {document.metadata['file_path']} as a text file") + splitter = CharacterTextSplitter(separator="\n", chunk_size=1_000, chunk_overlap=200) + + chunks += splitter.create_documents([content]) + + logger.info(f"Storing {len(chunks)} chunks from {files} files in {vector_store_dir}") + vector_store = FAISS.from_documents(chunks, embeddings) + vector_store.save_local(vector_store_file) + return vector_store + + +def load_learned_repo(repo_name): + """Load a vector store from a learned repo.""" + vector_store_file = Path(get_local_data_dir() / "vector_stores" / repo_name / "faiss_index") + if not vector_store_file.exists(): + raise ValueError(f"Vector store for {repo_name} does not exist. Please run `aicodebot learn $githuburl` first.") + + embeddings = OpenAIEmbeddings() + return FAISS.load_local(vector_store_file, embeddings) diff --git a/requirements/requirements.in b/requirements/requirements.in index 6bb3e16..7888e20 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -7,6 +7,7 @@ beautifulsoup4 # needed by langchain click # command line interface helpers +faiss-cpu GitPython langchain loguru diff --git a/requirements/requirements.txt b/requirements/requirements.txt index ef2c016..756a65d 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile +# pip-compile requirements.in # aiohttp==3.8.4 # via @@ -15,7 +15,7 @@ async-timeout==4.0.2 attrs==23.1.0 # via aiohttp beautifulsoup4==4.12.2 - # via -r requirements/requirements.in + # via -r requirements.in certifi==2023.5.7 # via requests charset-normalizer==3.1.0 @@ -23,9 +23,11 @@ charset-normalizer==3.1.0 # aiohttp # requests click==8.1.4 - # via -r requirements/requirements.in + # via -r requirements.in dataclasses-json==0.5.8 # via langchain +faiss-cpu==1.7.4 + # via -r requirements.in frozenlist==1.3.3 # via # aiohttp @@ -33,19 +35,17 @@ frozenlist==1.3.3 gitdb==4.0.10 # via gitpython gitpython==3.1.32 - # via -r requirements/requirements.in -greenlet==2.0.2 - # via sqlalchemy + # via -r requirements.in idna==3.4 # via # requests # yarl langchain==0.0.231 - # via -r requirements/requirements.in + # via -r requirements.in langchainplus-sdk==0.0.20 # via langchain loguru==0.7.0 - # via -r requirements/requirements.in + # via -r requirements.in markdown-it-py==3.0.0 # via rich marshmallow==3.19.0 @@ -69,13 +69,13 @@ numpy==1.25.0 # langchain # numexpr openai==0.27.8 - # via -r requirements/requirements.in + # via -r requirements.in openapi-schema-pydantic==1.2.4 # via langchain packaging==23.1 # via marshmallow prompt-toolkit==3.0.39 - # via -r requirements/requirements.in + # via -r requirements.in pydantic==1.10.9 # via # langchain @@ -85,7 +85,7 @@ pygments==2.15.1 # via rich pyyaml==6.0 # via - # -r requirements/requirements.in + # -r requirements.in # langchain regex==2023.6.3 # via tiktoken @@ -96,7 +96,7 @@ requests==2.31.0 # openai # tiktoken rich==13.4.2 - # via -r requirements/requirements.in + # via -r requirements.in smmap==5.0.0 # via gitdb soupsieve==2.4.1 @@ -108,7 +108,7 @@ tenacity==8.2.2 # langchain # langchainplus-sdk tiktoken==0.4.0 - # via -r requirements/requirements.in + # via -r requirements.in tqdm==4.65.0 # via openai typing-extensions==4.6.3 diff --git a/tests/test_coder.py b/tests/test_coder.py index d5221eb..f0dff81 100644 --- a/tests/test_coder.py +++ b/tests/test_coder.py @@ -1,6 +1,6 @@ from aicodebot.coder import Coder from aicodebot.helpers import create_and_write_file -import os +import os, pytest def test_generate_directory_structure(tmp_path): @@ -113,3 +113,18 @@ def test_git_diff_context(temp_git_repo): commit = temp_git_repo.head.commit.hexsha diff = Coder.git_diff_context(commit) assert "renamedfile.txt" in diff + + def test_parse_github_url(): + # Test with https URL + owner, repo = Coder.parse_github_url("https://github.com/owner/repo.git") + assert owner == "owner" + assert repo == "repo" + + # Test with git URL + owner, repo = Coder.parse_github_url("git@github.com:owner/repo.git") + assert owner == "owner" + assert repo == "repo" + + # Test with invalid URL + with pytest.raises(ValueError): + Coder.parse_github_url("not a valid url")