diff --git a/aicodebot/cli.py b/aicodebot/cli.py index 56758b9..0a2d517 100644 --- a/aicodebot/cli.py +++ b/aicodebot/cli.py @@ -1,7 +1,8 @@ from aicodebot import version as aicodebot_version from aicodebot.coder import CREATIVE_TEMPERATURE, DEFAULT_MAX_TOKENS, Coder -from aicodebot.config import get_config_file, read_config +from aicodebot.config import get_config_file, get_local_data_dir, read_config from aicodebot.helpers import RichLiveCallbackHandler, create_and_write_file, exec_and_get_output, logger +from aicodebot.learn import load_documents_from_repo, store_documents from aicodebot.prompts import DEFAULT_PERSONALITY, PERSONALITIES, generate_files_context, get_prompt from langchain.chains import LLMChain from langchain.memory import ConversationTokenBufferMemory @@ -352,6 +353,36 @@ def fun_fact(verbose, response_token_size): chain.run(f"programming and artificial intelligence in the year {year}") +@cli.command +@click.option("-v", "--verbose", count=True) +@click.option("-r", "--repo-url", help="The URL of the repository to learn from") +def learn(repo_url, verbose): + """Learn new skills and gain additional knowledge from a repository""" + # Clone the supplied repo locally and walk through it, load it into a + # local vector store, and pre-query this vector store for the LLM to use a + # context for the prompt + + setup_config() + + owner, repo_name = Coder.parse_github_url(repo_url) + + start_time = datetime.datetime.utcnow() + + local_data_dir = get_local_data_dir() + + Coder.clone_repo(repo_url, local_data_dir / "repos" / repo_name) + console.print("✅ Repo cloned.") + + console.log("Loading documents") + vector_store_dir = local_data_dir / "vector_stores" / repo_name + documents = load_documents_from_repo(local_data_dir / "repos" / repo_name) + console.print("✅ Repo loaded and indexed.") + + with console.status("Storing the repo in the vector store", spinner=DEFAULT_SPINNER): + store_documents(documents, vector_store_dir) + console.print(f"✅ Repo loaded and indexed in {datetime.datetime.utcnow() - start_time} seconds.") + + @cli.command @click.option("-c", "--commit", help="The commit hash to review (otherwise look at [un]staged changes).") @click.option("-v", "--verbose", count=True) @@ -478,6 +509,7 @@ def sidekick(request, verbose, response_token_size, files): with Live(Markdown(""), auto_refresh=True) as live: callback = RichLiveCallbackHandler(live, bot_style) llm.callbacks = [callback] # a fresh callback handler for each question + chain.run({"task": human_input, "context": context}) if request: @@ -497,6 +529,7 @@ def setup_config(): configure.callback(openai_api_key=os.getenv("OPENAI_API_KEY"), verbose=0) sys.exit(0) else: + os.environ["OPENAI_API_KEY"] = existing_config["openai_api_key"] return existing_config diff --git a/aicodebot/coder.py b/aicodebot/coder.py index c05ab69..546cc72 100644 --- a/aicodebot/coder.py +++ b/aicodebot/coder.py @@ -3,7 +3,7 @@ from langchain.chat_models import ChatOpenAI from openai.api_resources import engine from pathlib import Path -import fnmatch, functools, openai, tiktoken +import fnmatch, functools, openai, re, subprocess, tiktoken DEFAULT_MAX_TOKENS = 512 PRECISE_TEMPERATURE = 0.05 @@ -16,6 +16,18 @@ class Coder: git, and the local file system. """ + @staticmethod + def clone_repo(repo_url, repo_dir): + """Clone a git repository to a directory.""" + if Path(repo_dir).exists(): + logger.info(f"Repo {repo_dir} already exists, updating it instead") + # Reset it first to make sure we don't have any local changes + subprocess.run(["git", "reset", "--hard"], cwd=repo_dir, check=True, stdout=subprocess.DEVNULL) + subprocess.run(["git", "pull"], cwd=repo_dir, check=True) + else: + logger.info(f"Cloning {repo_url} to {repo_dir}") + subprocess.run(["git", "clone", repo_url, repo_dir], check=True) + @classmethod def generate_directory_structure(cls, path, ignore_patterns=None, use_gitignore=True, indent=0): """Generate a text representation of the directory structure of a path.""" @@ -198,3 +210,18 @@ def git_staged_files(): @staticmethod def git_unstaged_files(): return exec_and_get_output(["git", "diff", "HEAD", "--name-only"]).splitlines() + + @staticmethod + def parse_github_url(repo_url): + """ + Parse a GitHub URL and return the owner and repo name. + Returns: A tuple containing the owner and repo name. + """ + pattern = r"(?:https:\/\/github\.com\/|git@github\.com:)([^\/]+)\/([^\/]+?)(?:\.git)?$" + match = re.match(pattern, repo_url) + + if not match: + raise ValueError("URL is not a valid GitHub URL") + + owner, repo = match.groups() + return owner, repo diff --git a/aicodebot/config.py b/aicodebot/config.py index 940fb0a..298dff2 100644 --- a/aicodebot/config.py +++ b/aicodebot/config.py @@ -3,6 +3,19 @@ import os, yaml +def get_local_data_dir(): + data_dir = Path(os.getenv("AICODEBOT_LOCAL_DATA_DIR", str(Path.home() / ".aicodebot_data"))) + # Make the directory if it doesn't exist + if not data_dir.exists(): + logger.debug(f"Creating local data directory {data_dir}") + data_dir.mkdir() + # Create the subdirectories + (data_dir / "repos").mkdir() + (data_dir / "vector_stores").mkdir() + + return data_dir + + def get_config_file(): return Path(os.getenv("AICODEBOT_CONFIG_FILE", str(Path.home() / ".aicodebot.yaml"))) diff --git a/aicodebot/learn.py b/aicodebot/learn.py new file mode 100644 index 0000000..b6d0adf --- /dev/null +++ b/aicodebot/learn.py @@ -0,0 +1,125 @@ +from aicodebot.config import get_local_data_dir +from aicodebot.helpers import logger +from git import Repo +from langchain.document_loaders import GitLoader, NotebookLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter, Language, RecursiveCharacterTextSplitter +from langchain.vectorstores import FAISS +from pathlib import Path + +DEFAULT_EXCLUDE = [".csv", ".enex", ".json", ".jsonl"] + + +def load_documents_from_repo(repo_dir, exclude=DEFAULT_EXCLUDE): + """Load a repo into the vector store.""" + + repo = Repo(repo_dir) + assert not repo.bare, f"Repo {repo_dir} does not appear to be a valid git repository" + + # Check main first, then master, then give up + for branch in ["main", "master"]: + if branch in repo.heads: + default_branch = branch + break + else: + raise ValueError(f"Repo {repo_dir} does not have a main or master branch") + + loader = GitLoader(repo_path=repo_dir, branch=default_branch) + + documents = loader.load() + logger.info(f"Loaded {len(documents)} documents from {repo_dir}") + + # Clean up + cleaned = [] + logger.info("Cleaning up documents") + for document in documents: + content = document.page_content + if not content: + logger.debug(f"Skipping empty file {document.metadata['file_path']}") + continue + + file_type = document.metadata["file_type"].lower() + if file_type in exclude: + logger.debug(f"Skipping excluded file {document.metadata['file_path']}") + continue + + # Reload notebooks + if file_type == ".ipynb": + logger.debug(f"Reloading notebook {document.metadata['file_path']}") + new_document = NotebookLoader(repo_dir / document.metadata["file_path"]).load()[0] + # Use the original metadata, because it contains file_type + new_document.metadata = document.metadata + cleaned.append(new_document) + else: + cleaned.append(document) + + return cleaned + + +def store_documents(documents, vector_store_dir): + """Store documents in the vector store.""" + vector_store_file = Path(vector_store_dir / "faiss_index") + embeddings = OpenAIEmbeddings() + if Path(vector_store_file).exists(): + logger.info(f"Loading existing vector store {vector_store_file}") + return FAISS.load_local(vector_store_file, embeddings) + + logger.info(f"Creating new vector store {vector_store_file}") + + language_extension_map = { + ".py": Language.PYTHON, + ".ipynb": Language.PYTHON, + ".js": Language.JS, + ".ts": Language.JS, + ".html": Language.HTML, + ".md": Language.MARKDOWN, + ".mdx": Language.MARKDOWN, + ".go": Language.GO, + ".java": Language.JAVA, + ".c": Language.CPP, + ".cpp": Language.CPP, + ".php": Language.PHP, + ".rb": Language.RUBY, + ".xml": Language.HTML, + } + + files = 0 + chunks = [] + for document in documents: + file_type = document.metadata["file_type"].lower() + files += 1 + + # Clean up + # Remove magic text that breaks processing + content = document.page_content.replace("<|end" + "of" + "text|>", "") # noqa: ISC003 + + if file_type in language_extension_map: + # Use a recursive splitter for code files + logger.debug( + f"Processing {document.metadata['file_path']} as {language_extension_map[file_type].value} code" + ) + splitter = RecursiveCharacterTextSplitter.from_language( + language=language_extension_map[document.metadata["file_type"].lower()], chunk_size=50, chunk_overlap=0 + ) + else: + # TODO: Check if it's a text file + if file_type not in [".txt", ".md", ".yml", ".yaml"]: + logger.info(f"Processing {document.metadata['file_path']} as a text file") + splitter = CharacterTextSplitter(separator="\n", chunk_size=1_000, chunk_overlap=200) + + chunks += splitter.create_documents([content]) + + logger.info(f"Storing {len(chunks)} chunks from {files} files in {vector_store_dir}") + vector_store = FAISS.from_documents(chunks, embeddings) + vector_store.save_local(vector_store_file) + return vector_store + + +def load_learned_repo(repo_name): + """Load a vector store from a learned repo.""" + vector_store_file = Path(get_local_data_dir() / "vector_stores" / repo_name / "faiss_index") + if not vector_store_file.exists(): + raise ValueError(f"Vector store for {repo_name} does not exist. Please run `aicodebot learn $githuburl` first.") + + embeddings = OpenAIEmbeddings() + return FAISS.load_local(vector_store_file, embeddings) diff --git a/requirements/requirements.in b/requirements/requirements.in index 6bb3e16..7888e20 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -7,6 +7,7 @@ beautifulsoup4 # needed by langchain click # command line interface helpers +faiss-cpu GitPython langchain loguru diff --git a/requirements/requirements.txt b/requirements/requirements.txt index ef2c016..756a65d 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile +# pip-compile requirements.in # aiohttp==3.8.4 # via @@ -15,7 +15,7 @@ async-timeout==4.0.2 attrs==23.1.0 # via aiohttp beautifulsoup4==4.12.2 - # via -r requirements/requirements.in + # via -r requirements.in certifi==2023.5.7 # via requests charset-normalizer==3.1.0 @@ -23,9 +23,11 @@ charset-normalizer==3.1.0 # aiohttp # requests click==8.1.4 - # via -r requirements/requirements.in + # via -r requirements.in dataclasses-json==0.5.8 # via langchain +faiss-cpu==1.7.4 + # via -r requirements.in frozenlist==1.3.3 # via # aiohttp @@ -33,19 +35,17 @@ frozenlist==1.3.3 gitdb==4.0.10 # via gitpython gitpython==3.1.32 - # via -r requirements/requirements.in -greenlet==2.0.2 - # via sqlalchemy + # via -r requirements.in idna==3.4 # via # requests # yarl langchain==0.0.231 - # via -r requirements/requirements.in + # via -r requirements.in langchainplus-sdk==0.0.20 # via langchain loguru==0.7.0 - # via -r requirements/requirements.in + # via -r requirements.in markdown-it-py==3.0.0 # via rich marshmallow==3.19.0 @@ -69,13 +69,13 @@ numpy==1.25.0 # langchain # numexpr openai==0.27.8 - # via -r requirements/requirements.in + # via -r requirements.in openapi-schema-pydantic==1.2.4 # via langchain packaging==23.1 # via marshmallow prompt-toolkit==3.0.39 - # via -r requirements/requirements.in + # via -r requirements.in pydantic==1.10.9 # via # langchain @@ -85,7 +85,7 @@ pygments==2.15.1 # via rich pyyaml==6.0 # via - # -r requirements/requirements.in + # -r requirements.in # langchain regex==2023.6.3 # via tiktoken @@ -96,7 +96,7 @@ requests==2.31.0 # openai # tiktoken rich==13.4.2 - # via -r requirements/requirements.in + # via -r requirements.in smmap==5.0.0 # via gitdb soupsieve==2.4.1 @@ -108,7 +108,7 @@ tenacity==8.2.2 # langchain # langchainplus-sdk tiktoken==0.4.0 - # via -r requirements/requirements.in + # via -r requirements.in tqdm==4.65.0 # via openai typing-extensions==4.6.3 diff --git a/tests/test_coder.py b/tests/test_coder.py index d5221eb..f0dff81 100644 --- a/tests/test_coder.py +++ b/tests/test_coder.py @@ -1,6 +1,6 @@ from aicodebot.coder import Coder from aicodebot.helpers import create_and_write_file -import os +import os, pytest def test_generate_directory_structure(tmp_path): @@ -113,3 +113,18 @@ def test_git_diff_context(temp_git_repo): commit = temp_git_repo.head.commit.hexsha diff = Coder.git_diff_context(commit) assert "renamedfile.txt" in diff + + def test_parse_github_url(): + # Test with https URL + owner, repo = Coder.parse_github_url("https://github.com/owner/repo.git") + assert owner == "owner" + assert repo == "repo" + + # Test with git URL + owner, repo = Coder.parse_github_url("git@github.com:owner/repo.git") + assert owner == "owner" + assert repo == "repo" + + # Test with invalid URL + with pytest.raises(ValueError): + Coder.parse_github_url("not a valid url")