Skip to content

Commit

Permalink
chore: Update Python2To3Migrator to only migrate .py files
Browse files Browse the repository at this point in the history
  • Loading branch information
MS-elug committed Jun 5, 2024
1 parent f6fdaff commit 706f6ad
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 45 deletions.
4 changes: 0 additions & 4 deletions action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
name: 'TED run'
description: 'Run TED on a given repo'
inputs:
git-repo: # repo to work on
description: 'Git Repository'
required: true
default: ''
git-branch: # branch to work on
description: 'Git Branch'
required: false
Expand Down
11 changes: 4 additions & 7 deletions services/Python2To3Migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def run_generation(self, retriever, llm, output_parser, clone_dir) -> None:
answer = chain.invoke("Give me the list of files that should be modified to migrate from Python 2 to Python 3.")

print("-------------------------------------------------\n")
print(answer)
print(f"πŸ†—: {answer}")
parsed = re.search('```json\n([\\w\\W]*?)\n```', answer)

if parsed is not None:
Expand All @@ -84,14 +84,11 @@ def run_generation(self, retriever, llm, output_parser, clone_dir) -> None:
else:
print("File parsing failure")
else:
print("Answer parsing failure")
print("πŸ†˜ Answer parsing failure")

def get_file_extensions(self) -> List[str]:
return [".py", ".md", ".txt"]

def get_file_glob(self) -> str:
return "**/*"

return [".py"]

def get_text_format(self) -> Language:
return Language.PYTHON

Expand Down
4 changes: 0 additions & 4 deletions services/TEDGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ def get_file_extensions(self) -> List[str]:
"""Returns the file extension repo code to take into account."""
pass

def get_file_glob(self) -> str:
"""Returns the glob to match files in repo code."""
pass

def get_text_format(self) -> Language:
"""Returns the text format of the generator."""
pass
Expand Down
3 changes: 0 additions & 3 deletions services/UnitTestsGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,6 @@ class <className><suffix>Test {{
def get_file_extensions(self) -> List[str]:
return [".java"]

def get_file_glob(self) -> str:
return "**/*.java"

def get_text_format(self) -> Language:
return Language.JAVA

Expand Down
81 changes: 58 additions & 23 deletions ted.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import argparse
from uuid import uuid1
from dotenv import load_dotenv
from pprint import pprint
from git import Repo

from helpers.GitHelper import GitHelper
from services.UnitTestsGenerator import UnitTestsGenerator
Expand All @@ -10,7 +12,10 @@
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.document_loaders import GitLoader, DirectoryLoader, TextLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.parsers.txt import TextParser

def main():

Expand Down Expand Up @@ -50,47 +55,77 @@ def main():
)

output_parser = StrOutputParser()

loaders = []
# Check if git_url exists
clone_path = None
if git_url:
print(f"Loaded clones repository from URL: {git_url}")
clone_path="./clone/"
loader = GitLoader(
clone_url=git_url,
repo_path=clone_path,
branch=branch,
file_filter=lambda file_path: filter_files(file_path, generator.get_file_extensions())
)
else:

if(os.path.exists(clone_path)):
print("Repository already cloned. Skip cloning.")
else:
print(f"Clone repository from {git_url} to {clone_path}")
Repo.clone_from(
url=git_url,
single_branch=True,
depth=1,
to_path=clone_path,
branch=branch,
)

if(not clone_path and os.getenv('GITHUB_WORKSPACE')):
clone_path=os.getenv('GITHUB_WORKSPACE')
print(f"Loader uses from github workspace: {clone_path}")
loader = DirectoryLoader(
path=clone_path,
glob=generator.get_file_glob(), # @TODO Find a way to use glob with extensions: "**/*{" +",".join(generator.getFileExtensions()) + "}",
exclude=["**/*.yml", "**/*.json"],
show_progress=True,
loader_cls=TextLoader
)

text_format = generator.get_text_format()

print("Load documents")
docs = loader.load()
if(not clone_path):
print("No git repository provided.")
return

print(f'πŸ“‚ Load documents from {clone_path}')
loaders.append(GenericLoader.from_filesystem(
clone_path,
glob="*",
#suffixes= generator.get_file_extensions(),
#parser=LanguageParser(generator.get_text_format()),
suffixes= [".py"],
parser=LanguageParser(generator.get_text_format()),
))
loaders.append(GenericLoader.from_filesystem(
clone_path,
glob="*",
suffixes= [".txt", ".md", "Dockerfile"],
parser= TextParser()
))

# Load all documents
docs = []
for loader in loaders:
docs.extend(loader.load())

# Add a new document to docs array
# docs.append(TextLoader("text", "This is a test document"))

# if zero docs stop
if len(docs) == 0:
print("No documents found.")
print("πŸ“„ No documents found.")
return

print(f"πŸ“„ Found {len(docs)} documents")

for document in docs:
pprint(document)


text_format = generator.get_text_format()
print("Using language splitter {}.".format(text_format))
text_splitter = RecursiveCharacterTextSplitter.from_language(
language=text_format ,chunk_size=2000, chunk_overlap=200
)

print("Split documents")
texts = text_splitter.split_documents(docs)


print(f"πŸ“„ Generated {len(texts)} chuncks for {len(docs)} documents")
print("Create embeddings and vector store")
embedding = AzureOpenAIEmbeddings(
# keys and endpoint are read from the .env file
Expand All @@ -102,7 +137,7 @@ def main():
)
retriever = vector_store.as_retriever()

print("πŸ˜€ Run generation")
print("πŸ” Run generation")
generator.run_generation(retriever, llm, output_parser, clone_path)

if(push and github_repository and branch and github_token):
Expand Down
4 changes: 0 additions & 4 deletions tests/services/test_Generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ def test_get_file_extensions(self):
ted_generator = TEDGenerator()
self.assertIsNone(ted_generator.get_file_extensions())

def test_get_file_glob(self):
ted_generator = TEDGenerator()
self.assertIsNone(ted_generator.get_file_glob())

def test_get_text_format(self):
ted_generator = TEDGenerator()
self.assertIsNone(ted_generator.get_text_format())
Expand Down

0 comments on commit 706f6ad

Please sign in to comment.