Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add GitLoader Component with advanced filtering options #2850

Merged
merged 14 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions src/backend/base/langflow/components/documentloaders/GitLoader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from pathlib import Path
from typing import List
import re

from langchain_community.document_loaders.git import GitLoader
from langflow.custom import Component
from langflow.io import MessageTextInput, Output
from langflow.schema import Data


class GitLoaderComponent(Component):
display_name = "GitLoader"
description = "Load files from a Git repository"
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/"
trace_type = "tool"
icon = "GitLoader"
name = "GitLoader"

inputs = [
MessageTextInput(
name="repo_path",
display_name="Repository Path",
required=True,
info="The local path to the Git repository.",
),
MessageTextInput(
name="clone_url",
display_name="Clone URL",
required=False,
info="The URL to clone the Git repository from.",
),
MessageTextInput(
name="branch",
display_name="Branch",
required=False,
value="main",
info="The branch to load files from. Defaults to 'main'.",
),
MessageTextInput(
name="file_filter",
display_name="File Filter",
required=False,
advanced=True,
info="A list of patterns to filter files. Example to include only .py files: '*.py'. "
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.",
),
MessageTextInput(
name="content_filter",
display_name="Content Filter",
required=False,
advanced=True,
info="A regex pattern to filter files based on their content.",
),
]

outputs = [
Output(name="data", display_name="Data", method="load_documents"),
]

@staticmethod
def is_binary(file_path: str) -> bool:
"""
Check if a file is binary by looking for null bytes.
This is necessary because when searches are performed using
the content_filter, binary files need to be ignored.
"""
with open(file_path, "rb") as file:
return b"\x00" in file.read(1024)

def build_gitloader(self) -> GitLoader:
file_filter_patterns = getattr(self, "file_filter", None)
content_filter_pattern = getattr(self, "content_filter", None)

file_filters = []
if file_filter_patterns:
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")]

def file_filter(file_path: Path) -> bool:
if len(patterns) == 1 and patterns[0].startswith("!"):
return not file_path.match(patterns[0][1:])
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!"))
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!"))
return included and not excluded

file_filters.append(file_filter)

if content_filter_pattern:
content_regex = re.compile(content_filter_pattern)

def content_filter(file_path: Path) -> bool:
with file_path.open("r", encoding="utf-8", errors="ignore") as file:
content = file.read()
return bool(content_regex.search(content))

file_filters.append(content_filter)

def combined_filter(file_path: str) -> bool:
path = Path(file_path)
if self.is_binary(file_path):
return False
return all(f(path) for f in file_filters)

loader = GitLoader(
repo_path=self.repo_path,
clone_url=self.clone_url,
branch=self.branch,
file_filter=combined_filter,
)
return loader

def load_documents(self) -> List[Data]:
gitloader = self.build_gitloader()
documents = list(gitloader.lazy_load())
data = [Data.from_document(doc) for doc in documents]
self.status = data
return data
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .Confluence import ConfluenceComponent
from .GitLoader import GitLoaderComponent

__all__ = ["ConfluenceComponent"]
__all__ = ["ConfluenceComponent", "GitLoaderComponent"]
1 change: 1 addition & 0 deletions src/frontend/src/icons/GitLoader/Git.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 22 additions & 0 deletions src/frontend/src/icons/GitLoader/GitLoader.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
const GitLoaderIcon = (props) => (
<svg
xmlns="http://www.w3.org/2000/svg"
width="32"
height="32"
viewBox="0 0 32 32"
fill="none"
{...props}
>
<path
d="M31.349 14.191L17.451.293a1.938 1.938 0 0 0-2.738 0L11.618 3.39l3.47 3.47a2.311 2.311 0 0 1 2.377.554 2.31 2.31 0 0 1 .549 2.392l3.36 3.359a2.31 2.31 0 0 1 2.393.55 2.311 2.311 0 0 1 0 3.27 2.312 2.312 0 0 1-3.271 0 2.309 2.309 0 0 1-.501-2.511l-3.12-3.12V20.24a2.31 2.31 0 0 1 .611 3.701 2.31 2.31 0 0 1-3.27 0 2.31 2.31 0 0 1 0-3.27 2.324 2.324 0 0 1 .759-.509V11.925a2.35 2.35 0 0 1-1.27-3.082L9.747 4.741 1.73 12.758a1.938 1.938 0 0 0 0 2.737L14.628 28.393a1.938 1.938 0 0 0 2.737 0l13.372-13.371a1.938 1.938 0 0 0 0-2.738"
style={{
stroke: "none",
fillRule: "nonzero",
fill: "#f03c2e",
fillOpacity: 1,
}}
/>
</svg>
);

export default GitLoaderIcon;
9 changes: 9 additions & 0 deletions src/frontend/src/icons/GitLoader/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import React, { forwardRef } from "react";
import SvgGitLoader from "./GitLoader";

export const GitLoaderIcon = forwardRef<
SVGSVGElement,
React.PropsWithChildren<{}>
>((props, ref) => {
return <SvgGitLoader ref={ref} {...props} />;
});
2 changes: 2 additions & 0 deletions src/frontend/src/utils/styleUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ import { EvernoteIcon } from "../icons/Evernote";
import { FBIcon } from "../icons/FacebookMessenger";
import { FirecrawlIcon } from "../icons/Firecrawl";
import { GitBookIcon } from "../icons/GitBook";
import { GitLoaderIcon } from "../icons/GitLoader";
import { GoogleIcon } from "../icons/Google";
import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI";
import {
Expand Down Expand Up @@ -586,4 +587,5 @@ export const nodeIconsLucide: iconsType = {
Table: Table,
AIML: AIMLIcon,
"AI/ML": AIMLIcon,
GitLoader: GitLoaderIcon,
};
14 changes: 6 additions & 8 deletions src/frontend/tests/end-to-end/promptModalComponent.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,8 @@ test("PromptTemplateComponent", async ({ page }) => {
}

value =
(await page
.locator('//*[@id="textarea_str_edit_prompt1"]')
.inputValue()) ?? "";
(await page.locator('//*[@id="textarea_str_edit_prompt1"]').inputValue()) ??
"";

if (value != "prompt_name_test_123123!@#!@#") {
expect(false).toBeTruthy();
Expand All @@ -126,14 +125,14 @@ test("PromptTemplateComponent", async ({ page }) => {
expect(false).toBeTruthy();
}

await page.getByTestId('textarea_str_edit_prompt1-ExternalLink').click();
await page.getByTestId("textarea_str_edit_prompt1-ExternalLink").click();
await page
.getByTestId("text-area-modal")
.fill("prompt_edit_test_12312312321!@#$");

await page.getByText("Finish Editing", { exact: true }).click();

await page.getByTestId('textarea_str_edit_prompt-ExternalLink').click();
await page.getByTestId("textarea_str_edit_prompt-ExternalLink").click();
await page
.getByTestId("text-area-modal")
.fill("prompt_edit_test_44444444444!@#$");
Expand Down Expand Up @@ -194,9 +193,8 @@ test("PromptTemplateComponent", async ({ page }) => {
}

value =
(await page
.locator('//*[@id="textarea_str_edit_prompt1"]')
.inputValue()) ?? "";
(await page.locator('//*[@id="textarea_str_edit_prompt1"]').inputValue()) ??
"";

if (value != "prompt_edit_test_12312312321!@#$") {
expect(false).toBeTruthy();
Expand Down
Loading