Skip to content

Commit

Permalink
feat: add deep scraper implementation
Browse files Browse the repository at this point in the history
Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com>
  • Loading branch information
VinciGit00 and vedovati-matteo committed Oct 3, 2024
1 parent 17c5145 commit 4b371f4
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

"""
depth_search_graph_opeani example
"""
from scrapegraphai.graphs import DepthSearchGraph

graph_config = {
Expand All @@ -19,4 +21,4 @@
)

result = search_graph.run()
print(result)
print(result)
62 changes: 52 additions & 10 deletions scrapegraphai/graphs/depth_search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,18 @@
from ..utils.save_code_to_file import save_code_to_file
from ..nodes import (
FetchNodeLevelK,
ParseNodeDepthK
ParseNodeDepthK,
DescriptionNode,
RAGNode,
GenerateAnswerNodeKLevel
)

class DepthSearchGraph(AbstractGraph):
"""
CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
CodeGeneratorGraph is a script generator pipeline that generates
the function extract_data(html: str) -> dict() for
extracting the wanted information from a HTML page. The
code generated is in Python and uses the library BeautifulSoup.
It requires a user prompt, a source URL, and an output schema.
Attributes:
Expand Down Expand Up @@ -60,7 +65,7 @@ def _create_graph(self) -> BaseGraph:
BaseGraph: A graph instance representing the web scraping workflow.
"""

fetch_node = FetchNodeLevelK(
fetch_node_k = FetchNodeLevelK(
input="url| local_dir",
output=["docs"],
node_config={
Expand All @@ -72,24 +77,61 @@ def _create_graph(self) -> BaseGraph:
"only_inside_links": self.config.get("only_inside_links", False)
}
)
parse_node = ParseNodeDepthK(

parse_node_k = ParseNodeDepthK(
input="docs",
output=["docs"],
node_config={
"verbose": self.config.get("verbose", False)
}
)

description_node = DescriptionNode(
input="docs",
output=["docs"],
node_config={
"llm_model": self.llm_model,
"verbose": self.config.get("verbose", False),
"cache_path": self.config.get("cache_path", False)
}
)

rag_node = RAGNode (
input="docs",
output=["vectorial_db"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.config.get("embedder_model", False),
"verbose": self.config.get("verbose", False),
}
)

generate_answer_k = GenerateAnswerNodeKLevel(
input="vectorial_db",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.config.get("embedder_model", False),
"verbose": self.config.get("verbose", False),
}

)

return BaseGraph(
nodes=[
fetch_node,
parse_node
fetch_node_k,
parse_node_k,
description_node,
rag_node,
generate_answer_k
],
edges=[
(fetch_node, parse_node),
(fetch_node_k, parse_node_k),
(parse_node_k, description_node),
(description_node, rag_node),
(rag_node, generate_answer_k)
],
entry_point=fetch_node,
entry_point=fetch_node_k,
graph_name=self.__class__.__name__
)

Expand Down
3 changes: 1 addition & 2 deletions scrapegraphai/nodes/description_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,11 @@ def __init__(
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "RAG",
node_name: str = "DESCRIPTION",
):
super().__init__(node_name, "node", input, output, 2, node_config)

self.llm_model = node_config["llm_model"]
self.embedder_model = node_config.get("embedder_model", None)
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
Expand Down

0 comments on commit 4b371f4

Please sign in to comment.