Skip to content

Commit

Permalink
feat: add integration with scrape.do
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed Oct 23, 2024
1 parent 5002c71 commit ae275ec
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
6 changes: 3 additions & 3 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,10 +270,10 @@ def handle_web_source(self, state, source):
else:
loader_kwargs = {}

if self.node_config is not None:
if self.node_config:
loader_kwargs = self.node_config.get("loader_kwargs", {})

if self.browser_base is not None:
if self.browser_base:
try:
from ..docloaders.browser_base import browser_base_fetch
except ImportError:
Expand All @@ -285,7 +285,7 @@ def handle_web_source(self, state, source):

document = [Document(page_content=content,
metadata={"source": source}) for content in data]
elif self.scrape_do is not None:
elif self.scrape_do:
from ..docloaders.scrape_do import scrape_do_fetch
if (self.scrape_do.get("use_proxy") is None) or \
self.scrape_do.get("geoCode") is None or \
Expand Down
6 changes: 6 additions & 0 deletions scrapegraphai/nodes/fetch_node_level_k.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
self.headless = node_config.get("headless", True) if node_config else True
self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {}
self.browser_base = node_config.get("browser_base", None)
self.scrape_do = node_config.get("scrape_do", None)
self.depth = node_config.get("depth", 1) if node_config else 1
self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False
self.min_input_len = 1
Expand Down Expand Up @@ -115,6 +116,11 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
self.browser_base.get("project_id"), [source])
document = [Document(page_content=content,
metadata={"source": source}) for content in data]
elif self.scrape_do:
from ..docloaders.scrape_do import scrape_do_fetch
data = scrape_do_fetch(self.scrape_do.get("api_key"), source)
document = [Document(page_content=data,
metadata={"source": source})]
else:
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()
Expand Down

0 comments on commit ae275ec

Please sign in to comment.