From ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 23 Oct 2024 12:08:00 +0200 Subject: [PATCH] feat: add integration with scrape.do --- scrapegraphai/nodes/fetch_node.py | 6 +++--- scrapegraphai/nodes/fetch_node_level_k.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 4cd549a5..d90864e9 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -270,10 +270,10 @@ def handle_web_source(self, state, source): else: loader_kwargs = {} - if self.node_config is not None: + if self.node_config: loader_kwargs = self.node_config.get("loader_kwargs", {}) - if self.browser_base is not None: + if self.browser_base: try: from ..docloaders.browser_base import browser_base_fetch except ImportError: @@ -285,7 +285,7 @@ def handle_web_source(self, state, source): document = [Document(page_content=content, metadata={"source": source}) for content in data] - elif self.scrape_do is not None: + elif self.scrape_do: from ..docloaders.scrape_do import scrape_do_fetch if (self.scrape_do.get("use_proxy") is None) or \ self.scrape_do.get("geoCode") is None or \ diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py index 0f772edf..ce8e4042 100644 --- a/scrapegraphai/nodes/fetch_node_level_k.py +++ b/scrapegraphai/nodes/fetch_node_level_k.py @@ -57,6 +57,7 @@ def __init__( self.headless = node_config.get("headless", True) if node_config else True self.loader_kwargs = node_config.get("loader_kwargs", {}) if node_config else {} self.browser_base = node_config.get("browser_base", None) + self.scrape_do = node_config.get("scrape_do", None) self.depth = node_config.get("depth", 1) if node_config else 1 self.only_inside_links = node_config.get("only_inside_links", False) if node_config else False self.min_input_len = 1 @@ -115,6 +116,11 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]: self.browser_base.get("project_id"), [source]) document = [Document(page_content=content, metadata={"source": source}) for content in data] + elif self.scrape_do: + from ..docloaders.scrape_do import scrape_do_fetch + data = scrape_do_fetch(self.scrape_do.get("api_key"), source) + document = [Document(page_content=data, + metadata={"source": source})] else: loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) document = loader.load()