Skip to content

Commit

Permalink
Merge pull request #787 from ScrapeGraphAI/786-csr-client-side-render…
Browse files Browse the repository at this point in the history
…ing-web-pages-dont-work

feat: update chromium
  • Loading branch information
VinciGit00 authored Nov 6, 2024
2 parents 65add5f + 38c6dd2 commit a12c83f
Showing 1 changed file with 14 additions and 9 deletions.
23 changes: 14 additions & 9 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
"""
chromiumloader module
"""
import asyncio
from typing import Any, AsyncIterator, Iterator, List, Optional
from langchain_community.document_loaders.base import BaseLoader
Expand All @@ -12,15 +9,16 @@
logger = get_logger("web-loader")

class ChromiumLoader(BaseLoader):
"""scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection
"""Scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection.
Attributes:
backend: The web driver backend library; defaults to 'playwright'.
browser_config: A dictionary containing additional browser kwargs.
headless: whether to run browser in headless mode.
headless: Whether to run browser in headless mode.
proxy: A dictionary containing proxy settings; None disables protection.
urls: A list of URLs to scrape content from.
requires_js_support: Flag to determine if JS rendering is required.
"""

RETRY_LIMIT = 3
Expand All @@ -34,15 +32,17 @@ def __init__(
headless: bool = True,
proxy: Optional[Proxy] = None,
load_state: str = "domcontentloaded",
requires_js_support: bool = False,
**kwargs: Any,
):
"""Initialize the loader with a list of URL paths.
Args:
backend: The web driver backend library; defaults to 'playwright'.
headless: whether to run browser in headless mode.
headless: Whether to run browser in headless mode.
proxy: A dictionary containing proxy information; None disables protection.
urls: A list of URLs to scrape content from.
requires_js_support: Whether to use JS rendering for scraping.
kwargs: A dictionary containing additional browser kwargs.
Raises:
Expand All @@ -61,6 +61,7 @@ def __init__(
self.proxy = parse_or_search_proxy(proxy) if proxy else None
self.urls = urls
self.load_state = load_state
self.requires_js_support = requires_js_support

async def ascrape_undetected_chromedriver(self, url: str) -> str:
"""
Expand Down Expand Up @@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
Yields:
Document: The scraped content encapsulated within a Document object.
"""
scraping_fn = getattr(self, f"ascrape_{self.backend}")
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
)

for url in self.urls:
html_content = asyncio.run(scraping_fn(url))
Expand All @@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
Document: A Document object containing the scraped content, along with its
source URL as metadata.
"""
scraping_fn = getattr(self, f"ascrape_{self.backend}")
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
)

tasks = [scraping_fn(url) for url in self.urls]
results = await asyncio.gather(*tasks)
Expand Down

0 comments on commit a12c83f

Please sign in to comment.