Skip to content

Commit

Permalink
refactor(document_loaders): abstract page evaluation logic in Playwri…
Browse files Browse the repository at this point in the history
…ghtURLLoader (#9995)

This PR brings structural updates to `PlaywrightURLLoader`, aiming at
making the code more readable and extensible through the abstraction of
page evaluation logic. These changes also align this implementation with
a similar structure used in LangChain.js.

The key enhancements include:

1. Introduction of 'PlaywrightEvaluator', an abstract base class for all
evaluators.
2. Creation of 'UnstructuredHtmlEvaluator', a concrete class
implementing 'PlaywrightEvaluator', which uses `unstructured` library
for processing page's HTML content.
3. Extension of 'PlaywrightURLLoader' constructor to optionally accept
an evaluator of the type 'PlaywrightEvaluator'. It defaults to
'UnstructuredHtmlEvaluator' if no evaluator is provided.
4. Refactoring of 'load' and 'aload' methods to use the 'evaluate' and
'evaluate_async' methods of the provided 'PageEvaluator' for page
content handling.

This update brings flexibility to 'PlaywrightURLLoader' as it can now
utilize different evaluators for page processing depending on the
requirement. The abstraction also improves code maintainability and
readability.

Twitter: @ywkim
  • Loading branch information
baskaryan authored Aug 31, 2023
2 parents 13fef1e + 6da1583 commit 6b5a970
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 35 deletions.
140 changes: 105 additions & 35 deletions libs/langchain/langchain/document_loaders/url_playwright.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,104 @@
"""Loader that uses Playwright to load a page, then uses unstructured to load the html.
"""
import logging
from typing import List, Optional
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

if TYPE_CHECKING:
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
from playwright.sync_api import Browser, Page, Response


logger = logging.getLogger(__name__)


class PlaywrightEvaluator(ABC):
"""Abstract base class for all evaluators.
Each evaluator should take a page, a browser instance, and a response
object, process the page as necessary, and return the resulting text.
"""

@abstractmethod
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
"""Synchronously process the page and return the resulting text.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
pass

@abstractmethod
async def evaluate_async(
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
) -> str:
"""Asynchronously process the page and return the resulting text.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
pass


class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
"""Evaluates the page HTML content using the `unstructured` library."""

def __init__(self, remove_selectors: Optional[List[str]] = None):
"""Initialize UnstructuredHtmlEvaluator."""
try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)

self.remove_selectors = remove_selectors

def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
"""Synchronously process the HTML content of the page."""
from unstructured.partition.html import partition_html

for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")

page_source = page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])

async def evaluate_async(
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
) -> str:
"""Asynchronously process the HTML content of the page."""
from unstructured.partition.html import partition_html

for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")

page_source = await page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])


class PlaywrightURLLoader(BaseLoader):
"""Load `HTML` pages with `Playwright` and parse with `Unstructured`.
Expand All @@ -26,8 +116,9 @@ def __init__(
continue_on_failure: bool = True,
headless: bool = True,
remove_selectors: Optional[List[str]] = None,
evaluator: Optional[PlaywrightEvaluator] = None,
):
"""Load a list of URLs using Playwright and unstructured."""
"""Load a list of URLs using Playwright."""
try:
import playwright # noqa:F401
except ImportError:
Expand All @@ -36,18 +127,17 @@ def __init__(
"`pip install playwright`"
)

try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)

self.urls = urls
self.continue_on_failure = continue_on_failure
self.headless = headless
self.remove_selectors = remove_selectors

if remove_selectors and evaluator:
raise ValueError(
"`remove_selectors` and `evaluator` cannot be both not None"
)

# Use the provided evaluator, if any, otherwise, use the default.
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)

def load(self) -> List[Document]:
"""Load the specified URLs using Playwright and create Document instances.
Expand All @@ -56,7 +146,6 @@ def load(self) -> List[Document]:
List[Document]: A list of Document instances with loaded content.
"""
from playwright.sync_api import sync_playwright
from unstructured.partition.html import partition_html

docs: List[Document] = list()

Expand All @@ -65,17 +154,8 @@ def load(self) -> List[Document]:
for url in self.urls:
try:
page = browser.new_page()
page.goto(url)

for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")

page_source = page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
response = page.goto(url)
text = self.evaluator.evaluate(page, browser, response)
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:
Expand All @@ -96,7 +176,6 @@ async def aload(self) -> List[Document]:
List[Document]: A list of Document instances with loaded content.
"""
from playwright.async_api import async_playwright
from unstructured.partition.html import partition_html

docs: List[Document] = list()

Expand All @@ -105,17 +184,8 @@ async def aload(self) -> List[Document]:
for url in self.urls:
try:
page = await browser.new_page()
await page.goto(url)

for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")

page_source = await page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
response = await page.goto(url)
text = await self.evaluator.evaluate_async(page, browser, response)
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
"""Tests for the Playwright URL loader"""
from typing import TYPE_CHECKING

import pytest

from langchain.document_loaders import PlaywrightURLLoader
from langchain.document_loaders.url_playwright import PlaywrightEvaluator

if TYPE_CHECKING:
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
from playwright.sync_api import Browser, Page, Response


class TestEvaluator(PlaywrightEvaluator):
"""A simple evaluator for testing purposes."""

def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
return "test"

async def evaluate_async(
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
) -> str:
return "test"


def test_playwright_url_loader() -> None:
Expand Down Expand Up @@ -39,3 +58,32 @@ async def test_playwright_async_url_loader() -> None:
)
docs = await loader.aload()
assert len(docs) > 0


def test_playwright_url_loader_with_custom_evaluator() -> None:
"""Test Playwright URL loader with a custom evaluator."""
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
loader = PlaywrightURLLoader(
urls=urls,
evaluator=TestEvaluator(),
continue_on_failure=False,
headless=True,
)
docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == "test"


@pytest.mark.asyncio
async def test_playwright_async_url_loader_with_custom_evaluator() -> None:
"""Test Playwright async URL loader with a custom evaluator."""
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
loader = PlaywrightURLLoader(
urls=urls,
evaluator=TestEvaluator(),
continue_on_failure=False,
headless=True,
)
docs = await loader.aload()
assert len(docs) == 1
assert docs[0].page_content == "test"

0 comments on commit 6b5a970

Please sign in to comment.