Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use playwright instead of requests for browse #96

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions autogpt/commands/web_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Web scraping commands using Playwright"""
try:
from playwright.sync_api import sync_playwright
except ImportError:
print(
"Playwright not installed. Please install it with 'pip install playwright' to use."
)
from bs4 import BeautifulSoup
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
from typing import List, Union


def scrape_text(url: str) -> str:
"""Scrape text from a webpage

Args:
url (str): The URL to scrape text from

Returns:
str: The scraped text
"""
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

try:
page.goto(url)
html_content = page.content()
soup = BeautifulSoup(html_content, "html.parser")

for script in soup(["script", "style"]):
script.extract()

text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)

except Exception as e:
text = f"Error: {str(e)}"

finally:
browser.close()

return text


def scrape_links(url: str) -> Union[str, List[str]]:
"""Scrape links from a webpage

Args:
url (str): The URL to scrape links from

Returns:
Union[str, List[str]]: The scraped links
"""
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

try:
page.goto(url)
html_content = page.content()
soup = BeautifulSoup(html_content, "html.parser")

for script in soup(["script", "style"]):
script.extract()

hyperlinks = extract_hyperlinks(soup, url)
formatted_links = format_hyperlinks(hyperlinks)

except Exception as e:
formatted_links = f"Error: {str(e)}"

finally:
browser.close()

return formatted_links
32 changes: 1 addition & 31 deletions autogpt/commands/web_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from autogpt.config import Config
from autogpt.memory import get_memory
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks

CFG = Config()
memory = get_memory(CFG)
Expand Down Expand Up @@ -135,37 +136,6 @@ def scrape_text(url: str) -> str:
return text


def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
"""Extract hyperlinks from a BeautifulSoup object

Args:
soup (BeautifulSoup): The BeautifulSoup object
base_url (str): The base URL

Returns:
List[Tuple[str, str]]: The extracted hyperlinks
"""
return [
(link.text, urljoin(base_url, link["href"]))
for link in soup.find_all("a", href=True)
]


def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
"""Format hyperlinks into a list of strings

Args:
hyperlinks (List[Tuple[str, str]]): The hyperlinks to format

Returns:
List[str]: The formatted hyperlinks
"""
formatted_links = []
for link_text, link_url in hyperlinks:
formatted_links.append(f"{link_text} ({link_url})")
return formatted_links


def scrape_links(url: str) -> Union[str, List[str]]:
"""Scrape links from a webpage

Expand Down
31 changes: 4 additions & 27 deletions autogpt/commands/web_selenium.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Selenium web scraping module."""
from selenium import webdriver
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
import autogpt.processing.text as summary
from bs4 import BeautifulSoup
from selenium.webdriver.remote.webdriver import WebDriver
Expand Down Expand Up @@ -33,7 +34,7 @@ def browse_website(url: str, question: str) -> Tuple[str, WebDriver]:
driver, text = scrape_text_with_selenium(url)
add_header(driver)
summary_text = summary.summarize_text(url, text, question, driver)
links = scrape_links_with_selenium(driver)
links = scrape_links_with_selenium(driver, url)

# Limit links to 5
if len(links) > 5:
Expand Down Expand Up @@ -96,7 +97,7 @@ def scrape_text_with_selenium(url: str) -> Tuple[WebDriver, str]:
return driver, text


def scrape_links_with_selenium(driver: WebDriver) -> List[str]:
def scrape_links_with_selenium(driver: WebDriver, url: str) -> List[str]:
"""Scrape links from a website using selenium

Args:
Expand All @@ -111,7 +112,7 @@ def scrape_links_with_selenium(driver: WebDriver) -> List[str]:
for script in soup(["script", "style"]):
script.extract()

hyperlinks = extract_hyperlinks(soup)
hyperlinks = extract_hyperlinks(soup, url)

return format_hyperlinks(hyperlinks)

Expand All @@ -128,30 +129,6 @@ def close_browser(driver: WebDriver) -> None:
driver.quit()


def extract_hyperlinks(soup: BeautifulSoup) -> List[Tuple[str, str]]:
"""Extract hyperlinks from a BeautifulSoup object

Args:
soup (BeautifulSoup): The BeautifulSoup object to extract the hyperlinks from

Returns:
List[Tuple[str, str]]: The hyperlinks extracted from the BeautifulSoup object
"""
return [(link.text, link["href"]) for link in soup.find_all("a", href=True)]


def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
"""Format hyperlinks to be displayed to the user

Args:
hyperlinks (List[Tuple[str, str]]): The hyperlinks to format

Returns:
List[str]: The formatted hyperlinks
"""
return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks]


def add_header(driver: WebDriver) -> None:
"""Add a header to the website

Expand Down
32 changes: 32 additions & 0 deletions autogpt/processing/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""HTML processing functions"""
from requests.compat import urljoin
from typing import List, Tuple
from bs4 import BeautifulSoup


def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> List[Tuple[str, str]]:
"""Extract hyperlinks from a BeautifulSoup object

Args:
soup (BeautifulSoup): The BeautifulSoup object
base_url (str): The base URL

Returns:
List[Tuple[str, str]]: The extracted hyperlinks
"""
return [
(link.text, urljoin(base_url, link["href"]))
for link in soup.find_all("a", href=True)
]


def format_hyperlinks(hyperlinks: List[Tuple[str, str]]) -> List[str]:
"""Format hyperlinks to be displayed to the user

Args:
hyperlinks (List[Tuple[str, str]]): The hyperlinks to format

Returns:
List[str]: The formatted hyperlinks
"""
return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks]