-
Notifications
You must be signed in to change notification settings - Fork 44.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use playwright instead of requests for browse #96
Changes from 3 commits
ac7fefe
6ea2a97
29c0b54
f203523
ef4e4eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
beautifulsoup4 | ||
colorama==0.4.6 | ||
dirtyjson==1.0. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? It is unrelated to the Playwrite replacements. Separate PR. Even if it was related, delete, not comment. |
||
# dirtyjson==1.0. | ||
openai==0.27.2 | ||
playsound==1.3.0 | ||
python-dotenv==1.0.0 | ||
|
@@ -9,5 +9,5 @@ readability-lxml==0.8.1 | |
requests | ||
tiktoken==0.3.3 | ||
docker | ||
# googlesearch-python | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pinning down a dependency is unrelated. Put to a separate PR. |
||
googlesearch_python==1.1.0 | ||
# Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this? |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
from playwright.sync_api import sync_playwright | ||
from bs4 import BeautifulSoup | ||
from config import Config | ||
from llm_utils import create_chat_completion | ||
|
||
cfg = Config() | ||
|
||
def scrape_text(url): | ||
with sync_playwright() as p: | ||
browser = p.chromium.launch() | ||
page = browser.new_page() | ||
|
||
try: | ||
page.goto(url) | ||
html_content = page.content() | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
|
||
for script in soup(["script", "style"]): | ||
script.extract() | ||
|
||
text = soup.get_text() | ||
lines = (line.strip() for line in text.splitlines()) | ||
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | ||
text = '\n'.join(chunk for chunk in chunks if chunk) | ||
|
||
except Exception as e: | ||
text = "Error: " + str(e) | ||
|
||
finally: | ||
browser.close() | ||
|
||
return text | ||
|
||
|
||
def extract_hyperlinks(soup): | ||
hyperlinks = [] | ||
for link in soup.find_all('a', href=True): | ||
hyperlinks.append((link.text, link['href'])) | ||
return hyperlinks | ||
|
||
|
||
def format_hyperlinks(hyperlinks): | ||
formatted_links = [] | ||
for link_text, link_url in hyperlinks: | ||
formatted_links.append(f"{link_text} ({link_url})") | ||
return formatted_links | ||
|
||
|
||
def scrape_links(url): | ||
with sync_playwright() as p: | ||
browser = p.chromium.launch() | ||
page = browser.new_page() | ||
|
||
try: | ||
page.goto(url) | ||
html_content = page.content() | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
|
||
for script in soup(["script", "style"]): | ||
script.extract() | ||
|
||
hyperlinks = extract_hyperlinks(soup) | ||
formatted_links = format_hyperlinks(hyperlinks) | ||
|
||
except Exception as e: | ||
formatted_links = "Error: " + str(e) | ||
|
||
finally: | ||
browser.close() | ||
|
||
return formatted_links | ||
|
||
# The rest of the code remains unchanged. | ||
|
||
def split_text(text, max_length=8192): | ||
paragraphs = text.split("\n") | ||
current_length = 0 | ||
current_chunk = [] | ||
|
||
for paragraph in paragraphs: | ||
if current_length + len(paragraph) + 1 <= max_length: | ||
current_chunk.append(paragraph) | ||
current_length += len(paragraph) + 1 | ||
else: | ||
yield "\n".join(current_chunk) | ||
current_chunk = [paragraph] | ||
current_length = len(paragraph) + 1 | ||
|
||
if current_chunk: | ||
yield "\n".join(current_chunk) | ||
|
||
|
||
def summarize_text(text, is_website=True): | ||
if text == "": | ||
return "Error: No text to summarize" | ||
|
||
print("Text length: " + str(len(text)) + " characters") | ||
summaries = [] | ||
chunks = list(split_text(text)) | ||
|
||
for i, chunk in enumerate(chunks): | ||
print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks))) | ||
if is_website: | ||
messages = [ | ||
{ | ||
"role": "user", | ||
"content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " + | ||
chunk}, | ||
] | ||
else: | ||
messages = [ | ||
{ | ||
"role": "user", | ||
"content": "Please summarize the following text, focusing on extracting concise and specific information: " + | ||
chunk}, | ||
] | ||
|
||
summary = create_chat_completion( | ||
model=cfg.fast_llm_model, | ||
messages=messages, | ||
max_tokens=300, | ||
) | ||
summaries.append(summary) | ||
print("Summarized " + str(len(chunks)) + " chunks.") | ||
|
||
combined_summary = "\n".join(summaries) | ||
|
||
# Summarize the combined summary | ||
if is_website: | ||
messages = [ | ||
{ | ||
"role": "user", | ||
"content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " + | ||
combined_summary}, | ||
] | ||
else: | ||
messages = [ | ||
{ | ||
"role": "user", | ||
"content": "Please summarize the following text, focusing on extracting concise and specific infomation: " + | ||
combined_summary}, | ||
] | ||
|
||
final_summary = create_chat_completion( | ||
model=cfg.fast_llm_model, | ||
messages=messages, | ||
max_tokens=300, | ||
) | ||
|
||
return final_summary |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
import browse | ||
import browse_playwright as browse | ||
import json | ||
import memory as mem | ||
import datetime | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using JSON instead of Dirty JSON should be either in a separate PR or documented in the description of this PR as for why it is important |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Submit this change as a separate PR. PRs should be atomic, not mixing unrelated changes together.