Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use playwright instead of requests for browse #96

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Submit this change as a separate PR. PRs should be atomic, not mixing unrelated changes together.

Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ scripts/__pycache__/keys.cpython-310.pyc
package-lock.json
*.pyc
scripts/auto_gpt_workspace/*
auto_gpt_workspace/*
*.mpeg
.env
last_run_ai_settings.yaml
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
beautifulsoup4
colorama==0.4.6
dirtyjson==1.0.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? It is unrelated to the Playwrite replacements. Separate PR. Even if it was related, delete, not comment.

# dirtyjson==1.0.
openai==0.27.2
playsound==1.3.0
python-dotenv==1.0.0
Expand All @@ -9,5 +9,5 @@ readability-lxml==0.8.1
requests
tiktoken==0.3.3
docker
# googlesearch-python
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pinning down a dependency is unrelated. Put to a separate PR.

googlesearch_python==1.1.0
# Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
150 changes: 150 additions & 0 deletions scripts/browse_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from config import Config
from llm_utils import create_chat_completion

cfg = Config()

def scrape_text(url):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

try:
page.goto(url)
html_content = page.content()
soup = BeautifulSoup(html_content, "html.parser")

for script in soup(["script", "style"]):
script.extract()

text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)

except Exception as e:
text = "Error: " + str(e)

finally:
browser.close()

return text


def extract_hyperlinks(soup):
hyperlinks = []
for link in soup.find_all('a', href=True):
hyperlinks.append((link.text, link['href']))
return hyperlinks


def format_hyperlinks(hyperlinks):
formatted_links = []
for link_text, link_url in hyperlinks:
formatted_links.append(f"{link_text} ({link_url})")
return formatted_links


def scrape_links(url):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()

try:
page.goto(url)
html_content = page.content()
soup = BeautifulSoup(html_content, "html.parser")

for script in soup(["script", "style"]):
script.extract()

hyperlinks = extract_hyperlinks(soup)
formatted_links = format_hyperlinks(hyperlinks)

except Exception as e:
formatted_links = "Error: " + str(e)

finally:
browser.close()

return formatted_links

# The rest of the code remains unchanged.

def split_text(text, max_length=8192):
paragraphs = text.split("\n")
current_length = 0
current_chunk = []

for paragraph in paragraphs:
if current_length + len(paragraph) + 1 <= max_length:
current_chunk.append(paragraph)
current_length += len(paragraph) + 1
else:
yield "\n".join(current_chunk)
current_chunk = [paragraph]
current_length = len(paragraph) + 1

if current_chunk:
yield "\n".join(current_chunk)


def summarize_text(text, is_website=True):
if text == "":
return "Error: No text to summarize"

print("Text length: " + str(len(text)) + " characters")
summaries = []
chunks = list(split_text(text))

for i, chunk in enumerate(chunks):
print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks)))
if is_website:
messages = [
{
"role": "user",
"content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
chunk},
]
else:
messages = [
{
"role": "user",
"content": "Please summarize the following text, focusing on extracting concise and specific information: " +
chunk},
]

summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
)
summaries.append(summary)
print("Summarized " + str(len(chunks)) + " chunks.")

combined_summary = "\n".join(summaries)

# Summarize the combined summary
if is_website:
messages = [
{
"role": "user",
"content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
combined_summary},
]
else:
messages = [
{
"role": "user",
"content": "Please summarize the following text, focusing on extracting concise and specific infomation: " +
combined_summary},
]

final_summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
)

return final_summary
2 changes: 1 addition & 1 deletion scripts/commands.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import browse
import browse_playwright as browse
import json
import memory as mem
import datetime
Expand Down
10 changes: 5 additions & 5 deletions scripts/json_parser.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using JSON instead of Dirty JSON should be either in a separate PR or documented in the description of this PR as for why it is important

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import dirtyjson
import json
from call_ai_function import call_ai_function
from config import Config
cfg = Config()
Expand All @@ -24,7 +24,7 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
"""

try:
return dirtyjson.loads(json_str)
return json.loads(json_str)
except Exception as e:
# Let's do something manually - sometimes GPT responds with something BEFORE the braces:
# "I'm sorry, I don't understand. Please try again."{"text": "I'm sorry, I don't understand. Please try again.", "confidence": 0.0}
Expand All @@ -34,14 +34,14 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
json_str = json_str[brace_index:]
last_brace_index = json_str.rindex("}")
json_str = json_str[:last_brace_index+1]
return dirtyjson.loads(json_str)
return json.loads(json_str)
except Exception as e:
if try_to_fix_with_gpt:
print(f"Warning: Failed to parse AI output, attempting to fix.\n If you see this warning frequently, it's likely that your prompt is confusing the AI. Try changing it up slightly.")
# Now try to fix this up using the ai_functions
ai_fixed_json = fix_json(json_str, json_schema, False)
if ai_fixed_json != "failed":
return dirtyjson.loads(ai_fixed_json)
return json.loads(ai_fixed_json)
else:
print(f"Failed to fix ai output, telling the AI.") # This allows the AI to react to the error message, which usually results in it correcting its ways.
return json_str
Expand All @@ -68,7 +68,7 @@ def fix_json(json_str: str, schema: str, debug=False) -> str:
print(f"Fixed JSON: {result_string}")
print("----------- END OF FIX ATTEMPT ----------------")
try:
return dirtyjson.loads(result_string)
return json.loads(result_string)
except:
# Get the call stack:
# import traceback
Expand Down