Significant-Gravitas · Agusx1211 · Apr 3, 2023 · nponeccop · Apr 10, 2023
@@ -7,6 +7,7 @@ pyyaml==6.0
 readability-lxml==0.8.1
 requests
 tiktoken==0.3.3
+selenium==4.8.3
 docker
 googlesearch-python
 # Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
@@ -1,134 +1,78 @@
-import requests
-from bs4 import BeautifulSoup
-from config import Config
-from llm_utils import create_chat_completion
 
-cfg = Config()
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.remote.webelement import WebElement
+from time import sleep
 
-def scrape_text(url):
-    response = requests.get(url)
+options = Options()
+options.add_argument('--headless')
 
-    # Check if the response contains an HTTP error
-    if response.status_code >= 400:
-        return "Error: HTTP " + str(response.status_code) + " error"
+lastFetched = None
 
-    soup = BeautifulSoup(response.text, "html.parser")
+def fetch_url(url):
+    browser = webdriver.Chrome(options=options)
+    browser.get(url)
 
-    for script in soup(["script", "style"]):
-        script.extract()
+    # Wait for page to load
+    # browser.implicitly_wait(10)
+    sleep(5)
 
-    text = soup.get_text()
-    lines = (line.strip() for line in text.splitlines())
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    text = '\n'.join(chunk for chunk in chunks if chunk)
+    # Use a more targeted XPath expression to select only elements that are likely to have meaningful text content
+    xpath = "//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6 or self::p or self::a or self::li or self::span or self::a or self::button]"
 
-    return text
+    # Find all elements on the page
+    elements = browser.find_elements(By.XPATH, xpath)
 
-
-def extract_hyperlinks(soup):
-    hyperlinks = []
-    for link in soup.find_all('a', href=True):
-        hyperlinks.append((link.text, link['href']))
-    return hyperlinks
-
-
-def format_hyperlinks(hyperlinks):
-    formatted_links = []
-    for link_text, link_url in hyperlinks:
-        formatted_links.append(f"{link_text} ({link_url})")
-    return formatted_links
-
-
-def scrape_links(url):
-    response = requests.get(url)
-
-    # Check if the response contains an HTTP error
-    if response.status_code >= 400:
-        return "error"
-
-    soup = BeautifulSoup(response.text, "html.parser")
-
-    for script in soup(["script", "style"]):
-        script.extract()
-
-    hyperlinks = extract_hyperlinks(soup)
-
-    return format_hyperlinks(hyperlinks)
-
-
-def split_text(text, max_length=8192):
-    paragraphs = text.split("\n")
-    current_length = 0
-    current_chunk = []
-
-    for paragraph in paragraphs:
-        if current_length + len(paragraph) + 1 <= max_length:
-            current_chunk.append(paragraph)
-            current_length += len(paragraph) + 1
-        else:
-            yield "\n".join(current_chunk)
-            current_chunk = [paragraph]
-            current_length = len(paragraph) + 1
-
-    if current_chunk:
-        yield "\n".join(current_chunk)
-
-
-def summarize_text(text, is_website=True):
-    if text == "":
-        return "Error: No text to summarize"
-
-    print("Text length: " + str(len(text)) + " characters")
-    summaries = []
-    chunks = list(split_text(text))
-
-    for i, chunk in enumerate(chunks):
-        print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks)))
-        if is_website:
-            messages = [
-                {
-                    "role": "user",
-                    "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
-                    chunk},
-            ]
+    # Extract the text content from the elements
+    text_content = []
+    for element in elements:
+        # if element is button or link, we must include the URL
+        if element.tag_name == "a" or element.tag_name == "button":
+            text = element.text
+            url = element.get_attribute("href")
+            if text and text != "" and url and url.startswith("http"):
+                text_content.append("(" + text + ")[" + url + "]")
         else:
-            messages = [
-                {
-                    "role": "user",
-                    "content": "Please summarize the following text, focusing on extracting concise and specific information: " +
-                    chunk},
-            ]
-
-        summary = create_chat_completion(
-            model=cfg.fast_llm_model,
-            messages=messages,
-            max_tokens=300,
-        )
-        summaries.append(summary)
-    print("Summarized " + str(len(chunks)) + " chunks.")
-
-    combined_summary = "\n".join(summaries)
-
-    # Summarize the combined summary
-    if is_website:
-        messages = [
-            {
-                "role": "user",
-                "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
-                combined_summary},
-        ]
-    else:
-        messages = [
-            {
-                "role": "user",
-                "content": "Please summarize the following text, focusing on extracting concise and specific infomation: " +
-                combined_summary},
-        ]
-
-    final_summary = create_chat_completion(
-        model=cfg.fast_llm_model,
-        messages=messages,
-        max_tokens=300,
-    )
-
-    return final_summary
+          # Otherwise, just include the text
+          text = element.text
+          if text and (len(text_content) == 0 or text_content[len(text_content) - 1] != text):
+              text_content.append(text)
+
+    # Close browser
+    browser.quit()
+
+    # Build content
+    content = ' '.join(text_content)
+
+    # Store content
+    global lastFetched
+    lastFetched = content
+
+def split_text(text, max_length=2048):
+    # Split text into chunks of max_length
+    chunks = []
+    while len(text) > max_length:
+        # Find the last space before the max length
+        last_space = text.rfind(" ", 0, max_length)
+        if last_space == -1:
+            # If there is no space, just split at the max length
+            last_space = max_length
+        chunks.append(text[0:last_space])
+        text = text[last_space + 1:]
+    chunks.append(text)
+    return chunks
+
+def has_fetched():
+  return lastFetched != None
+
+def view_page(pageNumber):
+  if lastFetched:
+    chunks = split_text(lastFetched)
+    if int(pageNumber) > len(list(chunks)):
+      return "Page number out of range."
+
+    header = "Page " + str(int(pageNumber) + 1) + " of " + str(len(list(chunks))) + ":\n"
+    return header + list(chunks)[int(pageNumber)]
+  else:
+    return "No page fetched yet."
@@ -76,6 +76,8 @@ def execute_command(command_name, arguments):
             return delete_file(arguments["file"])
         elif command_name == "browse_website":
             return browse_website(arguments["url"])
+        elif command_name == "view_website_page":
+            return view_website_page(arguments["page_number"])
         # TODO: Change these to take in a file rather than pasted code, if
         # non-file is given, return instructions "Input should be a python
         # filepath, write your code to file and try again"
@@ -110,17 +112,15 @@ def google_search(query, num_results=8):
 
 
 def browse_website(url):
-    summary = get_text_summary(url)
-    links = get_hyperlinks(url)
-
-    # Limit links to 5
-    if len(links) > 5:
-        links = links[:5]
-
-    result = f"""Website Content Summary: {summary}\n\nLinks: {links}"""
-
-    return result
+    browse.fetch_url(url)
+    return browse.view_page(0)
 
+def view_website_page(page):
+    if browse.has_fetched():
+        res = browse.view_page(page)
+        return res + "\n To view the next page, use the command 'view_website_page " + str(int(page) + 1)
+    else:
+        return "No website has been fetched yet. Use the browse_website command to fetch a website."
 
 def get_text_summary(url):
     text = browse.scrape_text(url)

@@ -2,6 +2,7 @@ CONSTRAINTS:
 
 1. ~4000 word limit for memory. Your memory is short, so immidiately save important information to long term memory and code to files.
 2. No user assistance
+3. No images
 
 COMMANDS:
 
@@ -10,19 +11,20 @@ COMMANDS:
 3. Memory Delete: "memory_del", args: "key": "<key>"
 4. Memory Overwrite: "memory_ovr", args: "key": "<key>", "string": "<string>"
 5. Browse Website: "browse_website", args: "url": "<url>"
-6. Start GPT Agent: "start_agent",  args: "name": <name>, "task": "<short_task_desc>", "prompt": "<prompt>"
-7. Message GPT Agent: "message_agent", args: "key": "<key>", "message": "<message>"
-8. List GPT Agents: "list_agents", args: ""
-9. Delete GPT Agent: "delete_agent", args: "key": "<key>"
-10. Write to file: "write_to_file", args: "file": "<file>", "text": "<text>"
-11. Read file: "read_file", args: "file": "<file>"
-12. Append to file: "append_to_file", args: "file": "<file>", "text": "<text>"
-13. Delete file: "delete_file", args: "file": "<file>"
-14. Evaluate Code: "evaluate_code", args: "code": "<full _code_string>"
-15. Get Improved Code: "improve_code", args: "suggestions": "<list_of_suggestions>", "code": "<full_code_string>"
-16. Write Tests: "write_tests", args: "code": "<full_code_string>", "focus": "<list_of_focus_areas>"
-17. Execute Python File: "execute_python_file", args: "file": "<file>"
-18. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"
+6. View Website Page: "view_website_page", args: "page": "<page_number>"
+7. Start GPT Agent: "start_agent",  args: "name": <name>, "task": "<short_task_desc>", "prompt": "<prompt>"
+8. Message GPT Agent: "message_agent", args: "key": "<key>", "message": "<message>"
+9. List GPT Agents: "list_agents", args: ""
+10. Delete GPT Agent: "delete_agent", args: "key": "<key>"
+11. Write to file: "write_to_file", args: "file": "<file>", "text": "<text>"
+12. Read file: "read_file", args: "file": "<file>"
+13. Append to file: "append_to_file", args: "file": "<file>", "text": "<text>"
+14. Delete file: "delete_file", args: "file": "<file>"
+15. Evaluate Code: "evaluate_code", args: "code": "<full _code_string>"
+16. Get Improved Code: "improve_code", args: "suggestions": "<list_of_suggestions>", "code": "<full_code_string>"
+17. Write Tests: "write_tests", args: "code": "<full_code_string>", "focus": "<list_of_focus_areas>"
+18. Execute Python File: "execute_python_file", args: "file": "<file>"
+19. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"
 
 RESOURCES: