Significant-Gravitas · gigabit-eth · Apr 8, 2023 · Apr 8, 2023 · Apr 8, 2023 · Apr 15, 2023
@@ -12,3 +12,6 @@ docker
 duckduckgo-search
 google-api-python-client #(https://developers.google.com/custom-search/v1/overview)
 pinecone-client==2.2.1
+selenium==4.8.3
+googlesearch-python
+# Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
@@ -1,5 +1,6 @@
 import yaml
 import data
+import time
 
 
 class AIConfig:
@@ -39,5 +40,9 @@ def construct_full_prompt(self):
         for i, goal in enumerate(self.ai_goals):
             full_prompt += f"{i+1}. {goal}\n"
 
+        # Add knowledge cutoff date and current date
+        current_date = time.strftime("%B %d %Y")
+        full_prompt += f"Knowledge Cutoff Date: September 2021\nCurrent Date: {current_date}\n\n"
+
         full_prompt += f"\n\n{data.load_prompt()}"
         return full_prompt
@@ -1,115 +1,78 @@
-import requests
-from bs4 import BeautifulSoup
-from config import Config
-from llm_utils import create_chat_completion
 
-cfg = Config()
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.remote.webelement import WebElement
+from time import sleep
 
-def scrape_text(url):
-    response = requests.get(url, headers=cfg.user_agent_header)
+options = Options()
+options.add_argument('--headless')
 
-    # Check if the response contains an HTTP error
-    if response.status_code >= 400:
-        return "Error: HTTP " + str(response.status_code) + " error"
+lastFetched = None
 
-    soup = BeautifulSoup(response.text, "html.parser")
+def fetch_url(url):
+    browser = webdriver.Chrome(options=options)
+    browser.get(url)
 
-    for script in soup(["script", "style"]):
-        script.extract()
+    # Wait for page to load
+    # browser.implicitly_wait(10)
+    sleep(15)
 
-    text = soup.get_text()
-    lines = (line.strip() for line in text.splitlines())
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    text = '\n'.join(chunk for chunk in chunks if chunk)
+    # Use a more targeted XPath expression to select only elements that are likely to have meaningful text content
+    xpath = "//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6 or self::p or self::a or self::li or self::span or self::a or self::button]"
 
-    return text
+    # Find all elements on the page
+    elements = browser.find_elements(By.XPATH, xpath)
 
-
-def extract_hyperlinks(soup):
-    hyperlinks = []
-    for link in soup.find_all('a', href=True):
-        hyperlinks.append((link.text, link['href']))
-    return hyperlinks
-
-
-def format_hyperlinks(hyperlinks):
-    formatted_links = []
-    for link_text, link_url in hyperlinks:
-        formatted_links.append(f"{link_text} ({link_url})")
-    return formatted_links
-
-
-def scrape_links(url):
-    response = requests.get(url, headers=cfg.user_agent_header)
-
-    # Check if the response contains an HTTP error
-    if response.status_code >= 400:
-        return "error"
-
-    soup = BeautifulSoup(response.text, "html.parser")
-
-    for script in soup(["script", "style"]):
-        script.extract()
-
-    hyperlinks = extract_hyperlinks(soup)
-
-    return format_hyperlinks(hyperlinks)
-
-
-def split_text(text, max_length=8192):
-    paragraphs = text.split("\n")
-    current_length = 0
-    current_chunk = []
-
-    for paragraph in paragraphs:
-        if current_length + len(paragraph) + 1 <= max_length:
-            current_chunk.append(paragraph)
-            current_length += len(paragraph) + 1
+    # Extract the text content from the elements
+    text_content = []
+    for element in elements:
+        # if element is button or link, we must include the URL
+        if element.tag_name == "a" or element.tag_name == "button":
+            text = element.text
+            url = element.get_attribute("href")
+            if text and text != "" and url and url.startswith("http"):
+                text_content.append("(" + text + ")[" + url + "]")
         else:
-            yield "\n".join(current_chunk)
-            current_chunk = [paragraph]
-            current_length = len(paragraph) + 1
-
-    if current_chunk:
-        yield "\n".join(current_chunk)
-
-
-def create_message(chunk, question):
-    return {
-        "role": "user",
-        "content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text."
-    }
-
-def summarize_text(text, question):
-    if not text:
-        return "Error: No text to summarize"
-
-    text_length = len(text)
-    print(f"Text length: {text_length} characters")
-
-    summaries = []
-    chunks = list(split_text(text))
-
-    for i, chunk in enumerate(chunks):
-        print(f"Summarizing chunk {i + 1} / {len(chunks)}")
-        messages = [create_message(chunk, question)]
-
-        summary = create_chat_completion(
-            model=cfg.fast_llm_model,
-            messages=messages,
-            max_tokens=300,
-        )
-        summaries.append(summary)
-
-    print(f"Summarized {len(chunks)} chunks.")
-
-    combined_summary = "\n".join(summaries)
-    messages = [create_message(combined_summary, question)]
-
-    final_summary = create_chat_completion(
-        model=cfg.fast_llm_model,
-        messages=messages,
-        max_tokens=300,
-    )
-
-    return final_summary
+          # Otherwise, just include the text
+          text = element.text
+          if text and (len(text_content) == 0 or text_content[len(text_content) - 1] != text):
+              text_content.append(text)
+
+    # Close browser
+    browser.quit()
+
+    # Build content
+    content = ' '.join(text_content)
+
+    # Store content
+    global lastFetched
+    lastFetched = content
+
+def split_text(text, max_length=2048):
+    # Split text into chunks of max_length
+    chunks = []
+    while len(text) > max_length:
+        # Find the last space before the max length
+        last_space = text.rfind(" ", 0, max_length)
+        if last_space == -1:
+            # If there is no space, just split at the max length
+            last_space = max_length
+        chunks.append(text[0:last_space])
+        text = text[last_space + 1:]
+    chunks.append(text)
+    return chunks
+
+def has_fetched():
+  return lastFetched != None
+
+def view_page(pageNumber):
+  if lastFetched:
+    chunks = split_text(lastFetched)
+    if int(pageNumber) > len(list(chunks)):
+      return "Page number out of range."
+
+    header = "Page " + str(int(pageNumber) + 1) + " of " + str(len(list(chunks))) + ":\n"
+    return header + list(chunks)[int(pageNumber)]
+  else:
+    return "No page fetched yet."