Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search Google in 2023, and save to long term memory #507

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ docker
duckduckgo-search
google-api-python-client #(https://developers.google.com/custom-search/v1/overview)
pinecone-client==2.2.1
selenium==4.8.3
googlesearch-python
# Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
5 changes: 5 additions & 0 deletions scripts/ai_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import yaml
import data
import time


class AIConfig:
Expand Down Expand Up @@ -39,5 +40,9 @@ def construct_full_prompt(self):
for i, goal in enumerate(self.ai_goals):
full_prompt += f"{i+1}. {goal}\n"

# Add knowledge cutoff date and current date
current_date = time.strftime("%B %d %Y")
full_prompt += f"Knowledge Cutoff Date: September 2021\nCurrent Date: {current_date}\n\n"

full_prompt += f"\n\n{data.load_prompt()}"
return full_prompt
175 changes: 69 additions & 106 deletions scripts/browse.py
Original file line number Diff line number Diff line change
@@ -1,115 +1,78 @@
import requests
from bs4 import BeautifulSoup
from config import Config
from llm_utils import create_chat_completion

cfg = Config()
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from time import sleep

def scrape_text(url):
response = requests.get(url, headers=cfg.user_agent_header)
options = Options()
options.add_argument('--headless')

# Check if the response contains an HTTP error
if response.status_code >= 400:
return "Error: HTTP " + str(response.status_code) + " error"
lastFetched = None

soup = BeautifulSoup(response.text, "html.parser")
def fetch_url(url):
browser = webdriver.Chrome(options=options)
browser.get(url)

for script in soup(["script", "style"]):
script.extract()
# Wait for page to load
# browser.implicitly_wait(10)
sleep(15)

text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# Use a more targeted XPath expression to select only elements that are likely to have meaningful text content
xpath = "//*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6 or self::p or self::a or self::li or self::span or self::a or self::button]"

return text
# Find all elements on the page
elements = browser.find_elements(By.XPATH, xpath)


def extract_hyperlinks(soup):
hyperlinks = []
for link in soup.find_all('a', href=True):
hyperlinks.append((link.text, link['href']))
return hyperlinks


def format_hyperlinks(hyperlinks):
formatted_links = []
for link_text, link_url in hyperlinks:
formatted_links.append(f"{link_text} ({link_url})")
return formatted_links


def scrape_links(url):
response = requests.get(url, headers=cfg.user_agent_header)

# Check if the response contains an HTTP error
if response.status_code >= 400:
return "error"

soup = BeautifulSoup(response.text, "html.parser")

for script in soup(["script", "style"]):
script.extract()

hyperlinks = extract_hyperlinks(soup)

return format_hyperlinks(hyperlinks)


def split_text(text, max_length=8192):
paragraphs = text.split("\n")
current_length = 0
current_chunk = []

for paragraph in paragraphs:
if current_length + len(paragraph) + 1 <= max_length:
current_chunk.append(paragraph)
current_length += len(paragraph) + 1
# Extract the text content from the elements
text_content = []
for element in elements:
# if element is button or link, we must include the URL
if element.tag_name == "a" or element.tag_name == "button":
text = element.text
url = element.get_attribute("href")
if text and text != "" and url and url.startswith("http"):
text_content.append("(" + text + ")[" + url + "]")
else:
yield "\n".join(current_chunk)
current_chunk = [paragraph]
current_length = len(paragraph) + 1

if current_chunk:
yield "\n".join(current_chunk)


def create_message(chunk, question):
return {
"role": "user",
"content": f"\"\"\"{chunk}\"\"\" Using the above text, please answer the following question: \"{question}\" -- if the question cannot be answered using the text, please summarize the text."
}

def summarize_text(text, question):
if not text:
return "Error: No text to summarize"

text_length = len(text)
print(f"Text length: {text_length} characters")

summaries = []
chunks = list(split_text(text))

for i, chunk in enumerate(chunks):
print(f"Summarizing chunk {i + 1} / {len(chunks)}")
messages = [create_message(chunk, question)]

summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
)
summaries.append(summary)

print(f"Summarized {len(chunks)} chunks.")

combined_summary = "\n".join(summaries)
messages = [create_message(combined_summary, question)]

final_summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
)

return final_summary
# Otherwise, just include the text
text = element.text
if text and (len(text_content) == 0 or text_content[len(text_content) - 1] != text):
text_content.append(text)

# Close browser
browser.quit()

# Build content
content = ' '.join(text_content)

# Store content
global lastFetched
lastFetched = content

def split_text(text, max_length=2048):
# Split text into chunks of max_length
chunks = []
while len(text) > max_length:
# Find the last space before the max length
last_space = text.rfind(" ", 0, max_length)
if last_space == -1:
# If there is no space, just split at the max length
last_space = max_length
chunks.append(text[0:last_space])
text = text[last_space + 1:]
chunks.append(text)
return chunks

def has_fetched():
return lastFetched != None

def view_page(pageNumber):
if lastFetched:
chunks = split_text(lastFetched)
if int(pageNumber) > len(list(chunks)):
return "Page number out of range."

header = "Page " + str(int(pageNumber) + 1) + " of " + str(len(list(chunks))) + ":\n"
return header + list(chunks)[int(pageNumber)]
else:
return "No page fetched yet."
Loading