Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

browse: (1) apply url validation also to scrape_links(), (2) add unit-tests for scrape_links() #780

Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 30 additions & 15 deletions scripts/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,38 @@

cfg = Config()

# Define and check for local file address prefixes
def check_local_file_access(url):
# Define and check for local file address prefixes
local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
return any(url.startswith(prefix) for prefix in local_prefixes)

def get_validated_response(url, headers=cfg.user_agent_header):
try:
# Restrict access to local files
if check_local_file_access(url):
raise ValueError('Access to local files is restricted')

# Most basic check if the URL is valid:
if not url.startswith('http://') and not url.startswith('https://'):
raise ValueError('Invalid URL format')

# Make the HTTP request and return the response
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an exception if the response contains an HTTP error status code
return response, None
except ValueError as ve:
# Handle invalid URL format
return None, "Error: " + str(ve)

except requests.exceptions.RequestException as re:
# Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
return None, "Error: " + str(re)

def scrape_text(url):
"""Scrape text from a webpage"""
# Most basic check if the URL is valid:
if not url.startswith('http'):
return "Error: Invalid URL"

# Restrict access to local files
if check_local_file_access(url):
return "Error: Access to local files is restricted"

try:
response = requests.get(url, headers=cfg.user_agent_header)
except requests.exceptions.RequestException as e:
return "Error: " + str(e)
response, error_message = get_validated_response(url)
if error_message:
return error_message

# Check if the response contains an HTTP error
if response.status_code >= 400:
Expand Down Expand Up @@ -60,11 +73,13 @@ def format_hyperlinks(hyperlinks):

def scrape_links(url):
"""Scrape links from a webpage"""
response = requests.get(url, headers=cfg.user_agent_header)
response, error_message = get_validated_response(url)
if error_message:
return error_message

# Check if the response contains an HTTP error
if response.status_code >= 400:
return "error"
return "Error: HTTP " + str(response.status_code) + " error"

soup = BeautifulSoup(response.text, "html.parser")

Expand Down
115 changes: 115 additions & 0 deletions tests/test_browse_scrape_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@

# Generated by CodiumAI
from scripts.browse import scrape_links


# Dependencies:
# pip install pytest-mock
import pytest

"""
Code Analysis

Objective:
The objective of the 'scrape_links' function is to scrape hyperlinks from a given URL and return them in a formatted way.

Inputs:
- url: a string representing the URL to be scraped.

Flow:
1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
2. Check if the response contains an HTTP error. If it does, return "error".
3. Parse the HTML content of the response using the BeautifulSoup library.
4. Remove any script and style tags from the parsed HTML.
5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function.
6. Format the extracted hyperlinks using the 'format_hyperlinks' function.
7. Return the formatted hyperlinks.

Outputs:
- A list of formatted hyperlinks.

Additional aspects:
- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP requests and parse HTML content, respectively.
- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
- The 'format_hyperlinks' function is called to format the extracted hyperlinks.
- The function checks for HTTP errors and returns "error" if any are found.
"""



class TestScrapeLinks:

# Tests that the function returns a list of formatted hyperlinks when provided with a valid url that returns a webpage with hyperlinks.
def test_valid_url_with_hyperlinks(self):
url = "https://www.google.com"
result = scrape_links(url)
assert len(result) > 0
assert isinstance(result, list)
assert isinstance(result[0], str)

# Tests that the function returns correctly formatted hyperlinks when given a valid url.
def test_valid_url(self, mocker):
# Mock the requests.get() function to return a response with sample HTML containing hyperlinks
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = "<html><body><a href='https://www.google.com'>Google</a></body></html>"
mocker.patch('requests.get', return_value=mock_response)

# Call the function with a valid URL
result = scrape_links("https://www.example.com")

# Assert that the function returns correctly formatted hyperlinks
assert result == ["Google (https://www.google.com)"]

# Tests that the function returns "error" when given an invalid url.
def test_invalid_url(self, mocker):
# Mock the requests.get() function to return an HTTP error response
mock_response = mocker.Mock()
mock_response.status_code = 404
mocker.patch('requests.get', return_value=mock_response)

# Call the function with an invalid URL
result = scrape_links("https://www.invalidurl.com")

# Assert that the function returns "error"
assert "Error:" in result

# Tests that the function returns an empty list when the html contains no hyperlinks.
def test_no_hyperlinks(self, mocker):
# Mock the requests.get() function to return a response with sample HTML containing no hyperlinks
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>"
mocker.patch('requests.get', return_value=mock_response)

# Call the function with a URL containing no hyperlinks
result = scrape_links("https://www.example.com")

# Assert that the function returns an empty list
assert result == []

# Tests that scrape_links() correctly extracts and formats hyperlinks from a sample HTML containing a few hyperlinks.
def test_scrape_links_with_few_hyperlinks(self, mocker):
# Mock the requests.get() function to return a response with a sample HTML containing hyperlinks
mock_response = mocker.Mock()
mock_response.status_code = 200
mock_response.text = """
<html>
<body>
<div id="google-link"><a href="https://www.google.com">Google</a></div>
<div id="github"><a href="https://github.com">GitHub</a></div>
<div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div>
</body>
</html>
"""
mocker.patch('requests.get', return_value=mock_response)

# Call the function being tested
result = scrape_links("https://www.example.com")

# Assert that the function returns a list of formatted hyperlinks
assert isinstance(result, list)
assert len(result) == 3
assert result[0] == "Google (https://www.google.com)"
assert result[1] == "GitHub (https://github.com)"
assert result[2] == "CodiumAI (https://www.codium.ai)"