Significant-Gravitas · richbeales · Apr 13, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
@@ -5,25 +5,38 @@
 
 cfg = Config()
 
-# Define and check for local file address prefixes
 def check_local_file_access(url):
+    # Define and check for local file address prefixes
     local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
     return any(url.startswith(prefix) for prefix in local_prefixes)
 
+def get_validated_response(url, headers=cfg.user_agent_header):
+    try:
+        # Restrict access to local files
+        if check_local_file_access(url):
+            raise ValueError('Access to local files is restricted')
+
+        # Most basic check if the URL is valid:
+        if not url.startswith('http://') and not url.startswith('https://'):
+            raise ValueError('Invalid URL format')
+
+        # Make the HTTP request and return the response
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # Raise an exception if the response contains an HTTP error status code
+        return response, None
+    except ValueError as ve:
+        # Handle invalid URL format
+        return None, "Error: " + str(ve)
+
+    except requests.exceptions.RequestException as re:
+        # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
+        return None, "Error: " + str(re)
+
 def scrape_text(url):
     """Scrape text from a webpage"""
-    # Most basic check if the URL is valid:
-    if not url.startswith('http'):
-        return "Error: Invalid URL"
-
-    # Restrict access to local files
-    if check_local_file_access(url):
-        return "Error: Access to local files is restricted"
-
-    try:
-        response = requests.get(url, headers=cfg.user_agent_header)
-    except requests.exceptions.RequestException as e:
-        return "Error: " + str(e)
+    response, error_message = get_validated_response(url)
+    if error_message:
+        return error_message
 
     # Check if the response contains an HTTP error
     if response.status_code >= 400:
@@ -60,11 +73,13 @@ def format_hyperlinks(hyperlinks):
 
 def scrape_links(url):
     """Scrape links from a webpage"""
-    response = requests.get(url, headers=cfg.user_agent_header)
+    response, error_message = get_validated_response(url)
+    if error_message:
+        return error_message
 
     # Check if the response contains an HTTP error
     if response.status_code >= 400:
-        return "error"
+        return "Error: HTTP " + str(response.status_code) + " error"
 
     soup = BeautifulSoup(response.text, "html.parser")
 

diff --git a/tests/test_browse_scrape_links.py b/tests/test_browse_scrape_links.py
@@ -0,0 +1,115 @@
+
+# Generated by CodiumAI
+from scripts.browse import scrape_links
+
+
+# Dependencies:
+# pip install pytest-mock
+import pytest
+
+"""
+Code Analysis
+
+Objective:
+The objective of the 'scrape_links' function is to scrape hyperlinks from a given URL and return them in a formatted way.
+
+Inputs:
+- url: a string representing the URL to be scraped.
+
+Flow:
+1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
+2. Check if the response contains an HTTP error. If it does, return "error".
+3. Parse the HTML content of the response using the BeautifulSoup library.
+4. Remove any script and style tags from the parsed HTML.
+5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function.
+6. Format the extracted hyperlinks using the 'format_hyperlinks' function.
+7. Return the formatted hyperlinks.
+
+Outputs:
+- A list of formatted hyperlinks.
+
+Additional aspects:
+- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP requests and parse HTML content, respectively.
+- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
+- The 'format_hyperlinks' function is called to format the extracted hyperlinks.
+- The function checks for HTTP errors and returns "error" if any are found.
+"""
+
+
+
+class TestScrapeLinks:
+
+    # Tests that the function returns a list of formatted hyperlinks when provided with a valid url that returns a webpage with hyperlinks. 
+    def test_valid_url_with_hyperlinks(self):
+        url = "https://www.google.com"
+        result = scrape_links(url)
+        assert len(result) > 0
+        assert isinstance(result, list)
+        assert isinstance(result[0], str)
+
+    # Tests that the function returns correctly formatted hyperlinks when given a valid url. 
+    def test_valid_url(self, mocker):
+        # Mock the requests.get() function to return a response with sample HTML containing hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body><a href='https://www.google.com'>Google</a></body></html>"
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with a valid URL
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns correctly formatted hyperlinks
+        assert result == ["Google (https://www.google.com)"]
+
+    # Tests that the function returns "error" when given an invalid url. 
+    def test_invalid_url(self, mocker):
+        # Mock the requests.get() function to return an HTTP error response
+        mock_response = mocker.Mock()
+        mock_response.status_code = 404
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with an invalid URL
+        result = scrape_links("https://www.invalidurl.com")
+
+        # Assert that the function returns "error"
+        assert "Error:" in result
+
+    # Tests that the function returns an empty list when the html contains no hyperlinks. 
+    def test_no_hyperlinks(self, mocker):
+        # Mock the requests.get() function to return a response with sample HTML containing no hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>"
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with a URL containing no hyperlinks
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns an empty list
+        assert result == []
+
+    # Tests that scrape_links() correctly extracts and formats hyperlinks from a sample HTML containing a few hyperlinks. 
+    def test_scrape_links_with_few_hyperlinks(self, mocker):
+        # Mock the requests.get() function to return a response with a sample HTML containing hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = """
+            <html>
+                <body>
+                    <div id="google-link"><a href="https://www.google.com">Google</a></div>
+                    <div id="github"><a href="https://github.com">GitHub</a></div>
+                    <div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div>
+                </body>
+            </html>
+        """
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function being tested
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns a list of formatted hyperlinks
+        assert isinstance(result, list)
+        assert len(result) == 3
+        assert result[0] == "Google (https://www.google.com)"
+        assert result[1] == "GitHub (https://github.com)"
+        assert result[2] == "CodiumAI (https://www.codium.ai)"