Hotfix: extract only 'q' element from query string

Occasionally the search results will contain links with arguments such as 'dq', which was being erroneously used in attempts to extract the 'q' element from query strings. This enforces that only links with '?q=' or '&q=' (elements with a standalone 'q' arg) will have the element extracted. I also refactored the naming of this element once extracted to be just 'q'. Although this seems counterintuitive, it makes a little more sense since this element is the one we're extracting. It's a vague url arg name, but it is what it is. Bump version to 0.5.2 for hotfix release
benbusby · May 29, 2021 · cbe32a0 · cbe32a0
1 parent e1e6e84
commit cbe32a0
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 11 deletions.
diff --git a/app/__init__.py b/app/__init__.py
@@ -22,7 +22,7 @@
 app.no_cookie_ips = []
 app.config['SECRET_KEY'] = os.urandom(32)
 app.config['SESSION_TYPE'] = 'filesystem'
-app.config['VERSION_NUMBER'] = '0.5.1'
+app.config['VERSION_NUMBER'] = '0.5.2'
 app.config['APP_ROOT'] = os.getenv(
     'APP_ROOT',
     os.path.dirname(os.path.abspath(__file__)))

diff --git a/app/filter.py b/app/filter.py
@@ -22,6 +22,21 @@ def strip_blocked_sites(query: str) -> str:
     return query[:query.find('-site:')] if '-site:' in query else query
 
 
+def extract_q(q_str: str, href: str) -> str:
+    """Extracts the 'q' element from a result link. This is typically
+    either the link to a result's website, or a string.
+
+    Args:
+        q_str: The result link to parse
+        href: The full url to check for standalone 'q' elements first,
+              rather than parsing the whole query string and then checking.
+
+    Returns:
+        str: The 'q' element of the link, or an empty string
+    """
+    return parse_qs(q_str)['q'][0] if ('&q=' in href or '?q=' in href) else ''
+
+
 class Filter:
     def __init__(self, user_key: str, mobile=False, config=None) -> None:
         if config is None:
@@ -223,20 +238,18 @@ def update_link(self, link: Tag) -> None:
             link['target'] = '_blank'
 
         result_link = urlparse.urlparse(href)
-        query = parse_qs(
-            result_link.query
-        )['q'][0] if 'q=' in href else ''
+        q = extract_q(result_link.query, href)
 
-        if query.startswith('/'):
+        if q.startswith('/'):
             # Internal google links (i.e. mail, maps, etc) should still
             # be forwarded to Google
-            link['href'] = 'https://google.com' + query
+            link['href'] = 'https://google.com' + q
         elif '/search?q=' in href:
             # "li:1" implies the query should be interpreted verbatim,
             # which is accomplished by wrapping the query in double quotes
             if 'li:1' in href:
-                query = '"' + query + '"'
-            new_search = 'search?q=' + self.encrypt_path(query)
+                q = '"' + q + '"'
+            new_search = 'search?q=' + self.encrypt_path(q)
 
             query_params = parse_qs(urlparse.urlparse(href).query)
             for param in VALID_PARAMS:
@@ -247,15 +260,15 @@ def update_link(self, link: Tag) -> None:
             link['href'] = new_search
         elif 'url?q=' in href:
             # Strip unneeded arguments
-            link['href'] = filter_link_args(query)
+            link['href'] = filter_link_args(q)
 
             # Add no-js option
             if self.nojs:
                 append_nojs(link)
         else:
             if href.startswith(MAPS_URL):
                 # Maps links don't work if a site filter is applied
-                link['href'] = MAPS_URL + "?q=" + strip_blocked_sites(query)
+                link['href'] = MAPS_URL + "?q=" + strip_blocked_sites(q)
             else:
                 link['href'] = href
 

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
     author='Ben Busby',
     author_email='benbusby@protonmail.com',
     name='whoogle-search',
-    version='0.5.1',
+    version='0.5.2',
     include_package_data=True,
     install_requires=requirements,
     description='Self-hosted, ad-free, privacy-respecting metasearch engine',