Merge pull request #605 from Webperf-se/issue-603

Make the HTML test look for deprecated elements
Webperf-se · Aug 22, 2024 · 2e899fe · 2e899fe
2 parents 8e35b4c + 101a420
commit 2e899fe
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 8 deletions.
diff --git a/locales/en/LC_MESSAGES/html_validator_w3c.mo b/locales/en/LC_MESSAGES/html_validator_w3c.mo
diff --git a/locales/en/LC_MESSAGES/html_validator_w3c.po b/locales/en/LC_MESSAGES/html_validator_w3c.po
@@ -40,4 +40,7 @@ msgid "TEXT_REVIEW_RATING_GROUPED"
 msgstr "- Number of grouped error type: {0}"
 
 msgid "TEXT_REVIEW_RATING_ITEMS"
-msgstr "- Number of errors: {0}"
+msgstr "- Number of errors: {0}"
+
+msgid "TEXT_REVIEW_DEPRECATED_ELEMENT"
+msgstr "The use of “{0}” element is deprecated."
diff --git a/locales/gov/LC_MESSAGES/html_validator_w3c.mo b/locales/gov/LC_MESSAGES/html_validator_w3c.mo
diff --git a/locales/gov/LC_MESSAGES/html_validator_w3c.po b/locales/gov/LC_MESSAGES/html_validator_w3c.po
@@ -40,4 +40,7 @@ msgid "TEXT_REVIEW_RATING_GROUPED"
 msgstr "- Number of grouped error type: {0}"
 
 msgid "TEXT_REVIEW_RATING_ITEMS"
-msgstr "- Number of errors: {0}"
+msgstr "- Number of errors: {0}"
+
+msgid "TEXT_REVIEW_DEPRECATED_ELEMENT"
+msgstr "The use of “{0}” element is deprecated."
diff --git a/locales/sv/LC_MESSAGES/html_validator_w3c.mo b/locales/sv/LC_MESSAGES/html_validator_w3c.mo
diff --git a/locales/sv/LC_MESSAGES/html_validator_w3c.po b/locales/sv/LC_MESSAGES/html_validator_w3c.po
@@ -40,4 +40,7 @@ msgid "TEXT_REVIEW_RATING_GROUPED"
 msgstr "  - Antal (grupperade fel): {0}"
 
 msgid "TEXT_REVIEW_RATING_ITEMS"
-msgstr "- Antal fel: {0}"
+msgstr "- Antal fel: {0}"
+
+msgid "TEXT_REVIEW_DEPRECATED_ELEMENT"
+msgstr "Användningen av elementet “{0}” är föråldrad."
diff --git a/tests/html_validator_w3c.py b/tests/html_validator_w3c.py
@@ -1,8 +1,10 @@
 # -*- coding: utf-8 -*-
 from datetime import datetime
 import re
+
+from bs4 import BeautifulSoup
 from models import Rating
-from tests.utils import get_friendly_url_name,\
+from tests.utils import get_friendly_url_name, get_http_content,\
     get_translation,\
     set_cache_file
 from tests.w3c_base import calculate_rating, get_data_for_url,\
@@ -11,7 +13,6 @@
 from helpers.setting_helper import get_config
 
 # DEFAULTS
-HTML_REVIEW_GROUP_ERRORS = True
 HTML_START_STRINGS = [
         'Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”',
         'Element “head” is missing a required instance of child element “title”.'
@@ -97,7 +98,7 @@ def handle_html_markup_entry(entry, global_translation, local_translation, resul
     req_url = entry['url']
     name = get_friendly_url_name(global_translation, req_url, entry['index'])
     html = entry['content']
-    errors = get_errors_for_html(req_url, html)
+    errors = get_errors_for_html(req_url, html, local_translation)
     result_dict['errors']['all'].extend(errors)
     result_dict['errors']['html_files'].extend(errors)
     is_first_entry = entry['index'] <= 1
@@ -153,7 +154,7 @@ def create_review_and_rating(
 
             tmp = re.sub(
                 r"(“[^”]+”)", "X", error_message, 0, re.MULTILINE)
-            if HTML_REVIEW_GROUP_ERRORS:
+            if not get_config('general.review.details'):
                 error_message = tmp
 
             if msg_grouped_dict.get(error_message, False):
@@ -197,7 +198,61 @@ def is_start_html_error(error_message):
             return True
     return False
 
-def get_errors_for_html(url, html):
+def get_mdn_web_docs_deprecated_elements():
+    """
+    Returns a list of strings, of deprecated html elements.
+    """
+    elements = []
+
+    html = get_http_content(
+        ('https://developer.mozilla.org/'
+         'en-US/docs/Web/HTML/Element'
+         '#obsolete_and_deprecated_elements'))
+
+    soup = BeautifulSoup(html, 'lxml')
+
+    header = soup.find('h2', id = 'obsolete_and_deprecated_elements')
+    if header is None:
+        return []
+
+    section = header.parent
+    if section is None:
+        return []
+
+    tbody = section.find('tbody')
+    if tbody is None:
+        return []
+
+    table_rows = tbody.find_all('tr')
+    if table_rows is None:
+        return []
+
+    for table_row in table_rows:
+        if table_row is None:
+            continue
+
+        first_td = table_row.find('td')
+        if first_td is None:
+            continue
+
+        code = first_td.find('code')
+        if code is None:
+            continue
+
+        regex = r'(\&lt;|<)(?P<name>[^<>]+)(\&gt;|>)'
+        matches = re.search(regex, code.string)
+        if matches:
+            property_name = '<' + matches.group('name')
+            elements.append(property_name)
+
+    return sorted(list(set(elements)))
+
+
+# TODO: change this to just in time, right now it is called every time webperf_core is being called.
+html_deprecated_elements = get_mdn_web_docs_deprecated_elements()
+
+
+def get_errors_for_html(url, html, local_translation):
     """
     Caches the HTML content of a URL and retrieves the errors associated with it.
 
@@ -212,4 +267,13 @@ def get_errors_for_html(url, html):
     results = get_errors_for_url(
         'html',
         url)
+
+    for element in html_deprecated_elements:
+        if element not in html:
+            continue
+        results.append({
+                'type': 'error',
+                'message': local_translation('TEXT_REVIEW_DEPRECATED_ELEMENT').format(element.replace('<', ''))
+            })
+
     return results