Skip to content

Commit

Permalink
feat: check "data-*" attributes name restrictions
Browse files Browse the repository at this point in the history
According to HTML:
> A custom data attribute is an attribute in no namespace whose name
> starts with the string "data-", has at least one character after the
> hyphen, is XML-compatible, and contains no ASCII upper alphas.

This commit reports invalid `data-*` attributes as errors with a new
error code, `HTM-061`.

This should be ideally delegated to validator.nu, but we add this check
until we fully integrate the HTML checker.

Close #1107
  • Loading branch information
rdeltour committed Dec 23, 2022
1 parent ab13ef0 commit aa75f9b
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ private void initialize()
severities.put(MessageId.HTM_059, Severity.ERROR);
severities.put(MessageId.HTM_060a, Severity.USAGE);
severities.put(MessageId.HTM_060b, Severity.USAGE);
severities.put(MessageId.HTM_061, Severity.ERROR);

// Media
severities.put(MessageId.MED_001, Severity.SUPPRESSED);
Expand Down
1 change: 1 addition & 0 deletions src/main/java/com/adobe/epubcheck/messages/MessageId.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ public enum MessageId implements Comparable<MessageId>
HTM_059("HTM_059"),
HTM_060a("HTM_060a"),
HTM_060b("HTM_060b"),
HTM_061("HTM_061"),

// Messages associated with media (images, audio and video)
MED_001("MED-001"),
Expand Down
23 changes: 22 additions & 1 deletion src/main/java/com/adobe/epubcheck/xml/HTMLUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;

import net.sf.saxon.om.NameChecker;

/**
* Utilities for HTML-specific logic.
*
Expand Down Expand Up @@ -44,7 +46,7 @@ public static boolean isCustomNamespace(String namespace)
* attributes.
*
* @param name
* the name of an attribute defined in the HTML specification
* the name of an attribute defined in the HTML specification
* @return <code>true</code> iff the attribute value is case-insensitive
*/
public static boolean isCaseInsensitiveAttribute(String namespace, String name)
Expand All @@ -57,6 +59,25 @@ public static boolean isDataAttribute(String namespace, String name)
return namespace.isEmpty() && name.startsWith("data-");
}

/**
* Tells if a string is a valid <a href=
* "https://html.spec.whatwg.org/multipage/dom.html#custom-data-attribute">
* custom data attribute</a>, as
* defined in HTML.
*
* @param name
* the data attribute to test
* @return true if {@code name} is a valid custom data attribute
*/
public static boolean isValidDataAttribute(String name)
{
Preconditions.checkArgument(name != null && name.startsWith("data-"));
name = name.substring(5);
return !name.isEmpty()
&& NameChecker.isValidNCName(name)
&& !name.matches(".*[A-Z].*");
}

private HTMLUtils()
{
// Not instanciable.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ private Attributes preprocessAttributes(String elemNamespace, Attributes atts)
// Remove data-* attributes in both XHTML and SVG
if (HTMLUtils.isDataAttribute(namespace, name))
{
if (!HTMLUtils.isValidDataAttribute(name))
{
context.report.message(MessageId.HTM_061, LocationHandler.location(context, locator),
name);
}
attributes.removeAttribute(i);
}
// Remove custom namespace attributes in XHTML
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ HTM_058=HTML documents must be encoded in UTF-8, but UTF-16 was detected.
HTM_059=Viewport "%1$s" property must not be defined more than once, but found values [%2$s].
HTM_060a=EPUB reading systems must ignore secondary viewport meta elements in fixed-layout documents; viewport declaration "%1$s" will be ignored.
HTM_060b=EPUB reading systems must ignore viewport meta elements in reflowable documents; viewport declaration "%1$s" will be ignored.
HTM_061="%1$s" is not a valid custom data attributes (it must have at least one character after the hyphen, be XML-compatible, and contain no ASCII upper alphas).

#media
MED_003=Picture "img" elements must reference core media type resources, but found resource "%1$s" of type "%2$s".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ Feature: EPUB 3 — Content Documents — XHTML
When checking document 'data-attr-valid.xhtml'
Then no errors or warnings are reported

Scenario: Report invalid `data-*` attributes
When checking document 'data-attr-invalid-error.xhtml'
Then error HTM-061 is reported 3 times
And no other errors or warnings are reported

Scenario: Report invalid elements after a `data-*` attribute
See issue 189 - was allowed by stripping of `data-*` attributes
When checking EPUB 'content-xhtml-data-attr-removal-markup-error'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta charset="utf-8" />
<title>data-* attributes</title>
</head>
<body>
<div data-="">invalid (no character after hte hyphen)</div>
<div data--test="">invalid (not an XML name)</div>
<div data-ERR="">invalid (contains upper alphas)</div>
</body>
</html>

0 comments on commit aa75f9b

Please sign in to comment.