Skip to content

Commit

Permalink
feat: better parse URL fragment micro syntaxes
Browse files Browse the repository at this point in the history
This commit introduce a new `URLFragment` class to represent URL fragments.

Fragment strings are parsed into `URLFragment` instances using MIME type-specific logic, implementing some validity checks for a few micro syntaxes
including:

- shortand bare name IDs
- scheme-based fragments
- media fragments

SVG and HTML/XHTML MIME types are supported.

The parser is tested in the `url-fragment.feature` feature file, in a new `unit-tests` directory.
  • Loading branch information
rdeltour committed Nov 27, 2022
1 parent 2e474e3 commit bec390e
Show file tree
Hide file tree
Showing 9 changed files with 636 additions and 32 deletions.
3 changes: 3 additions & 0 deletions src/main/java/com/adobe/epubcheck/opf/OPFChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ protected boolean checkPackage()

List<OPFItem> items = opfHandler.getItems();
report.info(null, FeatureEnum.ITEMS_COUNT, Integer.toString(items.size()));

// Register package doc and items to the XRefChecker
xrefChecker.registerResource(context.url, context.mimeType);
for (OPFItem item : items)
{
xrefChecker.registerResource(item,
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/com/adobe/epubcheck/opf/OPFChecker30.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import java.util.Iterator;
import java.util.Set;

import org.w3c.epubcheck.url.URLFragment;

import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.api.EPUBProfile;
import com.adobe.epubcheck.api.FeatureReport.Feature;
Expand Down Expand Up @@ -387,7 +389,8 @@ private void checkPreviewCollection(ResourceCollection collection)
}
else
{
if (Optional.fromNullable(resource.getURL().fragment()).or("").startsWith("epubcfi("))
URLFragment fragment = URLFragment.parse(resource.getURL());
if (fragment.exists() && "epubcfi".equals(fragment.getScheme()))
{
report.message(MessageId.OPF_076, EPUBLocation.of(context));
}
Expand Down
69 changes: 45 additions & 24 deletions src/main/java/com/adobe/epubcheck/opf/XRefChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Pattern;

import org.w3c.epubcheck.constants.MIMEType;
import org.w3c.epubcheck.url.URLFragment;
import org.w3c.epubcheck.url.URLUtils;

import com.adobe.epubcheck.api.EPUBLocation;
Expand Down Expand Up @@ -126,6 +127,7 @@ public static final class Builder
private OPFItem item = null;
private boolean hasItemFallback = false;
private boolean hasImageFallback = false;
public String mimetype;

public Builder url(URL url)
{
Expand All @@ -137,6 +139,13 @@ public Builder item(OPFItem item)
{
this.url = item.getURL();
this.item = item;
this.mimetype = item.getMimeType();
return this;
}

public Builder mimetype(String mimetype)
{
this.mimetype = mimetype;
return this;
}

Expand Down Expand Up @@ -231,8 +240,6 @@ public boolean isInSpine()
}
}

private static final Pattern REGEX_SVG_VIEW = Pattern.compile("svgView\\(.*\\)");

private final Map<URL, Resource> resources = new HashMap<URL, Resource>();

private final Set<URL> undeclared = new HashSet<URL>();
Expand Down Expand Up @@ -281,7 +288,7 @@ public Optional<OPFItem> getResource(URL url)
* @param path
* the path to a publication resource
* @return an immutable {@link EnumSet} containing the types of references to
* {@code path}.
* {@code path}.
*/
public Set<Type> getTypes(URL resource)
{
Expand Down Expand Up @@ -413,9 +420,15 @@ public void checkReferences()
private void checkReference(URLReference reference)
{
Resource hostResource = resources.get(reference.location.url);
Resource targetResource = resources.get(reference.targetDoc);

// Retrieve the Resource instance representing the targeted document
// If the resource was not declared in the manifest,
// we build a new Resource object for the data URL.
Resource targetResource = resources.get(reference.targetDoc);
String targetMimetype = (targetResource != null) ? targetResource.getMimeType() : "";

// Parse the URL fragment
URLFragment fragment = URLFragment.parse(reference.url, targetMimetype);

// Check remote resources
if (container.isRemote(reference.url)
Expand Down Expand Up @@ -470,15 +483,18 @@ else if (!undeclared.contains(reference.targetDoc)
return;
}

String mimetype = targetResource.getMimeType();

// Type-specific checks
switch (reference.type)
{
case HYPERLINK:
if ("epubcfi".equals(fragment.getScheme()))
{
break; // EPUB CFI is not supported
}
// if mimeType is null, we should have reported an error already
if (!OPFChecker.isBlessedItemType(mimetype, version)
&& !OPFChecker.isDeprecatedBlessedItemType(mimetype) && !targetResource.hasItemFallback())
if (!OPFChecker.isBlessedItemType(targetMimetype, version)
&& !OPFChecker.isDeprecatedBlessedItemType(targetMimetype)
&& !targetResource.hasItemFallback())
{
report.message(MessageId.RSC_010,
reference.location.context(container.relativize(reference.url)));
Expand All @@ -494,31 +510,35 @@ else if (!undeclared.contains(reference.targetDoc)
case IMAGE:
case PICTURE_SOURCE:
case PICTURE_SOURCE_FOREIGN:
if (reference.url.fragment() != null && !mimetype.equals("image/svg+xml"))
if ("epubcfi".equals(fragment.getScheme()))
{
break; // EPUB CFI is not supported
}
if (fragment.exists() && !MIMEType.SVG.is(targetMimetype))
{
report.message(MessageId.RSC_009,
reference.location.context(container.relativize(reference.url)));
return;
}
// if mimeType is null, we should have reported an error already
if (!OPFChecker.isBlessedImageType(mimetype, version))
if (!OPFChecker.isBlessedImageType(targetMimetype, version))
{
if (version == EPUBVersion.VERSION_3 && reference.type == Type.PICTURE_SOURCE)
{
report.message(MessageId.MED_007, reference.location,
container.relativize(reference.targetDoc), mimetype);
container.relativize(reference.targetDoc), targetMimetype);
return;
}
else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
{
report.message(MessageId.MED_003, reference.location,
container.relativize(reference.targetDoc), mimetype);
container.relativize(reference.targetDoc), targetMimetype);
}
}
break;
case SEARCH_KEY:
// TODO update when we support EPUB CFI
if ((reference.url.fragment() == null || !reference.url.fragment().startsWith("epubcfi("))
if ((!fragment.exists() || !"epubcfi".equals(fragment.getScheme()))
&& !targetResource.isInSpine())
{
report.message(MessageId.RSC_021, reference.location,
Expand All @@ -527,7 +547,7 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
}
break;
case STYLESHEET:
if (reference.url.fragment() != null)
if (fragment.exists())
{
report.message(MessageId.RSC_013,
reference.location.context(container.relativize(reference.url)));
Expand All @@ -551,7 +571,7 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
case SVG_CLIP_PATH:
case SVG_PAINT:
case SVG_SYMBOL:
if (reference.url.fragment() == null)
if (!fragment.exists())
{
report.message(MessageId.RSC_015, reference.location.context(reference.url));
return;
Expand All @@ -562,32 +582,32 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
}

// Fragment integrity checks
String fragment = reference.url.fragment();
if (fragment != null && !fragment.isEmpty())
if (fragment.exists() && !fragment.isEmpty())
{
// EPUB CFI
if (fragment.startsWith("epubcfi("))
if ("epubcfi".equals(fragment.getScheme()))
{
// FIXME HOT should warn if in MO
// FIXME epubcfi currently not supported (see issue 150).
return;
}
// Media fragments in Data Navigation Documents
else if (fragment.contains("=") && hostResource != null && hostResource.hasItem()
else if (fragment.isMediaFragment() && hostResource != null && hostResource.hasItem()
&& hostResource.getItem().getProperties()
.contains(PackageVocabs.ITEM_VOCAB.get(PackageVocabs.ITEM_PROPERTIES.DATA_NAV)))
{
// Ignore,
return;
}
// SVG view fragments are ignored
else if (mimetype.equals("image/svg+xml") && REGEX_SVG_VIEW.matcher(fragment).matches())
// Non-ID-based fragments are ignored
else if (fragment.getId().isEmpty())
{
return;
}
// Fragment Identifier (by default)
else if (!container.isRemote(reference.targetDoc))
{
ID anchor = targetResource.ids.get(fragment);
ID anchor = targetResource.ids.get(fragment.getId());
if (anchor == null)
{
report.message(MessageId.RSC_012, reference.location.context(reference.url.toString()));
Expand Down Expand Up @@ -674,7 +694,8 @@ private void checkReadingOrder(Queue<URLReference> references, int lastSpinePosi
}

// check that the fragment is in document order
int targetAnchorPosition = res.getIDPosition(ref.url.fragment());
URLFragment fragment = URLFragment.parse(ref.url, res.getMimeType());
int targetAnchorPosition = res.getIDPosition(fragment.getId());
if (targetAnchorPosition < lastAnchorPosition)
{
String orderContext = LocalizedMessages.getInstance(locale).getSuggestion(MessageId.NAV_011,
Expand Down
5 changes: 0 additions & 5 deletions src/main/java/com/adobe/epubcheck/ops/OPSHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,6 @@ else if (".".equals(href))

// If the URL was not properly parsed, return early
if (url == null) return;
// If the URL is an EPUB CFI, return (not implemented)
if (url.fragment() != null && url.fragment().matches("epubcfi\\(.*\\)"))
{
return; // temp until cfi implemented
}

if ("file".equals(url.scheme()))
{
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/w3c/epubcheck/constants/MIMEType.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,6 @@ public boolean is(String string)

public static MIMEType get(String name)
{
return ENUM_MAP.getOrDefault(name.toLowerCase(Locale.ROOT), OTHER);
return (name != null) ? ENUM_MAP.getOrDefault(name.toLowerCase(Locale.ROOT), OTHER) : OTHER;
}
}
Loading

0 comments on commit bec390e

Please sign in to comment.