Skip to content

Commit

Permalink
[BUGFIX] Check existing Content-Type is in <head>
Browse files Browse the repository at this point in the history
If a valid `Content-Type` `<meta>` element is present, DOM conversion will
create a `<head>` element for it even without an explicit `<head>` tag in the
HTML.

However, to be valid, it must not be in the `<body>` element.  As well as with
an explicit `<body>` start tag, the `<body>` element also begins whenever a
start tag for an element which cannot be in the `<head>` is encountered.  This
is now checked.

Fixes #923.
  • Loading branch information
JakeQZ committed Mar 23, 2021
1 parent 1efffbb commit 57aa616
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ This project adheres to [Semantic Versioning](https://semver.org/).
### Removed

### Fixed
- Ignore `http-equiv` `Content-Type` in `<body>`
([#961](https://github.com/MyIntervals/emogrifier/pull/961))
- Allow "Content-Type" in content
([#959](https://github.com/MyIntervals/emogrifier/pull/959))

Expand Down
3 changes: 2 additions & 1 deletion psalm.baseline.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,13 @@
</PossiblyNullReference>
</file>
<file src="src/HtmlProcessor/AbstractHtmlProcessor.php">
<MixedOperand occurrences="6">
<MixedOperand occurrences="7">
<code>static::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER</code>
<code>static::DEFAULT_DOCUMENT_TYPE</code>
<code>static::CONTENT_TYPE_META_TAG</code>
<code>static::CONTENT_TYPE_META_TAG</code>
<code>static::CONTENT_TYPE_META_TAG</code>
<code>static::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER</code>
<code>static::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER</code>
</MixedOperand>
<PossiblyNullPropertyAssignmentValue occurrences="1">
Expand Down
118 changes: 114 additions & 4 deletions src/HtmlProcessor/AbstractHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ abstract class AbstractHtmlProcessor
*/
protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';

/**
* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
* for any other element would implicitly start the `<body>` element due to tag omission rules.
*
* @var string
*/
protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
= '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';

/**
* @var \DOMDocument|null
*/
Expand Down Expand Up @@ -271,10 +280,7 @@ private function normalizeDocumentType(string $html): string
*/
private function addContentTypeMetaTag(string $html): string
{
$contentTypeMetaTagMatchCount
= \preg_match('%<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>]%i', $html);
$hasContentTypeMetaTag = \is_int($contentTypeMetaTagMatchCount) && $contentTypeMetaTagMatchCount > 0;
if ($hasContentTypeMetaTag) {
if ($this->hasContentTypeMetaTagInHead($html)) {
return $html;
}

Expand Down Expand Up @@ -302,6 +308,110 @@ private function addContentTypeMetaTag(string $html): string
return $reworkedHtml;
}

/**
* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
* encountering a start tag for any element which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*/
private function hasContentTypeMetaTagInHead(string $html): bool
{
$contentTypeMetaTagMatchCount = \preg_match(
'%<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>]%i',
$html,
$matches,
PREG_OFFSET_CAPTURE
);
if (\is_int($contentTypeMetaTagMatchCount) && $contentTypeMetaTagMatchCount > 0) {
/** @psalm-var array<int, array{0:string, 1:int}> $matches */
$matchPosition = $matches[0][1];
$htmlBefore = \substr($html, 0, $matchPosition);
$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
} else {
$hasContentTypeMetaTagInHead = false;
}

return $hasContentTypeMetaTagInHead;
}

/**
* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
* which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*/
private function hasEndOfHeadElement(string $html): bool
{
$headEndTagMatchCount
= \preg_match('%<(?!' . static::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html);
if (\is_int($headEndTagMatchCount) && $headEndTagMatchCount > 0) {
// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
try {
$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
if ($htmlWithoutCommentsOrTemplates === $html) {
$hasEndOfHeadElement = true;
} else {
$hasEndOfHeadElement = $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
}
} catch (\RuntimeException $ex) {
// If something unexpected occurs, assume the tag(s) that would end the `<head>` element were within a
// `<template>` or comment and thus do not end it.
\trigger_error($ex->getMessage());
$hasEndOfHeadElement = false;
}
} else {
$hasEndOfHeadElement = false;
}

return $hasEndOfHeadElement;
}

/**
* Removes comments from the given HTML, including any unclosed, for which the remainder of the string is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlComments(string $html): string
{
$result = \preg_replace('/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/', '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616521475);
}

return $result;
}

/**
* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
* the string is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlTemplateElements(string $html): string
{
$result = \preg_replace('%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i', '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616519652);
}

return $result;
}

/**
* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
* self-closing slash.
Expand Down
46 changes: 42 additions & 4 deletions tests/Unit/HtmlProcessor/AbstractHtmlProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -442,12 +442,16 @@ public function provideContentWithoutHeadTag(): array
'META element with Content-Type as a value' => ['<meta name="description" content="Content-Type">'],
'BODY element with Content-Type in text' => ['<body>Content-Type</body>'],
'body content only with Content-Type in text' => ['<p>Content-Type</p>'],
// broken: BODY element containing Content-Type META tag
// broken: body content only with Content-Type META tag
'http-equiv META element within BODY (not allowed)'
=> ['<body><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></body>'],
'http-equiv META element after P (implicit BODY, not allowed)'
=> ['<p>hello</p><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'http-equiv META element within P (not allowed)'
=> ['<p><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></p>'],
'viewport META element within P (allowed)'
=> ['<p><meta name="viewport" content="width=device-width, initial-scale=1.0"></p>'],
'http-equiv META element within HEADER (not allowed)'
=> ['<header><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></header>'],
];
}

Expand Down Expand Up @@ -480,13 +484,22 @@ public function provideContentWithHeadTag(): array
'HEAD element with attribute' => ['<head lang="en"></head>'],
'HEAD element and HEADER element' => ['<head></head><header></header>'],
'HEAD element with Content-Type in comment' => ['<head><!-- Content-Type --></head>'],
'HEAD element with Content-Type as META value' => ['<meta name="description" content="Content-Type">'],
'HEAD element with Content-Type as META value'
=> ['<head><meta name="description" content="Content-Type"></head>'],
'with BODY element with Content-Type in text' => ['<head></head><body>Content-Type</body>'],
'with implicit body content with Content-Type in text' => ['<head></head><p>Content-Type</p>'],
'with BODY element containing Content-Type META tag'
=> ['<head></head><body><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></body>'],
'with implicit body content with Content-Type META tag'
=> ['<head></head><p>hello</p><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'with end tag omitted and BODY element containing Content-Type META tag'
=> ['<head><body><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></body>'],
'with end tag omitted and implicit body content with Content-Type META tag'
=> ['<head><p>hello</p><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'with Content-Type META tag after end tag'
=> ['<head></head><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'with Content-Type META tag after uppercase end tag'
=> ['<HEAD></HEAD><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
];
}

Expand Down Expand Up @@ -718,8 +731,16 @@ public function addsMissingContentTypeMetaTagOnlyOnce(string $html): void

$result = $subject->render();

$headEndPosition = \stripos($result, '</head>');
$resultBeforeHeadEnd = $headEndPosition !== false ? \substr($result, 0, $headEndPosition) : $result;
// PHP DOM does not understand `<header>` element so does not know it would implicitly start `<body>`.
$headerStartPosition = \stripos($resultBeforeHeadEnd, '<header');
if ($headerStartPosition !== false) {
$resultBeforeHeadEnd = \substr($resultBeforeHeadEnd, 0, $headerStartPosition);
}

$numberOfContentTypeMetaTags = \substr_count(
$result,
$resultBeforeHeadEnd,
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
);
self::assertSame(1, $numberOfContentTypeMetaTags);
Expand Down Expand Up @@ -763,6 +784,23 @@ public function provideHtmlAroundContentType(): array
'HEAD element with attribute' => ['<head lang="en">', '</head>'],
'HTML, HEAD, and BODY with HEADER elements'
=> ['<html><head>', '</head><body><header></header></body></html>'],
'HEAD element with comment' => ['<head><!--Test-->', '</head>'],
'HEAD element with commented-out BODY start tag' => ['<head><!--<body>-->', '</head>'],
'HEAD element with BASE element' => ['<head><base href="https://example.com"/>', '</head>'],
'HEAD element with COMMAND element' => ['<head><command type="command"/>', '</head>'],
'HEAD element with LINK element'
=> ['<head><link rel="stylesheet" href="https://example.org/css.css"/>', '</head>'],
'HEAD element with another META element' => ['<head><meta name="title" content="Test"/>', '</head>'],
'HEAD element with NOSCRIPT element'
=> ['<head><noscript><style>p{color:green}</style></noscript>', '</head>'],
'HEAD element with SCRIPT element' => ['<head><script>console.log("Test");</script>', '</head>'],
'HEAD element with STYLE element' => ['<head><style>p{color:green}</style>', '</head>'],
'HEAD element with TEMPLATE element'
=> ['<head><template id="test"><p>Test</p></template></title>', '</head>'],
'HEAD element with TITLE element' => ['<head><title>Test</title>', '</head>'],
'HEAD element with uppercase TEMPLATE element'
=> ['<head><TEMPLATE id="test"><p>Test</p></TEMPLATE></title>', '</head>'],
'HEAD element with uppercase TITLE element' => ['<head><TITLE>Test</TITLE>', '</head>'],
];
}

Expand Down

0 comments on commit 57aa616

Please sign in to comment.