Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUGFIX] Check existing Content-Type is in <head> #961

Merged
merged 4 commits into from
Apr 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ This project adheres to [Semantic Versioning](https://semver.org/).
### Removed

### Fixed
- Ignore `http-equiv` `Content-Type` in `<body>`
([#961](https://github.com/MyIntervals/emogrifier/pull/961))
- Allow "Content-Type" in content
([#959](https://github.com/MyIntervals/emogrifier/pull/959))

Expand Down
125 changes: 121 additions & 4 deletions src/HtmlProcessor/AbstractHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,30 @@ abstract class AbstractHtmlProcessor
*/
protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';

/**
* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
* for any other element would implicitly start the `<body>` element due to tag omission rules.
*
* @var string
*/
protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
= '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';

/**
* regular expression pattern to match an HTML comment, including delimiters and modifiers
*
* @var string
*/
protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';

/**
* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
*
* @var string
*/
protected const HTML_TEMPLATE_ELEMENT_PATTERN
= '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';

/**
* @var \DOMDocument|null
*/
Expand Down Expand Up @@ -271,10 +295,7 @@ private function normalizeDocumentType(string $html): string
*/
private function addContentTypeMetaTag(string $html): string
{
$contentTypeMetaTagMatchCount
= \preg_match('%<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>]%i', $html);
$hasContentTypeMetaTag = \is_int($contentTypeMetaTagMatchCount) && $contentTypeMetaTagMatchCount > 0;
if ($hasContentTypeMetaTag) {
if ($this->hasContentTypeMetaTagInHead($html)) {
return $html;
}

Expand Down Expand Up @@ -302,6 +323,102 @@ private function addContentTypeMetaTag(string $html): string
return $reworkedHtml;
}

/**
* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
* encountering a start tag for any element which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*/
private function hasContentTypeMetaTagInHead(string $html): bool
{
\preg_match('%^.*?(?=<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is', $html, $matches);
if (isset($matches[0])) {
$htmlBefore = $matches[0];
try {
$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
} catch (\RuntimeException $exception) {
// If something unexpected occurs, assume the `Content-Type` that was found is valid.
\trigger_error($exception->getMessage());
$hasContentTypeMetaTagInHead = true;
}
} else {
$hasContentTypeMetaTagInHead = false;
}

return $hasContentTypeMetaTagInHead;
}

/**
* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
* which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*
* @throws \RuntimeException
*/
private function hasEndOfHeadElement(string $html): bool
{
$headEndTagMatchCount
= \preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html);
if (\is_int($headEndTagMatchCount) && $headEndTagMatchCount > 0) {
// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
|| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
} else {
$hasEndOfHeadElement = false;
}

return $hasEndOfHeadElement;
}

/**
* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
* is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlComments(string $html): string
{
$result = \preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616521475);
}

return $result;
}

/**
* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
* the string is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlTemplateElements(string $html): string
{
$result = \preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616519652);
}

return $result;
}

/**
* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
* self-closing slash.
Expand Down
57 changes: 51 additions & 6 deletions tests/Unit/HtmlProcessor/AbstractHtmlProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -435,12 +435,23 @@ public function provideContentWithoutHeadTag(): array
'body content only' => ['<p>Hello</p>'],
'BODY element' => ['<body></body>'],
'HEADER element' => ['<header></header>'],
'META element (implicit HEAD)' => ['<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'http-equiv META element (implicit HEAD)'
=> ['<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'viewport META element (implicit HEAD)'
=> ['<meta name="viewport" content="width=device-width, initial-scale=1.0">'],
'META element with Content-Type as a value' => ['<meta name="description" content="Content-Type">'],
'BODY element with Content-Type in text' => ['<body>Content-Type</body>'],
'body content only with Content-Type in text' => ['<p>Content-Type</p>'],
// broken: BODY element containing Content-Type META tag
// broken: body content only with Content-Type META tag
'http-equiv META element within BODY (not allowed)'
=> ['<body><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></body>'],
'http-equiv META element after P (implicit BODY, not allowed)'
=> ['<p>hello</p><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'http-equiv META element within P (not allowed)'
=> ['<p><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></p>'],
'viewport META element within P (allowed)'
=> ['<p><meta name="viewport" content="width=device-width, initial-scale=1.0"></p>'],
'http-equiv META element within HEADER (not allowed)'
=> ['<header><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></header>'],
];
}

Expand All @@ -451,7 +462,7 @@ public function provideContentWithoutHeadTag(): array
*
* @dataProvider provideContentWithoutHeadTag
*/
public function addsMissingHeadTagOnlyOnce(string $html): void
public function addsMissingHeadTagExactlyOnce(string $html): void
{
$subject = TestingHtmlProcessor::fromHtml($html);

Expand All @@ -473,13 +484,22 @@ public function provideContentWithHeadTag(): array
'HEAD element with attribute' => ['<head lang="en"></head>'],
'HEAD element and HEADER element' => ['<head></head><header></header>'],
'HEAD element with Content-Type in comment' => ['<head><!-- Content-Type --></head>'],
'HEAD element with Content-Type as META value' => ['<meta name="description" content="Content-Type">'],
'HEAD element with Content-Type as META value'
=> ['<head><meta name="description" content="Content-Type"></head>'],
'with BODY element with Content-Type in text' => ['<head></head><body>Content-Type</body>'],
'with implicit body content with Content-Type in text' => ['<head></head><p>Content-Type</p>'],
'with BODY element containing Content-Type META tag'
=> ['<head></head><body><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></body>'],
'with implicit body content with Content-Type META tag'
=> ['<head></head><p>hello</p><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'with end tag omitted and BODY element containing Content-Type META tag'
=> ['<head><body><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></body>'],
'with end tag omitted and implicit body content with Content-Type META tag'
=> ['<head><p>hello</p><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'with Content-Type META tag after end tag'
=> ['<head></head><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
'with Content-Type META tag after uppercase end tag'
=> ['<HEAD></HEAD><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'],
];
}

Expand Down Expand Up @@ -711,8 +731,16 @@ public function addsMissingContentTypeMetaTagOnlyOnce(string $html): void

$result = $subject->render();

$headEndPosition = \stripos($result, '</head>');
$resultBeforeHeadEnd = $headEndPosition !== false ? \substr($result, 0, $headEndPosition) : $result;
// PHP DOM does not understand `<header>` element so does not know it would implicitly start `<body>`.
$headerStartPosition = \stripos($resultBeforeHeadEnd, '<header');
if ($headerStartPosition !== false) {
$resultBeforeHeadEnd = \substr($resultBeforeHeadEnd, 0, $headerStartPosition);
}

$numberOfContentTypeMetaTags = \substr_count(
$result,
$resultBeforeHeadEnd,
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
);
self::assertSame(1, $numberOfContentTypeMetaTags);
Expand Down Expand Up @@ -756,6 +784,23 @@ public function provideHtmlAroundContentType(): array
'HEAD element with attribute' => ['<head lang="en">', '</head>'],
'HTML, HEAD, and BODY with HEADER elements'
=> ['<html><head>', '</head><body><header></header></body></html>'],
'HEAD element with comment' => ['<head><!--Test-->', '</head>'],
'HEAD element with commented-out BODY start tag' => ['<head><!--<body>-->', '</head>'],
'HEAD element with BASE element' => ['<head><base href="https://example.com"/>', '</head>'],
'HEAD element with COMMAND element' => ['<head><command type="command"/>', '</head>'],
'HEAD element with LINK element'
=> ['<head><link rel="stylesheet" href="https://example.org/css.css"/>', '</head>'],
'HEAD element with another META element' => ['<head><meta name="title" content="Test"/>', '</head>'],
'HEAD element with NOSCRIPT element'
=> ['<head><noscript><style>p{color:green}</style></noscript>', '</head>'],
'HEAD element with SCRIPT element' => ['<head><script>console.log("Test");</script>', '</head>'],
'HEAD element with STYLE element' => ['<head><style>p{color:green}</style>', '</head>'],
'HEAD element with TEMPLATE element'
=> ['<head><template id="test"><p>Test</p></template></title>', '</head>'],
'HEAD element with TITLE element' => ['<head><title>Test</title>', '</head>'],
'HEAD element with uppercase TEMPLATE element'
=> ['<head><TEMPLATE id="test"><p>Test</p></TEMPLATE></title>', '</head>'],
'HEAD element with uppercase TITLE element' => ['<head><TITLE>Test</TITLE>', '</head>'],
];
}

Expand Down