Skip to content

Commit

Permalink
Merge pull request #1304 from capricorn86/1039-can-not-parse-correctl…
Browse files Browse the repository at this point in the history
…y-html-with-nested-ul-and-li-tags

1039 can not parse correctly html with nested ul and li tags
  • Loading branch information
capricorn86 authored Mar 13, 2024
2 parents 4970c69 + 23e9616 commit 713aa3c
Show file tree
Hide file tree
Showing 11 changed files with 815 additions and 200 deletions.
710 changes: 710 additions & 0 deletions packages/happy-dom/src/config/HTMLElementConfig.ts

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
enum HTMLElementConfigContentModelEnum {
rawText = 'rawText',
noSelfDescendants = 'noSelfDescendants',
noFirsLevelSelfDescendants = 'noFirsLevelSelfDescendants',
noDescendants = 'noDescendants',
anyDescendants = 'anyDescendants'
}

export default HTMLElementConfigContentModelEnum;
119 changes: 0 additions & 119 deletions packages/happy-dom/src/config/HTMLElementLocalNameToClass.ts

This file was deleted.

4 changes: 0 additions & 4 deletions packages/happy-dom/src/config/HTMLElementPlainText.ts

This file was deleted.

18 changes: 0 additions & 18 deletions packages/happy-dom/src/config/HTMLElementUnnestable.ts

This file was deleted.

16 changes: 0 additions & 16 deletions packages/happy-dom/src/config/HTMLElementVoid.ts

This file was deleted.

8 changes: 8 additions & 0 deletions packages/happy-dom/src/config/IHTMLElementConfigEntity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import HTMLElementConfigContentModelEnum from './HTMLElementConfigContentModelEnum.js';

export default interface IHTMLElementConfigEntity {
className: string;
localName: string;
tagName: string;
contentModel: HTMLElementConfigContentModelEnum;
}
6 changes: 4 additions & 2 deletions packages/happy-dom/src/nodes/document/Document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import DocumentFragment from '../document-fragment/DocumentFragment.js';
import XMLParser from '../../xml-parser/XMLParser.js';
import Event from '../../event/Event.js';
import DOMImplementation from '../../dom-implementation/DOMImplementation.js';
import HTMLElementLocalNameToClass from '../../config/HTMLElementLocalNameToClass.js';
import INodeFilter from '../../tree-walker/INodeFilter.js';
import NamespaceURI from '../../config/NamespaceURI.js';
import DocumentType from '../document-type/DocumentType.js';
Expand Down Expand Up @@ -51,6 +50,7 @@ import ISVGElementTagNameMap from '../../config/ISVGElementTagNameMap.js';
import ISVGElement from '../svg-element/ISVGElement.js';
import IHTMLFormElement from '../html-form-element/IHTMLFormElement.js';
import IHTMLAnchorElement from '../html-anchor-element/IHTMLAnchorElement.js';
import HTMLElementConfig from '../../config/HTMLElementConfig.js';

const PROCESSING_INSTRUCTION_TARGET_REGEXP = /^[a-z][a-z0-9-]+$/;

Expand Down Expand Up @@ -1131,7 +1131,9 @@ export default class Document extends Node implements IDocument {
}

const localName = qualifiedName.toLowerCase();
const elementClass = this[PropertySymbol.ownerWindow][HTMLElementLocalNameToClass[localName]];
const elementClass = HTMLElementConfig[localName]
? this[PropertySymbol.ownerWindow][HTMLElementConfig[localName].className]
: null;

// Known HTML element
if (elementClass) {
Expand Down
77 changes: 38 additions & 39 deletions packages/happy-dom/src/xml-parser/XMLParser.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import IDocument from '../nodes/document/IDocument.js';
import * as PropertySymbol from '../PropertySymbol.js';
import HTMLElementVoid from '../config/HTMLElementVoid.js';
import HTMLElementUnnestable from '../config/HTMLElementUnnestable.js';
import NamespaceURI from '../config/NamespaceURI.js';
import HTMLScriptElement from '../nodes/html-script-element/HTMLScriptElement.js';
import IElement from '../nodes/element/IElement.js';
import HTMLLinkElement from '../nodes/html-link-element/HTMLLinkElement.js';
import HTMLElementPlainText from '../config/HTMLElementPlainText.js';
import IDocumentType from '../nodes/document-type/IDocumentType.js';
import INode from '../nodes/node/INode.js';
import IDocumentFragment from '../nodes/document-fragment/IDocumentFragment.js';
import HTMLElementConfig from '../config/HTMLElementConfig.js';
import * as Entities from 'entities';
import HTMLElementConfigContentModelEnum from '../config/HTMLElementConfigContentModelEnum.js';

/**
* Markup RegExp.
Expand Down Expand Up @@ -58,6 +57,8 @@ const DOCUMENT_TYPE_ATTRIBUTE_REGEXP = /"([^"]+)"/gm;

/**
* XML parser.
*
* @see https://html.spec.whatwg.org/multipage/indices.html
*/
export default class XMLParser {
/**
Expand All @@ -77,12 +78,11 @@ export default class XMLParser {
): IElement | IDocumentFragment | IDocument {
const root = options && options.rootNode ? options.rootNode : document.createDocumentFragment();
const stack: INode[] = [root];
const stackTagNames: string[] = [];
const markupRegexp = new RegExp(MARKUP_REGEXP, 'gm');
const { evaluateScripts = false } = options || {};
const unnestableTagNames: string[] = [];
let currentNode: INode | null = root;
let match: RegExpExecArray;
let plainTextTagName: string | null = null;
let readState: MarkupReadStateEnum = MarkupReadStateEnum.startOrEndTag;
let startTagIndex = 0;
let lastIndex = 0;
Expand All @@ -108,19 +108,31 @@ export default class XMLParser {
// Start tag.
const tagName = match[1].toUpperCase();
const localName = tagName === 'SVG' ? 'svg' : match[1];
const config = HTMLElementConfig[localName];

// Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
// Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
const unnestableTagNameIndex = unnestableTagNames.indexOf(tagName);
if (unnestableTagNameIndex !== -1) {
unnestableTagNames.splice(unnestableTagNameIndex, 1);
if (
config?.contentModel ===
HTMLElementConfigContentModelEnum.noFirsLevelSelfDescendants &&
stackTagNames[stackTagNames.length - 1] === tagName
) {
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
} else if (
config?.contentModel === HTMLElementConfigContentModelEnum.noSelfDescendants &&
stackTagNames.includes(tagName)
) {
while (currentNode !== root) {
if ((<IElement>currentNode)[PropertySymbol.tagName].toUpperCase() === tagName) {
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
break;
}
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
}
}
Expand All @@ -136,25 +148,18 @@ export default class XMLParser {
currentNode.appendChild(newElement);
currentNode = newElement;
stack.push(currentNode);
stackTagNames.push(tagName);
readState = MarkupReadStateEnum.insideStartTag;
startTagIndex = markupRegexp.lastIndex;
} else if (match[2]) {
// End tag.

if (
match[2].toUpperCase() ===
(<IElement>currentNode)[PropertySymbol.tagName].toUpperCase()
(<IElement>currentNode)[PropertySymbol.tagName]?.toUpperCase()
) {
// Some elements are not allowed to be nested (e.g. "<a><a></a></a>" is not allowed.).
// Therefore we need to auto-close the tag, so that it become valid (e.g. "<a></a><a></a>").
const unnestableTagNameIndex = unnestableTagNames.indexOf(
(<IElement>currentNode)[PropertySymbol.tagName].toUpperCase()
);
if (unnestableTagNameIndex !== -1) {
unnestableTagNames.splice(unnestableTagNameIndex, 1);
}

stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
}
} else if (
Expand Down Expand Up @@ -201,8 +206,6 @@ export default class XMLParser {
case MarkupReadStateEnum.insideStartTag:
// End of start tag
if (match[7] || match[8]) {
// End of start tag.

// Attribute name and value.

const attributeString = xml.substring(startTagIndex, match.index);
Expand Down Expand Up @@ -257,33 +260,27 @@ export default class XMLParser {
// We need to check if the attribute string is read completely.
// The attribute string can potentially contain "/>" or ">".
if (hasAttributeStringEnded) {
const config = HTMLElementConfig[(<IElement>currentNode)[PropertySymbol.localName]];

// Checks if the tag is a self closing tag (ends with "/>") or void element.
// When it is a self closing tag or void element it should be closed immediately.
// Self closing tags are not allowed in the HTML namespace, but the parser should still allow it for void elements.
// Self closing tags is supported in the SVG namespace.
if (
HTMLElementVoid[(<IElement>currentNode)[PropertySymbol.tagName]] ||
config?.contentModel === HTMLElementConfigContentModelEnum.noDescendants ||
// SVG tag is self closing (<svg/>).
(match[7] &&
(<IElement>currentNode)[PropertySymbol.namespaceURI] === NamespaceURI.svg)
) {
stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
readState = MarkupReadStateEnum.startOrEndTag;
} else {
// Plain text elements such as <script> and <style> should only contain text.
plainTextTagName = HTMLElementPlainText[
(<IElement>currentNode)[PropertySymbol.tagName]
]
? (<IElement>currentNode)[PropertySymbol.tagName]
: null;

readState = !!plainTextTagName
? MarkupReadStateEnum.plainTextContent
: MarkupReadStateEnum.startOrEndTag;

if (HTMLElementUnnestable[(<IElement>currentNode)[PropertySymbol.tagName]]) {
unnestableTagNames.push((<IElement>currentNode)[PropertySymbol.tagName]);
}
readState =
config?.contentModel === HTMLElementConfigContentModelEnum.rawText
? MarkupReadStateEnum.plainTextContent
: MarkupReadStateEnum.startOrEndTag;
}

startTagIndex = markupRegexp.lastIndex;
Expand All @@ -292,15 +289,17 @@ export default class XMLParser {

break;
case MarkupReadStateEnum.plainTextContent:
if (match[2] && match[2].toUpperCase() === plainTextTagName) {
const tagName = currentNode[PropertySymbol.tagName];

if (tagName && match[2] && match[2].toUpperCase() === tagName) {
// End of plain text tag.

// Scripts are not allowed to be executed when they are parsed using innerHTML, outerHTML, replaceWith() etc.
// However, they are allowed to be executed when document.write() is used.
// See: https://developer.mozilla.org/en-US/docs/Web/API/HTMLScriptElement
if (plainTextTagName === 'SCRIPT') {
if (tagName === 'SCRIPT') {
(<HTMLScriptElement>currentNode)[PropertySymbol.evaluateScript] = evaluateScripts;
} else if (plainTextTagName === 'LINK') {
} else if (tagName === 'LINK') {
// An assumption that the same rule should be applied for the HTMLLinkElement is made here.
(<HTMLLinkElement>currentNode)[PropertySymbol.evaluateCSS] = evaluateScripts;
}
Expand All @@ -313,8 +312,8 @@ export default class XMLParser {
);

stack.pop();
stackTagNames.pop();
currentNode = stack[stack.length - 1] || root;
plainTextTagName = null;
readState = MarkupReadStateEnum.startOrEndTag;
}

Expand Down
Loading

0 comments on commit 713aa3c

Please sign in to comment.