Skip to content

Commit

Permalink
try to improve the performance a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
rbri committed Oct 27, 2024
1 parent 60b71ff commit ecac98f
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 19 deletions.
52 changes: 35 additions & 17 deletions src/main/java/org/htmlunit/cyberneko/HTMLTagBalancer.java
Original file line number Diff line number Diff line change
Expand Up @@ -785,14 +785,15 @@ else if (!fSeenRootElement && !fDocumentFragment) {
}
else {
if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) {
final int depth = getParentDepth(element.parent, element.bounds);
final int depth = getParentDepth(element);
if (depth == -1) { // no parent found
final String pname = modifyName(preferedParent.name, fNamesElems);
final QName qname = createQName(pname);
if (fReportErrors) {
final String ename = elem.getRawname();
fErrorReporter.reportWarning("HTML2004", new Object[]{ename, pname});
}

final QName qname = createQName(pname);
final boolean parentCreated = forceStartElement(qname, new XMLAttributesImpl(), synthesizedAugs());
if (!parentCreated) {
if (!isForcedCreation) {
Expand Down Expand Up @@ -1004,43 +1005,58 @@ public void characters(final XMLString text, final Augmentations augs) throws XN
return;
}

// is this text whitespace?
final boolean whitespace = text.isWhitespace();
if (!fDocumentFragment) {
// handle bare characters
if (!fSeenRootElement) {
forceStartBody();
}

if (whitespace && (fElementStack.top < 2 || endElementsBuffer_.size() == 1)) {
// ignore spaces directly within <html>
return;
// isWhitespace() can be an expensive opertation because, if you have many
// whitespace at the beginning of an string
// therefore we like to call it at late as possible - this leads to a bit
// strange code but it is worth the price
int whitespace = -1;
if (fElementStack.top < 2 || endElementsBuffer_.size() == 1) {
whitespace = text.isWhitespace() ? 1: 0;
if (whitespace == 1) {
// ignore spaces directly within <html>
return;
}
}

// handle character content in head
// NOTE: This frequently happens when the document looks like:
// <title>Title</title>
// And here's some text.
else if (!whitespace) {
if (text.length() > 0) {
final Info info = fElementStack.peek();
if (info.element.code == HTMLElements.HEAD || info.element.code == HTMLElements.HTML) {
if (fReportErrors) {
final String hname = modifyName("head", fNamesElems);
final String bname = modifyName("body", fNamesElems);
fErrorReporter.reportWarning("HTML2009", new Object[]{hname, bname});
if (whitespace == 0) {
if (fReportErrors) {
final String hname = modifyName("head", fNamesElems);
final String bname = modifyName("body", fNamesElems);
fErrorReporter.reportWarning("HTML2009", new Object[]{hname, bname});
}
forceStartBody();
}
else if (whitespace == -1 && !text.isWhitespace()) {
if (fReportErrors) {
final String hname = modifyName("head", fNamesElems);
final String bname = modifyName("body", fNamesElems);
fErrorReporter.reportWarning("HTML2009", new Object[]{hname, bname});
}
forceStartBody();
}
forceStartBody();
}
}
}

fSeenCharacters = fSeenCharacters || !whitespace;
fSeenCharacters = fSeenCharacters || text.length() > 0;

// call handler
if (documentHandler_ != null) {
documentHandler_.characters(text, augs);
}

}

/** End element. */
Expand Down Expand Up @@ -1259,8 +1275,10 @@ protected final int getElementDepth(final HTMLElements.Element element) {
* @param parents The parent elements.
* @param bounds bounds
*/
protected int getParentDepth(final HTMLElements.Element[] parents, final short bounds) {
if (parents != null) {
protected int getParentDepth(final Element element) {
final HTMLElements.Element[] parents = element.parent;
if (parents != null && parents.length > 0) {
final short bounds = element.bounds;
for (int i = fElementStack.top - 1; i >= 0; i--) {
final Info info = fElementStack.data[i];
if (info.element.code == bounds) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,9 @@ public XMLString trimToContent(final String startMarker, final String endMarker)
* @return true if we have only whitespace, false otherwise
*/
public boolean isWhitespace() {
for (int i = 0; i < length_; i++) {
// there is a good chance the whitespace is at the beginning
// therefore it makes sense to start at the end to return early
for (int i = length_ - 1; i >= 0; i--) {
if (!Character.isWhitespace(data_[i])) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1165,22 +1165,52 @@ public void isWhitespace() {
{
final XMLString a = new XMLString("");
assertTrue(a.isWhitespace());
assertTrue(a.isWhitespace());
}
{
final XMLString a = new XMLString(" ");
assertTrue(a.isWhitespace());
assertTrue(a.isWhitespace());
}
{
final XMLString a = new XMLString("a");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString(" a \n");
final XMLString a = new XMLString("a ");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString("a ");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString(" a");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString(" a");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString("a \n");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString(" \na");
assertFalse(a.isWhitespace());
assertFalse(a.isWhitespace());
}
{
final XMLString a = new XMLString(" \n");
assertTrue(a.isWhitespace());
assertTrue(a.isWhitespace());
}
}

Expand Down

0 comments on commit ecac98f

Please sign in to comment.