http: simplify checkIsHttpToken() #17399

Trott · 2017-11-30T13:22:24Z

Replace code optimized for older versions of V8 with more
straightforward code in checkIsHttpToken().

Checklist

make -j4 test (UNIX), or vcbuild test (Windows) passes
commit message follows commit guidelines

Affected core subsystem(s)

http

Trott · 2017-11-30T13:23:16Z

Benchmark results:

                                                                 improvement confidence      p.value
 http/check_is_http_token.js n=1000000 key=":"                      -67.14 %        *** 4.065668e-34
 http/check_is_http_token.js n=1000000 key=":alternate-protocol"    -54.61 %        *** 5.705110e-28
 http/check_is_http_token.js n=1000000 key="((((())))"              -66.34 %        *** 2.636118e-33
 http/check_is_http_token.js n=1000000 key="@@"                     -67.24 %        *** 2.329542e-28
 http/check_is_http_token.js n=1000000 key="Accept-Ranges"          146.79 %        *** 2.298935e-45
 http/check_is_http_token.js n=1000000 key="alt-svc"                 -3.79 %        *** 3.200379e-04
 http/check_is_http_token.js n=1000000 key="alternate-protocol:"    150.80 %        *** 1.859951e-41
 http/check_is_http_token.js n=1000000 key="alternate-protocol"     239.27 %        *** 2.887460e-42
 http/check_is_http_token.js n=1000000 key="Cache-Control"          148.05 %        *** 1.588821e-41
 http/check_is_http_token.js n=1000000 key="Connection"              38.53 %        *** 9.847708e-29
 http/check_is_http_token.js n=1000000 key="Content-Encoding"       205.91 %        *** 1.416987e-37
 http/check_is_http_token.js n=1000000 key="content-length"         157.29 %        *** 1.049744e-37
 http/check_is_http_token.js n=1000000 key="Content-Location"       203.87 %        *** 1.005885e-38
 http/check_is_http_token.js n=1000000 key="content-type"            60.62 %        *** 1.051226e-40
 http/check_is_http_token.js n=1000000 key="Content-Type"            61.86 %        *** 1.993077e-30
 http/check_is_http_token.js n=1000000 key="date"                   -58.40 %        *** 4.130459e-34
 http/check_is_http_token.js n=1000000 key="ETag"                   -58.21 %        *** 2.484249e-43
 http/check_is_http_token.js n=1000000 key="Expires"                 -1.77 %            8.884764e-02
 http/check_is_http_token.js n=1000000 key="Keep-Alive"              38.08 %        *** 3.190244e-35
 http/check_is_http_token.js n=1000000 key="Last-Modified"          150.14 %        *** 4.677389e-55
 http/check_is_http_token.js n=1000000 key="location"                12.19 %        *** 1.952609e-14
 http/check_is_http_token.js n=1000000 key="server"                 -18.56 %        *** 6.947429e-20
 http/check_is_http_token.js n=1000000 key="Server"                 -18.91 %        *** 1.729007e-19
 http/check_is_http_token.js n=1000000 key="status"                 -19.17 %        *** 4.218003e-27
 http/check_is_http_token.js n=1000000 key="TCN"                    -61.80 %        *** 1.219346e-23
 http/check_is_http_token.js n=1000000 key="Transfer-Encoding"      223.55 %        *** 1.359534e-40
 http/check_is_http_token.js n=1000000 key="Vary"                   -59.13 %        *** 3.830565e-45
 http/check_is_http_token.js n=1000000 key="version"                 -6.03 %        *** 2.214569e-04
 http/check_is_http_token.js n=1000000 key="x-frame-options"        164.26 %        *** 1.243066e-30
 http/check_is_http_token.js n=1000000 key="x-xss-protection"       204.88 %        *** 1.000083e-35
 http/check_is_http_token.js n=1000000 key="中文呢"                 -31.09 %        *** 6.800282e-23

Trott · 2017-11-30T13:30:28Z

Benchmark summary seems to be:

Longer valid tokens perform much better with this change.
Shorter valid tokens take a perf hit with this change.
Invalid tokens where the invalid character appear early on in the string perform better with the existing code, but you get the opposite result if it appears late in the string.

I'm tempted to log arguments sent to checkIsHttpToken() in a real world app to see what kinds of strings are processed and how often.

Trott · 2017-11-30T13:32:33Z

@nodejs/v8

cjihrig

LGTM for readability.

mscdex · 2017-11-30T21:36:57Z

I think I'd prefer a hybrid + expanded loop unrolling solution. The expanded unrolling would cover the most common http headers (based on headers listed on Wikipedia for example and also the list we use in _http_incoming.js when converting headers to lowercase) and the regexp would be used for larger, less common header names.

We could dynamically generate the function to avoid the lengthy function source code, it performs the same as the inline version:

const validTokens = [
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 - 15
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 - 31
  0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, // 32 - 47
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 48 - 63
  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64 - 79
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, // 80 - 95
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 96 - 111
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, // 112 - 127
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 128 ...
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // ... 255
];
const tokenRegExp = /^[\^_`a-zA-Z\-0-9!#$%&'*+.|~]+$/;
function checkIsHttpToken(val) {
  if (val.length > 19) return tokenRegExp.test(val);
  if (!validTokens[val.charCodeAt(0)]) return false;
  if (val.length < 2) return true;
  if (!validTokens[val.charCodeAt(1)]) return false;
  if (val.length < 3) return true;
  if (!validTokens[val.charCodeAt(2)]) return false;
  if (val.length < 4) return true;
  if (!validTokens[val.charCodeAt(3)]) return false;
  if (val.length < 5) return true;
  if (!validTokens[val.charCodeAt(4)]) return false;
  if (val.length < 6) return true;
  if (!validTokens[val.charCodeAt(5)]) return false;
  if (val.length < 7) return true;
  if (!validTokens[val.charCodeAt(6)]) return false;
  if (val.length < 8) return true;
  if (!validTokens[val.charCodeAt(7)]) return false;
  if (val.length < 9) return true;
  if (!validTokens[val.charCodeAt(8)]) return false;
  if (val.length < 10) return true;
  if (!validTokens[val.charCodeAt(9)]) return false;
  if (val.length < 11) return true;
  if (!validTokens[val.charCodeAt(10)]) return false;
  if (val.length < 12) return true;
  if (!validTokens[val.charCodeAt(11)]) return false;
  if (val.length < 13) return true;
  if (!validTokens[val.charCodeAt(12)]) return false;
  if (val.length < 14) return true;
  if (!validTokens[val.charCodeAt(13)]) return false;
  if (val.length < 15) return true;
  if (!validTokens[val.charCodeAt(14)]) return false;
  if (val.length < 16) return true;
  if (!validTokens[val.charCodeAt(15)]) return false;
  if (val.length < 17) return true;
  if (!validTokens[val.charCodeAt(16)]) return false;
  if (val.length < 18) return true;
  if (!validTokens[val.charCodeAt(17)]) return false;
  if (val.length < 19) return true;
  if (!validTokens[val.charCodeAt(18)]) return false;
  return true;
}

Benchmarking the various solutions with 1e8 iterations, I see these results for various headers:

	master	this PR	hybrid+expanded unroll
If-Unmodified-Since (19 chars)	10.7s	5.6s	3.4s
Content-Type (12 chars)	6.1s	4.3s	2.4s
Date (4 chars)	0.96s	3.0s	1.1s

ofrobots · 2017-11-30T22:12:11Z

It would be good to get some real world benchmarking data; but lacking that, my preference would be for readability.

apapirovski · 2017-12-01T01:01:52Z

Ignore my — now deleted — post re: for loop, testing on the wrong V8 version... It seems like the version @mscdex proposed is currently the best (albeit ugly) it gets.

Trott · 2017-12-01T01:11:21Z

@mscdex or anyone else: Do you have an explanation as to why your benchmarking shows the hybrid+expanded unroll faster for a 19-character value than this PR? I'm mystified as to how that could be and TBH it's making me look at those timings with a bit of side-eye....

mscdex · 2017-12-01T01:30:32Z

@Trott because a regexp (or just a loop for that matter) has a lot more overhead than a series of if statements? That was why I had unrolled the loop a bit originally, although it was not much because of Crankshaft's inlining requirements.

Trott · 2017-12-01T02:26:35Z

@Trott because a regexp (or just a loop for that matter) has a lot more overhead than a series of if statements? That was why I had unrolled the loop a bit originally, although it was not much because of Crankshaft's inlining requirements.

@mscdex But for a 19-character string, your unrolled version uses a regexp too. So how is it faster for a 19-character string? Something isn't right...

apapirovski · 2017-12-01T02:47:10Z

@Trott it's > 19 so that test string still uses the unrolled checks.

Trott · 2017-12-01T03:37:32Z

@Trott it's > 19 so that test string still uses the unrolled checks.

Ah! Off-by-one error in my brain.

psmarshall · 2017-12-03T10:29:12Z

@Trott It looks like the regexp code is faster at processing 8 or more characters. By 'processing' I mean how many characters it actually looks at, not how long the input string is, e.g. it bails on out the first-character of ":alternate-protocol" and so is slower than the handwritten JS code. It looks like every string under 7 characters in length regresses.

My guess is that that is caused by the overhead of calling to the regexp builtin or some setup/initialization/allocation that we have to do for each match. I'm not an expert on regexps in v8 though.

The manual loop unrolling is concerning - this should definitely not be necessary to do by hand.

I like this code much better just based on readability. Parsing time will also be lower which is nice. Inlining decisions are more complicated than just the pure length of the function now - I think we have a higher budget for extremely small functions, so this could potentially help there too.

One more thing to think about - is this function even run as optimized code on a server? i.e. is it actually called enough times with reasonably stable input types. I don't have any intuition there. The microbenchmarks probably aren't stressing this code in the same way a real server would.

I'd suggest the following:

Log a v8 issue for the regexp being slower for < 7 characters. It could just be the overhead of calling to a builtin and we might not be able to do much about it - but worth confirming that at least, because it looks like that is stopping people from writing more idiomatic code.
Log a v8 issue for the manual loop unrolling/iteration peeling being faster. I've seen this elsewhere in node code so maybe it is a wider issue. Ideally the optimizing compiler should be better at doing this than humans, so it is definitely worth looking into as well.

apapirovski · 2017-12-08T18:51:32Z

I'm pretty in favour of landing this very soon (with the caveat we don't use it for v8.x or lower). @mscdex would you still like this to use unrolled checks? Could you make your request/objection more explicit if so, as otherwise this will end up landing eventually given the 3 approvals (incl 2 from TSC).

mscdex · 2017-12-08T22:32:59Z

@apapirovski to be honest a lot of the performance suggestions I make are too much for most people, so just take the suggestions/benchmark results I posted as some food for thought.

apapirovski · 2017-12-09T12:48:06Z

CI: https://ci.nodejs.org/job/node-test-pull-request/11996/

BridgeAR · 2017-12-10T03:49:09Z

lib/_http_common.js

-/**
- * Verifies that the given val is a valid HTTP token
- * per the rules defined in RFC 7230
- * See https://tools.ietf.org/html/rfc7230#section-3.2.6


I would like to keep this comment. When looking through the code it is good to have a reference handy.

@BridgeAR Restored.

Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken().

apapirovski · 2017-12-10T21:51:32Z

Landed in 9f55eac 🎉

Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken(). PR-URL: #17399 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Anatoli Papirovski <apapirovski@mac.com> Reviewed-By: Timothy Gu <timothygu99@gmail.com>

Trott · 2017-12-10T22:27:25Z

Any volunteers to open the two issues for V8 as suggested by @psmarshall? I'll do it if not one else does, but I feel like someone who better understands V8 and benchmarking would do a better job... @nodejs/v8

bmeurer · 2017-12-11T03:55:49Z

Great job! 👍

Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken(). PR-URL: #17399 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Anatoli Papirovski <apapirovski@mac.com> Reviewed-By: Timothy Gu <timothygu99@gmail.com>

mathiasbynens · 2017-12-12T20:00:07Z

Any volunteers to open the two issues for V8 as suggested by @psmarshall? I'll do it if no one else does, but I feel like someone who better understands V8 and benchmarking would do a better job…

@Trott Please go ahead and file those issues. You’ve got this! 👍

Trott · 2017-12-12T23:24:01Z

V8 issues opened at https://bugs.chromium.org/p/v8/issues/detail?id=7200 and https://bugs.chromium.org/p/v8/issues/detail?id=7201.

In the spirit of [17399](nodejs#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data.

In the spirit of [17399](nodejs#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: nodejs#18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>

In the spirit of [17399](#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: #18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>

In the spirit of [17399](nodejs#17399), we can also simplify checkInvalidHeaderChar to use regex matching instead of a loop. This makes it faster on long matches and slower on short matches or non-matches. This change also includes some sample data from an AcmeAir benchmark run, as a rough proxy for real-world data. PR-URL: nodejs#18381 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Benedikt Meurer <benedikt.meurer@gmail.com> Reviewed-By: Tiancheng "Timothy" Gu <timothygu99@gmail.com>

nodejs-github-bot added the http Issues or PRs related to the http subsystem. label Nov 30, 2017

cjihrig approved these changes Nov 30, 2017

View reviewed changes

addaleax approved these changes Nov 30, 2017

View reviewed changes

apapirovski approved these changes Nov 30, 2017

View reviewed changes

MylesBorins force-pushed the master branch from b7405ab to 7f086dd Compare December 8, 2017 16:37

apapirovski added dont-land-on-v4.x labels Dec 8, 2017

TimothyGu approved these changes Dec 8, 2017

View reviewed changes

BridgeAR reviewed Dec 10, 2017

View reviewed changes

http: simplify checkIsHttpToken()

989c634

Replace code optimized for older versions of V8 with more straightforward code in checkIsHttpToken().

Trott force-pushed the istoken branch from 3b1223c to 989c634 Compare December 10, 2017 03:59

apapirovski closed this Dec 10, 2017

MylesBorins mentioned this pull request Dec 12, 2017

v9.3.0 proposal #17631

Merged

sethbrenith mentioned this pull request Jan 25, 2018

http: simplify checkInvalidHeaderChar #18381

Closed

3 tasks

Trott deleted the istoken branch January 13, 2022 22:48

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

http: simplify checkIsHttpToken() #17399

http: simplify checkIsHttpToken() #17399

Trott commented Nov 30, 2017

Trott commented Nov 30, 2017

Trott commented Nov 30, 2017

Trott commented Nov 30, 2017

cjihrig left a comment

mscdex commented Nov 30, 2017 •

edited

Loading

ofrobots commented Nov 30, 2017

apapirovski commented Dec 1, 2017

Trott commented Dec 1, 2017

mscdex commented Dec 1, 2017

Trott commented Dec 1, 2017

apapirovski commented Dec 1, 2017

Trott commented Dec 1, 2017

psmarshall commented Dec 3, 2017

apapirovski commented Dec 8, 2017 •

edited

Loading

mscdex commented Dec 8, 2017 •

edited

Loading

apapirovski commented Dec 9, 2017

BridgeAR Dec 10, 2017

Trott Dec 10, 2017

apapirovski commented Dec 10, 2017

Trott commented Dec 10, 2017

bmeurer commented Dec 11, 2017

mathiasbynens commented Dec 12, 2017 •

edited by Trott

Loading

Trott commented Dec 12, 2017

http: simplify checkIsHttpToken() #17399

http: simplify checkIsHttpToken() #17399

Conversation

Trott commented Nov 30, 2017

Checklist

Affected core subsystem(s)

Trott commented Nov 30, 2017

Trott commented Nov 30, 2017

Trott commented Nov 30, 2017

cjihrig left a comment

Choose a reason for hiding this comment

mscdex commented Nov 30, 2017 • edited Loading

ofrobots commented Nov 30, 2017

apapirovski commented Dec 1, 2017

Trott commented Dec 1, 2017

mscdex commented Dec 1, 2017

Trott commented Dec 1, 2017

apapirovski commented Dec 1, 2017

Trott commented Dec 1, 2017

psmarshall commented Dec 3, 2017

apapirovski commented Dec 8, 2017 • edited Loading

mscdex commented Dec 8, 2017 • edited Loading

apapirovski commented Dec 9, 2017

BridgeAR Dec 10, 2017

Choose a reason for hiding this comment

Trott Dec 10, 2017

Choose a reason for hiding this comment

apapirovski commented Dec 10, 2017

Trott commented Dec 10, 2017

bmeurer commented Dec 11, 2017

mathiasbynens commented Dec 12, 2017 • edited by Trott Loading

Trott commented Dec 12, 2017

mscdex commented Nov 30, 2017 •

edited

Loading

apapirovski commented Dec 8, 2017 •

edited

Loading

mscdex commented Dec 8, 2017 •

edited

Loading

mathiasbynens commented Dec 12, 2017 •

edited by Trott

Loading