-
-
Notifications
You must be signed in to change notification settings - Fork 799
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of writing raw UTF-8 encoded byte arrays #1349
base: 2.19
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -647,11 +647,16 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException | |
_flushBuffer(); | ||
} | ||
_outputBuffer[_outputTail++] = _quoteChar; | ||
|
||
// When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into | ||
// using the byte value. | ||
final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits(); | ||
|
||
// One or multiple segments? | ||
if (len <= _outputMaxContiguous) { | ||
_writeUTF8Segment(text, offset, len); | ||
_writeUTF8Segment(text, offset, len, extendedOutputEscapes); | ||
} else { | ||
_writeUTF8Segments(text, offset, len); | ||
_writeUTF8Segments(text, offset, len, extendedOutputEscapes); | ||
} | ||
if (_outputTail >= _outputEnd) { | ||
_flushBuffer(); | ||
|
@@ -1846,28 +1851,26 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in | |
* to fit in the output buffer after escaping; as such, we just need to | ||
* chunk writes. | ||
*/ | ||
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen) | ||
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes) | ||
throws IOException, JsonGenerationException | ||
{ | ||
do { | ||
int len = Math.min(_outputMaxContiguous, totalLen); | ||
_writeUTF8Segment(utf8, offset, len); | ||
_writeUTF8Segment(utf8, offset, len, extendedOutputEscapes); | ||
offset += len; | ||
totalLen -= len; | ||
} while (totalLen > 0); | ||
} | ||
|
||
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len) | ||
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len, final int[] extendedOutputEscapes) | ||
throws IOException, JsonGenerationException | ||
{ | ||
// fast loop to see if escaping is needed; don't copy, just look | ||
final int[] escCodes = _outputEscapes; | ||
|
||
for (int ptr = offset, end = offset + len; ptr < end; ) { | ||
// 28-Feb-2011, tatu: escape codes just cover 7-bit range, so: | ||
int ch = utf8[ptr++]; | ||
if ((ch >= 0) && escCodes[ch] != 0) { | ||
_writeUTF8Segment2(utf8, offset, len); | ||
int ch = utf8[ptr++] & 0xFF; | ||
if (extendedOutputEscapes[ch] != 0) { | ||
_writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes); | ||
return; | ||
} | ||
} | ||
|
@@ -1880,7 +1883,7 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le | |
_outputTail += len; | ||
} | ||
|
||
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) | ||
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len, final int[] extendedOutputEscapes) | ||
throws IOException, JsonGenerationException | ||
{ | ||
int outputPtr = _outputTail; | ||
|
@@ -1892,17 +1895,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) | |
} | ||
|
||
final byte[] outputBuffer = _outputBuffer; | ||
final int[] escCodes = _outputEscapes; | ||
len += offset; // so 'len' becomes 'end' | ||
|
||
while (offset < len) { | ||
byte b = utf8[offset++]; | ||
int ch = b; | ||
if (ch < 0 || escCodes[ch] == 0) { | ||
int ch = b & 0xFF; | ||
int escape = extendedOutputEscapes[ch]; | ||
if (escape == 0) { | ||
outputBuffer[outputPtr++] = b; | ||
continue; | ||
} | ||
int escape = escCodes[ch]; | ||
if (escape > 0) { // 2-char escape, fine | ||
outputBuffer[outputPtr++] = BYTE_BACKSLASH; | ||
outputBuffer[outputPtr++] = (byte) escape; | ||
|
@@ -1914,6 +1916,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len) | |
_outputTail = outputPtr; | ||
} | ||
|
||
private int[] _extendOutputEscapesTo8Bits() { | ||
final int[] escapes = _outputEscapes; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok: I am fine with the idea, but will propose one change: instead of overwriting The reason is just that ideally There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The benefit of overwriting the field is that it will automatically be recreated if it is reset to a 7-bit wide LUT. Storing the 8-bit table in a separate field might get out of date, especially since |
||
if (escapes.length >= 0xFF) { | ||
return escapes; | ||
} | ||
|
||
final int[] extended = new int[0xFF]; | ||
System.arraycopy(escapes, 0, extended, 0, escapes.length); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is |
||
_outputEscapes = extended; | ||
return extended; | ||
} | ||
|
||
/* | ||
/********************************************************** | ||
/* Internal methods, low-level writing, base64 encoded | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
package perf; | ||
|
||
import com.fasterxml.jackson.core.JsonFactory; | ||
import com.fasterxml.jackson.core.JsonGenerator; | ||
import com.fasterxml.jackson.core.io.CharTypes; | ||
|
||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
/** | ||
* Benchmarks the performance of writing UTF-8 encoded bytes, in particular the difference between using a 7-bit wide | ||
* lookup table for escapes, versus a full 8-bit wide table. The latter is beneficial when processing encoded UTF-8 | ||
* bytes, as the byte itself can directly be used as table index instead of needing an additional branch. | ||
* <p> | ||
* This benchmark implements the escaping UTF-8 write loops using both 7-bit and 8-bit tables to show their respective | ||
* differences, as well as testing {@link JsonGenerator#writeUTF8String} for benchmarking the production implementation. | ||
* | ||
* @see <a href="https://github.com/FasterXML/jackson-core/pull/1349">Github PR</a> | ||
*/ | ||
public class ManualUtf8WriteTest | ||
{ | ||
private String test(byte[] utf8) throws Exception | ||
{ | ||
final byte[] OUTPUT = new byte[utf8.length * 2]; | ||
ByteArrayOutputStream OUTPUT_STREAM = new ByteArrayOutputStream(utf8.length * 2); | ||
JsonGenerator generator = new JsonFactory().createGenerator(OUTPUT_STREAM); | ||
|
||
// Let's try to guestimate suitable size, N megs of output | ||
final int REPS = (int) ((double) (80 * 1000 * 1000) / (double) utf8.length); | ||
System.out.printf("%d bytes to scan, will do %d repetitions\n", | ||
utf8.length, REPS); | ||
|
||
int i = 0; | ||
int roundsDone = 0; | ||
final int TYPES = 3; | ||
final int WARMUP_ROUNDS = 5; | ||
final int ROUNDS = WARMUP_ROUNDS + 10; | ||
|
||
final long[] times = new long[TYPES]; | ||
|
||
while (i < ROUNDS * TYPES) { | ||
int round = i++ % TYPES; | ||
|
||
String msg; | ||
|
||
long msecs; | ||
switch (round) { | ||
case 0: | ||
msg = "Write UTF-8 [7-bit escaping table]"; | ||
msecs = writeUtf8_7BitEscapingTable(REPS, utf8, OUTPUT); | ||
break; | ||
case 1: | ||
msg = "Write UTF-8 [8-bit escaping table]"; | ||
msecs = writeUtf8_8BitEscapingTable(REPS, utf8, OUTPUT); | ||
break; | ||
case 2: | ||
msg = "JsonGenerator.writeUTF8String "; | ||
msecs = writeUtf8_JsonGenerator(REPS, utf8, OUTPUT_STREAM, generator); | ||
break; | ||
default: | ||
throw new Error(); | ||
} | ||
// skip first 5 rounds to let results stabilize | ||
if (roundsDone >= WARMUP_ROUNDS) { | ||
times[round] += msecs; | ||
} | ||
|
||
System.out.printf("Test '%s' -> %3d msecs\n", msg, msecs); | ||
if (round == TYPES - 1) { | ||
++roundsDone; | ||
if ((roundsDone % 3) == 0) { | ||
System.out.println("[GC]"); | ||
Thread.sleep(100L); | ||
System.gc(); | ||
Thread.sleep(100L); | ||
} | ||
System.out.println(); | ||
} | ||
} | ||
double den = roundsDone - WARMUP_ROUNDS; | ||
|
||
return String.format("(7-bit, 8-bit, JsonGenerator): %5.1f / %5.1f / %5.1f msecs", | ||
times[0] / den, times[1] / den, times[2] / den); | ||
} | ||
|
||
private final long writeUtf8_7BitEscapingTable(int REPS, byte[] input, byte[] output) | ||
{ | ||
long start = System.currentTimeMillis(); | ||
int[] outputEscapes = CharTypes.get7BitOutputEscapes(); | ||
|
||
while (--REPS >= 0) { | ||
int inOffset = 0; | ||
int outOffset = 0; | ||
int len = input.length; | ||
|
||
while (inOffset < len) { | ||
byte b = input[inOffset++]; | ||
int ch = b; | ||
if (ch < 0 || outputEscapes[ch] == 0) { | ||
output[outOffset++] = b; | ||
continue; | ||
} | ||
int escape = outputEscapes[ch]; | ||
if (escape > 0) { | ||
output[outOffset++] = (byte) '\\'; | ||
output[outOffset++] = (byte) escape; | ||
} else { | ||
throw new UnsupportedOperationException("ctrl character escapes are not covered in test"); | ||
} | ||
} | ||
} | ||
long time = System.currentTimeMillis() - start; | ||
return time; | ||
} | ||
|
||
private final long writeUtf8_8BitEscapingTable(int REPS, byte[] input, byte[] output) | ||
{ | ||
long start = System.currentTimeMillis(); | ||
|
||
int[] outputEscapes = CharTypes.get7BitOutputEscapes(); | ||
int[] extendedOutputEscapes = new int[0xFF]; | ||
System.arraycopy(outputEscapes, 0, extendedOutputEscapes, 0, outputEscapes.length); | ||
|
||
while (--REPS >= 0) { | ||
int inOffset = 0; | ||
int outOffset = 0; | ||
int len = input.length; | ||
|
||
while (inOffset < len) { | ||
byte b = input[inOffset++]; | ||
int ch = b & 0xFF; | ||
int escape = extendedOutputEscapes[ch]; | ||
if (escape == 0) { | ||
output[outOffset++] = b; | ||
continue; | ||
} | ||
if (escape > 0) { | ||
output[outOffset++] = (byte) '\\'; | ||
output[outOffset++] = (byte) escape; | ||
} else { | ||
throw new UnsupportedOperationException("ctrl character escapes are not covered in test"); | ||
} | ||
} | ||
} | ||
|
||
long time = System.currentTimeMillis() - start; | ||
return time; | ||
} | ||
|
||
private final long writeUtf8_JsonGenerator(int REPS, byte[] input, ByteArrayOutputStream output, JsonGenerator generator) throws IOException { | ||
long start = System.currentTimeMillis(); | ||
|
||
while (--REPS >= 0) { | ||
output.reset(); | ||
generator.writeUTF8String(input, 0, input.length); | ||
generator.flush(); | ||
} | ||
|
||
long time = System.currentTimeMillis() - start; | ||
return time; | ||
} | ||
|
||
public static void main(String[] args) throws Exception | ||
{ | ||
if (args.length != 0) { | ||
System.err.println("Usage: java ..."); | ||
System.exit(1); | ||
} | ||
|
||
final int[] LENGTHS = new int[]{8, 16, 32, 256, 512, 1024, 1024 * 8}; | ||
final String[] ESCAPE_VARIANTS = new String[] {"none", "start", "end"}; | ||
final List<String> results = new ArrayList<String>(); | ||
for (int length : LENGTHS){ | ||
final byte[] buffer = new byte[length]; | ||
|
||
for (int j = 0; j < ESCAPE_VARIANTS.length; j++) { | ||
Arrays.fill(buffer, (byte) 'a'); | ||
|
||
if (j == 1) { | ||
buffer[0] = '"'; | ||
} else if (j == 2) { | ||
buffer[buffer.length - 1] = '"'; | ||
} | ||
|
||
String LABEL = String.format("Length %4d, %5s escape", length, ESCAPE_VARIANTS[j]); | ||
|
||
System.out.printf("Starting %s %n", LABEL); | ||
String result = new ManualUtf8WriteTest().test(buffer); | ||
System.out.printf("Finished %s %n", LABEL); | ||
System.out.println("================================================================================"); | ||
|
||
results.add(String.format("%s: %s", LABEL, result)); | ||
} | ||
} | ||
|
||
for (String result : results) { | ||
System.out.println(result); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok... this is the (only) part I find problematic. Having to dynamically change
_outputEscapes
seems problematic, although I understand why it is being done. Since it is something that may be changed by a call toJsonGenerator.setCharacterEscapes()
modifications cannot be done on constructor.But: I have an idea for bit bigger changes that would make it possible to eagerly ensure
_outputEscapes
is 256 elements long. Will add a separate comment.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, scratch that. Only now realized this is limited to
writeUTF8String
, not all escaping.So dynamically copying + changing is actually reasonable since it's not always needed etc.