Skip to content

Commit

Permalink
Refactor TSDB doc_values util allow introduce new codec
Browse files Browse the repository at this point in the history
  • Loading branch information
dnhatn committed Oct 17, 2024
1 parent d3fcead commit 631c259
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public abstract class AbstractDocValuesForUtilBenchmark {
protected final int blockSize;

public AbstractDocValuesForUtilBenchmark() {
this.forUtil = new DocValuesForUtil();
this.forUtil = new DocValuesForUtil(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
this.blockSize = ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataOutput;
import org.elasticsearch.index.codec.tsdb.DocValuesForUtil;
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;
Expand Down Expand Up @@ -44,7 +43,7 @@ public void setupInvocation(int bitsPerValue) {

@Override
public void benchmark(int bitsPerValue, Blackhole bh) throws IOException {
DocValuesForUtil.decode(bitsPerValue, this.dataInput, this.output);
forUtil.decode(bitsPerValue, this.dataInput, this.output);
bh.consume(this.output);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ public class DocValuesForUtil {
private static final int BITS_IN_FIVE_BYTES = 5 * Byte.SIZE;
private static final int BITS_IN_SIX_BYTES = 6 * Byte.SIZE;
private static final int BITS_IN_SEVEN_BYTES = 7 * Byte.SIZE;
private static final int blockSize = ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
private final int blockSize;
private final byte[] encoded = new byte[1024];

public DocValuesForUtil() {}
public DocValuesForUtil(int numericBlockSize) {
this.blockSize = numericBlockSize;
}

public static int roundBits(int bitsPerValue) {
if (bitsPerValue > 24 && bitsPerValue <= 32) {
Expand Down Expand Up @@ -67,7 +69,7 @@ private void encodeFiveSixOrSevenBytesPerValue(long[] in, int bitsPerValue, fina
out.writeBytes(this.encoded, bytesPerValue * in.length);
}

public static void decode(int bitsPerValue, final DataInput in, long[] out) throws IOException {
public void decode(int bitsPerValue, final DataInput in, long[] out) throws IOException {
if (bitsPerValue <= 24) {
ForUtil.decode(bitsPerValue, in, out);
} else if (bitsPerValue <= 32) {
Expand All @@ -81,7 +83,7 @@ public static void decode(int bitsPerValue, final DataInput in, long[] out) thro
}
}

private static void decodeFiveSixOrSevenBytesPerValue(int bitsPerValue, final DataInput in, long[] out) throws IOException {
private void decodeFiveSixOrSevenBytesPerValue(int bitsPerValue, final DataInput in, long[] out) throws IOException {
// NOTE: we expect multibyte values to be written "least significant byte" first
int bytesPerValue = bitsPerValue / Byte.SIZE;
long mask = (1L << bitsPerValue) - 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ private long[] writeField(FieldInfo field, DocValuesProducer valuesProducer, lon
if (maxOrd != 1) {
final long[] buffer = new long[ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];
int bufferSize = 0;
final ES87TSDBDocValuesEncoder encoder = new ES87TSDBDocValuesEncoder();
final TSDBDocValuesEncoder encoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
values = valuesProducer.getSortedNumeric(field);
final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ public long longValue() {

private final int maxDoc = ES87TSDBDocValuesProducer.this.maxDoc;
private int doc = -1;
private final ES87TSDBDocValuesEncoder decoder = new ES87TSDBDocValuesEncoder();
private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
private long currentBlockIndex = -1;
private final long[] currentBlock = new long[ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];

Expand Down Expand Up @@ -1030,7 +1030,7 @@ public long longValue() throws IOException {
);
return new NumericDocValues() {

private final ES87TSDBDocValuesEncoder decoder = new ES87TSDBDocValuesEncoder();
private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
private long currentBlockIndex = -1;
private final long[] currentBlock = new long[ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];

Expand Down Expand Up @@ -1092,7 +1092,7 @@ private NumericValues getValues(NumericEntry entry, final long maxOrd) throws IO
final int bitsPerOrd = maxOrd >= 0 ? PackedInts.bitsRequired(maxOrd - 1) : -1;
return new NumericValues() {

private final ES87TSDBDocValuesEncoder decoder = new ES87TSDBDocValuesEncoder();
private final TSDBDocValuesEncoder decoder = new TSDBDocValuesEncoder(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
private long currentBlockIndex = -1;
private final long[] currentBlock = new long[ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
* </li>
* </ul>
*
* Notice that encoding and decoding are written in a nested way, for instance {@link ES87TSDBDocValuesEncoder#deltaEncode} calling
* {@link ES87TSDBDocValuesEncoder#removeOffset} and so on. This allows us to easily introduce new encoding schemes or remove existing
* Notice that encoding and decoding are written in a nested way, for instance {@link TSDBDocValuesEncoder#deltaEncode} calling
* {@link TSDBDocValuesEncoder#removeOffset} and so on. This allows us to easily introduce new encoding schemes or remove existing
* (non-effective) encoding schemes in a backward-compatible way.
*
* A token is used as a bitmask to represent which encoding is applied and allows us to detect the applied encoding scheme at decoding time.
Expand All @@ -54,11 +54,13 @@
*
* Of course, decoding follows the opposite order with respect to encoding.
*/
public class ES87TSDBDocValuesEncoder {
public class TSDBDocValuesEncoder {
private final DocValuesForUtil forUtil;
private final int numericBlockSize;

public ES87TSDBDocValuesEncoder() {
this.forUtil = new DocValuesForUtil();
public TSDBDocValuesEncoder(int numericBlockSize) {
this.forUtil = new DocValuesForUtil(numericBlockSize);
this.numericBlockSize = numericBlockSize;
}

/**
Expand All @@ -68,7 +70,7 @@ public ES87TSDBDocValuesEncoder() {
private void deltaEncode(int token, int tokenBits, long[] in, DataOutput out) throws IOException {
int gts = 0;
int lts = 0;
for (int i = 1; i < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++i) {
for (int i = 1; i < numericBlockSize; ++i) {
if (in[i] > in[i - 1]) {
gts++;
} else if (in[i] < in[i - 1]) {
Expand All @@ -79,7 +81,7 @@ private void deltaEncode(int token, int tokenBits, long[] in, DataOutput out) th
final boolean doDeltaCompression = (gts == 0 && lts >= 2) || (lts == 0 && gts >= 2);
long first = 0;
if (doDeltaCompression) {
for (int i = ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE - 1; i > 0; --i) {
for (int i = numericBlockSize - 1; i > 0; --i) {
in[i] -= in[i - 1];
}
// Avoid setting in[0] to 0 in case there is a minimum interval between
Expand Down Expand Up @@ -115,7 +117,7 @@ private void removeOffset(int token, int tokenBits, long[] in, DataOutput out) t
}

if (min != 0) {
for (int i = 0; i < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++i) {
for (int i = 0; i < numericBlockSize; ++i) {
in[i] -= min;
}
token = (token << 1) | 0x01;
Expand Down Expand Up @@ -143,7 +145,7 @@ private void gcdEncode(int token, int tokenBits, long[] in, DataOutput out) thro
}
final boolean doGcdCompression = Long.compareUnsigned(gcd, 1) > 0;
if (doGcdCompression) {
for (int i = 0; i < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++i) {
for (int i = 0; i < numericBlockSize; ++i) {
in[i] /= gcd;
}
token = (token << 1) | 0x01;
Expand Down Expand Up @@ -174,7 +176,7 @@ private void forEncode(int token, int tokenBits, long[] in, DataOutput out) thro
* Encode the given longs using a combination of delta-coding, GCD factorization and bit packing.
*/
void encode(long[] in, DataOutput out) throws IOException {
assert in.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
assert in.length == numericBlockSize;

deltaEncode(0, 0, in, out);
}
Expand All @@ -192,7 +194,7 @@ void encode(long[] in, DataOutput out) throws IOException {
* </ul>
*/
void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOException {
assert in.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
assert in.length == numericBlockSize;
int numRuns = 1;
long firstValue = in[0];
long previousValue = firstValue;
Expand Down Expand Up @@ -259,7 +261,7 @@ void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOExceptio
}

void decodeOrdinals(DataInput in, long[] out, int bitsPerOrd) throws IOException {
assert out.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE : out.length;
assert out.length == numericBlockSize : out.length;

long v1 = in.readVLong();
int encoding = Long.numberOfTrailingZeros(~v1);
Expand All @@ -275,7 +277,7 @@ void decodeOrdinals(DataInput in, long[] out, int bitsPerOrd) throws IOException
Arrays.fill(out, runLen, out.length, v2);
} else if (encoding == 2) {
// bit-packed
DocValuesForUtil.decode(bitsPerOrd, in, out);
forUtil.decode(bitsPerOrd, in, out);
} else if (encoding == 3) {
// cycle encoding
int cycleLength = (int) v1;
Expand All @@ -293,13 +295,13 @@ void decodeOrdinals(DataInput in, long[] out, int bitsPerOrd) throws IOException

/** Decode longs that have been encoded with {@link #encode}. */
void decode(DataInput in, long[] out) throws IOException {
assert out.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE : out.length;
assert out.length == numericBlockSize : out.length;

final int token = in.readVInt();
final int bitsPerValue = token >>> 3;

if (bitsPerValue != 0) {
DocValuesForUtil.decode(bitsPerValue, in, out);
forUtil.decode(bitsPerValue, in, out);
} else {
Arrays.fill(out, 0L);
}
Expand Down Expand Up @@ -330,21 +332,21 @@ void decode(DataInput in, long[] out) throws IOException {
}

// this loop should auto-vectorize
private static void mul(long[] arr, long m) {
for (int i = 0; i < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++i) {
private void mul(long[] arr, long m) {
for (int i = 0; i < numericBlockSize; ++i) {
arr[i] *= m;
}
}

// this loop should auto-vectorize
private static void add(long[] arr, long min) {
for (int i = 0; i < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++i) {
private void add(long[] arr, long min) {
for (int i = 0; i < numericBlockSize; ++i) {
arr[i] += min;
}
}

private static void deltaDecode(long[] arr) {
for (int i = 1; i < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++i) {
private void deltaDecode(long[] arr) {
for (int i = 1; i < numericBlockSize; ++i) {
arr[i] += arr[i - 1];
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,18 @@
import java.util.Random;

public class DocValuesForUtilTests extends LuceneTestCase {
int NUMERIC_BLOCK_SIZE = 1 << 7;

public void testEncodeDecode() throws IOException {
final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000);
final long[] values = new long[iterations * ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];
final long[] values = new long[iterations * NUMERIC_BLOCK_SIZE];
final int[] bpvs = new int[iterations];

for (int i = 0; i < iterations; ++i) {
final int bpv = TestUtil.nextInt(random(), 1, 64);
bpvs[i] = DocValuesForUtil.roundBits(bpv);
for (int j = 0; j < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++j) {
values[i * ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE + j] = bpv == 64
for (int j = 0; j < NUMERIC_BLOCK_SIZE; ++j) {
values[i * NUMERIC_BLOCK_SIZE + j] = bpv == 64
? random().nextLong()
: TestUtil.nextLong(random(), 0, PackedInts.maxValue(bpv));
}
Expand All @@ -53,12 +54,12 @@ public void testEncodeDecode() throws IOException {
{
// encode
IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
final DocValuesForUtil forUtil = new DocValuesForUtil();
final DocValuesForUtil forUtil = new DocValuesForUtil(NUMERIC_BLOCK_SIZE);

for (int i = 0; i < iterations; ++i) {
long[] source = new long[ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];
for (int j = 0; j < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++j) {
source[j] = values[i * ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE + j];
long[] source = new long[NUMERIC_BLOCK_SIZE];
for (int j = 0; j < NUMERIC_BLOCK_SIZE; ++j) {
source[j] = values[i * NUMERIC_BLOCK_SIZE + j];
}
out.writeByte((byte) bpvs[i]);
forUtil.encode(source, bpvs[i], out);
Expand All @@ -70,16 +71,17 @@ public void testEncodeDecode() throws IOException {
{
// decode
IndexInput in = d.openInput("test.bin", IOContext.READONCE);
final long[] restored = new long[ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE];
final DocValuesForUtil forUtil = new DocValuesForUtil(NUMERIC_BLOCK_SIZE);
final long[] restored = new long[NUMERIC_BLOCK_SIZE];
for (int i = 0; i < iterations; ++i) {
final int bitsPerValue = in.readByte();
DocValuesForUtil.decode(bitsPerValue, in, restored);
forUtil.decode(bitsPerValue, in, restored);
assertArrayEquals(
Arrays.toString(restored),
ArrayUtil.copyOfSubArray(
values,
i * ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE,
(i + 1) * ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE
i * NUMERIC_BLOCK_SIZE,
(i + 1) * NUMERIC_BLOCK_SIZE
),
restored
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@

public class ES87TSDBDocValuesEncoderTests extends LuceneTestCase {

private final ES87TSDBDocValuesEncoder encoder;
private final TSDBDocValuesEncoder encoder;
private final int blockSize = ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;

public ES87TSDBDocValuesEncoderTests() {
this.encoder = new ES87TSDBDocValuesEncoder();
this.encoder = new TSDBDocValuesEncoder(blockSize);
}

public void testRandomValues() throws IOException {
Expand Down

0 comments on commit 631c259

Please sign in to comment.