From a83a0edcab2d9d1c5f43b3d0dba4d4ba66f30dbf Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Thu, 31 Jan 2019 16:55:33 -0500 Subject: [PATCH] Only create final MatrixStatsResults on final reduction MatrixStatsResults is the "final" result object, and runs an additional computation in it's ctor to calculate covariance, etc. This means it should only run on the final reduction instead of on every reduce. --- .../matrix/stats/InternalMatrixStats.java | 11 +++-- .../stats/MatrixStatsAggregatorTests.java | 44 ++++++++++++++++++- .../matrix/stats/MultiPassStats.java | 24 ++++++++++ 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/modules/aggs-matrix-stats/src/main/java/org/elasticsearch/search/aggregations/matrix/stats/InternalMatrixStats.java b/modules/aggs-matrix-stats/src/main/java/org/elasticsearch/search/aggregations/matrix/stats/InternalMatrixStats.java index 8293844f4b6fa..7dcdff426711b 100644 --- a/modules/aggs-matrix-stats/src/main/java/org/elasticsearch/search/aggregations/matrix/stats/InternalMatrixStats.java +++ b/modules/aggs-matrix-stats/src/main/java/org/elasticsearch/search/aggregations/matrix/stats/InternalMatrixStats.java @@ -244,12 +244,15 @@ public InternalAggregation doReduce(List aggregations, Redu } RunningStats runningStats = new RunningStats(); - for (int i=0; i < aggs.size(); ++i) { - runningStats.merge(((InternalMatrixStats) aggs.get(i)).stats); + for (InternalAggregation agg : aggs) { + runningStats.merge(((InternalMatrixStats) agg).stats); } - MatrixStatsResults results = new MatrixStatsResults(runningStats); - return new InternalMatrixStats(name, results.getDocCount(), runningStats, results, pipelineAggregators(), getMetaData()); + if (reduceContext.isFinalReduce()) { + MatrixStatsResults results = new MatrixStatsResults(runningStats); + return new InternalMatrixStats(name, results.getDocCount(), runningStats, results, pipelineAggregators(), getMetaData()); + } + return new InternalMatrixStats(name, runningStats.docCount, runningStats, null, pipelineAggregators(), getMetaData()); } @Override diff --git a/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MatrixStatsAggregatorTests.java b/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MatrixStatsAggregatorTests.java index 0512f3d5db3b6..44082b16defb6 100644 --- a/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MatrixStatsAggregatorTests.java +++ b/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MatrixStatsAggregatorTests.java @@ -58,7 +58,6 @@ public void testNoData() throws Exception { } } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/37587") public void testTwoFields() throws Exception { String fieldA = "a"; MappedFieldType ftA = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); @@ -89,8 +88,49 @@ public void testTwoFields() throws Exception { IndexSearcher searcher = new IndexSearcher(reader); MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg") .fields(Arrays.asList(fieldA, fieldB)); - InternalMatrixStats stats = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, ftA, ftB); + InternalMatrixStats stats = search(searcher, new MatchAllDocsQuery(), aggBuilder, ftA, ftB); + // Since `search` doesn't do any reduction, and the InternalMatrixStats object will have a null `MatrixStatsResults` + // object. That is created during the final reduction, which also does a final round of computations + // So we have to create a MatrixStatsResults object here manually so that the final `compute()` is called multiPassStats.assertNearlyEqual(new MatrixStatsResults(stats.getStats())); + } + } + } + + public void testTwoFieldsReduce() throws Exception { + String fieldA = "a"; + MappedFieldType ftA = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); + ftA.setName(fieldA); + String fieldB = "b"; + MappedFieldType ftB = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); + ftB.setName(fieldB); + + try (Directory directory = newDirectory(); + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + + int numDocs = scaledRandomIntBetween(8192, 16384); + Double[] fieldAValues = new Double[numDocs]; + Double[] fieldBValues = new Double[numDocs]; + for (int docId = 0; docId < numDocs; docId++) { + Document document = new Document(); + fieldAValues[docId] = randomDouble(); + document.add(new SortedNumericDocValuesField(fieldA, NumericUtils.doubleToSortableLong(fieldAValues[docId]))); + + fieldBValues[docId] = randomDouble(); + document.add(new SortedNumericDocValuesField(fieldB, NumericUtils.doubleToSortableLong(fieldBValues[docId]))); + indexWriter.addDocument(document); + } + + MultiPassStats multiPassStats = new MultiPassStats(fieldA, fieldB); + multiPassStats.computeStats(Arrays.asList(fieldAValues), Arrays.asList(fieldBValues)); + try (IndexReader reader = indexWriter.getReader()) { + IndexSearcher searcher = new IndexSearcher(reader); + MatrixStatsAggregationBuilder aggBuilder = new MatrixStatsAggregationBuilder("my_agg") + .fields(Arrays.asList(fieldA, fieldB)); + InternalMatrixStats stats = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, ftA, ftB); + // Unlike testTwoFields, `searchAndReduce` will execute reductions so the `MatrixStatsResults` object + // will be populated and fully computed. We should use that value directly to test against + multiPassStats.assertNearlyEqual(stats); assertTrue(MatrixAggregationInspectionHelper.hasValue(stats)); } } diff --git a/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MultiPassStats.java b/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MultiPassStats.java index b5a348f45eb54..cd4ee3ee849ee 100644 --- a/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MultiPassStats.java +++ b/modules/aggs-matrix-stats/src/test/java/org/elasticsearch/search/aggregations/matrix/stats/MultiPassStats.java @@ -136,6 +136,30 @@ void assertNearlyEqual(MatrixStatsResults stats) { assertTrue(nearlyEqual(correlations.get(fieldBKey).get(fieldAKey), stats.getCorrelation(fieldBKey, fieldAKey), 1e-7)); } + void assertNearlyEqual(InternalMatrixStats stats) { + assertEquals(count, stats.getDocCount()); + assertEquals(count, stats.getFieldCount(fieldAKey)); + assertEquals(count, stats.getFieldCount(fieldBKey)); + // means + assertTrue(nearlyEqual(means.get(fieldAKey), stats.getMean(fieldAKey), 1e-7)); + assertTrue(nearlyEqual(means.get(fieldBKey), stats.getMean(fieldBKey), 1e-7)); + // variances + assertTrue(nearlyEqual(variances.get(fieldAKey), stats.getVariance(fieldAKey), 1e-7)); + assertTrue(nearlyEqual(variances.get(fieldBKey), stats.getVariance(fieldBKey), 1e-7)); + // skewness (multi-pass is more susceptible to round-off error so we need to slightly relax the tolerance) + assertTrue(nearlyEqual(skewness.get(fieldAKey), stats.getSkewness(fieldAKey), 1e-4)); + assertTrue(nearlyEqual(skewness.get(fieldBKey), stats.getSkewness(fieldBKey), 1e-4)); + // kurtosis (multi-pass is more susceptible to round-off error so we need to slightly relax the tolerance) + assertTrue(nearlyEqual(kurtosis.get(fieldAKey), stats.getKurtosis(fieldAKey), 1e-4)); + assertTrue(nearlyEqual(kurtosis.get(fieldBKey), stats.getKurtosis(fieldBKey), 1e-4)); + // covariances + assertTrue(nearlyEqual(covariances.get(fieldAKey).get(fieldBKey),stats.getCovariance(fieldAKey, fieldBKey), 1e-7)); + assertTrue(nearlyEqual(covariances.get(fieldBKey).get(fieldAKey),stats.getCovariance(fieldBKey, fieldAKey), 1e-7)); + // correlation + assertTrue(nearlyEqual(correlations.get(fieldAKey).get(fieldBKey), stats.getCorrelation(fieldAKey, fieldBKey), 1e-7)); + assertTrue(nearlyEqual(correlations.get(fieldBKey).get(fieldAKey), stats.getCorrelation(fieldBKey, fieldAKey), 1e-7)); + } + private static boolean nearlyEqual(double a, double b, double epsilon) { final double absA = Math.abs(a); final double absB = Math.abs(b);