From 49cb72cfde3f9bed5b1dc0232a41ff6317efdb7a Mon Sep 17 00:00:00 2001 From: Fabio Massimo Ercoli Date: Fri, 5 Jul 2024 13:36:07 +0200 Subject: [PATCH] HSEARCH-5133 Implement Lucene count distinct aggregations --- .../collector/impl/CountDistinct.java | 37 ++++++++ .../impl/CountDistinctCollector.java | 67 +++++++++++++ .../impl/CountDistinctCollectorFactory.java | 32 +++++++ .../impl/CountDistinctCollectorManager.java | 41 ++++++++ .../LuceneMetricNumericLongAggregation.java | 93 +++++++++++++++++++ ...uceneNumericIndexFieldTypeOptionsStep.java | 3 + 6 files changed, 273 insertions(+) create mode 100644 backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinct.java create mode 100644 backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollector.java create mode 100644 backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorFactory.java create mode 100644 backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorManager.java create mode 100644 backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/aggregation/impl/LuceneMetricNumericLongAggregation.java diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinct.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinct.java new file mode 100644 index 00000000000..19aaa331197 --- /dev/null +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinct.java @@ -0,0 +1,37 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright Red Hat Inc. and Hibernate Authors + */ +package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl; + +import java.util.BitSet; +import java.util.HashSet; + +/** + *

+ * The algorithm to collect distinct elements is inspired by {@code org.apache.lucene.facet.LongValueFacetCounts} + * of Apache Lucene project. + */ +public class CountDistinct { + + private final BitSet counts = new BitSet( 1024 ); + private final HashSet hashCounts = new HashSet<>(); + + public void increment(long value) { + if ( value >= 0 && value < counts.size() ) { + counts.set( (int) value ); + } + else { + hashCounts.add( value ); + } + } + + public long result() { + return counts.cardinality() + hashCounts.size(); + } + + public void merge(CountDistinct other) { + counts.or( other.counts ); + hashCounts.addAll( other.hashCounts ); + } +} diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollector.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollector.java new file mode 100644 index 00000000000..8f0cef514bf --- /dev/null +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollector.java @@ -0,0 +1,67 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright Red Hat Inc. and Hibernate Authors + */ +package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl; + +import java.io.IOException; + +import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.LongMultiValues; +import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.LongMultiValuesSource; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.ScoreMode; + +public class CountDistinctCollector implements Collector { + + private final LongMultiValuesSource valueSource; + private final CountDistinct counter = new CountDistinct(); + + public CountDistinctCollector(LongMultiValuesSource valueSource) { + this.valueSource = valueSource; + } + + public long count() { + return counter.result(); + } + + public void merge(CountDistinctCollector collector) { + counter.merge( collector.counter ); + } + + @Override + public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { + return new CountDistinctLeafCollector( valueSource.getValues( context ) ); + } + + @Override + public ScoreMode scoreMode() { + return ScoreMode.COMPLETE_NO_SCORES; + } + + public class CountDistinctLeafCollector implements LeafCollector { + private final LongMultiValues values; + + public CountDistinctLeafCollector(LongMultiValues values) { + this.values = values; + } + + @Override + public void collect(int doc) throws IOException { + if ( values.advanceExact( doc ) ) { + while ( values.hasNextValue() ) { + long value = values.nextValue(); + counter.increment( value ); + } + } + } + + @Override + public void setScorer(Scorable scorer) { + // no-op by default + } + } +} diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorFactory.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorFactory.java new file mode 100644 index 00000000000..2aeceb69745 --- /dev/null +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorFactory.java @@ -0,0 +1,32 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright Red Hat Inc. and Hibernate Authors + */ +package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl; + +import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorExecutionContext; +import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorFactory; +import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorKey; +import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.JoiningLongMultiValuesSource; + +public class CountDistinctCollectorFactory + implements CollectorFactory { + + private final JoiningLongMultiValuesSource source; + private final CollectorKey key; + + public CountDistinctCollectorFactory(JoiningLongMultiValuesSource source, CollectorKey key) { + this.source = source; + this.key = key; + } + + @Override + public CountDistinctCollectorManager createCollectorManager(CollectorExecutionContext context) { + return new CountDistinctCollectorManager( source ); + } + + @Override + public CollectorKey getCollectorKey() { + return key; + } +} diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorManager.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorManager.java new file mode 100644 index 00000000000..df6c4c90049 --- /dev/null +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/lowlevel/aggregation/collector/impl/CountDistinctCollectorManager.java @@ -0,0 +1,41 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright Red Hat Inc. and Hibernate Authors + */ +package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl; + +import java.io.IOException; +import java.util.Collection; +import java.util.LinkedList; + +import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.JoiningLongMultiValuesSource; + +import org.apache.lucene.search.CollectorManager; + +public class CountDistinctCollectorManager implements CollectorManager { + + private final JoiningLongMultiValuesSource source; + + public CountDistinctCollectorManager(JoiningLongMultiValuesSource source) { + this.source = source; + } + + @Override + public CountDistinctCollector newCollector() { + return new CountDistinctCollector( source ); + } + + @Override + public Long reduce(Collection collectors) throws IOException { + if ( collectors.isEmpty() ) { + return 0L; + } + + LinkedList distinctCollectors = new LinkedList<>( collectors ); + CountDistinctCollector collector = distinctCollectors.removeLast(); + for ( CountDistinctCollector other : distinctCollectors ) { + collector.merge( other ); + } + return collector.count(); + } +} diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/aggregation/impl/LuceneMetricNumericLongAggregation.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/aggregation/impl/LuceneMetricNumericLongAggregation.java new file mode 100644 index 00000000000..1651e4bfcac --- /dev/null +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/aggregation/impl/LuceneMetricNumericLongAggregation.java @@ -0,0 +1,93 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright Red Hat Inc. and Hibernate Authors + */ +package org.hibernate.search.backend.lucene.types.aggregation.impl; + +import java.util.Set; + +import org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl.CountDistinctCollector; +import org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl.CountDistinctCollectorFactory; +import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorKey; +import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.JoiningLongMultiValuesSource; +import org.hibernate.search.backend.lucene.search.aggregation.impl.AggregationExtractContext; +import org.hibernate.search.backend.lucene.search.aggregation.impl.AggregationRequestContext; +import org.hibernate.search.backend.lucene.search.common.impl.AbstractLuceneCodecAwareSearchQueryElementFactory; +import org.hibernate.search.backend.lucene.search.common.impl.LuceneSearchIndexScope; +import org.hibernate.search.backend.lucene.search.common.impl.LuceneSearchIndexValueFieldContext; +import org.hibernate.search.backend.lucene.types.codec.impl.AbstractLuceneNumericFieldCodec; +import org.hibernate.search.engine.search.aggregation.spi.FieldMetricAggregationBuilder; + +public class LuceneMetricNumericLongAggregation extends AbstractLuceneNestableAggregation { + + private final Set indexNames; + private final String absoluteFieldPath; + private final String operation; + private final CollectorKey collectorKey; + + LuceneMetricNumericLongAggregation(Builder builder) { + super( builder ); + this.indexNames = builder.scope.hibernateSearchIndexNames(); + this.absoluteFieldPath = builder.field.absolutePath(); + this.operation = builder.operation; + this.collectorKey = CollectorKey.create(); + } + + @Override + public Extractor request(AggregationRequestContext context) { + JoiningLongMultiValuesSource source = JoiningLongMultiValuesSource.fromField( + absoluteFieldPath, createNestedDocsProvider( context ) + ); + if ( "cardinality".equals( operation ) ) { + context.requireCollector( new CountDistinctCollectorFactory( source, collectorKey ) ); + } + return new LuceneNumericMetricLongAggregationExtraction(); + } + + @Override + public Set indexNames() { + return indexNames; + } + + private class LuceneNumericMetricLongAggregationExtraction implements Extractor { + @Override + public Long extract(AggregationExtractContext context) { + return context.getFacets( collectorKey ); + } + } + + public static class Factory + extends AbstractLuceneCodecAwareSearchQueryElementFactory, + F, + AbstractLuceneNumericFieldCodec> { + + private final String operation; + + public Factory(AbstractLuceneNumericFieldCodec codec, String operation) { + super( codec ); + this.operation = operation; + } + + @Override + public FieldMetricAggregationBuilder create(LuceneSearchIndexScope scope, + LuceneSearchIndexValueFieldContext field) { + return new Builder( scope, field, operation ); + } + } + + private static class Builder extends AbstractBuilder implements FieldMetricAggregationBuilder { + private final String operation; + + public Builder(LuceneSearchIndexScope scope, + LuceneSearchIndexValueFieldContext field, + String operation) { + super( scope, field ); + this.operation = operation; + } + + @Override + public LuceneMetricNumericLongAggregation build() { + return new LuceneMetricNumericLongAggregation( this ); + } + } +} diff --git a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/dsl/impl/AbstractLuceneNumericIndexFieldTypeOptionsStep.java b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/dsl/impl/AbstractLuceneNumericIndexFieldTypeOptionsStep.java index f2fdd9f2cb8..4e6f11dec51 100644 --- a/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/dsl/impl/AbstractLuceneNumericIndexFieldTypeOptionsStep.java +++ b/backend/lucene/src/main/java/org/hibernate/search/backend/lucene/types/dsl/impl/AbstractLuceneNumericIndexFieldTypeOptionsStep.java @@ -7,6 +7,7 @@ import org.hibernate.search.backend.lucene.search.predicate.impl.LucenePredicateTypeKeys; import org.hibernate.search.backend.lucene.search.projection.impl.LuceneFieldProjection; import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneMetricNumericFieldAggregation; +import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneMetricNumericLongAggregation; import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneNumericRangeAggregation; import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneNumericTermsAggregation; import org.hibernate.search.backend.lucene.types.codec.impl.AbstractLuceneNumericFieldCodec; @@ -88,6 +89,8 @@ public LuceneIndexValueFieldType toIndexFieldType() { builder.queryElementFactory( AggregationTypeKeys.RANGE, new LuceneNumericRangeAggregation.Factory<>( codec ) ); builder.queryElementFactory( AggregationTypeKeys.SUM, new LuceneMetricNumericFieldAggregation.Factory<>( codec, "sum" ) ); + builder.queryElementFactory( AggregationTypeKeys.COUNT_DISTINCT, + new LuceneMetricNumericLongAggregation.Factory<>( codec, "cardinality" ) ); } return builder.build();