Skip to content

Commit

Permalink
HSEARCH-5133 Implement Lucene count distinct aggregations
Browse files Browse the repository at this point in the history
  • Loading branch information
fax4ever committed Jul 5, 2024
1 parent 6113c0a commit 49cb72c
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright Red Hat Inc. and Hibernate Authors
*/
package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl;

import java.util.BitSet;
import java.util.HashSet;

/**
* <p>
* The algorithm to collect distinct elements is inspired by {@code org.apache.lucene.facet.LongValueFacetCounts}
* of <a href="https://lucene.apache.org/">Apache Lucene project</a>.
*/
public class CountDistinct {

private final BitSet counts = new BitSet( 1024 );
private final HashSet<Long> hashCounts = new HashSet<>();

public void increment(long value) {
if ( value >= 0 && value < counts.size() ) {
counts.set( (int) value );
}
else {
hashCounts.add( value );
}
}

public long result() {
return counts.cardinality() + hashCounts.size();
}

public void merge(CountDistinct other) {
counts.or( other.counts );
hashCounts.addAll( other.hashCounts );
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright Red Hat Inc. and Hibernate Authors
*/
package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl;

import java.io.IOException;

import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.LongMultiValues;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.LongMultiValuesSource;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;

public class CountDistinctCollector implements Collector {

private final LongMultiValuesSource valueSource;
private final CountDistinct counter = new CountDistinct();

public CountDistinctCollector(LongMultiValuesSource valueSource) {
this.valueSource = valueSource;
}

public long count() {
return counter.result();
}

public void merge(CountDistinctCollector collector) {
counter.merge( collector.counter );
}

@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
return new CountDistinctLeafCollector( valueSource.getValues( context ) );
}

@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}

public class CountDistinctLeafCollector implements LeafCollector {
private final LongMultiValues values;

public CountDistinctLeafCollector(LongMultiValues values) {
this.values = values;
}

@Override
public void collect(int doc) throws IOException {
if ( values.advanceExact( doc ) ) {
while ( values.hasNextValue() ) {
long value = values.nextValue();
counter.increment( value );
}
}
}

@Override
public void setScorer(Scorable scorer) {
// no-op by default
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright Red Hat Inc. and Hibernate Authors
*/
package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl;

import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorExecutionContext;
import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorFactory;
import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorKey;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.JoiningLongMultiValuesSource;

public class CountDistinctCollectorFactory
implements CollectorFactory<CountDistinctCollector, Long, CountDistinctCollectorManager> {

private final JoiningLongMultiValuesSource source;
private final CollectorKey<CountDistinctCollector, Long> key;

public CountDistinctCollectorFactory(JoiningLongMultiValuesSource source, CollectorKey<CountDistinctCollector, Long> key) {
this.source = source;
this.key = key;
}

@Override
public CountDistinctCollectorManager createCollectorManager(CollectorExecutionContext context) {
return new CountDistinctCollectorManager( source );
}

@Override
public CollectorKey<CountDistinctCollector, Long> getCollectorKey() {
return key;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright Red Hat Inc. and Hibernate Authors
*/
package org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl;

import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;

import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.JoiningLongMultiValuesSource;

import org.apache.lucene.search.CollectorManager;

public class CountDistinctCollectorManager implements CollectorManager<CountDistinctCollector, Long> {

private final JoiningLongMultiValuesSource source;

public CountDistinctCollectorManager(JoiningLongMultiValuesSource source) {
this.source = source;
}

@Override
public CountDistinctCollector newCollector() {
return new CountDistinctCollector( source );
}

@Override
public Long reduce(Collection<CountDistinctCollector> collectors) throws IOException {
if ( collectors.isEmpty() ) {
return 0L;
}

LinkedList<CountDistinctCollector> distinctCollectors = new LinkedList<>( collectors );
CountDistinctCollector collector = distinctCollectors.removeLast();
for ( CountDistinctCollector other : distinctCollectors ) {
collector.merge( other );
}
return collector.count();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright Red Hat Inc. and Hibernate Authors
*/
package org.hibernate.search.backend.lucene.types.aggregation.impl;

import java.util.Set;

import org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl.CountDistinctCollector;
import org.hibernate.search.backend.lucene.lowlevel.aggregation.collector.impl.CountDistinctCollectorFactory;
import org.hibernate.search.backend.lucene.lowlevel.collector.impl.CollectorKey;
import org.hibernate.search.backend.lucene.lowlevel.docvalues.impl.JoiningLongMultiValuesSource;
import org.hibernate.search.backend.lucene.search.aggregation.impl.AggregationExtractContext;
import org.hibernate.search.backend.lucene.search.aggregation.impl.AggregationRequestContext;
import org.hibernate.search.backend.lucene.search.common.impl.AbstractLuceneCodecAwareSearchQueryElementFactory;
import org.hibernate.search.backend.lucene.search.common.impl.LuceneSearchIndexScope;
import org.hibernate.search.backend.lucene.search.common.impl.LuceneSearchIndexValueFieldContext;
import org.hibernate.search.backend.lucene.types.codec.impl.AbstractLuceneNumericFieldCodec;
import org.hibernate.search.engine.search.aggregation.spi.FieldMetricAggregationBuilder;

public class LuceneMetricNumericLongAggregation extends AbstractLuceneNestableAggregation<Long> {

private final Set<String> indexNames;
private final String absoluteFieldPath;
private final String operation;
private final CollectorKey<CountDistinctCollector, Long> collectorKey;

LuceneMetricNumericLongAggregation(Builder builder) {
super( builder );
this.indexNames = builder.scope.hibernateSearchIndexNames();
this.absoluteFieldPath = builder.field.absolutePath();
this.operation = builder.operation;
this.collectorKey = CollectorKey.create();
}

@Override
public Extractor<Long> request(AggregationRequestContext context) {
JoiningLongMultiValuesSource source = JoiningLongMultiValuesSource.fromField(
absoluteFieldPath, createNestedDocsProvider( context )
);
if ( "cardinality".equals( operation ) ) {
context.requireCollector( new CountDistinctCollectorFactory( source, collectorKey ) );
}
return new LuceneNumericMetricLongAggregationExtraction();
}

@Override
public Set<String> indexNames() {
return indexNames;
}

private class LuceneNumericMetricLongAggregationExtraction implements Extractor<Long> {
@Override
public Long extract(AggregationExtractContext context) {
return context.getFacets( collectorKey );
}
}

public static class Factory<F>
extends AbstractLuceneCodecAwareSearchQueryElementFactory<FieldMetricAggregationBuilder<Long>,
F,
AbstractLuceneNumericFieldCodec<F, ?>> {

private final String operation;

public Factory(AbstractLuceneNumericFieldCodec<F, ?> codec, String operation) {
super( codec );
this.operation = operation;
}

@Override
public FieldMetricAggregationBuilder<Long> create(LuceneSearchIndexScope<?> scope,
LuceneSearchIndexValueFieldContext<F> field) {
return new Builder( scope, field, operation );
}
}

private static class Builder extends AbstractBuilder<Long> implements FieldMetricAggregationBuilder<Long> {
private final String operation;

public Builder(LuceneSearchIndexScope<?> scope,
LuceneSearchIndexValueFieldContext<?> field,
String operation) {
super( scope, field );
this.operation = operation;
}

@Override
public LuceneMetricNumericLongAggregation build() {
return new LuceneMetricNumericLongAggregation( this );
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.hibernate.search.backend.lucene.search.predicate.impl.LucenePredicateTypeKeys;
import org.hibernate.search.backend.lucene.search.projection.impl.LuceneFieldProjection;
import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneMetricNumericFieldAggregation;
import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneMetricNumericLongAggregation;
import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneNumericRangeAggregation;
import org.hibernate.search.backend.lucene.types.aggregation.impl.LuceneNumericTermsAggregation;
import org.hibernate.search.backend.lucene.types.codec.impl.AbstractLuceneNumericFieldCodec;
Expand Down Expand Up @@ -88,6 +89,8 @@ public LuceneIndexValueFieldType<F> toIndexFieldType() {
builder.queryElementFactory( AggregationTypeKeys.RANGE, new LuceneNumericRangeAggregation.Factory<>( codec ) );
builder.queryElementFactory( AggregationTypeKeys.SUM,
new LuceneMetricNumericFieldAggregation.Factory<>( codec, "sum" ) );
builder.queryElementFactory( AggregationTypeKeys.COUNT_DISTINCT,
new LuceneMetricNumericLongAggregation.Factory<>( codec, "cardinality" ) );
}

return builder.build();
Expand Down

0 comments on commit 49cb72c

Please sign in to comment.