-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement BloomFilter query rewrite (without pushdown optimization) #248
Changes from all commits
6f2aceb
cf3ff2a
d7565f5
1cc7cf1
c22c7d1
ce21393
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.skipping.bloomfilter | ||
|
||
import java.io.ByteArrayInputStream | ||
|
||
import org.opensearch.flint.core.field.bloomfilter.classic.ClassicBloomFilter | ||
|
||
import org.apache.spark.sql.Column | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
import org.apache.spark.sql.catalyst.expressions.{BinaryComparison, Expression} | ||
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} | ||
import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper | ||
import org.apache.spark.sql.functions.{col, lit} | ||
import org.apache.spark.sql.types._ | ||
|
||
/** | ||
* Bloom filter function that returns the membership check result for values of `valueExpression` | ||
* in the bloom filter represented by `bloomFilterExpression`. | ||
* | ||
* @param bloomFilterExpression | ||
* binary expression that represents bloom filter data | ||
* @param valueExpression | ||
* Long value expression to be tested | ||
*/ | ||
case class BloomFilterMightContain(bloomFilterExpression: Expression, valueExpression: Expression) | ||
extends BinaryComparison { | ||
|
||
override def nullable: Boolean = true | ||
|
||
override def left: Expression = bloomFilterExpression | ||
|
||
override def right: Expression = valueExpression | ||
|
||
override def prettyName: String = "bloom_filter_might_contain" | ||
|
||
override def dataType: DataType = BooleanType | ||
|
||
override def symbol: String = "BLOOM_FILTER_MIGHT_CONTAIN" | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
(left.dataType, right.dataType) match { | ||
case (BinaryType, NullType) | (NullType, LongType) | (NullType, NullType) | | ||
(BinaryType, LongType) => | ||
TypeCheckResult.TypeCheckSuccess | ||
case _ => | ||
TypeCheckResult.TypeCheckFailure(s""" | ||
| Input to function $prettyName should be Binary expression followed by a Long value, | ||
| but it's [${left.dataType.catalogString}, ${right.dataType.catalogString}]. | ||
| """.stripMargin) | ||
} | ||
} | ||
|
||
override protected def withNewChildrenInternal( | ||
newBloomFilterExpression: Expression, | ||
newValueExpression: Expression): BloomFilterMightContain = | ||
copy(bloomFilterExpression = newBloomFilterExpression, valueExpression = newValueExpression) | ||
|
||
override def eval(input: InternalRow): Any = { | ||
val value = valueExpression.eval(input) | ||
if (value == null) { | ||
null | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why eval result is null? Should bloomFilter.test(null) return false? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following Spark SQL NULL semantics, NULL is ignored in BloomFilterAgg. So NULL is returned for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I understand, what's discussed here will happen only if I did some test and found out that
|
||
} else { | ||
val bytes = bloomFilterExpression.eval(input).asInstanceOf[Array[Byte]] | ||
val bloomFilter = ClassicBloomFilter.readFrom(new ByteArrayInputStream(bytes)) | ||
bloomFilter.mightContain(value.asInstanceOf[Long]) | ||
} | ||
} | ||
|
||
/** | ||
* Generate expression code for Spark codegen execution. Sample result code: | ||
* ``` | ||
* boolean filter_isNull_0 = true; | ||
* boolean filter_value_0 = false; | ||
* if (!right_isNull) { | ||
* filter_isNull_0 = false; | ||
* filter_value_0 = | ||
* org.opensearch.flint.core.field.bloomfilter.classic.ClassicBloomFilter.readFrom( | ||
* new java.io.ByteArrayInputStream(left_value) | ||
* ).mightContain(right_value); | ||
* } | ||
* ``` | ||
*/ | ||
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
val leftGen = left.genCode(ctx) | ||
val rightGen = right.genCode(ctx) | ||
val bloomFilterEncoder = classOf[ClassicBloomFilter].getCanonicalName.stripSuffix("$") | ||
val bf = s"$bloomFilterEncoder.readFrom(new java.io.ByteArrayInputStream(${leftGen.value}))" | ||
val result = s"$bf.mightContain(${rightGen.value})" | ||
val resultCode = | ||
s""" | ||
|if (!(${rightGen.isNull})) { | ||
| ${leftGen.code} | ||
| ${ev.isNull} = false; | ||
| ${ev.value} = $result; | ||
|} | ||
""".stripMargin | ||
ev.copy(code = code""" | ||
${rightGen.code} | ||
boolean ${ev.isNull} = true; | ||
boolean ${ev.value} = false; | ||
$resultCode""") | ||
} | ||
} | ||
|
||
object BloomFilterMightContain { | ||
|
||
/** | ||
* Generate bloom filter might contain function given the bloom filter column and value. | ||
* | ||
* @param colName | ||
* column name | ||
* @param value | ||
* value | ||
* @return | ||
* bloom filter might contain expression | ||
*/ | ||
def bloom_filter_might_contain(colName: String, value: Any): Column = { | ||
new Column(BloomFilterMightContain(col(colName).expr, lit(value).expr)) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.skipping.bloomfilter | ||
|
||
import org.apache.spark.FlintSuite | ||
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult._ | ||
import org.apache.spark.sql.catalyst.expressions.Literal | ||
import org.apache.spark.sql.types.{BinaryType, DoubleType, LongType, StringType} | ||
import org.apache.spark.unsafe.types.UTF8String | ||
|
||
class BloomFilterMightContainSuite extends FlintSuite { | ||
|
||
test("checkInputDataTypes should succeed for valid input types") { | ||
val binaryExpression = Literal(Array[Byte](1, 2, 3), BinaryType) | ||
val longExpression = Literal(42L, LongType) | ||
|
||
val bloomFilterMightContain = BloomFilterMightContain(binaryExpression, longExpression) | ||
assert(bloomFilterMightContain.checkInputDataTypes() == TypeCheckSuccess) | ||
} | ||
|
||
test("checkInputDataTypes should succeed for valid input types with nulls") { | ||
val binaryExpression = Literal.create(null, BinaryType) | ||
val longExpression = Literal.create(null, LongType) | ||
|
||
val bloomFilterMightContain = BloomFilterMightContain(binaryExpression, longExpression) | ||
assert(bloomFilterMightContain.checkInputDataTypes() == TypeCheckSuccess) | ||
} | ||
|
||
test("checkInputDataTypes should fail for invalid input types") { | ||
val stringExpression = Literal(UTF8String.fromString("invalid"), StringType) | ||
val doubleExpression = Literal(3.14, DoubleType) | ||
|
||
val bloomFilterMightContain = BloomFilterMightContain(stringExpression, doubleExpression) | ||
val expectedErrorMsg = | ||
s""" | ||
| Input to function bloom_filter_might_contain should be Binary expression followed by a Long value, | ||
| but it's [${stringExpression.dataType.catalogString}, ${doubleExpression.dataType.catalogString}]. | ||
| """.stripMargin | ||
|
||
assert(bloomFilterMightContain.checkInputDataTypes() == TypeCheckFailure(expectedErrorMsg)) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to try-catch as Spark codegen doesn't allow checked exception