Update the offset when we filter out all values of a Row column in Se…

…lectiveStructColumnReader (#6575) Summary: Pull Request resolved: #6575 SelectiveStructColumnReader has logic to exit early if none of the values pass a pushed down filter. Unfortunately, we're missing the updates to the offsets when we do this. This results in us either trying to re- skip the values in the children when we read the next batch or reading to far ahed in the nulls buffer for the Row which leads to attempting to read off the end of the buffer when we get to the end of the stripe. The fix is just to make sure we update the offsets before returning. Reviewed By: Yuhta Differential Revision: D49285600 fbshipit-source-id: ed370b3702ad771c2885ab71f481fcfd1ee39084
facebookincubator · Sep 15, 2023 · 02ea91a · 02ea91a
1 parent 812280c
commit 02ea91a
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 0 deletions.
diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp
@@ -125,6 +125,8 @@ void SelectiveStructColumnReaderBase::read(
         activeRows, kind == velox::common::FilterKind::kIsNull, false);
     if (outputRows_.empty()) {
       recordParentNullsInChildren(offset, rows);
+      lazyVectorReadOffset_ = offset;
+      readOffset_ = offset + rows.back() + 1;
       return;
     }
     activeRows = outputRows_;

diff --git a/velox/dwio/dwrf/test/ReaderTest.cpp b/velox/dwio/dwrf/test/ReaderTest.cpp
@@ -2489,3 +2489,68 @@ TEST(TestReader, readFlatMapsWithNullMaps) {
     }
   }
 }
+
+TEST(TestReader, readStructWithWholeBatchFiltered) {
+  // Test reading a struct with a pushdown filter that filters out all rows for
+  // a certain batch.
+  auto* pool = defaultPool.get();
+  VectorMaker maker(pool);
+
+  auto rowType = ROW({"a"}, {BIGINT()});
+  const vector_size_t vectorSize = 20;
+  const vector_size_t batchSize = 10;
+  std::vector<VectorPtr> children{maker.flatVector<int64_t>(
+      vectorSize,
+      folly::identity,
+      // In the first batch, the parent Rows will all be null.
+      [&](auto i) { return i < batchSize; })};
+
+  BufferPtr nulls = AlignedBuffer::allocate<bool>(vectorSize, pool);
+  uint64_t* rawNulls = nulls->asMutable<uint64_t>();
+  memset(rawNulls, bits::kNotNullByte, nulls->capacity());
+  // Mark the Row as null in the first batch.
+  for (int i = 0; i < batchSize; ++i) {
+    bits::setNull(rawNulls, i, true);
+  }
+
+  auto c0 =
+      std::make_shared<RowVector>(pool, rowType, nulls, vectorSize, children);
+  auto row = maker.rowVector({"c0"}, {c0});
+
+  auto [writer, reader] = createWriterReader({row}, *pool);
+
+  auto schema = asRowType(row->type());
+  auto spec = std::make_shared<common::ScanSpec>("<root>");
+  spec->addAllChildFields(*schema);
+  // Create a filter that will filter out all rows in the first batch.
+  spec->childByName("c0")->setFilter(std::make_unique<common::IsNotNull>());
+  RowReaderOptions rowReaderOpts;
+  rowReaderOpts.setScanSpec(spec);
+
+  auto rowReader = reader->createRowReader(rowReaderOpts);
+  VectorPtr batch = BaseVector::create(schema, 0, pool);
+
+  ASSERT_TRUE(rowReader->next(batchSize, batch));
+  // Confirm that all rows were filtered out.
+  ASSERT_EQ(batch->size(), 0);
+  ASSERT_TRUE(rowReader->next(batchSize, batch));
+  // None of the rows should be filtered out in the second batch.
+  ASSERT_EQ(batch->size(), 10);
+  // Validate that we successfully read the batch.
+  auto rowVector = batch->as<RowVector>();
+  auto resultRows = rowVector->childAt(0)->loadedVector()->as<RowVector>();
+  ASSERT_EQ(resultRows->size(), batchSize);
+
+  for (int i = 0; i < batchSize; ++i) {
+    ASSERT_FALSE(resultRows->isNullAt(i));
+  }
+
+  auto resultValues =
+      resultRows->childAt(0)->loadedVector()->as<FlatVector<int64_t>>();
+  ASSERT_EQ(resultValues->size(), batchSize);
+
+  for (int i = 0; i < batchSize; ++i) {
+    ASSERT_FALSE(resultValues->isNullAt(i));
+    ASSERT_EQ(resultValues->valueAt(i), i + 10);
+  }
+}