apache · wgtmac · Sep 19, 2024 · Jan 28, 2024 · Feb 1, 2024 · Feb 12, 2024
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java
@@ -102,7 +102,7 @@ public class ParquetRewriter implements Closeable {
   private Map<ColumnPath, MaskMode> maskColumns = null;
   private Set<ColumnPath> encryptColumns = null;
   private boolean encryptMode = false;
-  private final Map<String, String> extraMetaData = new HashMap<>();
+  private final Map<String, String> extraMetaData;
   // Writer to rewrite the input files
   private final ParquetFileWriter writer;
   // Number of blocks written which is used to keep track of the actual row group ordinal
@@ -125,6 +125,7 @@ public ParquetRewriter(RewriteOptions options) throws IOException {
     inputFilesToJoin.addAll(getFileReaders(options.getParquetInputFilesToJoin(), conf));
     ensureSameSchema(inputFiles);
     ensureSameSchema(inputFilesToJoin);
+    ensureRowCount();
     LOG.info(
         "Start rewriting {} input file(s) {} to {}",
         inputFiles.size() + inputFilesToJoin.size(),
@@ -134,37 +135,7 @@ public ParquetRewriter(RewriteOptions options) throws IOException {
 
     this.outSchema = getSchema();
     this.outSchema = pruneColumnsInSchema(outSchema, options.getPruneColumns());
-
-    List<TransParquetFileReader> allFiles;
-    if (options.getIgnoreJoinFilesMetadata()) {
-      allFiles = new ArrayList<>(inputFiles);
-    } else {
-      allFiles = Stream.concat(inputFiles.stream(), inputFilesToJoin.stream())
-          .collect(Collectors.toList());
-    }
-    extraMetaData.put(
-        ORIGINAL_CREATED_BY_KEY,
-        allFiles.stream()
-            .map(x -> x.getFooter().getFileMetaData().getCreatedBy())
-            .collect(Collectors.toSet())
-            .stream()
-            .reduce((a, b) -> a + "\n" + b)
-            .orElse(""));
-    allFiles.forEach(x -> extraMetaData.putAll(x.getFileMetaData().getKeyValueMetaData()));
-
-    if (!inputFilesToJoin.isEmpty()) {
-      List<Long> blocksRowCountsL = inputFiles.stream()
-          .flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount))
-          .collect(Collectors.toList());
-      List<Long> blocksRowCountsR = inputFilesToJoin.stream()
-          .flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount))
-          .collect(Collectors.toList());
-      if (!blocksRowCountsL.equals(blocksRowCountsR)) {
-        throw new IllegalArgumentException(
-            "The number of rows in each block must match! Left blocks row counts: " + blocksRowCountsL
-                + ", right blocks row counts" + blocksRowCountsR + ".");
-      }
-    }
+    this.extraMetaData = getExtraMetadata(options);
 
     if (options.getMaskColumns() != null) {
       this.maskColumns = new HashMap<>();
@@ -192,6 +163,36 @@ public ParquetRewriter(RewriteOptions options) throws IOException {
     writer.start();
   }
 
+  // Ctor for legacy CompressionConverter and ColumnMasker
+  public ParquetRewriter(
+      TransParquetFileReader reader,
+      ParquetFileWriter writer,
+      ParquetMetadata meta,
+      MessageType outSchema,
+      String originalCreatedBy,
+      CompressionCodecName codecName,
+      List<String> maskColumns,
+      MaskMode maskMode) {
+    this.writer = writer;
+    this.outSchema = outSchema;
+    this.newCodecName = codecName;
+    extraMetaData = new HashMap<>(meta.getFileMetaData().getKeyValueMetaData());
+    extraMetaData.put(
+        ORIGINAL_CREATED_BY_KEY,
+        originalCreatedBy != null
+            ? originalCreatedBy
+            : meta.getFileMetaData().getCreatedBy());
+    if (maskColumns != null && maskMode != null) {
+      this.maskColumns = new HashMap<>();
+      for (String col : maskColumns) {
+        this.maskColumns.put(ColumnPath.fromDotString(col), maskMode);
+      }
+    }
+    this.inputFiles.add(reader);
+    this.indexCacheStrategy = IndexCache.CacheStrategy.NONE;
+    this.overwriteInputWithJoinColumns = false;
+  }
+
   private MessageType getSchema() {
     MessageType schemaMain = inputFiles.peek().getFooter().getFileMetaData().getSchema();
     if (inputFilesToJoin.isEmpty()) {
@@ -206,39 +207,53 @@ private MessageType getSchema() {
           .getSchema()
           .getFields()
           .forEach(x -> {
-            if (!fieldNames.containsKey(x.getName()) || overwriteInputWithJoinColumns) {
+            if (!fieldNames.containsKey(x.getName())) {
+              LOG.info("Column {} is added to the output from inputFilesToJoin side", x.getName());
+              fieldNames.put(x.getName(), x);
+            } else if (overwriteInputWithJoinColumns) {
+              LOG.info("Column {} in inputFiles is overwritten by inputFilesToJoin side", x.getName());
               fieldNames.put(x.getName(), x);
             }
           });
       return new MessageType(schemaMain.getName(), new ArrayList<>(fieldNames.values()));
     }
   }
 
-  // Ctor for legacy CompressionConverter and ColumnMasker
-  public ParquetRewriter(
-      TransParquetFileReader reader,
-      ParquetFileWriter writer,
-      ParquetMetadata meta,
-      MessageType outSchema,
-      String originalCreatedBy,
-      CompressionCodecName codecName,
-      List<String> maskColumns,
-      MaskMode maskMode) {
-    this.writer = writer;
-    this.outSchema = outSchema;
-    this.newCodecName = codecName;
-    originalCreatedBy = originalCreatedBy == null ? meta.getFileMetaData().getCreatedBy() : originalCreatedBy;
-    extraMetaData.putAll(meta.getFileMetaData().getKeyValueMetaData());
-    extraMetaData.put(ORIGINAL_CREATED_BY_KEY, originalCreatedBy);
-    if (maskColumns != null && maskMode != null) {
-      this.maskColumns = new HashMap<>();
-      for (String col : maskColumns) {
-        this.maskColumns.put(ColumnPath.fromDotString(col), maskMode);
+  private Map<String, String> getExtraMetadata(RewriteOptions options) {
+    List<TransParquetFileReader> allFiles;
+    if (options.getIgnoreJoinFilesMetadata()) {
+      allFiles = new ArrayList<>(inputFiles);
+    } else {
+      allFiles = Stream.concat(inputFiles.stream(), inputFilesToJoin.stream())
+          .collect(Collectors.toList());
+    }
+    Map<String, String> result = new HashMap<>();
+    result.put(
+        ORIGINAL_CREATED_BY_KEY,
+        allFiles.stream()
+            .map(x -> x.getFooter().getFileMetaData().getCreatedBy())
+            .collect(Collectors.toSet())
+            .stream()
+            .reduce((a, b) -> a + "\n" + b)
+            .orElse(""));
+    allFiles.forEach(x -> result.putAll(x.getFileMetaData().getKeyValueMetaData()));
+    return result;
+  }
+
+  private void ensureRowCount() {
+    if (!inputFilesToJoin.isEmpty()) {
+      List<Long> blocksRowCountsL = inputFiles.stream()
+          .flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount))
+          .collect(Collectors.toList());
+      List<Long> blocksRowCountsR = inputFilesToJoin.stream()
+          .flatMap(x -> x.getFooter().getBlocks().stream().map(BlockMetaData::getRowCount))
+          .collect(Collectors.toList());
+      if (!blocksRowCountsL.equals(blocksRowCountsR)) {
+        throw new IllegalArgumentException(
+            "The number of rows in each block must match! Left blocks row counts: " + blocksRowCountsL
+                + ", right blocks row counts" + blocksRowCountsR + ".");
       }
     }
-    this.inputFiles.add(reader);
-    this.indexCacheStrategy = IndexCache.CacheStrategy.NONE;
-    this.overwriteInputWithJoinColumns = false;
   }
 
   private Queue<TransParquetFileReader> getFileReaders(List<InputFile> inputFiles, ParquetConfiguration conf) {
@@ -282,9 +297,9 @@ public void close() throws IOException {
   }
 
   public void processBlocks() throws IOException {
-    TransParquetFileReader readerJoin = inputFilesToJoin.peek();
-    IndexCache indexCacheJoin = null;
-    int blockIdxJoin = -1;
+    TransParquetFileReader readerToJoin = null;
+    IndexCache indexCacheToJoin = null;
+    int blockIdxToJoin = 0;
     List<ColumnDescriptor> outColumns = outSchema.getColumns();
 
     while (!inputFiles.isEmpty()) {
@@ -303,36 +318,42 @@ public void processBlocks() throws IOException {
         Map<ColumnPath, ColumnChunkMetaData> pathToChunk =
             blockMetaData.getColumns().stream().collect(Collectors.toMap(x -> x.getPath(), x -> x));
 
-        if (readerJoin != null
-            && (blockIdxJoin == -1
-                || ++blockIdxJoin
-                    == readerJoin.getFooter().getBlocks().size())) {
-          blockIdxJoin = 0;
-          readerJoin = inputFilesToJoin.poll();
-          Set<ColumnPath> columnPathsJoin = readerJoin.getFileMetaData().getSchema().getColumns().stream()
-              .map(x -> ColumnPath.get(x.getPath()))
-              .collect(Collectors.toSet());
-          if (indexCacheJoin != null) {
-            indexCacheJoin.clean();
+        if (!inputFilesToJoin.isEmpty()) {
+          if (readerToJoin == null
+              || ++blockIdxToJoin
+                  == readerToJoin.getFooter().getBlocks().size()) {
+            if (readerToJoin != null) readerToJoin.close();
+            blockIdxToJoin = 0;
+            readerToJoin = inputFilesToJoin.poll();
+            Set<ColumnPath> columnPathsToJoin =
+                readerToJoin.getFileMetaData().getSchema().getColumns().stream()
+                    .map(x -> ColumnPath.get(x.getPath()))
+                    .collect(Collectors.toSet());
+            if (indexCacheToJoin != null) {
+              indexCacheToJoin.clean();
+            }
+            indexCacheToJoin = IndexCache.create(readerToJoin, columnPathsToJoin, indexCacheStrategy, true);
+            indexCacheToJoin.setBlockMetadata(
+                readerToJoin.getFooter().getBlocks().get(blockIdxToJoin));
+          } else {
+            blockIdxToJoin++;
+            indexCacheToJoin.setBlockMetadata(
+                readerToJoin.getFooter().getBlocks().get(blockIdxToJoin));
           }
-          indexCacheJoin = IndexCache.create(readerJoin, columnPathsJoin, indexCacheStrategy, true);
-          indexCacheJoin.setBlockMetadata(
-              readerJoin.getFooter().getBlocks().get(blockIdxJoin));
-        } else {
-          blockIdxJoin++;
         }
 
         for (int outColumnIdx = 0; outColumnIdx < outColumns.size(); outColumnIdx++) {
           ColumnPath colPath =
               ColumnPath.get(outColumns.get(outColumnIdx).getPath());
-          if (readerJoin != null) {
-            Optional<ColumnChunkMetaData> chunkJoin =
-                readerJoin.getFooter().getBlocks().get(blockIdxJoin).getColumns().stream()
+          if (readerToJoin != null) {
+            Optional<ColumnChunkMetaData> chunkToJoin =
+                readerToJoin.getFooter().getBlocks().get(blockIdxToJoin).getColumns().stream()
                     .filter(x -> x.getPath().equals(colPath))
                     .findFirst();
-            if (chunkJoin.isPresent()
+            if (chunkToJoin.isPresent()
                 && (overwriteInputWithJoinColumns || !columnPaths.contains(colPath))) {
-              processBlock(readerJoin, blockIdxJoin, outColumnIdx, indexCacheJoin, chunkJoin.get());
+              processBlock(
+                  readerToJoin, blockIdxToJoin, outColumnIdx, indexCacheToJoin, chunkToJoin.get());
             } else {
               processBlock(reader, blockIdx, outColumnIdx, indexCache, pathToChunk.get(colPath));
             }
@@ -348,7 +369,9 @@ public void processBlocks() throws IOException {
 
       indexCache.clean();
       LOG.info("Finish rewriting input file: {}", reader.getFile());
+      reader.close();
     }
+    if (readerToJoin != null) readerToJoin.close();
   }
 
   private void processBlock(

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/RewriteOptions.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/RewriteOptions.java
@@ -466,9 +466,6 @@ public Builder ignoreJoinFilesMetadata(boolean ignoreJoinFilesMetadata) {
     public RewriteOptions build() {
       Preconditions.checkArgument(inputFiles != null && !inputFiles.isEmpty(), "Input file is required");
       Preconditions.checkArgument(outputFile != null, "Output file is required");
-      Preconditions.checkArgument(
-          inputFilesToJoin == null || !inputFiles.isEmpty(),
-          "Input files to join must be non-empty list or it can be left unset, it can't be an empty list");
 
       if (pruneColumns != null) {
         if (maskColumns != null) {
@@ -501,7 +498,7 @@ public RewriteOptions build() {
       return new RewriteOptions(
           conf,
           inputFiles,
-          inputFilesToJoin,
+          (inputFilesToJoin != null ? inputFilesToJoin : new ArrayList<>()),
           outputFile,
           pruneColumns,
           newCodecName,