org.apache.parquet.hadoop.rewrite.RewriteOptions Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop.rewrite;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Preconditions;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.IndexCache;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;

/**
 * A set of options to create a {@link ParquetRewriter}. See {@link RewriteOptions.Builder} for options description.
 */
public class RewriteOptions {

  private final ParquetConfiguration conf;
  private final List inputFiles;
  private final List inputFilesToJoin;
  private final OutputFile outputFile;
  private final List pruneColumns;
  private final CompressionCodecName newCodecName;
  private final Map maskColumns;
  private final Map renameColumns;
  private final List encryptColumns;
  private final FileEncryptionProperties fileEncryptionProperties;
  private final IndexCache.CacheStrategy indexCacheStrategy;
  private final boolean overwriteInputWithJoinColumns;
  private final boolean ignoreJoinFilesMetadata;

  private RewriteOptions(
      ParquetConfiguration conf,
      List inputFiles,
      List inputFilesToJoin,
      OutputFile outputFile,
      List pruneColumns,
      CompressionCodecName newCodecName,
      Map maskColumns,
      Map renameColumns,
      List encryptColumns,
      FileEncryptionProperties fileEncryptionProperties,
      IndexCache.CacheStrategy indexCacheStrategy,
      boolean overwriteInputWithJoinColumns,
      boolean ignoreJoinFilesMetadata) {
    this.conf = conf;
    this.inputFiles = inputFiles;
    this.inputFilesToJoin = inputFilesToJoin;
    this.outputFile = outputFile;
    this.pruneColumns = pruneColumns;
    this.newCodecName = newCodecName;
    this.maskColumns = maskColumns;
    this.renameColumns = renameColumns;
    this.encryptColumns = encryptColumns;
    this.fileEncryptionProperties = fileEncryptionProperties;
    this.indexCacheStrategy = indexCacheStrategy;
    this.overwriteInputWithJoinColumns = overwriteInputWithJoinColumns;
    this.ignoreJoinFilesMetadata = ignoreJoinFilesMetadata;
  }

  /**
   * Gets the {@link Configuration} part of the rewrite options.
   *
   * @return the associated {@link Configuration}
   */
  public Configuration getConf() {
    return ConfigurationUtil.createHadoopConfiguration(conf);
  }

  /**
   * Gets the {@link ParquetConfiguration} part of the rewrite options.
   *
   * @return the associated {@link ParquetConfiguration}
   */
  public ParquetConfiguration getParquetConfiguration() {
    return conf;
  }

  /**
   * Gets the input {@link Path}s for the rewrite if they exist for all input files,
   * otherwise throws a {@link RuntimeException}.
   *
   * @return a {@link List} of the associated input {@link Path}s
   */
  public List getInputFiles() {
    return inputFiles.stream()
        .map(f -> {
          if (f instanceof HadoopOutputFile) {
            HadoopOutputFile hadoopOutputFile = (HadoopOutputFile) f;
            return new Path(hadoopOutputFile.getPath());
          } else {
            throw new RuntimeException("The input files do not all have an associated Hadoop Path.");
          }
        })
        .collect(Collectors.toList());
  }

  /**
   * Gets the input {@link Path}s for the rewrite if they exist for all input files to join,
   * otherwise throws a {@link RuntimeException}.
   *
   * @return a {@link List} of the associated input {@link Path}s to join
   */
  public List getInputFilesToJoin() {
    return inputFilesToJoin.stream()
        .map(f -> {
          if (f instanceof HadoopOutputFile) {
            HadoopOutputFile hadoopOutputFile = (HadoopOutputFile) f;
            return new Path(hadoopOutputFile.getPath());
          } else {
            throw new RuntimeException(
                "The input files to join do not all have an associated Hadoop Path.");
          }
        })
        .collect(Collectors.toList());
  }

  /**
   * Gets the {@link InputFile}s for the rewrite.
   *
   * @return a {@link List} of the associated {@link InputFile}s
   */
  public List getParquetInputFiles() {
    return inputFiles;
  }

  /**
   * Gets the right {@link InputFile}s to join during the rewrite.
   *
   * @return a {@link List} of the associated {@link InputFile}s to join
   */
  public List getParquetInputFilesToJoin() {
    return inputFilesToJoin;
  }

  /**
   * Get the {@link Path} for the rewrite if it exists, otherwise throws a {@link RuntimeException}.
   *
   * @return the associated {@link Path} if it exists
   */
  public Path getOutputFile() {
    if (outputFile instanceof HadoopOutputFile) {
      HadoopOutputFile hadoopOutputFile = (HadoopOutputFile) outputFile;
      return new Path(hadoopOutputFile.getPath());
    } else {
      throw new RuntimeException("The output file does not have an associated Hadoop Path.");
    }
  }

  /**
   * Get the {@link OutputFile} for the rewrite.
   *
   * @return the associated {@link OutputFile}
   */
  public OutputFile getParquetOutputFile() {
    return outputFile;
  }

  public List getPruneColumns() {
    return pruneColumns;
  }

  public CompressionCodecName getNewCodecName() {
    return newCodecName;
  }

  public Map getMaskColumns() {
    return maskColumns;
  }

  public Map getRenameColumns() {
    return renameColumns;
  }

  public List getEncryptColumns() {
    return encryptColumns;
  }

  public FileEncryptionProperties getFileEncryptionProperties() {
    return fileEncryptionProperties;
  }

  public IndexCache.CacheStrategy getIndexCacheStrategy() {
    return indexCacheStrategy;
  }

  public boolean getOverwriteInputWithJoinColumns() {
    return overwriteInputWithJoinColumns;
  }

  public boolean getIgnoreJoinFilesMetadata() {
    return ignoreJoinFilesMetadata;
  }

  /** Builder for {@link RewriteOptions} which is used for constructing {@link ParquetRewriter}.*/
  public static class Builder {
    private final ParquetConfiguration conf;
    private final List inputFiles;
    private final List inputFilesToJoin;
    private final OutputFile outputFile;
    private List pruneColumns;
    private CompressionCodecName newCodecName;
    private Map maskColumns;
    private Map renameColumns;
    private List encryptColumns;
    private FileEncryptionProperties fileEncryptionProperties;
    private IndexCache.CacheStrategy indexCacheStrategy = IndexCache.CacheStrategy.NONE;
    private boolean overwriteInputWithJoinColumns = false;
    private boolean ignoreJoinFilesMetadata = false;

    /**
     * Create a builder to create a RewriterOptions.
     *
     * @param conf              configuration for reading from input files and writing to output file
     * @param inputFile         input file path to read from
     * @param inputFileToJoin   input join file path to read from
     * @param outputFile        output file path to rewrite to
     */
    public Builder(Configuration conf, Path inputFile, Path inputFileToJoin, Path outputFile) {
      this(
          new HadoopParquetConfiguration(conf),
          HadoopInputFile.fromPathUnchecked(inputFile, conf),
          HadoopInputFile.fromPathUnchecked(inputFileToJoin, conf),
          HadoopOutputFile.fromPathUnchecked(outputFile, conf));
    }

    /**
     * Create a builder to create a RewriterOptions.
     *
     * @param conf       configuration for reading from input files and writing to output file
     * @param inputFile  input file path to read from
     * @param outputFile output file path to rewrite to
     */
    public Builder(Configuration conf, Path inputFile, Path outputFile) {
      this(
          new HadoopParquetConfiguration(conf),
          HadoopInputFile.fromPathUnchecked(inputFile, conf),
          HadoopOutputFile.fromPathUnchecked(outputFile, conf));
    }

    /**
     * Create a builder to create a RewriterOptions.
     *
     * @param conf       configuration for reading from input files and writing to output file
     * @param inputFile  input file to read from
     * @param outputFile output file to rewrite to
     */
    public Builder(ParquetConfiguration conf, InputFile inputFile, OutputFile outputFile) {
      this(conf, Collections.singletonList(inputFile), null, outputFile);
    }

    /**
     * Create a builder to create a RewriterOptions.
     *
     * @param conf              configuration for reading from input files and writing to output file
     * @param inputFile         input file to read from
     * @param inputFileToJoin   input join file to read from
     * @param outputFile        output file to rewrite to
     */
    public Builder(
        ParquetConfiguration conf, InputFile inputFile, InputFile inputFileToJoin, OutputFile outputFile) {
      this(conf, Collections.singletonList(inputFile), Collections.singletonList(inputFileToJoin), outputFile);
    }

    /**
     * Create a builder to create a RewriterOptions.
     * 
     * Please note that if merging more than one file, the schema of all files must be the same.
     * Otherwise, the rewrite will fail.
     * 

     * The rewrite will keep original row groups from all input files. This may not be optimal
     * if row groups are very small and will not solve small file problems. Instead, it will
     * make it worse to have a large file footer in the output file.
     * TODO: support rewrite by record to break the original row groups into reasonable ones.
     * 

     * See {@link ParquetRewriter} for more details.
     *
     * @param conf       configuration for reading from input files and writing to output file
     * @param inputFiles list of input file paths to read from
     * @param outputFile output file path to rewrite to
     */
    public Builder(Configuration conf, List inputFiles, Path outputFile) {
      this.conf = new HadoopParquetConfiguration(conf);
      this.inputFiles = new ArrayList<>(inputFiles.size());
      for (Path inputFile : inputFiles) {
        this.inputFiles.add(HadoopInputFile.fromPathUnchecked(inputFile, conf));
      }
      this.inputFilesToJoin = new ArrayList<>();
      this.outputFile = HadoopOutputFile.fromPathUnchecked(outputFile, conf);
    }

    /**
     * Create a builder to create a RewriterOptions.
     * 

     * Please note that if merging more than one file, the schema of all files must be the same.
     * Otherwise, the rewrite will fail.
     * 

     * The rewrite will keep original row groups from all input files. This may not be optimal
     * if row groups are very small and will not solve small file problems. Instead, it will
     * make it worse to have a large file footer in the output file.
     * TODO: support rewrite by record to break the original row groups into reasonable ones.
     * 

     * See {@link ParquetRewriter} for more details.
     *
     * @param conf       configuration for reading from input files and writing to output file
     * @param inputFiles list of input file paths to read from
     * @param outputFile output file path to rewrite to
     */
    public Builder(ParquetConfiguration conf, List inputFiles, OutputFile outputFile) {
      this.conf = conf;
      this.inputFiles = inputFiles;
      this.inputFilesToJoin = new ArrayList<>();
      this.outputFile = outputFile;
    }

    /**
     * Create a builder to create a RewriterOptions.
     * 

     * Please note the schema of all files in each file group inputFiles and inputFilesToJoin
     * must be the same while those two schemas can be different in comparison with each other.
     * Otherwise, the rewrite will fail.
     * 

     * The rewrite will keep original row groups from all input files. This may not be optimal
     * if row groups are very small and will not solve small file problems. Instead, it will
     * make it worse to have a large file footer in the output file.
     * TODO: support rewrite by record to break the original row groups into reasonable ones.
     * 

     * See {@link ParquetRewriter} for more details.
     *
     * @param conf               configuration for reading from input files and writing to output file
     * @param inputFiles        list of input file paths to read from
     * @param inputFilesToJoin  list of input join file paths to read from
     * @param outputFile        output file path to rewrite to
     */
    public Builder(Configuration conf, List inputFiles, List inputFilesToJoin, Path outputFile) {
      this.conf = new HadoopParquetConfiguration(conf);
      this.inputFiles = new ArrayList<>(inputFiles.size());
      for (Path inputFile : inputFiles) {
        this.inputFiles.add(HadoopInputFile.fromPathUnchecked(inputFile, conf));
      }
      this.inputFilesToJoin = new ArrayList<>(inputFilesToJoin.size());
      for (Path inputFile : inputFilesToJoin) {
        this.inputFilesToJoin.add(HadoopInputFile.fromPathUnchecked(inputFile, conf));
      }
      this.outputFile = HadoopOutputFile.fromPathUnchecked(outputFile, conf);
    }

    /**
     * Create a builder to create a RewriterOptions.
     * 

     * Please note the schema of all files in each file group inputFiles and inputFilesToJoin
     * must be the same while those two schemas can be different in comparison with each other.
     * Otherwise, the rewrite will fail.
     * 

     * The rewrite will keep original row groups from all input files. This may not be optimal
     * if row groups are very small and will not solve small file problems. Instead, it will
     * make it worse to have a large file footer in the output file.
     * 

     * See {@link ParquetRewriter} for more details.
     *
     * @param conf              configuration for reading from input files and writing to output file
     * @param inputFiles        list of input file paths to read from
     * @param inputFilesToJoin  list of input join file paths to read from
     * @param outputFile        output file path to rewrite to
     */
    public Builder(
        ParquetConfiguration conf,
        List inputFiles,
        List inputFilesToJoin,
        OutputFile outputFile) {
      this.conf = conf;
      this.inputFiles = inputFiles;
      this.inputFilesToJoin = inputFilesToJoin;
      this.outputFile = outputFile;
    }

    /**
     * Set the columns to prune.
     * 

     * By default, all columns are kept.
     *
     * @param columns list of columns to prune
     * @return self
     */
    public Builder prune(List columns) {
      this.pruneColumns = columns;
      return this;
    }

    /**
     * Set the compression codec to use for the output file.
     * 

     * By default, the codec is the same as the input file.
     *
     * @param newCodecName compression codec to use
     * @return self
     */
    public Builder transform(CompressionCodecName newCodecName) {
      this.newCodecName = newCodecName;
      return this;
    }

    /**
     * Set the columns to mask.
     * 

     * By default, no columns are masked.
     *
     * @param maskColumns map of columns to mask to the masking mode
     * @return self
     */
    public Builder mask(Map maskColumns) {
      this.maskColumns = maskColumns;
      return this;
    }

    /**
     * Set the columns to be renamed.
     * 

     * Note that nested columns can't be renamed, in case of GroupType column only top level column can be renamed.
     *
     * @param renameColumns map where keys are original names and values are new names
     * @return self
     */
    public Builder renameColumns(Map renameColumns) {
      this.renameColumns = renameColumns;
      return this;
    }

    /**
     * Set the columns to encrypt.
     * 

     * By default, no columns are encrypted.
     *
     * @param encryptColumns list of columns to encrypt
     * @return self
     */
    public Builder encrypt(List encryptColumns) {
      this.encryptColumns = encryptColumns;
      return this;
    }

    /**
     * Set the encryption properties to use for the output file.
     * 

     * This is required if encrypting columns are not empty.
     *
     * @param fileEncryptionProperties encryption properties to use
     * @return self
     */
    public Builder encryptionProperties(FileEncryptionProperties fileEncryptionProperties) {
      this.fileEncryptionProperties = fileEncryptionProperties;
      return this;
    }

    /**
     * Add an input file to read from.
     *
     * @param path input file path to read from
     * @return self
     */
    public Builder addInputFile(Path path) {
      this.inputFiles.add(
          HadoopInputFile.fromPathUnchecked(path, ConfigurationUtil.createHadoopConfiguration(conf)));
      return this;
    }

    /**
     * Add an input join file to read from.
     *
     * @param path input file path to read from
     * @return self
     */
    public Builder addInputFileToJoinColumns(Path path) {
      this.inputFilesToJoin.add(
          HadoopInputFile.fromPathUnchecked(path, ConfigurationUtil.createHadoopConfiguration(conf)));
      return this;
    }

    /**
     * Add an input file to read from.
     *
     * @param inputFile input file to read from
     * @return self
     */
    public Builder addInputFile(InputFile inputFile) {
      this.inputFiles.add(inputFile);
      return this;
    }

    /**
     * Add an input file to join.
     *
     * @param fileToJoin input file to join
     * @return self
     */
    public Builder addInputFilesToJoin(InputFile fileToJoin) {
      this.inputFilesToJoin.add(fileToJoin);
      return this;
    }

    /**
     * Set the index(ColumnIndex, Offset and BloomFilter) cache strategy.
     * 

     * This could reduce the random seek while rewriting with PREFETCH_BLOCK strategy, NONE by default.
     *
     * @param cacheStrategy the index cache strategy, supports: {@link IndexCache.CacheStrategy#NONE} or
     *                      {@link IndexCache.CacheStrategy#PREFETCH_BLOCK}
     * @return self
     */
    public Builder indexCacheStrategy(IndexCache.CacheStrategy cacheStrategy) {
      this.indexCacheStrategy = cacheStrategy;
      return this;
    }

    /**
     * Set a flag whether columns from join files need to overwrite columns from the main input files.
     * 

     * By default, join files columns do not overwrite the main input file columns.
     *
     * @param overwriteInputWithJoinColumns a flag if columns from join files should overwrite columns
     *                                      from the main input files
     * @return self
     */
    public Builder overwriteInputWithJoinColumns(boolean overwriteInputWithJoinColumns) {
      this.overwriteInputWithJoinColumns = overwriteInputWithJoinColumns;
      return this;
    }

    /**
     * Set a flag whether metadata from join files should be ignored.
     * 
     * By default, metadata is not ignored.
     *
     * @param ignoreJoinFilesMetadata a flag if metadata from join files should be ignored
     * @return self
     */
    public Builder ignoreJoinFilesMetadata(boolean ignoreJoinFilesMetadata) {
      this.ignoreJoinFilesMetadata = ignoreJoinFilesMetadata;
      return this;
    }

    /**
     * Build the RewriterOptions.
     *
     * @return a RewriterOptions
     */
    public RewriteOptions build() {
      checkPreconditions();
      return new RewriteOptions(
          conf,
          inputFiles,
          (inputFilesToJoin != null ? inputFilesToJoin : new ArrayList<>()),
          outputFile,
          pruneColumns,
          newCodecName,
          maskColumns,
          renameColumns == null
              ? new HashMap<>()
              : renameColumns.entrySet().stream()
                  .collect(Collectors.toMap(x -> x.getKey().trim(), x -> x.getValue()
                      .trim())),
          encryptColumns,
          fileEncryptionProperties,
          indexCacheStrategy,
          overwriteInputWithJoinColumns,
          ignoreJoinFilesMetadata);
    }

    private void checkPreconditions() {
      Preconditions.checkArgument(inputFiles != null && !inputFiles.isEmpty(), "Input file is required");
      Preconditions.checkArgument(outputFile != null, "Output file is required");

      if (pruneColumns != null) {
        if (maskColumns != null) {
          for (String pruneColumn : pruneColumns) {
            Preconditions.checkArgument(
                !maskColumns.containsKey(pruneColumn), "Cannot prune and mask same column");
          }
        }
        if (encryptColumns != null) {
          for (String pruneColumn : pruneColumns) {
            Preconditions.checkArgument(
                !encryptColumns.contains(pruneColumn), "Cannot prune and encrypt same column");
          }
        }
      }

      if (renameColumns != null) {
        Set nullifiedColumns = maskColumns == null
            ? new HashSet<>()
            : maskColumns.entrySet().stream()
                .filter(x -> x.getValue() == MaskMode.NULLIFY)
                .map(Map.Entry::getKey)
                .collect(Collectors.toSet());
        renameColumns.forEach((colSrc, colDst) -> {
          Preconditions.checkArgument(
              colSrc != null && !colSrc.trim().isEmpty(), "Renamed column source name can't be empty");
          Preconditions.checkArgument(
              colDst != null && !colDst.trim().isEmpty(), "Renamed column target name can't be empty");
          Preconditions.checkArgument(
              !nullifiedColumns.contains(colSrc), "Cannot nullify and rename the same column");
          Preconditions.checkArgument(
              !colSrc.contains(".") && !colDst.contains("."),
              "Renamed column can't be nested, in case of GroupType column only a top level column can be renamed");
        });
      }

      if (encryptColumns != null && !encryptColumns.isEmpty()) {
        Preconditions.checkArgument(
            fileEncryptionProperties != null,
            "FileEncryptionProperties is required when encrypting columns");
      }

      if (fileEncryptionProperties != null) {
        Preconditions.checkArgument(
            encryptColumns != null && !encryptColumns.isEmpty(),
            "Encrypt columns is required when FileEncryptionProperties is set");
      }
    }
  }
}