All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.hadoop.rewrite.RewriteOptions Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop.rewrite;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Preconditions;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
 * A set of options to create a ParquetRewriter.
 */
public class RewriteOptions {

  final Configuration conf;
  final List inputFiles;
  final Path outputFile;
  final List pruneColumns;
  final CompressionCodecName newCodecName;
  final Map maskColumns;
  final List encryptColumns;
  final FileEncryptionProperties fileEncryptionProperties;

  private RewriteOptions(Configuration conf,
                         List inputFiles,
                         Path outputFile,
                         List pruneColumns,
                         CompressionCodecName newCodecName,
                         Map maskColumns,
                         List encryptColumns,
                         FileEncryptionProperties fileEncryptionProperties) {
    this.conf = conf;
    this.inputFiles = inputFiles;
    this.outputFile = outputFile;
    this.pruneColumns = pruneColumns;
    this.newCodecName = newCodecName;
    this.maskColumns = maskColumns;
    this.encryptColumns = encryptColumns;
    this.fileEncryptionProperties = fileEncryptionProperties;
  }

  public Configuration getConf() {
    return conf;
  }

  public List getInputFiles() {
    return inputFiles;
  }

  public Path getOutputFile() {
    return outputFile;
  }

  public List getPruneColumns() {
    return pruneColumns;
  }

  public CompressionCodecName getNewCodecName() {
    return newCodecName;
  }

  public Map getMaskColumns() {
    return maskColumns;
  }

  public List getEncryptColumns() {
    return encryptColumns;
  }

  public FileEncryptionProperties getFileEncryptionProperties() {
    return fileEncryptionProperties;
  }

  // Builder to create a RewriterOptions.
  public static class Builder {
    private Configuration conf;
    private List inputFiles;
    private Path outputFile;
    private List pruneColumns;
    private CompressionCodecName newCodecName;
    private Map maskColumns;
    private List encryptColumns;
    private FileEncryptionProperties fileEncryptionProperties;

    /**
     * Create a builder to create a RewriterOptions.
     *
     * @param conf       configuration for reading from input files and writing to output file
     * @param inputFile  input file path to read from
     * @param outputFile output file path to rewrite to
     */
    public Builder(Configuration conf, Path inputFile, Path outputFile) {
      this.conf = conf;
      this.inputFiles = Arrays.asList(inputFile);
      this.outputFile = outputFile;
    }

    /**
     * Create a builder to create a RewriterOptions.
     * 

* Please note that if merging more than one file, the schema of all files must be the same. * Otherwise, the rewrite will fail. *

* The rewrite will keep original row groups from all input files. This may not be optimal * if row groups are very small and will not solve small file problems. Instead, it will * make it worse to have a large file footer in the output file. * TODO: support rewrite by record to break the original row groups into reasonable ones. * * @param conf configuration for reading from input files and writing to output file * @param inputFiles list of input file paths to read from * @param outputFile output file path to rewrite to */ public Builder(Configuration conf, List inputFiles, Path outputFile) { this.conf = conf; this.inputFiles = inputFiles; this.outputFile = outputFile; } /** * Set the columns to prune. *

* By default, all columns are kept. * * @param columns list of columns to prune * @return self */ public Builder prune(List columns) { this.pruneColumns = columns; return this; } /** * Set the compression codec to use for the output file. *

* By default, the codec is the same as the input file. * * @param newCodecName compression codec to use * @return self */ public Builder transform(CompressionCodecName newCodecName) { this.newCodecName = newCodecName; return this; } /** * Set the columns to mask. *

* By default, no columns are masked. * * @param maskColumns map of columns to mask to the masking mode * @return self */ public Builder mask(Map maskColumns) { this.maskColumns = maskColumns; return this; } /** * Set the columns to encrypt. *

* By default, no columns are encrypted. * * @param encryptColumns list of columns to encrypt * @return self */ public Builder encrypt(List encryptColumns) { this.encryptColumns = encryptColumns; return this; } /** * Set the encryption properties to use for the output file. *

* This is required if encrypting columns are not empty. * * @param fileEncryptionProperties encryption properties to use * @return self */ public Builder encryptionProperties(FileEncryptionProperties fileEncryptionProperties) { this.fileEncryptionProperties = fileEncryptionProperties; return this; } /** * Add an input file to read from. * * @param path input file path to read from * @return self */ public Builder addInputFile(Path path) { this.inputFiles.add(path); return this; } /** * Build the RewriterOptions. * * @return a RewriterOptions */ public RewriteOptions build() { Preconditions.checkArgument(inputFiles != null && !inputFiles.isEmpty(), "Input file is required"); Preconditions.checkArgument(outputFile != null, "Output file is required"); if (pruneColumns != null) { if (maskColumns != null) { for (String pruneColumn : pruneColumns) { Preconditions.checkArgument(!maskColumns.containsKey(pruneColumn), "Cannot prune and mask same column"); } } if (encryptColumns != null) { for (String pruneColumn : pruneColumns) { Preconditions.checkArgument(!encryptColumns.contains(pruneColumn), "Cannot prune and encrypt same column"); } } } if (encryptColumns != null && !encryptColumns.isEmpty()) { Preconditions.checkArgument(fileEncryptionProperties != null, "FileEncryptionProperties is required when encrypting columns"); } if (fileEncryptionProperties != null) { Preconditions.checkArgument(encryptColumns != null && !encryptColumns.isEmpty(), "Encrypt columns is required when FileEncryptionProperties is set"); } return new RewriteOptions(conf, inputFiles, outputFile, pruneColumns, newCodecName, maskColumns, encryptColumns, fileEncryptionProperties); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy