
org.apache.parquet.hadoop.rewrite.RewriteOptions Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.hadoop.rewrite;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Preconditions;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.IndexCache;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
/**
* A set of options to create a {@link ParquetRewriter}. See {@link RewriteOptions.Builder} for options description.
*/
public class RewriteOptions {
private final ParquetConfiguration conf;
private final List inputFiles;
private final List inputFilesToJoin;
private final OutputFile outputFile;
private final List pruneColumns;
private final CompressionCodecName newCodecName;
private final Map maskColumns;
private final Map renameColumns;
private final List encryptColumns;
private final FileEncryptionProperties fileEncryptionProperties;
private final IndexCache.CacheStrategy indexCacheStrategy;
private final boolean overwriteInputWithJoinColumns;
private final boolean ignoreJoinFilesMetadata;
private RewriteOptions(
ParquetConfiguration conf,
List inputFiles,
List inputFilesToJoin,
OutputFile outputFile,
List pruneColumns,
CompressionCodecName newCodecName,
Map maskColumns,
Map renameColumns,
List encryptColumns,
FileEncryptionProperties fileEncryptionProperties,
IndexCache.CacheStrategy indexCacheStrategy,
boolean overwriteInputWithJoinColumns,
boolean ignoreJoinFilesMetadata) {
this.conf = conf;
this.inputFiles = inputFiles;
this.inputFilesToJoin = inputFilesToJoin;
this.outputFile = outputFile;
this.pruneColumns = pruneColumns;
this.newCodecName = newCodecName;
this.maskColumns = maskColumns;
this.renameColumns = renameColumns;
this.encryptColumns = encryptColumns;
this.fileEncryptionProperties = fileEncryptionProperties;
this.indexCacheStrategy = indexCacheStrategy;
this.overwriteInputWithJoinColumns = overwriteInputWithJoinColumns;
this.ignoreJoinFilesMetadata = ignoreJoinFilesMetadata;
}
/**
* Gets the {@link Configuration} part of the rewrite options.
*
* @return the associated {@link Configuration}
*/
public Configuration getConf() {
return ConfigurationUtil.createHadoopConfiguration(conf);
}
/**
* Gets the {@link ParquetConfiguration} part of the rewrite options.
*
* @return the associated {@link ParquetConfiguration}
*/
public ParquetConfiguration getParquetConfiguration() {
return conf;
}
/**
* Gets the input {@link Path}s for the rewrite if they exist for all input files,
* otherwise throws a {@link RuntimeException}.
*
* @return a {@link List} of the associated input {@link Path}s
*/
public List getInputFiles() {
return inputFiles.stream()
.map(f -> {
if (f instanceof HadoopOutputFile) {
HadoopOutputFile hadoopOutputFile = (HadoopOutputFile) f;
return new Path(hadoopOutputFile.getPath());
} else {
throw new RuntimeException("The input files do not all have an associated Hadoop Path.");
}
})
.collect(Collectors.toList());
}
/**
* Gets the input {@link Path}s for the rewrite if they exist for all input files to join,
* otherwise throws a {@link RuntimeException}.
*
* @return a {@link List} of the associated input {@link Path}s to join
*/
public List getInputFilesToJoin() {
return inputFilesToJoin.stream()
.map(f -> {
if (f instanceof HadoopOutputFile) {
HadoopOutputFile hadoopOutputFile = (HadoopOutputFile) f;
return new Path(hadoopOutputFile.getPath());
} else {
throw new RuntimeException(
"The input files to join do not all have an associated Hadoop Path.");
}
})
.collect(Collectors.toList());
}
/**
* Gets the {@link InputFile}s for the rewrite.
*
* @return a {@link List} of the associated {@link InputFile}s
*/
public List getParquetInputFiles() {
return inputFiles;
}
/**
* Gets the right {@link InputFile}s to join during the rewrite.
*
* @return a {@link List} of the associated {@link InputFile}s to join
*/
public List getParquetInputFilesToJoin() {
return inputFilesToJoin;
}
/**
* Get the {@link Path} for the rewrite if it exists, otherwise throws a {@link RuntimeException}.
*
* @return the associated {@link Path} if it exists
*/
public Path getOutputFile() {
if (outputFile instanceof HadoopOutputFile) {
HadoopOutputFile hadoopOutputFile = (HadoopOutputFile) outputFile;
return new Path(hadoopOutputFile.getPath());
} else {
throw new RuntimeException("The output file does not have an associated Hadoop Path.");
}
}
/**
* Get the {@link OutputFile} for the rewrite.
*
* @return the associated {@link OutputFile}
*/
public OutputFile getParquetOutputFile() {
return outputFile;
}
public List getPruneColumns() {
return pruneColumns;
}
public CompressionCodecName getNewCodecName() {
return newCodecName;
}
public Map getMaskColumns() {
return maskColumns;
}
public Map getRenameColumns() {
return renameColumns;
}
public List getEncryptColumns() {
return encryptColumns;
}
public FileEncryptionProperties getFileEncryptionProperties() {
return fileEncryptionProperties;
}
public IndexCache.CacheStrategy getIndexCacheStrategy() {
return indexCacheStrategy;
}
public boolean getOverwriteInputWithJoinColumns() {
return overwriteInputWithJoinColumns;
}
public boolean getIgnoreJoinFilesMetadata() {
return ignoreJoinFilesMetadata;
}
/** Builder for {@link RewriteOptions} which is used for constructing {@link ParquetRewriter}.*/
public static class Builder {
private final ParquetConfiguration conf;
private final List inputFiles;
private final List inputFilesToJoin;
private final OutputFile outputFile;
private List pruneColumns;
private CompressionCodecName newCodecName;
private Map maskColumns;
private Map renameColumns;
private List encryptColumns;
private FileEncryptionProperties fileEncryptionProperties;
private IndexCache.CacheStrategy indexCacheStrategy = IndexCache.CacheStrategy.NONE;
private boolean overwriteInputWithJoinColumns = false;
private boolean ignoreJoinFilesMetadata = false;
/**
* Create a builder to create a RewriterOptions.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFile input file path to read from
* @param inputFileToJoin input join file path to read from
* @param outputFile output file path to rewrite to
*/
public Builder(Configuration conf, Path inputFile, Path inputFileToJoin, Path outputFile) {
this(
new HadoopParquetConfiguration(conf),
HadoopInputFile.fromPathUnchecked(inputFile, conf),
HadoopInputFile.fromPathUnchecked(inputFileToJoin, conf),
HadoopOutputFile.fromPathUnchecked(outputFile, conf));
}
/**
* Create a builder to create a RewriterOptions.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFile input file path to read from
* @param outputFile output file path to rewrite to
*/
public Builder(Configuration conf, Path inputFile, Path outputFile) {
this(
new HadoopParquetConfiguration(conf),
HadoopInputFile.fromPathUnchecked(inputFile, conf),
HadoopOutputFile.fromPathUnchecked(outputFile, conf));
}
/**
* Create a builder to create a RewriterOptions.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFile input file to read from
* @param outputFile output file to rewrite to
*/
public Builder(ParquetConfiguration conf, InputFile inputFile, OutputFile outputFile) {
this(conf, Collections.singletonList(inputFile), null, outputFile);
}
/**
* Create a builder to create a RewriterOptions.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFile input file to read from
* @param inputFileToJoin input join file to read from
* @param outputFile output file to rewrite to
*/
public Builder(
ParquetConfiguration conf, InputFile inputFile, InputFile inputFileToJoin, OutputFile outputFile) {
this(conf, Collections.singletonList(inputFile), Collections.singletonList(inputFileToJoin), outputFile);
}
/**
* Create a builder to create a RewriterOptions.
*
* Please note that if merging more than one file, the schema of all files must be the same.
* Otherwise, the rewrite will fail.
*
* The rewrite will keep original row groups from all input files. This may not be optimal
* if row groups are very small and will not solve small file problems. Instead, it will
* make it worse to have a large file footer in the output file.
* TODO: support rewrite by record to break the original row groups into reasonable ones.
*
* See {@link ParquetRewriter} for more details.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFiles list of input file paths to read from
* @param outputFile output file path to rewrite to
*/
public Builder(Configuration conf, List inputFiles, Path outputFile) {
this.conf = new HadoopParquetConfiguration(conf);
this.inputFiles = new ArrayList<>(inputFiles.size());
for (Path inputFile : inputFiles) {
this.inputFiles.add(HadoopInputFile.fromPathUnchecked(inputFile, conf));
}
this.inputFilesToJoin = new ArrayList<>();
this.outputFile = HadoopOutputFile.fromPathUnchecked(outputFile, conf);
}
/**
* Create a builder to create a RewriterOptions.
*
* Please note that if merging more than one file, the schema of all files must be the same.
* Otherwise, the rewrite will fail.
*
* The rewrite will keep original row groups from all input files. This may not be optimal
* if row groups are very small and will not solve small file problems. Instead, it will
* make it worse to have a large file footer in the output file.
* TODO: support rewrite by record to break the original row groups into reasonable ones.
*
* See {@link ParquetRewriter} for more details.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFiles list of input file paths to read from
* @param outputFile output file path to rewrite to
*/
public Builder(ParquetConfiguration conf, List inputFiles, OutputFile outputFile) {
this.conf = conf;
this.inputFiles = inputFiles;
this.inputFilesToJoin = new ArrayList<>();
this.outputFile = outputFile;
}
/**
* Create a builder to create a RewriterOptions.
*
* Please note the schema of all files in each file group inputFiles
and inputFilesToJoin
* must be the same while those two schemas can be different in comparison with each other.
* Otherwise, the rewrite will fail.
*
* The rewrite will keep original row groups from all input files. This may not be optimal
* if row groups are very small and will not solve small file problems. Instead, it will
* make it worse to have a large file footer in the output file.
* TODO: support rewrite by record to break the original row groups into reasonable ones.
*
* See {@link ParquetRewriter} for more details.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFiles list of input file paths to read from
* @param inputFilesToJoin list of input join file paths to read from
* @param outputFile output file path to rewrite to
*/
public Builder(Configuration conf, List inputFiles, List inputFilesToJoin, Path outputFile) {
this.conf = new HadoopParquetConfiguration(conf);
this.inputFiles = new ArrayList<>(inputFiles.size());
for (Path inputFile : inputFiles) {
this.inputFiles.add(HadoopInputFile.fromPathUnchecked(inputFile, conf));
}
this.inputFilesToJoin = new ArrayList<>(inputFilesToJoin.size());
for (Path inputFile : inputFilesToJoin) {
this.inputFilesToJoin.add(HadoopInputFile.fromPathUnchecked(inputFile, conf));
}
this.outputFile = HadoopOutputFile.fromPathUnchecked(outputFile, conf);
}
/**
* Create a builder to create a RewriterOptions.
*
* Please note the schema of all files in each file group inputFiles
and inputFilesToJoin
* must be the same while those two schemas can be different in comparison with each other.
* Otherwise, the rewrite will fail.
*
* The rewrite will keep original row groups from all input files. This may not be optimal
* if row groups are very small and will not solve small file problems. Instead, it will
* make it worse to have a large file footer in the output file.
*
* See {@link ParquetRewriter} for more details.
*
* @param conf configuration for reading from input files and writing to output file
* @param inputFiles list of input file paths to read from
* @param inputFilesToJoin list of input join file paths to read from
* @param outputFile output file path to rewrite to
*/
public Builder(
ParquetConfiguration conf,
List inputFiles,
List inputFilesToJoin,
OutputFile outputFile) {
this.conf = conf;
this.inputFiles = inputFiles;
this.inputFilesToJoin = inputFilesToJoin;
this.outputFile = outputFile;
}
/**
* Set the columns to prune.
*
* By default, all columns are kept.
*
* @param columns list of columns to prune
* @return self
*/
public Builder prune(List columns) {
this.pruneColumns = columns;
return this;
}
/**
* Set the compression codec to use for the output file.
*
* By default, the codec is the same as the input file.
*
* @param newCodecName compression codec to use
* @return self
*/
public Builder transform(CompressionCodecName newCodecName) {
this.newCodecName = newCodecName;
return this;
}
/**
* Set the columns to mask.
*
* By default, no columns are masked.
*
* @param maskColumns map of columns to mask to the masking mode
* @return self
*/
public Builder mask(Map maskColumns) {
this.maskColumns = maskColumns;
return this;
}
/**
* Set the columns to be renamed.
*
* Note that nested columns can't be renamed, in case of GroupType column only top level column can be renamed.
*
* @param renameColumns map where keys are original names and values are new names
* @return self
*/
public Builder renameColumns(Map renameColumns) {
this.renameColumns = renameColumns;
return this;
}
/**
* Set the columns to encrypt.
*
* By default, no columns are encrypted.
*
* @param encryptColumns list of columns to encrypt
* @return self
*/
public Builder encrypt(List encryptColumns) {
this.encryptColumns = encryptColumns;
return this;
}
/**
* Set the encryption properties to use for the output file.
*
* This is required if encrypting columns are not empty.
*
* @param fileEncryptionProperties encryption properties to use
* @return self
*/
public Builder encryptionProperties(FileEncryptionProperties fileEncryptionProperties) {
this.fileEncryptionProperties = fileEncryptionProperties;
return this;
}
/**
* Add an input file to read from.
*
* @param path input file path to read from
* @return self
*/
public Builder addInputFile(Path path) {
this.inputFiles.add(
HadoopInputFile.fromPathUnchecked(path, ConfigurationUtil.createHadoopConfiguration(conf)));
return this;
}
/**
* Add an input join file to read from.
*
* @param path input file path to read from
* @return self
*/
public Builder addInputFileToJoinColumns(Path path) {
this.inputFilesToJoin.add(
HadoopInputFile.fromPathUnchecked(path, ConfigurationUtil.createHadoopConfiguration(conf)));
return this;
}
/**
* Add an input file to read from.
*
* @param inputFile input file to read from
* @return self
*/
public Builder addInputFile(InputFile inputFile) {
this.inputFiles.add(inputFile);
return this;
}
/**
* Add an input file to join.
*
* @param fileToJoin input file to join
* @return self
*/
public Builder addInputFilesToJoin(InputFile fileToJoin) {
this.inputFilesToJoin.add(fileToJoin);
return this;
}
/**
* Set the index(ColumnIndex, Offset and BloomFilter) cache strategy.
*
* This could reduce the random seek while rewriting with PREFETCH_BLOCK strategy, NONE by default.
*
* @param cacheStrategy the index cache strategy, supports: {@link IndexCache.CacheStrategy#NONE} or
* {@link IndexCache.CacheStrategy#PREFETCH_BLOCK}
* @return self
*/
public Builder indexCacheStrategy(IndexCache.CacheStrategy cacheStrategy) {
this.indexCacheStrategy = cacheStrategy;
return this;
}
/**
* Set a flag whether columns from join files need to overwrite columns from the main input files.
*
* By default, join files columns do not overwrite the main input file columns.
*
* @param overwriteInputWithJoinColumns a flag if columns from join files should overwrite columns
* from the main input files
* @return self
*/
public Builder overwriteInputWithJoinColumns(boolean overwriteInputWithJoinColumns) {
this.overwriteInputWithJoinColumns = overwriteInputWithJoinColumns;
return this;
}
/**
* Set a flag whether metadata from join files should be ignored.
*
* By default, metadata is not ignored.
*
* @param ignoreJoinFilesMetadata a flag if metadata from join files should be ignored
* @return self
*/
public Builder ignoreJoinFilesMetadata(boolean ignoreJoinFilesMetadata) {
this.ignoreJoinFilesMetadata = ignoreJoinFilesMetadata;
return this;
}
/**
* Build the RewriterOptions.
*
* @return a RewriterOptions
*/
public RewriteOptions build() {
checkPreconditions();
return new RewriteOptions(
conf,
inputFiles,
(inputFilesToJoin != null ? inputFilesToJoin : new ArrayList<>()),
outputFile,
pruneColumns,
newCodecName,
maskColumns,
renameColumns == null
? new HashMap<>()
: renameColumns.entrySet().stream()
.collect(Collectors.toMap(x -> x.getKey().trim(), x -> x.getValue()
.trim())),
encryptColumns,
fileEncryptionProperties,
indexCacheStrategy,
overwriteInputWithJoinColumns,
ignoreJoinFilesMetadata);
}
private void checkPreconditions() {
Preconditions.checkArgument(inputFiles != null && !inputFiles.isEmpty(), "Input file is required");
Preconditions.checkArgument(outputFile != null, "Output file is required");
if (pruneColumns != null) {
if (maskColumns != null) {
for (String pruneColumn : pruneColumns) {
Preconditions.checkArgument(
!maskColumns.containsKey(pruneColumn), "Cannot prune and mask same column");
}
}
if (encryptColumns != null) {
for (String pruneColumn : pruneColumns) {
Preconditions.checkArgument(
!encryptColumns.contains(pruneColumn), "Cannot prune and encrypt same column");
}
}
}
if (renameColumns != null) {
Set nullifiedColumns = maskColumns == null
? new HashSet<>()
: maskColumns.entrySet().stream()
.filter(x -> x.getValue() == MaskMode.NULLIFY)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
renameColumns.forEach((colSrc, colDst) -> {
Preconditions.checkArgument(
colSrc != null && !colSrc.trim().isEmpty(), "Renamed column source name can't be empty");
Preconditions.checkArgument(
colDst != null && !colDst.trim().isEmpty(), "Renamed column target name can't be empty");
Preconditions.checkArgument(
!nullifiedColumns.contains(colSrc), "Cannot nullify and rename the same column");
Preconditions.checkArgument(
!colSrc.contains(".") && !colDst.contains("."),
"Renamed column can't be nested, in case of GroupType column only a top level column can be renamed");
});
}
if (encryptColumns != null && !encryptColumns.isEmpty()) {
Preconditions.checkArgument(
fileEncryptionProperties != null,
"FileEncryptionProperties is required when encrypting columns");
}
if (fileEncryptionProperties != null) {
Preconditions.checkArgument(
encryptColumns != null && !encryptColumns.isEmpty(),
"Encrypt columns is required when FileEncryptionProperties is set");
}
}
}
}