org.apache.parquet.hadoop.ParquetOutputFormat Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.hadoop;
import static org.apache.parquet.column.ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.util.ContextUtil.getConfiguration;
import java.io.IOException;
import java.util.Objects;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.crypto.EncryptionPropertiesFactory;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.ParquetFileWriter.Mode;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.api.WriteSupport.WriteContext;
import org.apache.parquet.hadoop.codec.CodecConfig;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* OutputFormat to write to a Parquet file
*
* It requires a {@link WriteSupport} to convert the actual records to the underlying format.
* It requires the schema of the incoming records. (provided by the write support)
* It allows storing extra metadata in the footer (for example: for schema compatibility purpose when converting from a different schema language).
*
* The format configuration settings in the job configuration:
*
* # The block size is the size of a row group being buffered in memory
* # this limits the memory usage when writing
* # Larger values will improve the IO when reading but consume more memory when writing
* parquet.block.size=134217728 # in bytes, default = 128 * 1024 * 1024
*
* # The page size is for compression. When reading, each page can be decompressed independently.
* # A block is composed of pages. The page is the smallest unit that must be read fully to access a single record.
* # If this value is too small, the compression will deteriorate
* parquet.page.size=1048576 # in bytes, default = 1 * 1024 * 1024
*
* # There is one dictionary page per column per row group when dictionary encoding is used.
* # The dictionary page size works like the page size but for dictionary
* parquet.dictionary.page.size=1048576 # in bytes, default = 1 * 1024 * 1024
*
* # The compression algorithm used to compress pages
* parquet.compression=UNCOMPRESSED # one of: UNCOMPRESSED, SNAPPY, GZIP, LZO. Default: UNCOMPRESSED. Supersedes mapred.output.compress*
*
* # The write support class to convert the records written to the OutputFormat into the events accepted by the record consumer
* # Usually provided by a specific ParquetOutputFormat subclass
* parquet.write.support.class= # fully qualified name
*
* # To enable/disable dictionary encoding
* parquet.enable.dictionary=true # false to disable dictionary encoding
*
* # To enable/disable summary metadata aggregation at the end of a MR job
* # The default is true (enabled)
* parquet.enable.summary-metadata=true # false to disable summary aggregation
*
* # Maximum size (in bytes) allowed as padding to align row groups
* # This is also the minimum size of a row group. Default: 8388608
* parquet.writer.max-padding=8388608 # 8 MB
*
*
* If parquet.compression is not set, the following properties are checked (FileOutputFormat behavior).
* Note that we explicitely disallow custom Codecs
*
* mapred.output.compress=true
* mapred.output.compression.codec=org.apache.hadoop.io.compress.SomeCodec # the codec must be one of Snappy, GZip or LZO
*
*
* if none of those is set the data is uncompressed.
*
* @param the type of the materialized records
*/
public class ParquetOutputFormat extends FileOutputFormat {
private static final Logger LOG = LoggerFactory.getLogger(ParquetOutputFormat.class);
public static enum JobSummaryLevel {
/**
* Write no summary files
*/
NONE,
/**
* Write both summary file with row group info and summary file without
* (both _metadata and _common_metadata)
*/
ALL,
/**
* Write only the summary file without the row group info
* (_common_metadata only)
*/
COMMON_ONLY
}
/**
* An alias for JOB_SUMMARY_LEVEL, where true means ALL and false means NONE
*/
@Deprecated
public static final String ENABLE_JOB_SUMMARY = "parquet.enable.summary-metadata";
/**
* Must be one of the values in {@link JobSummaryLevel} (case insensitive)
*/
public static final String JOB_SUMMARY_LEVEL = "parquet.summary.metadata.level";
public static final String BLOCK_SIZE = "parquet.block.size";
public static final String PAGE_SIZE = "parquet.page.size";
public static final String COMPRESSION = "parquet.compression";
public static final String WRITE_SUPPORT_CLASS = "parquet.write.support.class";
public static final String DICTIONARY_PAGE_SIZE = "parquet.dictionary.page.size";
public static final String ENABLE_DICTIONARY = "parquet.enable.dictionary";
public static final String VALIDATION = "parquet.validation";
public static final String WRITER_VERSION = "parquet.writer.version";
public static final String MEMORY_POOL_RATIO = "parquet.memory.pool.ratio";
public static final String MIN_MEMORY_ALLOCATION = "parquet.memory.min.chunk.size";
public static final String MAX_PADDING_BYTES = "parquet.writer.max-padding";
public static final String MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.min";
public static final String MAX_ROW_COUNT_FOR_PAGE_SIZE_CHECK = "parquet.page.size.row.check.max";
public static final String ESTIMATE_PAGE_SIZE_CHECK = "parquet.page.size.check.estimate";
public static final String COLUMN_INDEX_TRUNCATE_LENGTH = "parquet.columnindex.truncate.length";
public static final String STATISTICS_TRUNCATE_LENGTH = "parquet.statistics.truncate.length";
public static final String BLOOM_FILTER_ENABLED = "parquet.bloom.filter.enabled";
public static final String BLOOM_FILTER_EXPECTED_NDV = "parquet.bloom.filter.expected.ndv";
public static final String BLOOM_FILTER_MAX_BYTES = "parquet.bloom.filter.max.bytes";
public static final String PAGE_ROW_COUNT_LIMIT = "parquet.page.row.count.limit";
public static final String PAGE_WRITE_CHECKSUM_ENABLED = "parquet.page.write-checksum.enabled";
public static JobSummaryLevel getJobSummaryLevel(Configuration conf) {
String level = conf.get(JOB_SUMMARY_LEVEL);
String deprecatedFlag = conf.get(ENABLE_JOB_SUMMARY);
if (deprecatedFlag != null) {
LOG.warn("Setting " + ENABLE_JOB_SUMMARY + " is deprecated, please use " + JOB_SUMMARY_LEVEL);
}
if (level != null && deprecatedFlag != null) {
LOG.warn("Both " + JOB_SUMMARY_LEVEL + " and " + ENABLE_JOB_SUMMARY + " are set! " + ENABLE_JOB_SUMMARY + " will be ignored.");
}
if (level != null) {
return JobSummaryLevel.valueOf(level.toUpperCase());
}
if (deprecatedFlag != null) {
return Boolean.valueOf(deprecatedFlag) ? JobSummaryLevel.ALL : JobSummaryLevel.NONE;
}
return JobSummaryLevel.ALL;
}
public static void setWriteSupportClass(Job job, Class> writeSupportClass) {
getConfiguration(job).set(WRITE_SUPPORT_CLASS, writeSupportClass.getName());
}
public static void setWriteSupportClass(JobConf job, Class> writeSupportClass) {
job.set(WRITE_SUPPORT_CLASS, writeSupportClass.getName());
}
public static Class> getWriteSupportClass(Configuration configuration) {
final String className = configuration.get(WRITE_SUPPORT_CLASS);
if (className == null) {
return null;
}
final Class> writeSupportClass = ConfigurationUtil.getClassFromConfig(configuration, WRITE_SUPPORT_CLASS, WriteSupport.class);
return writeSupportClass;
}
public static void setBlockSize(Job job, int blockSize) {
getConfiguration(job).setInt(BLOCK_SIZE, blockSize);
}
public static void setPageSize(Job job, int pageSize) {
getConfiguration(job).setInt(PAGE_SIZE, pageSize);
}
public static void setDictionaryPageSize(Job job, int pageSize) {
getConfiguration(job).setInt(DICTIONARY_PAGE_SIZE, pageSize);
}
public static void setCompression(Job job, CompressionCodecName compression) {
getConfiguration(job).set(COMPRESSION, compression.name());
}
public static void setEnableDictionary(Job job, boolean enableDictionary) {
getConfiguration(job).setBoolean(ENABLE_DICTIONARY, enableDictionary);
}
public static boolean getEnableDictionary(JobContext jobContext) {
return getEnableDictionary(getConfiguration(jobContext));
}
public static int getBloomFilterMaxBytes(Configuration conf) {
return conf.getInt(BLOOM_FILTER_MAX_BYTES,
ParquetProperties.DEFAULT_MAX_BLOOM_FILTER_BYTES);
}
public static boolean getBloomFilterEnabled(Configuration conf) {
return conf.getBoolean(BLOOM_FILTER_ENABLED, DEFAULT_BLOOM_FILTER_ENABLED);
}
public static int getBlockSize(JobContext jobContext) {
return getBlockSize(getConfiguration(jobContext));
}
public static int getPageSize(JobContext jobContext) {
return getPageSize(getConfiguration(jobContext));
}
public static int getDictionaryPageSize(JobContext jobContext) {
return getDictionaryPageSize(getConfiguration(jobContext));
}
public static CompressionCodecName getCompression(JobContext jobContext) {
return getCompression(getConfiguration(jobContext));
}
public static boolean isCompressionSet(JobContext jobContext) {
return isCompressionSet(getConfiguration(jobContext));
}
public static void setValidation(JobContext jobContext, boolean validating) {
setValidation(getConfiguration(jobContext), validating);
}
public static boolean getValidation(JobContext jobContext) {
return getValidation(getConfiguration(jobContext));
}
public static boolean getEnableDictionary(Configuration configuration) {
return configuration.getBoolean(
ENABLE_DICTIONARY, ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED);
}
public static int getMinRowCountForPageSizeCheck(Configuration configuration) {
return configuration.getInt(MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK,
ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK);
}
public static int getMaxRowCountForPageSizeCheck(Configuration configuration) {
return configuration.getInt(MAX_ROW_COUNT_FOR_PAGE_SIZE_CHECK,
ParquetProperties.DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK);
}
public static boolean getEstimatePageSizeCheck(Configuration configuration) {
return configuration.getBoolean(ESTIMATE_PAGE_SIZE_CHECK,
ParquetProperties.DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK);
}
@Deprecated
public static int getBlockSize(Configuration configuration) {
return configuration.getInt(BLOCK_SIZE, DEFAULT_BLOCK_SIZE);
}
public static long getLongBlockSize(Configuration configuration) {
return configuration.getLong(BLOCK_SIZE, DEFAULT_BLOCK_SIZE);
}
public static int getPageSize(Configuration configuration) {
return configuration.getInt(PAGE_SIZE, ParquetProperties.DEFAULT_PAGE_SIZE);
}
public static int getDictionaryPageSize(Configuration configuration) {
return configuration.getInt(
DICTIONARY_PAGE_SIZE, ParquetProperties.DEFAULT_DICTIONARY_PAGE_SIZE);
}
public static WriterVersion getWriterVersion(Configuration configuration) {
String writerVersion = configuration.get(
WRITER_VERSION, ParquetProperties.DEFAULT_WRITER_VERSION.toString());
return WriterVersion.fromString(writerVersion);
}
public static CompressionCodecName getCompression(Configuration configuration) {
return CodecConfig.getParquetCompressionCodec(configuration);
}
public static boolean isCompressionSet(Configuration configuration) {
return CodecConfig.isParquetCompressionSet(configuration);
}
public static void setValidation(Configuration configuration, boolean validating) {
configuration.setBoolean(VALIDATION, validating);
}
public static boolean getValidation(Configuration configuration) {
return configuration.getBoolean(VALIDATION, false);
}
private CompressionCodecName getCodec(TaskAttemptContext taskAttemptContext) {
return CodecConfig.from(taskAttemptContext).getCodec();
}
public static void setMaxPaddingSize(JobContext jobContext, int maxPaddingSize) {
setMaxPaddingSize(getConfiguration(jobContext), maxPaddingSize);
}
public static void setMaxPaddingSize(Configuration conf, int maxPaddingSize) {
conf.setInt(MAX_PADDING_BYTES, maxPaddingSize);
}
private static int getMaxPaddingSize(Configuration conf) {
return conf.getInt(MAX_PADDING_BYTES, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
}
public static void setColumnIndexTruncateLength(JobContext jobContext, int length) {
setColumnIndexTruncateLength(getConfiguration(jobContext), length);
}
public static void setColumnIndexTruncateLength(Configuration conf, int length) {
conf.setInt(COLUMN_INDEX_TRUNCATE_LENGTH, length);
}
private static int getColumnIndexTruncateLength(Configuration conf) {
return conf.getInt(COLUMN_INDEX_TRUNCATE_LENGTH, ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH);
}
public static void setStatisticsTruncateLength(JobContext jobContext, int length) {
setStatisticsTruncateLength(getConfiguration(jobContext), length);
}
private static void setStatisticsTruncateLength(Configuration conf, int length) {
conf.setInt(STATISTICS_TRUNCATE_LENGTH, length);
}
private static int getStatisticsTruncateLength(Configuration conf) {
return conf.getInt(STATISTICS_TRUNCATE_LENGTH, ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
public static void setPageRowCountLimit(JobContext jobContext, int rowCount) {
setPageRowCountLimit(getConfiguration(jobContext), rowCount);
}
public static void setPageRowCountLimit(Configuration conf, int rowCount) {
conf.setInt(PAGE_ROW_COUNT_LIMIT, rowCount);
}
private static int getPageRowCountLimit(Configuration conf) {
return conf.getInt(PAGE_ROW_COUNT_LIMIT, ParquetProperties.DEFAULT_PAGE_ROW_COUNT_LIMIT);
}
public static void setPageWriteChecksumEnabled(JobContext jobContext, boolean val) {
setPageWriteChecksumEnabled(getConfiguration(jobContext), val);
}
public static void setPageWriteChecksumEnabled(Configuration conf, boolean val) {
conf.setBoolean(PAGE_WRITE_CHECKSUM_ENABLED, val);
}
public static boolean getPageWriteChecksumEnabled(Configuration conf) {
return conf.getBoolean(PAGE_WRITE_CHECKSUM_ENABLED, ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
}
private WriteSupport writeSupport;
private ParquetOutputCommitter committer;
/**
* constructor used when this OutputFormat in wrapped in another one (In Pig for example)
*
* @param writeSupport the class used to convert the incoming records
* @param the Java write support type
*/
public > ParquetOutputFormat(S writeSupport) {
this.writeSupport = writeSupport;
}
/**
* used when directly using the output format and configuring the write support implementation
* using parquet.write.support.class
*
* @param the Java write support type
*/
public > ParquetOutputFormat() {
}
/**
* {@inheritDoc}
*/
@Override
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
return getRecordWriter(taskAttemptContext, Mode.CREATE);
}
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext, Mode mode)
throws IOException, InterruptedException {
final Configuration conf = getConfiguration(taskAttemptContext);
CompressionCodecName codec = getCodec(taskAttemptContext);
String extension = codec.getExtension() + ".parquet";
Path file = getDefaultWorkFile(taskAttemptContext, extension);
return getRecordWriter(conf, file, codec, mode);
}
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext, Path file)
throws IOException, InterruptedException {
return getRecordWriter(taskAttemptContext, file, Mode.CREATE);
}
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext, Path file, Mode mode)
throws IOException, InterruptedException {
return getRecordWriter(getConfiguration(taskAttemptContext), file, getCodec(taskAttemptContext), mode);
}
public RecordWriter getRecordWriter(Configuration conf, Path file, CompressionCodecName codec)
throws IOException, InterruptedException {
return getRecordWriter(conf, file, codec, Mode.CREATE);
}
public RecordWriter getRecordWriter(Configuration conf, Path file, CompressionCodecName codec, Mode mode)
throws IOException, InterruptedException {
final WriteSupport writeSupport = getWriteSupport(conf);
ParquetProperties.Builder propsBuilder = ParquetProperties.builder()
.withPageSize(getPageSize(conf))
.withDictionaryPageSize(getDictionaryPageSize(conf))
.withDictionaryEncoding(getEnableDictionary(conf))
.withWriterVersion(getWriterVersion(conf))
.estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf))
.withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf))
.withMaxRowCountForPageSizeCheck(getMaxRowCountForPageSizeCheck(conf))
.withColumnIndexTruncateLength(getColumnIndexTruncateLength(conf))
.withStatisticsTruncateLength(getStatisticsTruncateLength(conf))
.withMaxBloomFilterBytes(getBloomFilterMaxBytes(conf))
.withBloomFilterEnabled(getBloomFilterEnabled(conf))
.withPageRowCountLimit(getPageRowCountLimit(conf))
.withPageWriteChecksumEnabled(getPageWriteChecksumEnabled(conf));
new ColumnConfigParser()
.withColumnConfig(ENABLE_DICTIONARY, key -> conf.getBoolean(key, false), propsBuilder::withDictionaryEncoding)
.withColumnConfig(BLOOM_FILTER_ENABLED, key -> conf.getBoolean(key, false),
propsBuilder::withBloomFilterEnabled)
.withColumnConfig(BLOOM_FILTER_EXPECTED_NDV, key -> conf.getLong(key, -1L), propsBuilder::withBloomFilterNDV)
.parseConfig(conf);
ParquetProperties props = propsBuilder.build();
long blockSize = getLongBlockSize(conf);
int maxPaddingSize = getMaxPaddingSize(conf);
boolean validating = getValidation(conf);
if (LOG.isInfoEnabled()) {
LOG.info("Parquet block size to {}", blockSize);
LOG.info("Validation is {}", (validating ? "on" : "off"));
LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize);
LOG.info("Parquet properties are:\n{}", props);
}
WriteContext fileWriteContext = writeSupport.init(conf);
FileEncryptionProperties encryptionProperties = createEncryptionProperties(conf, file, fileWriteContext);
ParquetFileWriter w = new ParquetFileWriter(HadoopOutputFile.fromPath(file, conf),
fileWriteContext.getSchema(), mode, blockSize, maxPaddingSize, props.getColumnIndexTruncateLength(),
props.getStatisticsTruncateLength(), props.getPageWriteChecksumEnabled(), encryptionProperties);
w.start();
float maxLoad = conf.getFloat(ParquetOutputFormat.MEMORY_POOL_RATIO,
MemoryManager.DEFAULT_MEMORY_POOL_RATIO);
long minAllocation = conf.getLong(ParquetOutputFormat.MIN_MEMORY_ALLOCATION,
MemoryManager.DEFAULT_MIN_MEMORY_ALLOCATION);
synchronized (ParquetOutputFormat.class) {
if (memoryManager == null) {
memoryManager = new MemoryManager(maxLoad, minAllocation);
}
}
if (memoryManager.getMemoryPoolRatio() != maxLoad) {
LOG.warn("The configuration " + MEMORY_POOL_RATIO + " has been set. It should not " +
"be reset by the new value: " + maxLoad);
}
return new ParquetRecordWriter(
w,
writeSupport,
fileWriteContext.getSchema(),
fileWriteContext.getExtraMetaData(),
blockSize,
codec,
validating,
props,
memoryManager,
conf);
}
/**
* @param configuration to find the configuration for the write support class
* @return the configured write support
*/
@SuppressWarnings("unchecked")
public WriteSupport getWriteSupport(Configuration configuration){
if (writeSupport != null) return writeSupport;
Class> writeSupportClass = getWriteSupportClass(configuration);
try {
return (WriteSupport) Objects
.requireNonNull(writeSupportClass, "writeSupportClass cannot be null")
.newInstance();
} catch (InstantiationException | IllegalAccessException e) {
throw new BadConfigurationException("could not instantiate write support class: " + writeSupportClass, e);
}
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException {
if (committer == null) {
Path output = getOutputPath(context);
committer = new ParquetOutputCommitter(output, context);
}
return committer;
}
/**
* This memory manager is for all the real writers (InternalParquetRecordWriter) in one task.
*/
private static MemoryManager memoryManager;
public synchronized static MemoryManager getMemoryManager() {
return memoryManager;
}
public static FileEncryptionProperties createEncryptionProperties(Configuration fileHadoopConfig, Path tempFilePath,
WriteContext fileWriteContext) {
EncryptionPropertiesFactory cryptoFactory = EncryptionPropertiesFactory.loadFactory(fileHadoopConfig);
if (null == cryptoFactory) {
return null;
}
return cryptoFactory.getFileEncryptionProperties(fileHadoopConfig, tempFilePath, fileWriteContext);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy