org.apache.orc.OrcFile Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.orc.impl.HadoopShims;
import org.apache.orc.impl.HadoopShimsFactory;
import org.apache.orc.impl.KeyProvider;
import org.apache.orc.impl.MemoryManagerImpl;
import org.apache.orc.impl.OrcTail;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.WriterImpl;
import org.apache.orc.impl.WriterInternal;
import org.apache.orc.impl.writer.WriterImplV2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/* This file is based on source code from the ORC Project (http://orc.apache.org/), licensed by the Apache
* Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
* additional information regarding copyright ownership. */
/**
* Contains factory methods to read or write ORC files.
*
* NOTE: The file was copied and modified to support zstd-jni. This feature is only supported in
* ORC 2.0, but 2.0 only supports JDK17. We need to support JDK8.
*/
public class OrcFile {
private static final Logger LOG = LoggerFactory.getLogger(OrcFile.class);
public static final String MAGIC = "ORC";
/**
* Create a version number for the ORC file format, so that we can add non-forward compatible
* changes in the future. To make it easier for users to understand the version numbers, we use
* the Hive release number that first wrote that version of ORC files.
*
*
Thus, if you add new encodings or other non-forward compatible changes to ORC files, which
* prevent the old reader from reading the new format, you should change these variable to
* reflect the next Hive release number. Non-forward compatible changes should never be added in
* patch releases.
*
*
Do not make any changes that break backwards compatibility, which would prevent the new
* reader from reading ORC files generated by any released version of Hive.
*/
public enum Version {
V_0_11("0.11", 0, 11),
V_0_12("0.12", 0, 12),
/**
* Do not use this format except for testing. It will not be compatible with other versions
* of the software. While we iterate on the ORC 2.0 format, we will make incompatible format
* changes under this version without providing any forward or backward compatibility.
*
*
When 2.0 is released, this version identifier will be completely removed.
*/
UNSTABLE_PRE_2_0("UNSTABLE-PRE-2.0", 1, 9999),
/** The generic identifier for all unknown versions. */
FUTURE("future", Integer.MAX_VALUE, Integer.MAX_VALUE);
public static final Version CURRENT = V_0_12;
private final String name;
private final int major;
private final int minor;
Version(String name, int major, int minor) {
this.name = name;
this.major = major;
this.minor = minor;
}
public static Version byName(String name) {
for (Version version : values()) {
if (version.name.equals(name)) {
return version;
}
}
throw new IllegalArgumentException("Unknown ORC version " + name);
}
/** Get the human readable name for the version. */
public String getName() {
return name;
}
/** Get the major version number. */
public int getMajor() {
return major;
}
/** Get the minor version number. */
public int getMinor() {
return minor;
}
}
/** WriterImplementation Enum. */
public enum WriterImplementation {
/** ORC_JAVA. */
ORC_JAVA(0), // ORC Java writer
/** ORC_CPP. */
ORC_CPP(1), // ORC C++ writer
/** PRESTO. */
PRESTO(2), // Presto writer
/** SCRITCHLEY_GO. */
SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc
/** TRINO. */
TRINO(4), // Trino writer
/** CUDF. */
CUDF(5), // CUDF writer
/** UNKNOWN. */
UNKNOWN(Integer.MAX_VALUE);
private final int id;
WriterImplementation(int id) {
this.id = id;
}
public int getId() {
return id;
}
public static WriterImplementation from(int id) {
WriterImplementation[] values = values();
if (id >= 0 && id < values.length - 1) {
return values[id];
}
return UNKNOWN;
}
}
/**
* Records the version of the writer in terms of which bugs have been fixed. When you fix bugs
* in the writer (or make substantial changes) that don't change the file format, add a new
* version here instead of Version.
*
*
The ids are assigned sequentially from 6 per a WriterImplementation so that readers that
* predate ORC-202 treat the other writers correctly.
*/
public enum WriterVersion {
// Java ORC Writer.
ORIGINAL(WriterImplementation.ORC_JAVA, 0),
HIVE_8732(WriterImplementation.ORC_JAVA, 1),
/** fixed stripe/file maximum statistics and string statistics to use utf8 for min/max. */
HIVE_4243(WriterImplementation.ORC_JAVA, 2), // use real column names from Hive tables
HIVE_12055(WriterImplementation.ORC_JAVA, 3), // vectorized writer
HIVE_13083(WriterImplementation.ORC_JAVA, 4), // decimals write present stream correctly
ORC_101(WriterImplementation.ORC_JAVA, 5), // bloom filters use utf8
ORC_135(WriterImplementation.ORC_JAVA, 6), // timestamp stats use utc
ORC_517(WriterImplementation.ORC_JAVA, 7), // decimal64 min/max are fixed
ORC_203(WriterImplementation.ORC_JAVA, 8), // trim long strings & record they were trimmed
ORC_14(WriterImplementation.ORC_JAVA, 9), // column encryption added
// C++ ORC Writer
ORC_CPP_ORIGINAL(WriterImplementation.ORC_CPP, 6),
// Presto Writer
PRESTO_ORIGINAL(WriterImplementation.PRESTO, 6),
// Scritchley Go Writer
SCRITCHLEY_GO_ORIGINAL(WriterImplementation.SCRITCHLEY_GO, 6),
// Trino Writer
TRINO_ORIGINAL(WriterImplementation.TRINO, 6),
// CUDF Writer
CUDF_ORIGINAL(WriterImplementation.CUDF, 6),
// Don't use any magic numbers here except for the below:
FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from a future writer
private final int id;
private final WriterImplementation writer;
public WriterImplementation getWriterImplementation() {
return writer;
}
public int getId() {
return id;
}
WriterVersion(WriterImplementation writer, int id) {
this.writer = writer;
this.id = id;
}
private static final WriterVersion[][] values =
new WriterVersion[WriterImplementation.values().length][];
static {
for (WriterVersion v : WriterVersion.values()) {
WriterImplementation writer = v.writer;
if (writer != WriterImplementation.UNKNOWN) {
if (values[writer.id] == null) {
values[writer.id] = new WriterVersion[WriterVersion.values().length];
}
if (values[writer.id][v.id] != null) {
throw new IllegalArgumentException("Duplicate WriterVersion id " + v);
}
values[writer.id][v.id] = v;
}
}
}
/**
* Convert the integer from OrcProto.PostScript.writerVersion to the enumeration with
* unknown versions being mapped to FUTURE.
*
* @param writer the writer implementation
* @param val the serialized writer version
* @return the corresponding enumeration value
*/
public static WriterVersion from(WriterImplementation writer, int val) {
if (writer == WriterImplementation.UNKNOWN) {
return FUTURE;
}
if (writer != WriterImplementation.ORC_JAVA && val < 6) {
throw new IllegalArgumentException(
"ORC File with illegal version " + val + " for writer " + writer);
}
WriterVersion[] versions = values[writer.id];
if (val < 0 || versions.length <= val) {
return FUTURE;
}
WriterVersion result = versions[val];
return result == null ? FUTURE : result;
}
/**
* Does this file include the given fix or come from a different writer?
*
* @param fix the required fix
* @return true if the required fix is present
*/
public boolean includes(WriterVersion fix) {
return writer != fix.writer || id >= fix.id;
}
}
/** The WriterVersion for this version of the software. */
public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_14;
/** EncodingStrategy Enum. */
public enum EncodingStrategy {
/** SPEED. */
SPEED,
/** COMPRESSION. */
COMPRESSION
}
/** CompressionStrategy Enum. */
public enum CompressionStrategy {
/** SPEED. */
SPEED,
/** COMPRESSION. */
COMPRESSION
}
// unused
protected OrcFile() {}
/** Orc ReaderOptions. */
public static class ReaderOptions {
private final Configuration conf;
private FileSystem filesystem;
private long maxLength = Long.MAX_VALUE;
private OrcTail orcTail;
private KeyProvider keyProvider;
// TODO: We can generalize FileMetadata interface. Make OrcTail implement FileMetadata
// interface
// and remove this class altogether. Both footer caching and llap caching just needs
// OrcTail.
// For now keeping this around to avoid complex surgery
private FileMetadata fileMetadata;
private boolean useUTCTimestamp;
private boolean useProlepticGregorian;
public ReaderOptions(Configuration conf) {
this.conf = conf;
this.useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf);
}
public ReaderOptions filesystem(FileSystem fs) {
this.filesystem = fs;
return this;
}
public ReaderOptions maxLength(long val) {
maxLength = val;
return this;
}
public ReaderOptions orcTail(OrcTail tail) {
this.orcTail = tail;
return this;
}
/**
* Set the KeyProvider to override the default for getting keys.
*
* @param provider
* @return
*/
public ReaderOptions setKeyProvider(KeyProvider provider) {
this.keyProvider = provider;
return this;
}
/**
* Should the reader convert dates and times to the proleptic Gregorian calendar?
*
* @param newValue should it use the proleptic Gregorian calendar?
* @return this
*/
public ReaderOptions convertToProlepticGregorian(boolean newValue) {
this.useProlepticGregorian = newValue;
return this;
}
public Configuration getConfiguration() {
return conf;
}
public FileSystem getFilesystem() {
return filesystem;
}
public long getMaxLength() {
return maxLength;
}
public OrcTail getOrcTail() {
return orcTail;
}
public KeyProvider getKeyProvider() {
return keyProvider;
}
/** @deprecated Use {@link #orcTail(OrcTail)} instead. */
public ReaderOptions fileMetadata(final FileMetadata metadata) {
fileMetadata = metadata;
return this;
}
public FileMetadata getFileMetadata() {
return fileMetadata;
}
public ReaderOptions useUTCTimestamp(boolean value) {
useUTCTimestamp = value;
return this;
}
public boolean getUseUTCTimestamp() {
return useUTCTimestamp;
}
public boolean getConvertToProlepticGregorian() {
return useProlepticGregorian;
}
}
public static ReaderOptions readerOptions(Configuration conf) {
return new ReaderOptions(conf);
}
public static Reader createReader(Path path, ReaderOptions options) throws IOException {
return new ReaderImpl(path, options);
}
/** WriterContext. */
public interface WriterContext {
Writer getWriter();
}
/** WriterCallback. */
public interface WriterCallback {
void preStripeWrite(WriterContext context) throws IOException;
void preFooterWrite(WriterContext context) throws IOException;
}
/** BloomFilterVersion. */
public enum BloomFilterVersion {
// Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support
// both old and new readers.
ORIGINAL("original"),
// Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8.
// See ORC-101
UTF8("utf8");
private final String id;
BloomFilterVersion(String id) {
this.id = id;
}
@Override
public String toString() {
return id;
}
public static BloomFilterVersion fromString(String s) {
for (BloomFilterVersion version : values()) {
if (version.id.equals(s)) {
return version;
}
}
throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
}
}
/** ZstdCompressOptions. */
public static class ZstdCompressOptions {
private int compressionZstdLevel;
private int compressionZstdWindowLog;
public int getCompressionZstdLevel() {
return compressionZstdLevel;
}
public void setCompressionZstdLevel(int compressionZstdLevel) {
this.compressionZstdLevel = compressionZstdLevel;
}
public int getCompressionZstdWindowLog() {
return compressionZstdWindowLog;
}
public void setCompressionZstdWindowLog(int compressionZstdWindowLog) {
this.compressionZstdWindowLog = compressionZstdWindowLog;
}
}
/** Options for creating ORC file writers. */
public static class WriterOptions implements Cloneable {
private final Configuration configuration;
private FileSystem fileSystemValue = null;
private TypeDescription schema = null;
private long stripeSizeValue;
private long stripeRowCountValue;
private long blockSizeValue;
private boolean buildIndex;
private int rowIndexStrideValue;
private int bufferSizeValue;
private boolean enforceBufferSize = false;
private boolean blockPaddingValue;
private CompressionKind compressValue;
private MemoryManager memoryManagerValue;
private Version versionValue;
private WriterCallback callback;
private EncodingStrategy encodingStrategy;
private CompressionStrategy compressionStrategy;
private ZstdCompressOptions zstdCompressOptions;
private double paddingTolerance;
private String bloomFilterColumns;
private double bloomFilterFpp;
private BloomFilterVersion bloomFilterVersion;
private PhysicalWriter physicalWriter;
private WriterVersion writerVersion = CURRENT_WRITER;
private boolean useUTCTimestamp;
private boolean overwrite;
private boolean writeVariableLengthBlocks;
private HadoopShims shims;
private String directEncodingColumns;
private String encryption;
private String masks;
private KeyProvider provider;
private boolean useProlepticGregorian;
private Map keyOverrides = new HashMap<>();
protected WriterOptions(Properties tableProperties, Configuration conf) {
configuration = conf;
memoryManagerValue = getStaticMemoryManager(conf);
overwrite = OrcConf.OVERWRITE_OUTPUT_FILE.getBoolean(tableProperties, conf);
stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf);
stripeRowCountValue = OrcConf.STRIPE_ROW_COUNT.getLong(tableProperties, conf);
blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf);
buildIndex = OrcConf.ENABLE_INDEXES.getBoolean(tableProperties, conf);
rowIndexStrideValue = (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf);
bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties, conf);
blockPaddingValue = OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf);
compressValue =
CompressionKind.valueOf(
OrcConf.COMPRESS.getString(tableProperties, conf).toUpperCase());
enforceBufferSize =
OrcConf.ENFORCE_COMPRESSION_BUFFER_SIZE.getBoolean(tableProperties, conf);
String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties, conf);
versionValue = Version.byName(versionName);
String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties, conf);
encodingStrategy = EncodingStrategy.valueOf(enString);
String compString = OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf);
compressionStrategy = CompressionStrategy.valueOf(compString);
zstdCompressOptions = new ZstdCompressOptions();
zstdCompressOptions.setCompressionZstdLevel(
OrcConf.COMPRESSION_ZSTD_LEVEL.getInt(tableProperties, conf));
zstdCompressOptions.setCompressionZstdWindowLog(
OrcConf.COMPRESSION_ZSTD_WINDOWLOG.getInt(tableProperties, conf));
paddingTolerance = OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf);
bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties, conf);
bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties, conf);
bloomFilterVersion =
BloomFilterVersion.fromString(
OrcConf.BLOOM_FILTER_WRITE_VERSION.getString(tableProperties, conf));
shims = HadoopShimsFactory.get();
writeVariableLengthBlocks =
OrcConf.WRITE_VARIABLE_LENGTH_BLOCKS.getBoolean(tableProperties, conf);
directEncodingColumns =
OrcConf.DIRECT_ENCODING_COLUMNS.getString(tableProperties, conf);
useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf);
}
/** @return a SHALLOW clone */
@Override
public WriterOptions clone() {
try {
return (WriterOptions) super.clone();
} catch (CloneNotSupportedException ex) {
throw new AssertionError("Expected super.clone() to work");
}
}
/**
* Provide the filesystem for the path, if the client has it available. If it is not
* provided, it will be found from the path.
*/
public WriterOptions fileSystem(FileSystem value) {
fileSystemValue = value;
return this;
}
/**
* If the output file already exists, should it be overwritten? If it is not provided, write
* operation will fail if the file already exists.
*/
public WriterOptions overwrite(boolean value) {
overwrite = value;
return this;
}
/**
* Set the stripe size for the file. The writer stores the contents of the stripe in memory
* until this memory limit is reached and the stripe is flushed to the HDFS file and the
* next stripe started.
*/
public WriterOptions stripeSize(long value) {
stripeSizeValue = value;
return this;
}
/**
* Set the file system block size for the file. For optimal performance, set the block size
* to be multiple factors of stripe size.
*/
public WriterOptions blockSize(long value) {
blockSizeValue = value;
return this;
}
/**
* Set the distance between entries in the row index. The minimum value is 1000 to prevent
* the index from overwhelming the data. If the stride is set to 0, no indexes will be
* included in the file.
*/
public WriterOptions rowIndexStride(int value) {
rowIndexStrideValue = value;
return this;
}
/**
* The size of the memory buffers used for compressing and storing the stripe in memory.
* NOTE: ORC writer may choose to use smaller buffer size based on stripe size and number of
* columns for efficient stripe writing and memory utilization. To enforce writer to use the
* requested buffer size use enforceBufferSize().
*/
public WriterOptions bufferSize(int value) {
bufferSizeValue = value;
return this;
}
/**
* Enforce writer to use requested buffer size instead of estimating buffer size based on
* stripe size and number of columns. See bufferSize() method for more info. Default: false
*/
public WriterOptions enforceBufferSize() {
enforceBufferSize = true;
return this;
}
/**
* Sets whether the HDFS blocks are padded to prevent stripes from straddling blocks.
* Padding improves locality and thus the speed of reading, but costs space.
*/
public WriterOptions blockPadding(boolean value) {
blockPaddingValue = value;
return this;
}
/** Sets the encoding strategy that is used to encode the data. */
public WriterOptions encodingStrategy(EncodingStrategy strategy) {
encodingStrategy = strategy;
return this;
}
/** Sets the tolerance for block padding as a percentage of stripe size. */
public WriterOptions paddingTolerance(double value) {
paddingTolerance = value;
return this;
}
/** Comma separated values of column names for which bloom filter is to be created. */
public WriterOptions bloomFilterColumns(String columns) {
bloomFilterColumns = columns;
return this;
}
/**
* Specify the false positive probability for bloom filter.
*
* @param fpp - false positive probability
* @return this
*/
public WriterOptions bloomFilterFpp(double fpp) {
bloomFilterFpp = fpp;
return this;
}
/** Sets the generic compression that is used to compress the data. */
public WriterOptions compress(CompressionKind value) {
compressValue = value;
return this;
}
/**
* Set the schema for the file. This is a required parameter.
*
* @param schema the schema for the file.
* @return this
*/
public WriterOptions setSchema(TypeDescription schema) {
this.schema = schema;
return this;
}
/** Sets the version of the file that will be written. */
public WriterOptions version(Version value) {
versionValue = value;
return this;
}
/**
* Add a listener for when the stripe and file are about to be closed.
*
* @param callback the object to be called when the stripe is closed
* @return this
*/
public WriterOptions callback(WriterCallback callback) {
this.callback = callback;
return this;
}
/** Set the version of the bloom filters to write. */
public WriterOptions bloomFilterVersion(BloomFilterVersion version) {
this.bloomFilterVersion = version;
return this;
}
/**
* Change the physical writer of the ORC file.
*
* SHOULD ONLY BE USED BY LLAP.
*
* @param writer the writer to control the layout and persistence
* @return this
*/
public WriterOptions physicalWriter(PhysicalWriter writer) {
this.physicalWriter = writer;
return this;
}
/** A public option to set the memory manager. */
public WriterOptions memory(MemoryManager value) {
memoryManagerValue = value;
return this;
}
/**
* Should the ORC file writer use HDFS variable length blocks, if they are available?
*
* @param value the new value
* @return this
*/
public WriterOptions writeVariableLengthBlocks(boolean value) {
writeVariableLengthBlocks = value;
return this;
}
/**
* Set the HadoopShims to use. This is only for testing.
*
* @param value the new value
* @return this
*/
public WriterOptions setShims(HadoopShims value) {
this.shims = value;
return this;
}
/**
* Manually set the writer version. This is an internal API.
*
* @param version the version to write
* @return this
*/
protected WriterOptions writerVersion(WriterVersion version) {
if (version == WriterVersion.FUTURE) {
throw new IllegalArgumentException("Can't write a future version.");
}
this.writerVersion = version;
return this;
}
/**
* Manually set the time zone for the writer to utc. If not defined, system time zone is
* assumed.
*/
public WriterOptions useUTCTimestamp(boolean value) {
useUTCTimestamp = value;
return this;
}
/**
* Set the comma-separated list of columns that should be direct encoded.
*
* @param value the value to set
* @return this
*/
public WriterOptions directEncodingColumns(String value) {
directEncodingColumns = value;
return this;
}
/**
* Encrypt a set of columns with a key.
*
*
Format of the string is a key-list.
*
*
* - key-list = key (';' key-list)?
*
- key = key-name ':' field-list
*
- field-list = field-name ( ',' field-list )?
*
- field-name = number | field-part ('.' field-name)?
*
- field-part = quoted string | simple name
*
*
* @param value a key-list of which columns to encrypt
* @return this
*/
public WriterOptions encrypt(String value) {
encryption = value;
return this;
}
/**
* Set the masks for the unencrypted data.
*
* Format of the string is a mask-list.
*
*
* - mask-list = mask (';' mask-list)?
*
- mask = mask-name (',' parameter)* ':' field-list
*
- field-list = field-name ( ',' field-list )?
*
- field-name = number | field-part ('.' field-name)?
*
- field-part = quoted string | simple name
*
*
* @param value a list of the masks and column names
* @return this
*/
public WriterOptions masks(String value) {
masks = value;
return this;
}
/**
* For users that need to override the current version of a key, this method allows them to
* define the version and algorithm for a given key.
*
* This will mostly be used for ORC file merging where the writer has to use the same
* version of the key that the original files used.
*
* @param keyName the key name
* @param version the version of the key to use
* @param algorithm the algorithm for the given key version
* @return this
*/
public WriterOptions setKeyVersion(
String keyName, int version, EncryptionAlgorithm algorithm) {
HadoopShims.KeyMetadata meta = new HadoopShims.KeyMetadata(keyName, version, algorithm);
keyOverrides.put(keyName, meta);
return this;
}
/**
* Set the key provider for column encryption.
*
* @param provider the object that holds the master secrets
* @return this
*/
public WriterOptions setKeyProvider(KeyProvider provider) {
this.provider = provider;
return this;
}
/**
* Should the writer use the proleptic Gregorian calendar for times and dates.
*
* @param newValue true if we should use the proleptic calendar
* @return this
*/
public WriterOptions setProlepticGregorian(boolean newValue) {
this.useProlepticGregorian = newValue;
return this;
}
public KeyProvider getKeyProvider() {
return provider;
}
public boolean getBlockPadding() {
return blockPaddingValue;
}
public long getBlockSize() {
return blockSizeValue;
}
public String getBloomFilterColumns() {
return bloomFilterColumns;
}
public boolean getOverwrite() {
return overwrite;
}
public FileSystem getFileSystem() {
return fileSystemValue;
}
public Configuration getConfiguration() {
return configuration;
}
public TypeDescription getSchema() {
return schema;
}
public long getStripeSize() {
return stripeSizeValue;
}
public long getStripeRowCountValue() {
return stripeRowCountValue;
}
public CompressionKind getCompress() {
return compressValue;
}
public WriterCallback getCallback() {
return callback;
}
public Version getVersion() {
return versionValue;
}
public MemoryManager getMemoryManager() {
return memoryManagerValue;
}
public int getBufferSize() {
return bufferSizeValue;
}
public boolean isEnforceBufferSize() {
return enforceBufferSize;
}
public int getRowIndexStride() {
return rowIndexStrideValue;
}
public boolean isBuildIndex() {
return buildIndex;
}
public CompressionStrategy getCompressionStrategy() {
return compressionStrategy;
}
public EncodingStrategy getEncodingStrategy() {
return encodingStrategy;
}
public ZstdCompressOptions getZstdCompressOptions() {
return zstdCompressOptions;
}
public double getPaddingTolerance() {
return paddingTolerance;
}
public double getBloomFilterFpp() {
return bloomFilterFpp;
}
public BloomFilterVersion getBloomFilterVersion() {
return bloomFilterVersion;
}
public PhysicalWriter getPhysicalWriter() {
return physicalWriter;
}
public WriterVersion getWriterVersion() {
return writerVersion;
}
public boolean getWriteVariableLengthBlocks() {
return writeVariableLengthBlocks;
}
public HadoopShims getHadoopShims() {
return shims;
}
public boolean getUseUTCTimestamp() {
return useUTCTimestamp;
}
public String getDirectEncodingColumns() {
return directEncodingColumns;
}
public String getEncryption() {
return encryption;
}
public String getMasks() {
return masks;
}
public Map getKeyOverrides() {
return keyOverrides;
}
public boolean getProlepticGregorian() {
return useProlepticGregorian;
}
}
/**
* Create a set of writer options based on a configuration.
*
* @param conf the configuration to use for values
* @return A WriterOptions object that can be modified
*/
public static WriterOptions writerOptions(Configuration conf) {
return new WriterOptions(null, conf);
}
/**
* Create a set of write options based on a set of table properties and configuration.
*
* @param tableProperties the properties of the table
* @param conf the configuration of the query
* @return a WriterOptions object that can be modified
*/
public static WriterOptions writerOptions(Properties tableProperties, Configuration conf) {
return new WriterOptions(tableProperties, conf);
}
private static MemoryManager memoryManager = null;
private static synchronized MemoryManager getStaticMemoryManager(Configuration conf) {
if (memoryManager == null) {
memoryManager = new MemoryManagerImpl(conf);
}
return memoryManager;
}
/**
* Create an ORC file writer. This is the public interface for creating writers going forward
* and new options will only be added to this method.
*
* @param path filename to write to
* @param opts the options
* @return a new ORC file writer
* @throws IOException
*/
public static Writer createWriter(Path path, WriterOptions opts) throws IOException {
FileSystem fs =
opts.getFileSystem() == null
? path.getFileSystem(opts.getConfiguration())
: opts.getFileSystem();
switch (opts.getVersion()) {
case V_0_11:
case V_0_12:
return new WriterImpl(fs, path, opts);
case UNSTABLE_PRE_2_0:
return new WriterImplV2(fs, path, opts);
default:
throw new IllegalArgumentException("Unknown version " + opts.getVersion());
}
}
/**
* Do we understand the version in the reader?
*
* @param path the path of the file
* @param reader the ORC file reader
* @return is the version understood by this writer?
*/
static boolean understandFormat(Path path, Reader reader) {
if (reader.getFileVersion() == Version.FUTURE) {
LOG.info("Can't merge {} because it has a future version.", path);
return false;
}
if (reader.getWriterVersion() == WriterVersion.FUTURE) {
LOG.info("Can't merge {} because it has a future writerVersion.", path);
return false;
}
return true;
}
private static boolean sameKeys(EncryptionKey[] first, EncryptionKey[] next) {
if (first.length != next.length) {
return false;
}
for (int k = 0; k < first.length; ++k) {
if (!first[k].getKeyName().equals(next[k].getKeyName())
|| first[k].getKeyVersion() != next[k].getKeyVersion()
|| first[k].getAlgorithm() != next[k].getAlgorithm()) {
return false;
}
}
return true;
}
private static boolean sameMasks(DataMaskDescription[] first, DataMaskDescription[] next) {
if (first.length != next.length) {
return false;
}
for (int k = 0; k < first.length; ++k) {
if (!first[k].getName().equals(next[k].getName())) {
return false;
}
String[] firstParam = first[k].getParameters();
String[] nextParam = next[k].getParameters();
if (firstParam.length != nextParam.length) {
return false;
}
for (int p = 0; p < firstParam.length; ++p) {
if (!firstParam[p].equals(nextParam[p])) {
return false;
}
}
TypeDescription[] firstRoots = first[k].getColumns();
TypeDescription[] nextRoots = next[k].getColumns();
if (firstRoots.length != nextRoots.length) {
return false;
}
for (int r = 0; r < firstRoots.length; ++r) {
if (firstRoots[r].getId() != nextRoots[r].getId()) {
return false;
}
}
}
return true;
}
private static boolean sameVariants(EncryptionVariant[] first, EncryptionVariant[] next) {
if (first.length != next.length) {
return false;
}
for (int k = 0; k < first.length; ++k) {
if ((first[k].getKeyDescription() == null) != (next[k].getKeyDescription() == null)
|| !first[k].getKeyDescription()
.getKeyName()
.equals(next[k].getKeyDescription().getKeyName())
|| first[k].getRoot().getId() != next[k].getRoot().getId()) {
return false;
}
}
return true;
}
/**
* Is the new reader compatible with the file that is being written?
*
* @param firstReader the first reader that others must match
* @param userMetadata the user metadata
* @param path the new path name for warning messages
* @param reader the new reader
* @return is the reader compatible with the previous ones?
*/
static boolean readerIsCompatible(
Reader firstReader, Map userMetadata, Path path, Reader reader) {
// now we have to check compatibility
TypeDescription schema = firstReader.getSchema();
if (!reader.getSchema().equals(schema)) {
LOG.info(
"Can't merge {} because of different schemas {} vs {}",
path,
reader.getSchema(),
schema);
return false;
}
CompressionKind compression = firstReader.getCompressionKind();
if (reader.getCompressionKind() != compression) {
LOG.info(
"Can't merge {} because of different compression {} vs {}",
path,
reader.getCompressionKind(),
compression);
return false;
}
Version fileVersion = firstReader.getFileVersion();
if (reader.getFileVersion() != fileVersion) {
LOG.info(
"Can't merge {} because of different file versions {} vs {}",
path,
reader.getFileVersion(),
fileVersion);
return false;
}
WriterVersion writerVersion = firstReader.getWriterVersion();
if (reader.getWriterVersion() != writerVersion) {
LOG.info(
"Can't merge {} because of different writer versions {} vs {}",
path,
reader.getFileVersion(),
fileVersion);
return false;
}
int rowIndexStride = firstReader.getRowIndexStride();
if (reader.getRowIndexStride() != rowIndexStride) {
LOG.info(
"Can't merge {} because of different row index strides {} vs {}",
path,
reader.getRowIndexStride(),
rowIndexStride);
return false;
}
for (String key : reader.getMetadataKeys()) {
ByteBuffer currentValue = userMetadata.get(key);
if (currentValue != null) {
ByteBuffer newValue = reader.getMetadataValue(key);
if (!newValue.equals(currentValue)) {
LOG.info("Can't merge {} because of different user metadata {}", path, key);
return false;
}
}
}
if (!sameKeys(firstReader.getColumnEncryptionKeys(), reader.getColumnEncryptionKeys())) {
LOG.info("Can't merge {} because it has different encryption keys", path);
return false;
}
if (!sameMasks(firstReader.getDataMasks(), reader.getDataMasks())) {
LOG.info("Can't merge {} because it has different encryption masks", path);
return false;
}
if (!sameVariants(firstReader.getEncryptionVariants(), reader.getEncryptionVariants())) {
LOG.info("Can't merge {} because it has different encryption variants", path);
return false;
}
if (firstReader.writerUsedProlepticGregorian() != reader.writerUsedProlepticGregorian()) {
LOG.info("Can't merge {} because it uses a different calendar", path);
return false;
}
return true;
}
static void mergeMetadata(Map metadata, Reader reader) {
for (String key : reader.getMetadataKeys()) {
metadata.put(key, reader.getMetadataValue(key));
}
}
/**
* Merges multiple ORC files that all have the same schema to produce a single ORC file. The
* merge will reject files that aren't compatible with the merged file so the output list may be
* shorter than the input list. The stripes are copied as serialized byte buffers. The user
* metadata are merged and files that disagree on the value associated with a key will be
* rejected.
*
* @param outputPath the output file
* @param options the options for writing with although the options related to the input files'
* encodings are overridden
* @param inputFiles the list of files to merge
* @return the list of files that were successfully merged
* @throws IOException
*/
public static List mergeFiles(
Path outputPath, WriterOptions options, List inputFiles) throws IOException {
Writer output = null;
final Configuration conf = options.getConfiguration();
KeyProvider keyProvider = options.getKeyProvider();
try {
byte[] buffer = new byte[0];
Reader firstFile = null;
List result = new ArrayList<>(inputFiles.size());
Map userMetadata = new HashMap<>();
int bufferSize = 0;
for (Path input : inputFiles) {
FileSystem fs = input.getFileSystem(conf);
Reader reader =
createReader(
input,
readerOptions(options.getConfiguration())
.filesystem(fs)
.setKeyProvider(keyProvider));
if (!understandFormat(input, reader)) {
continue;
} else if (firstFile == null) {
// if this is the first file that we are including, grab the values
firstFile = reader;
bufferSize = reader.getCompressionSize();
CompressionKind compression = reader.getCompressionKind();
options.bufferSize(bufferSize)
.version(reader.getFileVersion())
.writerVersion(reader.getWriterVersion())
.compress(compression)
.rowIndexStride(reader.getRowIndexStride())
.setSchema(reader.getSchema());
if (compression != CompressionKind.NONE) {
options.enforceBufferSize().bufferSize(bufferSize);
}
mergeMetadata(userMetadata, reader);
// ensure that the merged file uses the same key versions
for (EncryptionKey key : reader.getColumnEncryptionKeys()) {
options.setKeyVersion(
key.getKeyName(), key.getKeyVersion(), key.getAlgorithm());
}
output = createWriter(outputPath, options);
} else if (!readerIsCompatible(firstFile, userMetadata, input, reader)) {
continue;
} else {
mergeMetadata(userMetadata, reader);
if (bufferSize < reader.getCompressionSize()) {
bufferSize = reader.getCompressionSize();
((WriterInternal) output).increaseCompressionSize(bufferSize);
}
}
EncryptionVariant[] variants = reader.getEncryptionVariants();
List[] completeList = new List[variants.length + 1];
for (int v = 0; v < variants.length; ++v) {
completeList[v] = reader.getVariantStripeStatistics(variants[v]);
}
completeList[completeList.length - 1] = reader.getVariantStripeStatistics(null);
StripeStatistics[] stripeStats = new StripeStatistics[completeList.length];
try (FSDataInputStream inputStream = ((ReaderImpl) reader).takeFile()) {
result.add(input);
for (StripeInformation stripe : reader.getStripes()) {
int length = (int) stripe.getLength();
if (buffer.length < length) {
buffer = new byte[length];
}
long offset = stripe.getOffset();
inputStream.readFully(offset, buffer, 0, length);
int stripeId = (int) stripe.getStripeId();
for (int v = 0; v < completeList.length; ++v) {
stripeStats[v] = completeList[v].get(stripeId);
}
output.appendStripe(buffer, 0, length, stripe, stripeStats);
}
}
}
if (output != null) {
for (Map.Entry entry : userMetadata.entrySet()) {
output.addUserMetadata(entry.getKey(), entry.getValue());
}
output.close();
}
return result;
} catch (Throwable t) {
if (output != null) {
try {
output.close();
} catch (Throwable ignore) {
// PASS
}
try {
FileSystem fs =
options.getFileSystem() == null
? outputPath.getFileSystem(conf)
: options.getFileSystem();
fs.delete(outputPath, false);
} catch (Throwable ignore) {
// PASS
}
}
throw new IOException("Problem merging files into " + outputPath, t);
}
}
}