All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.orc.OrcFile Maven / Gradle / Ivy

There is a newer version: 0.9.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.orc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.orc.impl.HadoopShims;
import org.apache.orc.impl.HadoopShimsFactory;
import org.apache.orc.impl.KeyProvider;
import org.apache.orc.impl.MemoryManagerImpl;
import org.apache.orc.impl.OrcTail;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.WriterImpl;
import org.apache.orc.impl.WriterInternal;
import org.apache.orc.impl.writer.WriterImplV2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

/* This file is based on source code from the ORC Project (http://orc.apache.org/), licensed by the Apache
 * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership. */

/**
 * Contains factory methods to read or write ORC files.
 *
 * 

NOTE: The file was copied and modified to support zstd-jni. This feature is only supported in * ORC 2.0, but 2.0 only supports JDK17. We need to support JDK8. */ public class OrcFile { private static final Logger LOG = LoggerFactory.getLogger(OrcFile.class); public static final String MAGIC = "ORC"; /** * Create a version number for the ORC file format, so that we can add non-forward compatible * changes in the future. To make it easier for users to understand the version numbers, we use * the Hive release number that first wrote that version of ORC files. * *

Thus, if you add new encodings or other non-forward compatible changes to ORC files, which * prevent the old reader from reading the new format, you should change these variable to * reflect the next Hive release number. Non-forward compatible changes should never be added in * patch releases. * *

Do not make any changes that break backwards compatibility, which would prevent the new * reader from reading ORC files generated by any released version of Hive. */ public enum Version { V_0_11("0.11", 0, 11), V_0_12("0.12", 0, 12), /** * Do not use this format except for testing. It will not be compatible with other versions * of the software. While we iterate on the ORC 2.0 format, we will make incompatible format * changes under this version without providing any forward or backward compatibility. * *

When 2.0 is released, this version identifier will be completely removed. */ UNSTABLE_PRE_2_0("UNSTABLE-PRE-2.0", 1, 9999), /** The generic identifier for all unknown versions. */ FUTURE("future", Integer.MAX_VALUE, Integer.MAX_VALUE); public static final Version CURRENT = V_0_12; private final String name; private final int major; private final int minor; Version(String name, int major, int minor) { this.name = name; this.major = major; this.minor = minor; } public static Version byName(String name) { for (Version version : values()) { if (version.name.equals(name)) { return version; } } throw new IllegalArgumentException("Unknown ORC version " + name); } /** Get the human readable name for the version. */ public String getName() { return name; } /** Get the major version number. */ public int getMajor() { return major; } /** Get the minor version number. */ public int getMinor() { return minor; } } /** WriterImplementation Enum. */ public enum WriterImplementation { /** ORC_JAVA. */ ORC_JAVA(0), // ORC Java writer /** ORC_CPP. */ ORC_CPP(1), // ORC C++ writer /** PRESTO. */ PRESTO(2), // Presto writer /** SCRITCHLEY_GO. */ SCRITCHLEY_GO(3), // Go writer from https://github.com/scritchley/orc /** TRINO. */ TRINO(4), // Trino writer /** CUDF. */ CUDF(5), // CUDF writer /** UNKNOWN. */ UNKNOWN(Integer.MAX_VALUE); private final int id; WriterImplementation(int id) { this.id = id; } public int getId() { return id; } public static WriterImplementation from(int id) { WriterImplementation[] values = values(); if (id >= 0 && id < values.length - 1) { return values[id]; } return UNKNOWN; } } /** * Records the version of the writer in terms of which bugs have been fixed. When you fix bugs * in the writer (or make substantial changes) that don't change the file format, add a new * version here instead of Version. * *

The ids are assigned sequentially from 6 per a WriterImplementation so that readers that * predate ORC-202 treat the other writers correctly. */ public enum WriterVersion { // Java ORC Writer. ORIGINAL(WriterImplementation.ORC_JAVA, 0), HIVE_8732(WriterImplementation.ORC_JAVA, 1), /** fixed stripe/file maximum statistics and string statistics to use utf8 for min/max. */ HIVE_4243(WriterImplementation.ORC_JAVA, 2), // use real column names from Hive tables HIVE_12055(WriterImplementation.ORC_JAVA, 3), // vectorized writer HIVE_13083(WriterImplementation.ORC_JAVA, 4), // decimals write present stream correctly ORC_101(WriterImplementation.ORC_JAVA, 5), // bloom filters use utf8 ORC_135(WriterImplementation.ORC_JAVA, 6), // timestamp stats use utc ORC_517(WriterImplementation.ORC_JAVA, 7), // decimal64 min/max are fixed ORC_203(WriterImplementation.ORC_JAVA, 8), // trim long strings & record they were trimmed ORC_14(WriterImplementation.ORC_JAVA, 9), // column encryption added // C++ ORC Writer ORC_CPP_ORIGINAL(WriterImplementation.ORC_CPP, 6), // Presto Writer PRESTO_ORIGINAL(WriterImplementation.PRESTO, 6), // Scritchley Go Writer SCRITCHLEY_GO_ORIGINAL(WriterImplementation.SCRITCHLEY_GO, 6), // Trino Writer TRINO_ORIGINAL(WriterImplementation.TRINO, 6), // CUDF Writer CUDF_ORIGINAL(WriterImplementation.CUDF, 6), // Don't use any magic numbers here except for the below: FUTURE(WriterImplementation.UNKNOWN, Integer.MAX_VALUE); // a version from a future writer private final int id; private final WriterImplementation writer; public WriterImplementation getWriterImplementation() { return writer; } public int getId() { return id; } WriterVersion(WriterImplementation writer, int id) { this.writer = writer; this.id = id; } private static final WriterVersion[][] values = new WriterVersion[WriterImplementation.values().length][]; static { for (WriterVersion v : WriterVersion.values()) { WriterImplementation writer = v.writer; if (writer != WriterImplementation.UNKNOWN) { if (values[writer.id] == null) { values[writer.id] = new WriterVersion[WriterVersion.values().length]; } if (values[writer.id][v.id] != null) { throw new IllegalArgumentException("Duplicate WriterVersion id " + v); } values[writer.id][v.id] = v; } } } /** * Convert the integer from OrcProto.PostScript.writerVersion to the enumeration with * unknown versions being mapped to FUTURE. * * @param writer the writer implementation * @param val the serialized writer version * @return the corresponding enumeration value */ public static WriterVersion from(WriterImplementation writer, int val) { if (writer == WriterImplementation.UNKNOWN) { return FUTURE; } if (writer != WriterImplementation.ORC_JAVA && val < 6) { throw new IllegalArgumentException( "ORC File with illegal version " + val + " for writer " + writer); } WriterVersion[] versions = values[writer.id]; if (val < 0 || versions.length <= val) { return FUTURE; } WriterVersion result = versions[val]; return result == null ? FUTURE : result; } /** * Does this file include the given fix or come from a different writer? * * @param fix the required fix * @return true if the required fix is present */ public boolean includes(WriterVersion fix) { return writer != fix.writer || id >= fix.id; } } /** The WriterVersion for this version of the software. */ public static final WriterVersion CURRENT_WRITER = WriterVersion.ORC_14; /** EncodingStrategy Enum. */ public enum EncodingStrategy { /** SPEED. */ SPEED, /** COMPRESSION. */ COMPRESSION } /** CompressionStrategy Enum. */ public enum CompressionStrategy { /** SPEED. */ SPEED, /** COMPRESSION. */ COMPRESSION } // unused protected OrcFile() {} /** Orc ReaderOptions. */ public static class ReaderOptions { private final Configuration conf; private FileSystem filesystem; private long maxLength = Long.MAX_VALUE; private OrcTail orcTail; private KeyProvider keyProvider; // TODO: We can generalize FileMetadata interface. Make OrcTail implement FileMetadata // interface // and remove this class altogether. Both footer caching and llap caching just needs // OrcTail. // For now keeping this around to avoid complex surgery private FileMetadata fileMetadata; private boolean useUTCTimestamp; private boolean useProlepticGregorian; public ReaderOptions(Configuration conf) { this.conf = conf; this.useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf); } public ReaderOptions filesystem(FileSystem fs) { this.filesystem = fs; return this; } public ReaderOptions maxLength(long val) { maxLength = val; return this; } public ReaderOptions orcTail(OrcTail tail) { this.orcTail = tail; return this; } /** * Set the KeyProvider to override the default for getting keys. * * @param provider * @return */ public ReaderOptions setKeyProvider(KeyProvider provider) { this.keyProvider = provider; return this; } /** * Should the reader convert dates and times to the proleptic Gregorian calendar? * * @param newValue should it use the proleptic Gregorian calendar? * @return this */ public ReaderOptions convertToProlepticGregorian(boolean newValue) { this.useProlepticGregorian = newValue; return this; } public Configuration getConfiguration() { return conf; } public FileSystem getFilesystem() { return filesystem; } public long getMaxLength() { return maxLength; } public OrcTail getOrcTail() { return orcTail; } public KeyProvider getKeyProvider() { return keyProvider; } /** @deprecated Use {@link #orcTail(OrcTail)} instead. */ public ReaderOptions fileMetadata(final FileMetadata metadata) { fileMetadata = metadata; return this; } public FileMetadata getFileMetadata() { return fileMetadata; } public ReaderOptions useUTCTimestamp(boolean value) { useUTCTimestamp = value; return this; } public boolean getUseUTCTimestamp() { return useUTCTimestamp; } public boolean getConvertToProlepticGregorian() { return useProlepticGregorian; } } public static ReaderOptions readerOptions(Configuration conf) { return new ReaderOptions(conf); } public static Reader createReader(Path path, ReaderOptions options) throws IOException { return new ReaderImpl(path, options); } /** WriterContext. */ public interface WriterContext { Writer getWriter(); } /** WriterCallback. */ public interface WriterCallback { void preStripeWrite(WriterContext context) throws IOException; void preFooterWrite(WriterContext context) throws IOException; } /** BloomFilterVersion. */ public enum BloomFilterVersion { // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support // both old and new readers. ORIGINAL("original"), // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8. // See ORC-101 UTF8("utf8"); private final String id; BloomFilterVersion(String id) { this.id = id; } @Override public String toString() { return id; } public static BloomFilterVersion fromString(String s) { for (BloomFilterVersion version : values()) { if (version.id.equals(s)) { return version; } } throw new IllegalArgumentException("Unknown BloomFilterVersion " + s); } } /** ZstdCompressOptions. */ public static class ZstdCompressOptions { private int compressionZstdLevel; private int compressionZstdWindowLog; public int getCompressionZstdLevel() { return compressionZstdLevel; } public void setCompressionZstdLevel(int compressionZstdLevel) { this.compressionZstdLevel = compressionZstdLevel; } public int getCompressionZstdWindowLog() { return compressionZstdWindowLog; } public void setCompressionZstdWindowLog(int compressionZstdWindowLog) { this.compressionZstdWindowLog = compressionZstdWindowLog; } } /** Options for creating ORC file writers. */ public static class WriterOptions implements Cloneable { private final Configuration configuration; private FileSystem fileSystemValue = null; private TypeDescription schema = null; private long stripeSizeValue; private long stripeRowCountValue; private long blockSizeValue; private boolean buildIndex; private int rowIndexStrideValue; private int bufferSizeValue; private boolean enforceBufferSize = false; private boolean blockPaddingValue; private CompressionKind compressValue; private MemoryManager memoryManagerValue; private Version versionValue; private WriterCallback callback; private EncodingStrategy encodingStrategy; private CompressionStrategy compressionStrategy; private ZstdCompressOptions zstdCompressOptions; private double paddingTolerance; private String bloomFilterColumns; private double bloomFilterFpp; private BloomFilterVersion bloomFilterVersion; private PhysicalWriter physicalWriter; private WriterVersion writerVersion = CURRENT_WRITER; private boolean useUTCTimestamp; private boolean overwrite; private boolean writeVariableLengthBlocks; private HadoopShims shims; private String directEncodingColumns; private String encryption; private String masks; private KeyProvider provider; private boolean useProlepticGregorian; private Map keyOverrides = new HashMap<>(); protected WriterOptions(Properties tableProperties, Configuration conf) { configuration = conf; memoryManagerValue = getStaticMemoryManager(conf); overwrite = OrcConf.OVERWRITE_OUTPUT_FILE.getBoolean(tableProperties, conf); stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf); stripeRowCountValue = OrcConf.STRIPE_ROW_COUNT.getLong(tableProperties, conf); blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf); buildIndex = OrcConf.ENABLE_INDEXES.getBoolean(tableProperties, conf); rowIndexStrideValue = (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf); bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties, conf); blockPaddingValue = OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf); compressValue = CompressionKind.valueOf( OrcConf.COMPRESS.getString(tableProperties, conf).toUpperCase()); enforceBufferSize = OrcConf.ENFORCE_COMPRESSION_BUFFER_SIZE.getBoolean(tableProperties, conf); String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties, conf); versionValue = Version.byName(versionName); String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties, conf); encodingStrategy = EncodingStrategy.valueOf(enString); String compString = OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf); compressionStrategy = CompressionStrategy.valueOf(compString); zstdCompressOptions = new ZstdCompressOptions(); zstdCompressOptions.setCompressionZstdLevel( OrcConf.COMPRESSION_ZSTD_LEVEL.getInt(tableProperties, conf)); zstdCompressOptions.setCompressionZstdWindowLog( OrcConf.COMPRESSION_ZSTD_WINDOWLOG.getInt(tableProperties, conf)); paddingTolerance = OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf); bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties, conf); bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties, conf); bloomFilterVersion = BloomFilterVersion.fromString( OrcConf.BLOOM_FILTER_WRITE_VERSION.getString(tableProperties, conf)); shims = HadoopShimsFactory.get(); writeVariableLengthBlocks = OrcConf.WRITE_VARIABLE_LENGTH_BLOCKS.getBoolean(tableProperties, conf); directEncodingColumns = OrcConf.DIRECT_ENCODING_COLUMNS.getString(tableProperties, conf); useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf); } /** @return a SHALLOW clone */ @Override public WriterOptions clone() { try { return (WriterOptions) super.clone(); } catch (CloneNotSupportedException ex) { throw new AssertionError("Expected super.clone() to work"); } } /** * Provide the filesystem for the path, if the client has it available. If it is not * provided, it will be found from the path. */ public WriterOptions fileSystem(FileSystem value) { fileSystemValue = value; return this; } /** * If the output file already exists, should it be overwritten? If it is not provided, write * operation will fail if the file already exists. */ public WriterOptions overwrite(boolean value) { overwrite = value; return this; } /** * Set the stripe size for the file. The writer stores the contents of the stripe in memory * until this memory limit is reached and the stripe is flushed to the HDFS file and the * next stripe started. */ public WriterOptions stripeSize(long value) { stripeSizeValue = value; return this; } /** * Set the file system block size for the file. For optimal performance, set the block size * to be multiple factors of stripe size. */ public WriterOptions blockSize(long value) { blockSizeValue = value; return this; } /** * Set the distance between entries in the row index. The minimum value is 1000 to prevent * the index from overwhelming the data. If the stride is set to 0, no indexes will be * included in the file. */ public WriterOptions rowIndexStride(int value) { rowIndexStrideValue = value; return this; } /** * The size of the memory buffers used for compressing and storing the stripe in memory. * NOTE: ORC writer may choose to use smaller buffer size based on stripe size and number of * columns for efficient stripe writing and memory utilization. To enforce writer to use the * requested buffer size use enforceBufferSize(). */ public WriterOptions bufferSize(int value) { bufferSizeValue = value; return this; } /** * Enforce writer to use requested buffer size instead of estimating buffer size based on * stripe size and number of columns. See bufferSize() method for more info. Default: false */ public WriterOptions enforceBufferSize() { enforceBufferSize = true; return this; } /** * Sets whether the HDFS blocks are padded to prevent stripes from straddling blocks. * Padding improves locality and thus the speed of reading, but costs space. */ public WriterOptions blockPadding(boolean value) { blockPaddingValue = value; return this; } /** Sets the encoding strategy that is used to encode the data. */ public WriterOptions encodingStrategy(EncodingStrategy strategy) { encodingStrategy = strategy; return this; } /** Sets the tolerance for block padding as a percentage of stripe size. */ public WriterOptions paddingTolerance(double value) { paddingTolerance = value; return this; } /** Comma separated values of column names for which bloom filter is to be created. */ public WriterOptions bloomFilterColumns(String columns) { bloomFilterColumns = columns; return this; } /** * Specify the false positive probability for bloom filter. * * @param fpp - false positive probability * @return this */ public WriterOptions bloomFilterFpp(double fpp) { bloomFilterFpp = fpp; return this; } /** Sets the generic compression that is used to compress the data. */ public WriterOptions compress(CompressionKind value) { compressValue = value; return this; } /** * Set the schema for the file. This is a required parameter. * * @param schema the schema for the file. * @return this */ public WriterOptions setSchema(TypeDescription schema) { this.schema = schema; return this; } /** Sets the version of the file that will be written. */ public WriterOptions version(Version value) { versionValue = value; return this; } /** * Add a listener for when the stripe and file are about to be closed. * * @param callback the object to be called when the stripe is closed * @return this */ public WriterOptions callback(WriterCallback callback) { this.callback = callback; return this; } /** Set the version of the bloom filters to write. */ public WriterOptions bloomFilterVersion(BloomFilterVersion version) { this.bloomFilterVersion = version; return this; } /** * Change the physical writer of the ORC file. * *

SHOULD ONLY BE USED BY LLAP. * * @param writer the writer to control the layout and persistence * @return this */ public WriterOptions physicalWriter(PhysicalWriter writer) { this.physicalWriter = writer; return this; } /** A public option to set the memory manager. */ public WriterOptions memory(MemoryManager value) { memoryManagerValue = value; return this; } /** * Should the ORC file writer use HDFS variable length blocks, if they are available? * * @param value the new value * @return this */ public WriterOptions writeVariableLengthBlocks(boolean value) { writeVariableLengthBlocks = value; return this; } /** * Set the HadoopShims to use. This is only for testing. * * @param value the new value * @return this */ public WriterOptions setShims(HadoopShims value) { this.shims = value; return this; } /** * Manually set the writer version. This is an internal API. * * @param version the version to write * @return this */ protected WriterOptions writerVersion(WriterVersion version) { if (version == WriterVersion.FUTURE) { throw new IllegalArgumentException("Can't write a future version."); } this.writerVersion = version; return this; } /** * Manually set the time zone for the writer to utc. If not defined, system time zone is * assumed. */ public WriterOptions useUTCTimestamp(boolean value) { useUTCTimestamp = value; return this; } /** * Set the comma-separated list of columns that should be direct encoded. * * @param value the value to set * @return this */ public WriterOptions directEncodingColumns(String value) { directEncodingColumns = value; return this; } /** * Encrypt a set of columns with a key. * *

Format of the string is a key-list. * *

    *
  • key-list = key (';' key-list)? *
  • key = key-name ':' field-list *
  • field-list = field-name ( ',' field-list )? *
  • field-name = number | field-part ('.' field-name)? *
  • field-part = quoted string | simple name *
* * @param value a key-list of which columns to encrypt * @return this */ public WriterOptions encrypt(String value) { encryption = value; return this; } /** * Set the masks for the unencrypted data. * *

Format of the string is a mask-list. * *

    *
  • mask-list = mask (';' mask-list)? *
  • mask = mask-name (',' parameter)* ':' field-list *
  • field-list = field-name ( ',' field-list )? *
  • field-name = number | field-part ('.' field-name)? *
  • field-part = quoted string | simple name *
* * @param value a list of the masks and column names * @return this */ public WriterOptions masks(String value) { masks = value; return this; } /** * For users that need to override the current version of a key, this method allows them to * define the version and algorithm for a given key. * *

This will mostly be used for ORC file merging where the writer has to use the same * version of the key that the original files used. * * @param keyName the key name * @param version the version of the key to use * @param algorithm the algorithm for the given key version * @return this */ public WriterOptions setKeyVersion( String keyName, int version, EncryptionAlgorithm algorithm) { HadoopShims.KeyMetadata meta = new HadoopShims.KeyMetadata(keyName, version, algorithm); keyOverrides.put(keyName, meta); return this; } /** * Set the key provider for column encryption. * * @param provider the object that holds the master secrets * @return this */ public WriterOptions setKeyProvider(KeyProvider provider) { this.provider = provider; return this; } /** * Should the writer use the proleptic Gregorian calendar for times and dates. * * @param newValue true if we should use the proleptic calendar * @return this */ public WriterOptions setProlepticGregorian(boolean newValue) { this.useProlepticGregorian = newValue; return this; } public KeyProvider getKeyProvider() { return provider; } public boolean getBlockPadding() { return blockPaddingValue; } public long getBlockSize() { return blockSizeValue; } public String getBloomFilterColumns() { return bloomFilterColumns; } public boolean getOverwrite() { return overwrite; } public FileSystem getFileSystem() { return fileSystemValue; } public Configuration getConfiguration() { return configuration; } public TypeDescription getSchema() { return schema; } public long getStripeSize() { return stripeSizeValue; } public long getStripeRowCountValue() { return stripeRowCountValue; } public CompressionKind getCompress() { return compressValue; } public WriterCallback getCallback() { return callback; } public Version getVersion() { return versionValue; } public MemoryManager getMemoryManager() { return memoryManagerValue; } public int getBufferSize() { return bufferSizeValue; } public boolean isEnforceBufferSize() { return enforceBufferSize; } public int getRowIndexStride() { return rowIndexStrideValue; } public boolean isBuildIndex() { return buildIndex; } public CompressionStrategy getCompressionStrategy() { return compressionStrategy; } public EncodingStrategy getEncodingStrategy() { return encodingStrategy; } public ZstdCompressOptions getZstdCompressOptions() { return zstdCompressOptions; } public double getPaddingTolerance() { return paddingTolerance; } public double getBloomFilterFpp() { return bloomFilterFpp; } public BloomFilterVersion getBloomFilterVersion() { return bloomFilterVersion; } public PhysicalWriter getPhysicalWriter() { return physicalWriter; } public WriterVersion getWriterVersion() { return writerVersion; } public boolean getWriteVariableLengthBlocks() { return writeVariableLengthBlocks; } public HadoopShims getHadoopShims() { return shims; } public boolean getUseUTCTimestamp() { return useUTCTimestamp; } public String getDirectEncodingColumns() { return directEncodingColumns; } public String getEncryption() { return encryption; } public String getMasks() { return masks; } public Map getKeyOverrides() { return keyOverrides; } public boolean getProlepticGregorian() { return useProlepticGregorian; } } /** * Create a set of writer options based on a configuration. * * @param conf the configuration to use for values * @return A WriterOptions object that can be modified */ public static WriterOptions writerOptions(Configuration conf) { return new WriterOptions(null, conf); } /** * Create a set of write options based on a set of table properties and configuration. * * @param tableProperties the properties of the table * @param conf the configuration of the query * @return a WriterOptions object that can be modified */ public static WriterOptions writerOptions(Properties tableProperties, Configuration conf) { return new WriterOptions(tableProperties, conf); } private static MemoryManager memoryManager = null; private static synchronized MemoryManager getStaticMemoryManager(Configuration conf) { if (memoryManager == null) { memoryManager = new MemoryManagerImpl(conf); } return memoryManager; } /** * Create an ORC file writer. This is the public interface for creating writers going forward * and new options will only be added to this method. * * @param path filename to write to * @param opts the options * @return a new ORC file writer * @throws IOException */ public static Writer createWriter(Path path, WriterOptions opts) throws IOException { FileSystem fs = opts.getFileSystem() == null ? path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem(); switch (opts.getVersion()) { case V_0_11: case V_0_12: return new WriterImpl(fs, path, opts); case UNSTABLE_PRE_2_0: return new WriterImplV2(fs, path, opts); default: throw new IllegalArgumentException("Unknown version " + opts.getVersion()); } } /** * Do we understand the version in the reader? * * @param path the path of the file * @param reader the ORC file reader * @return is the version understood by this writer? */ static boolean understandFormat(Path path, Reader reader) { if (reader.getFileVersion() == Version.FUTURE) { LOG.info("Can't merge {} because it has a future version.", path); return false; } if (reader.getWriterVersion() == WriterVersion.FUTURE) { LOG.info("Can't merge {} because it has a future writerVersion.", path); return false; } return true; } private static boolean sameKeys(EncryptionKey[] first, EncryptionKey[] next) { if (first.length != next.length) { return false; } for (int k = 0; k < first.length; ++k) { if (!first[k].getKeyName().equals(next[k].getKeyName()) || first[k].getKeyVersion() != next[k].getKeyVersion() || first[k].getAlgorithm() != next[k].getAlgorithm()) { return false; } } return true; } private static boolean sameMasks(DataMaskDescription[] first, DataMaskDescription[] next) { if (first.length != next.length) { return false; } for (int k = 0; k < first.length; ++k) { if (!first[k].getName().equals(next[k].getName())) { return false; } String[] firstParam = first[k].getParameters(); String[] nextParam = next[k].getParameters(); if (firstParam.length != nextParam.length) { return false; } for (int p = 0; p < firstParam.length; ++p) { if (!firstParam[p].equals(nextParam[p])) { return false; } } TypeDescription[] firstRoots = first[k].getColumns(); TypeDescription[] nextRoots = next[k].getColumns(); if (firstRoots.length != nextRoots.length) { return false; } for (int r = 0; r < firstRoots.length; ++r) { if (firstRoots[r].getId() != nextRoots[r].getId()) { return false; } } } return true; } private static boolean sameVariants(EncryptionVariant[] first, EncryptionVariant[] next) { if (first.length != next.length) { return false; } for (int k = 0; k < first.length; ++k) { if ((first[k].getKeyDescription() == null) != (next[k].getKeyDescription() == null) || !first[k].getKeyDescription() .getKeyName() .equals(next[k].getKeyDescription().getKeyName()) || first[k].getRoot().getId() != next[k].getRoot().getId()) { return false; } } return true; } /** * Is the new reader compatible with the file that is being written? * * @param firstReader the first reader that others must match * @param userMetadata the user metadata * @param path the new path name for warning messages * @param reader the new reader * @return is the reader compatible with the previous ones? */ static boolean readerIsCompatible( Reader firstReader, Map userMetadata, Path path, Reader reader) { // now we have to check compatibility TypeDescription schema = firstReader.getSchema(); if (!reader.getSchema().equals(schema)) { LOG.info( "Can't merge {} because of different schemas {} vs {}", path, reader.getSchema(), schema); return false; } CompressionKind compression = firstReader.getCompressionKind(); if (reader.getCompressionKind() != compression) { LOG.info( "Can't merge {} because of different compression {} vs {}", path, reader.getCompressionKind(), compression); return false; } Version fileVersion = firstReader.getFileVersion(); if (reader.getFileVersion() != fileVersion) { LOG.info( "Can't merge {} because of different file versions {} vs {}", path, reader.getFileVersion(), fileVersion); return false; } WriterVersion writerVersion = firstReader.getWriterVersion(); if (reader.getWriterVersion() != writerVersion) { LOG.info( "Can't merge {} because of different writer versions {} vs {}", path, reader.getFileVersion(), fileVersion); return false; } int rowIndexStride = firstReader.getRowIndexStride(); if (reader.getRowIndexStride() != rowIndexStride) { LOG.info( "Can't merge {} because of different row index strides {} vs {}", path, reader.getRowIndexStride(), rowIndexStride); return false; } for (String key : reader.getMetadataKeys()) { ByteBuffer currentValue = userMetadata.get(key); if (currentValue != null) { ByteBuffer newValue = reader.getMetadataValue(key); if (!newValue.equals(currentValue)) { LOG.info("Can't merge {} because of different user metadata {}", path, key); return false; } } } if (!sameKeys(firstReader.getColumnEncryptionKeys(), reader.getColumnEncryptionKeys())) { LOG.info("Can't merge {} because it has different encryption keys", path); return false; } if (!sameMasks(firstReader.getDataMasks(), reader.getDataMasks())) { LOG.info("Can't merge {} because it has different encryption masks", path); return false; } if (!sameVariants(firstReader.getEncryptionVariants(), reader.getEncryptionVariants())) { LOG.info("Can't merge {} because it has different encryption variants", path); return false; } if (firstReader.writerUsedProlepticGregorian() != reader.writerUsedProlepticGregorian()) { LOG.info("Can't merge {} because it uses a different calendar", path); return false; } return true; } static void mergeMetadata(Map metadata, Reader reader) { for (String key : reader.getMetadataKeys()) { metadata.put(key, reader.getMetadataValue(key)); } } /** * Merges multiple ORC files that all have the same schema to produce a single ORC file. The * merge will reject files that aren't compatible with the merged file so the output list may be * shorter than the input list. The stripes are copied as serialized byte buffers. The user * metadata are merged and files that disagree on the value associated with a key will be * rejected. * * @param outputPath the output file * @param options the options for writing with although the options related to the input files' * encodings are overridden * @param inputFiles the list of files to merge * @return the list of files that were successfully merged * @throws IOException */ public static List mergeFiles( Path outputPath, WriterOptions options, List inputFiles) throws IOException { Writer output = null; final Configuration conf = options.getConfiguration(); KeyProvider keyProvider = options.getKeyProvider(); try { byte[] buffer = new byte[0]; Reader firstFile = null; List result = new ArrayList<>(inputFiles.size()); Map userMetadata = new HashMap<>(); int bufferSize = 0; for (Path input : inputFiles) { FileSystem fs = input.getFileSystem(conf); Reader reader = createReader( input, readerOptions(options.getConfiguration()) .filesystem(fs) .setKeyProvider(keyProvider)); if (!understandFormat(input, reader)) { continue; } else if (firstFile == null) { // if this is the first file that we are including, grab the values firstFile = reader; bufferSize = reader.getCompressionSize(); CompressionKind compression = reader.getCompressionKind(); options.bufferSize(bufferSize) .version(reader.getFileVersion()) .writerVersion(reader.getWriterVersion()) .compress(compression) .rowIndexStride(reader.getRowIndexStride()) .setSchema(reader.getSchema()); if (compression != CompressionKind.NONE) { options.enforceBufferSize().bufferSize(bufferSize); } mergeMetadata(userMetadata, reader); // ensure that the merged file uses the same key versions for (EncryptionKey key : reader.getColumnEncryptionKeys()) { options.setKeyVersion( key.getKeyName(), key.getKeyVersion(), key.getAlgorithm()); } output = createWriter(outputPath, options); } else if (!readerIsCompatible(firstFile, userMetadata, input, reader)) { continue; } else { mergeMetadata(userMetadata, reader); if (bufferSize < reader.getCompressionSize()) { bufferSize = reader.getCompressionSize(); ((WriterInternal) output).increaseCompressionSize(bufferSize); } } EncryptionVariant[] variants = reader.getEncryptionVariants(); List[] completeList = new List[variants.length + 1]; for (int v = 0; v < variants.length; ++v) { completeList[v] = reader.getVariantStripeStatistics(variants[v]); } completeList[completeList.length - 1] = reader.getVariantStripeStatistics(null); StripeStatistics[] stripeStats = new StripeStatistics[completeList.length]; try (FSDataInputStream inputStream = ((ReaderImpl) reader).takeFile()) { result.add(input); for (StripeInformation stripe : reader.getStripes()) { int length = (int) stripe.getLength(); if (buffer.length < length) { buffer = new byte[length]; } long offset = stripe.getOffset(); inputStream.readFully(offset, buffer, 0, length); int stripeId = (int) stripe.getStripeId(); for (int v = 0; v < completeList.length; ++v) { stripeStats[v] = completeList[v].get(stripeId); } output.appendStripe(buffer, 0, length, stripe, stripeStats); } } } if (output != null) { for (Map.Entry entry : userMetadata.entrySet()) { output.addUserMetadata(entry.getKey(), entry.getValue()); } output.close(); } return result; } catch (Throwable t) { if (output != null) { try { output.close(); } catch (Throwable ignore) { // PASS } try { FileSystem fs = options.getFileSystem() == null ? outputPath.getFileSystem(conf) : options.getFileSystem(); fs.delete(outputPath, false); } catch (Throwable ignore) { // PASS } } throw new IOException("Problem merging files into " + outputPath, t); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy