All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.orc.OrcConf Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.orc;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/* This file is based on source code from the ORC Project (http://orc.apache.org/), licensed by the Apache
 * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership. */

/**
 * Define the configuration properties that Orc understands.
 *
 * 

NOTE: The file was copied and modified to support zstd-jni. This feature is only supported in * ORC 2.0, but 2.0 only supports JDK17. We need to support JDK8. */ public enum OrcConf { STRIPE_SIZE( "orc.stripe.size", "hive.exec.orc.default.stripe.size", 64L * 1024 * 1024, "Define the default ORC stripe size, in bytes."), STRIPE_ROW_COUNT( "orc.stripe.row.count", "orc.stripe.row.count", Integer.MAX_VALUE, "This value limit the row count in one stripe. \n" + "The number of stripe rows can be controlled at \n" + "(0, \"orc.stripe.row.count\" + max(batchSize, \"orc.rows.between.memory.checks\"))"), BLOCK_SIZE( "orc.block.size", "hive.exec.orc.default.block.size", 256L * 1024 * 1024, "Define the default file system block size for ORC files."), ENABLE_INDEXES( "orc.create.index", "orc.create.index", true, "Should the ORC writer create indexes as part of the file."), ROW_INDEX_STRIDE( "orc.row.index.stride", "hive.exec.orc.default.row.index.stride", 10000, "Define the default ORC index stride in number of rows. (Stride is the\n" + " number of rows an index entry represents.)"), BUFFER_SIZE( "orc.compress.size", "hive.exec.orc.default.buffer.size", 256 * 1024, "Define the default ORC buffer size, in bytes."), BASE_DELTA_RATIO( "orc.base.delta.ratio", "hive.exec.orc.base.delta.ratio", 8, "The ratio of base writer and delta writer in terms of STRIPE_SIZE and BUFFER_SIZE."), BLOCK_PADDING( "orc.block.padding", "hive.exec.orc.default.block.padding", true, "Define whether stripes should be padded to the HDFS block boundaries."), COMPRESS( "orc.compress", "hive.exec.orc.default.compress", "ZLIB", "Define the default compression codec for ORC file"), WRITE_FORMAT( "orc.write.format", "hive.exec.orc.write.format", "0.12", "Define the version of the file to write. Possible values are 0.11 and\n" + " 0.12. If this parameter is not defined, ORC will use the run\n" + " length encoding (RLE) introduced in Hive 0.12."), ENFORCE_COMPRESSION_BUFFER_SIZE( "orc.buffer.size.enforce", "hive.exec.orc.buffer.size.enforce", false, "Defines whether to enforce ORC compression buffer size."), ENCODING_STRATEGY( "orc.encoding.strategy", "hive.exec.orc.encoding.strategy", "SPEED", "Define the encoding strategy to use while writing data. Changing this\n" + "will only affect the light weight encoding for integers. This\n" + "flag will not change the compression level of higher level\n" + "compression codec (like ZLIB)."), COMPRESSION_STRATEGY( "orc.compression.strategy", "hive.exec.orc.compression.strategy", "SPEED", "Define the compression strategy to use while writing data.\n" + "This changes the compression level of higher level compression\n" + "codec (like ZLIB)."), COMPRESSION_ZSTD_LEVEL( "orc.compression.zstd.level", "hive.exec.orc.compression.zstd.level", 3, "Define the compression level to use with ZStandard codec " + "while writing data. The valid range is 1~22"), COMPRESSION_ZSTD_WINDOWLOG( "orc.compression.zstd.windowlog", "hive.exec.orc.compression.zstd.windowlog", 0, "Set the maximum allowed back-reference distance for " + "ZStandard codec, expressed as power of 2."), BLOCK_PADDING_TOLERANCE( "orc.block.padding.tolerance", "hive.exec.orc.block.padding.tolerance", 0.05, "Define the tolerance for block padding as a decimal fraction of\n" + "stripe size (for example, the default value 0.05 is 5% of the\n" + "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" + "blocks, the default block padding tolerance of 5% will\n" + "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n" + "In that case, if the available size within the block is more than\n" + "3.2Mb, a new smaller stripe will be inserted to fit within that\n" + "space. This will make sure that no stripe written will block\n" + " boundaries and cause remote reads within a node local task."), BLOOM_FILTER_FPP( "orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.01, "Define the default false positive probability for bloom filters."), USE_ZEROCOPY( "orc.use.zerocopy", "hive.exec.orc.zerocopy", false, "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"), SKIP_CORRUPT_DATA( "orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data", false, "If ORC reader encounters corrupt data, this value will be used to\n" + "determine whether to skip the corrupt data or throw exception.\n" + "The default behavior is to throw exception."), TOLERATE_MISSING_SCHEMA( "orc.tolerate.missing.schema", "hive.exec.orc.tolerate.missing.schema", true, "Writers earlier than HIVE-4243 may have inaccurate schema metadata.\n" + "This setting will enable best effort schema evolution rather\n" + "than rejecting mismatched schemas"), MEMORY_POOL( "orc.memory.pool", "hive.exec.orc.memory.pool", 0.5, "Maximum fraction of heap that can be used by ORC file writers"), DICTIONARY_KEY_SIZE_THRESHOLD( "orc.dictionary.key.threshold", "hive.exec.orc.dictionary.key.size.threshold", 0.8, "If the number of distinct keys in a dictionary is greater than this\n" + "fraction of the total number of non-null rows, turn off \n" + "dictionary encoding. Use 1 to always use dictionary encoding."), ROW_INDEX_STRIDE_DICTIONARY_CHECK( "orc.dictionary.early.check", "hive.orc.row.index.stride.dictionary.check", true, "If enabled dictionary check will happen after first row index stride\n" + "(default 10000 rows) else dictionary check will happen before\n" + "writing first stripe. In both cases, the decision to use\n" + "dictionary or not will be retained thereafter."), DICTIONARY_IMPL( "orc.dictionary.implementation", "orc.dictionary.implementation", "rbtree", "the implementation for the dictionary used for string-type column encoding.\n" + "The choices are:\n" + " rbtree - use red-black tree as the implementation for the dictionary.\n" + " hash - use hash table as the implementation for the dictionary."), BLOOM_FILTER_COLUMNS( "orc.bloom.filter.columns", "orc.bloom.filter.columns", "", "List of columns to create bloom filters for when writing."), BLOOM_FILTER_WRITE_VERSION( "orc.bloom.filter.write.version", "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(), "Which version of the bloom filters should we write.\n" + "The choices are:\n" + " original - writes two versions of the bloom filters for use by\n" + " both old and new readers.\n" + " utf8 - writes just the new bloom filters."), IGNORE_NON_UTF8_BLOOM_FILTERS( "orc.bloom.filter.ignore.non-utf8", "orc.bloom.filter.ignore.non-utf8", false, "Should the reader ignore the obsolete non-UTF8 bloom filters."), MAX_FILE_LENGTH( "orc.max.file.length", "orc.max.file.length", Long.MAX_VALUE, "The maximum size of the file to read for finding the file tail. This\n" + "is primarily used for streaming ingest to read intermediate\n" + "footers while the file is still open"), MAPRED_INPUT_SCHEMA( "orc.mapred.input.schema", null, null, "The schema that the user desires to read. The values are\n" + "interpreted using TypeDescription.fromString."), MAPRED_SHUFFLE_KEY_SCHEMA( "orc.mapred.map.output.key.schema", null, null, "The schema of the MapReduce shuffle key. The values are\n" + "interpreted using TypeDescription.fromString."), MAPRED_SHUFFLE_VALUE_SCHEMA( "orc.mapred.map.output.value.schema", null, null, "The schema of the MapReduce shuffle value. The values are\n" + "interpreted using TypeDescription.fromString."), MAPRED_OUTPUT_SCHEMA( "orc.mapred.output.schema", null, null, "The schema that the user desires to write. The values are\n" + "interpreted using TypeDescription.fromString."), INCLUDE_COLUMNS( "orc.include.columns", "hive.io.file.readcolumn.ids", null, "The list of comma separated column ids that should be read with 0\n" + "being the first column, 1 being the next, and so on. ."), KRYO_SARG( "orc.kryo.sarg", "orc.kryo.sarg", null, "The kryo and base64 encoded SearchArgument for predicate pushdown."), KRYO_SARG_BUFFER( "orc.kryo.sarg.buffer", null, 8192, "The kryo buffer size for SearchArgument for predicate pushdown."), SARG_COLUMNS( "orc.sarg.column.names", "orc.sarg.column.names", null, "The list of column names for the SearchArgument."), FORCE_POSITIONAL_EVOLUTION( "orc.force.positional.evolution", "orc.force.positional.evolution", false, "Require schema evolution to match the top level columns using position\n" + "rather than column names. This provides backwards compatibility with\n" + "Hive 2.1."), FORCE_POSITIONAL_EVOLUTION_LEVEL( "orc.force.positional.evolution.level", "orc.force.positional.evolution.level", 1, "Require schema evolution to match the the defined no. of level columns using position\n" + "rather than column names. This provides backwards compatibility with Hive 2.1."), ROWS_BETWEEN_CHECKS( "orc.rows.between.memory.checks", "orc.rows.between.memory.checks", 5000, "How often should MemoryManager check the memory sizes? Measured in rows\n" + "added to all of the writers. Valid range is [1,10000] and is primarily meant for" + "testing. Setting this too low may negatively affect performance." + " Use orc.stripe.row.count instead if the value larger than orc.stripe.row.count."), OVERWRITE_OUTPUT_FILE( "orc.overwrite.output.file", "orc.overwrite.output.file", false, "A boolean flag to enable overwriting of the output file if it already exists.\n"), IS_SCHEMA_EVOLUTION_CASE_SENSITIVE( "orc.schema.evolution.case.sensitive", "orc.schema.evolution.case.sensitive", true, "A boolean flag to determine if the comparision of field names " + "in schema evolution is case sensitive .\n"), ALLOW_SARG_TO_FILTER( "orc.sarg.to.filter", "orc.sarg.to.filter", false, "A boolean flag to determine if a SArg is allowed to become a filter"), READER_USE_SELECTED( "orc.filter.use.selected", "orc.filter.use.selected", false, "A boolean flag to determine if the selected vector is supported by\n" + "the reading application. If false, the output of the ORC reader " + "must have the filter\n" + "reapplied to avoid using unset values in the unselected rows.\n" + "If unsure please leave this as false."), ALLOW_PLUGIN_FILTER( "orc.filter.plugin", "orc.filter.plugin", false, "Enables the use of plugin filters during read. The plugin filters " + "are discovered against the service " + "org.apache.orc.filter.PluginFilterService, if multiple filters are " + "determined, they are combined using AND. The order of application is " + "non-deterministic and the filter functionality should not depend on the " + "order of application."), WRITE_VARIABLE_LENGTH_BLOCKS( "orc.write.variable.length.blocks", null, false, "A boolean flag as to whether the ORC writer should write variable length\n" + "HDFS blocks."), DIRECT_ENCODING_COLUMNS( "orc.column.encoding.direct", "orc.column.encoding.direct", "", "Comma-separated list of columns for which dictionary encoding is to be skipped."), // some JVM doesn't allow array creation of size Integer.MAX_VALUE, so chunk size is slightly // less than max int ORC_MAX_DISK_RANGE_CHUNK_LIMIT( "orc.max.disk.range.chunk.limit", "hive.exec.orc.max.disk.range.chunk.limit", Integer.MAX_VALUE - 1024, "When reading stripes >2GB, specify max limit for the chunk size."), ORC_MIN_DISK_SEEK_SIZE( "orc.min.disk.seek.size", "orc.min.disk.seek.size", 0, "When determining contiguous reads, gaps within this size are " + "read contiguously and not seeked. Default value of zero disables this " + "optimization"), ORC_MIN_DISK_SEEK_SIZE_TOLERANCE( "orc.min.disk.seek.size.tolerance", "orc.min.disk.seek.size.tolerance", 0.00, "Define the tolerance for for extra bytes read as a result of " + "orc.min.disk.seek.size. If the " + "(bytesRead - bytesNeeded) / bytesNeeded is greater than this " + "threshold then extra work is performed to drop the extra bytes from " + "memory after the read."), ENCRYPTION("orc.encrypt", "orc.encrypt", null, "The list of keys and columns to encrypt with"), DATA_MASK("orc.mask", "orc.mask", null, "The masks to apply to the encrypted columns"), KEY_PROVIDER( "orc.key.provider", "orc.key.provider", "hadoop", "The kind of KeyProvider to use for encryption."), PROLEPTIC_GREGORIAN( "orc.proleptic.gregorian", "orc.proleptic.gregorian", false, "Should we read and write dates & times using the proleptic Gregorian calendar\n" + "instead of the hybrid Julian Gregorian? Hive before 3.1 and Spark before 3.0\n" + "used hybrid."), PROLEPTIC_GREGORIAN_DEFAULT( "orc.proleptic.gregorian.default", "orc.proleptic.gregorian.default", false, "This value controls whether pre-ORC 27 files are using the hybrid or proleptic\n" + "calendar. Only Hive 3.1 and the C++ library wrote using the proleptic, so hybrid\n" + "is the default."), ROW_BATCH_SIZE( "orc.row.batch.size", "orc.row.batch.size", 1024, "The number of rows to include in a orc vectorized reader batch. " + "The value should be carefully chosen to minimize overhead and avoid OOMs in reading data."), ROW_BATCH_CHILD_LIMIT( "orc.row.child.limit", "orc.row.child.limit", 1024 * 32, "The maximum number of child elements to buffer before " + "the ORC row writer writes the batch to the file."); private final String attribute; private final String hiveConfName; private final Object defaultValue; private final String description; OrcConf(String attribute, String hiveConfName, Object defaultValue, String description) { this.attribute = attribute; this.hiveConfName = hiveConfName; this.defaultValue = defaultValue; this.description = description; } public String getAttribute() { return attribute; } public String getHiveConfName() { return hiveConfName; } public Object getDefaultValue() { return defaultValue; } public String getDescription() { return description; } private String lookupValue(Properties tbl, Configuration conf) { String result = null; if (tbl != null) { result = tbl.getProperty(attribute); } if (result == null && conf != null) { result = conf.get(attribute); if (result == null && hiveConfName != null) { result = conf.get(hiveConfName); } } return result; } public int getInt(Properties tbl, Configuration conf) { String value = lookupValue(tbl, conf); if (value != null) { return Integer.parseInt(value); } return ((Number) defaultValue).intValue(); } public int getInt(Configuration conf) { return getInt(null, conf); } /** * @deprecated Use {@link #getInt(Configuration)} instead. This method was incorrectly added and * shouldn't be used anymore. */ @Deprecated public void getInt(Configuration conf, int value) { // noop } public void setInt(Configuration conf, int value) { conf.setInt(attribute, value); } public long getLong(Properties tbl, Configuration conf) { String value = lookupValue(tbl, conf); if (value != null) { return Long.parseLong(value); } return ((Number) defaultValue).longValue(); } public long getLong(Configuration conf) { return getLong(null, conf); } public void setLong(Configuration conf, long value) { conf.setLong(attribute, value); } public String getString(Properties tbl, Configuration conf) { String value = lookupValue(tbl, conf); return value == null ? (String) defaultValue : value; } public String getString(Configuration conf) { return getString(null, conf); } public List getStringAsList(Configuration conf) { String value = getString(null, conf); List confList = new ArrayList<>(); if (StringUtils.isEmpty(value)) { return confList; } for (String str : value.split(",")) { String trimStr = StringUtils.trim(str); if (StringUtils.isNotEmpty(trimStr)) { confList.add(trimStr); } } return confList; } public void setString(Configuration conf, String value) { conf.set(attribute, value); } public boolean getBoolean(Properties tbl, Configuration conf) { String value = lookupValue(tbl, conf); if (value != null) { return Boolean.parseBoolean(value); } return (Boolean) defaultValue; } public boolean getBoolean(Configuration conf) { return getBoolean(null, conf); } public void setBoolean(Configuration conf, boolean value) { conf.setBoolean(attribute, value); } public double getDouble(Properties tbl, Configuration conf) { String value = lookupValue(tbl, conf); if (value != null) { return Double.parseDouble(value); } return ((Number) defaultValue).doubleValue(); } public double getDouble(Configuration conf) { return getDouble(null, conf); } public void setDouble(Configuration conf, double value) { conf.setDouble(attribute, value); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy