org.apache.orc.OrcConf Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/* This file is based on source code from the ORC Project (http://orc.apache.org/), licensed by the Apache
* Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
* additional information regarding copyright ownership. */
/**
* Define the configuration properties that Orc understands.
*
* NOTE: The file was copied and modified to support zstd-jni. This feature is only supported in
* ORC 2.0, but 2.0 only supports JDK17. We need to support JDK8.
*/
public enum OrcConf {
STRIPE_SIZE(
"orc.stripe.size",
"hive.exec.orc.default.stripe.size",
64L * 1024 * 1024,
"Define the default ORC stripe size, in bytes."),
STRIPE_ROW_COUNT(
"orc.stripe.row.count",
"orc.stripe.row.count",
Integer.MAX_VALUE,
"This value limit the row count in one stripe. \n"
+ "The number of stripe rows can be controlled at \n"
+ "(0, \"orc.stripe.row.count\" + max(batchSize, \"orc.rows.between.memory.checks\"))"),
BLOCK_SIZE(
"orc.block.size",
"hive.exec.orc.default.block.size",
256L * 1024 * 1024,
"Define the default file system block size for ORC files."),
ENABLE_INDEXES(
"orc.create.index",
"orc.create.index",
true,
"Should the ORC writer create indexes as part of the file."),
ROW_INDEX_STRIDE(
"orc.row.index.stride",
"hive.exec.orc.default.row.index.stride",
10000,
"Define the default ORC index stride in number of rows. (Stride is the\n"
+ " number of rows an index entry represents.)"),
BUFFER_SIZE(
"orc.compress.size",
"hive.exec.orc.default.buffer.size",
256 * 1024,
"Define the default ORC buffer size, in bytes."),
BASE_DELTA_RATIO(
"orc.base.delta.ratio",
"hive.exec.orc.base.delta.ratio",
8,
"The ratio of base writer and delta writer in terms of STRIPE_SIZE and BUFFER_SIZE."),
BLOCK_PADDING(
"orc.block.padding",
"hive.exec.orc.default.block.padding",
true,
"Define whether stripes should be padded to the HDFS block boundaries."),
COMPRESS(
"orc.compress",
"hive.exec.orc.default.compress",
"ZLIB",
"Define the default compression codec for ORC file"),
WRITE_FORMAT(
"orc.write.format",
"hive.exec.orc.write.format",
"0.12",
"Define the version of the file to write. Possible values are 0.11 and\n"
+ " 0.12. If this parameter is not defined, ORC will use the run\n"
+ " length encoding (RLE) introduced in Hive 0.12."),
ENFORCE_COMPRESSION_BUFFER_SIZE(
"orc.buffer.size.enforce",
"hive.exec.orc.buffer.size.enforce",
false,
"Defines whether to enforce ORC compression buffer size."),
ENCODING_STRATEGY(
"orc.encoding.strategy",
"hive.exec.orc.encoding.strategy",
"SPEED",
"Define the encoding strategy to use while writing data. Changing this\n"
+ "will only affect the light weight encoding for integers. This\n"
+ "flag will not change the compression level of higher level\n"
+ "compression codec (like ZLIB)."),
COMPRESSION_STRATEGY(
"orc.compression.strategy",
"hive.exec.orc.compression.strategy",
"SPEED",
"Define the compression strategy to use while writing data.\n"
+ "This changes the compression level of higher level compression\n"
+ "codec (like ZLIB)."),
COMPRESSION_ZSTD_LEVEL(
"orc.compression.zstd.level",
"hive.exec.orc.compression.zstd.level",
3,
"Define the compression level to use with ZStandard codec "
+ "while writing data. The valid range is 1~22"),
COMPRESSION_ZSTD_WINDOWLOG(
"orc.compression.zstd.windowlog",
"hive.exec.orc.compression.zstd.windowlog",
0,
"Set the maximum allowed back-reference distance for "
+ "ZStandard codec, expressed as power of 2."),
BLOCK_PADDING_TOLERANCE(
"orc.block.padding.tolerance",
"hive.exec.orc.block.padding.tolerance",
0.05,
"Define the tolerance for block padding as a decimal fraction of\n"
+ "stripe size (for example, the default value 0.05 is 5% of the\n"
+ "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n"
+ "blocks, the default block padding tolerance of 5% will\n"
+ "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n"
+ "In that case, if the available size within the block is more than\n"
+ "3.2Mb, a new smaller stripe will be inserted to fit within that\n"
+ "space. This will make sure that no stripe written will block\n"
+ " boundaries and cause remote reads within a node local task."),
BLOOM_FILTER_FPP(
"orc.bloom.filter.fpp",
"orc.default.bloom.fpp",
0.01,
"Define the default false positive probability for bloom filters."),
USE_ZEROCOPY(
"orc.use.zerocopy",
"hive.exec.orc.zerocopy",
false,
"Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"),
SKIP_CORRUPT_DATA(
"orc.skip.corrupt.data",
"hive.exec.orc.skip.corrupt.data",
false,
"If ORC reader encounters corrupt data, this value will be used to\n"
+ "determine whether to skip the corrupt data or throw exception.\n"
+ "The default behavior is to throw exception."),
TOLERATE_MISSING_SCHEMA(
"orc.tolerate.missing.schema",
"hive.exec.orc.tolerate.missing.schema",
true,
"Writers earlier than HIVE-4243 may have inaccurate schema metadata.\n"
+ "This setting will enable best effort schema evolution rather\n"
+ "than rejecting mismatched schemas"),
MEMORY_POOL(
"orc.memory.pool",
"hive.exec.orc.memory.pool",
0.5,
"Maximum fraction of heap that can be used by ORC file writers"),
DICTIONARY_KEY_SIZE_THRESHOLD(
"orc.dictionary.key.threshold",
"hive.exec.orc.dictionary.key.size.threshold",
0.8,
"If the number of distinct keys in a dictionary is greater than this\n"
+ "fraction of the total number of non-null rows, turn off \n"
+ "dictionary encoding. Use 1 to always use dictionary encoding."),
ROW_INDEX_STRIDE_DICTIONARY_CHECK(
"orc.dictionary.early.check",
"hive.orc.row.index.stride.dictionary.check",
true,
"If enabled dictionary check will happen after first row index stride\n"
+ "(default 10000 rows) else dictionary check will happen before\n"
+ "writing first stripe. In both cases, the decision to use\n"
+ "dictionary or not will be retained thereafter."),
DICTIONARY_IMPL(
"orc.dictionary.implementation",
"orc.dictionary.implementation",
"rbtree",
"the implementation for the dictionary used for string-type column encoding.\n"
+ "The choices are:\n"
+ " rbtree - use red-black tree as the implementation for the dictionary.\n"
+ " hash - use hash table as the implementation for the dictionary."),
BLOOM_FILTER_COLUMNS(
"orc.bloom.filter.columns",
"orc.bloom.filter.columns",
"",
"List of columns to create bloom filters for when writing."),
BLOOM_FILTER_WRITE_VERSION(
"orc.bloom.filter.write.version",
"orc.bloom.filter.write.version",
OrcFile.BloomFilterVersion.UTF8.toString(),
"Which version of the bloom filters should we write.\n"
+ "The choices are:\n"
+ " original - writes two versions of the bloom filters for use by\n"
+ " both old and new readers.\n"
+ " utf8 - writes just the new bloom filters."),
IGNORE_NON_UTF8_BLOOM_FILTERS(
"orc.bloom.filter.ignore.non-utf8",
"orc.bloom.filter.ignore.non-utf8",
false,
"Should the reader ignore the obsolete non-UTF8 bloom filters."),
MAX_FILE_LENGTH(
"orc.max.file.length",
"orc.max.file.length",
Long.MAX_VALUE,
"The maximum size of the file to read for finding the file tail. This\n"
+ "is primarily used for streaming ingest to read intermediate\n"
+ "footers while the file is still open"),
MAPRED_INPUT_SCHEMA(
"orc.mapred.input.schema",
null,
null,
"The schema that the user desires to read. The values are\n"
+ "interpreted using TypeDescription.fromString."),
MAPRED_SHUFFLE_KEY_SCHEMA(
"orc.mapred.map.output.key.schema",
null,
null,
"The schema of the MapReduce shuffle key. The values are\n"
+ "interpreted using TypeDescription.fromString."),
MAPRED_SHUFFLE_VALUE_SCHEMA(
"orc.mapred.map.output.value.schema",
null,
null,
"The schema of the MapReduce shuffle value. The values are\n"
+ "interpreted using TypeDescription.fromString."),
MAPRED_OUTPUT_SCHEMA(
"orc.mapred.output.schema",
null,
null,
"The schema that the user desires to write. The values are\n"
+ "interpreted using TypeDescription.fromString."),
INCLUDE_COLUMNS(
"orc.include.columns",
"hive.io.file.readcolumn.ids",
null,
"The list of comma separated column ids that should be read with 0\n"
+ "being the first column, 1 being the next, and so on. ."),
KRYO_SARG(
"orc.kryo.sarg",
"orc.kryo.sarg",
null,
"The kryo and base64 encoded SearchArgument for predicate pushdown."),
KRYO_SARG_BUFFER(
"orc.kryo.sarg.buffer",
null,
8192,
"The kryo buffer size for SearchArgument for predicate pushdown."),
SARG_COLUMNS(
"orc.sarg.column.names",
"orc.sarg.column.names",
null,
"The list of column names for the SearchArgument."),
FORCE_POSITIONAL_EVOLUTION(
"orc.force.positional.evolution",
"orc.force.positional.evolution",
false,
"Require schema evolution to match the top level columns using position\n"
+ "rather than column names. This provides backwards compatibility with\n"
+ "Hive 2.1."),
FORCE_POSITIONAL_EVOLUTION_LEVEL(
"orc.force.positional.evolution.level",
"orc.force.positional.evolution.level",
1,
"Require schema evolution to match the the defined no. of level columns using position\n"
+ "rather than column names. This provides backwards compatibility with Hive 2.1."),
ROWS_BETWEEN_CHECKS(
"orc.rows.between.memory.checks",
"orc.rows.between.memory.checks",
5000,
"How often should MemoryManager check the memory sizes? Measured in rows\n"
+ "added to all of the writers. Valid range is [1,10000] and is primarily meant for"
+ "testing. Setting this too low may negatively affect performance."
+ " Use orc.stripe.row.count instead if the value larger than orc.stripe.row.count."),
OVERWRITE_OUTPUT_FILE(
"orc.overwrite.output.file",
"orc.overwrite.output.file",
false,
"A boolean flag to enable overwriting of the output file if it already exists.\n"),
IS_SCHEMA_EVOLUTION_CASE_SENSITIVE(
"orc.schema.evolution.case.sensitive",
"orc.schema.evolution.case.sensitive",
true,
"A boolean flag to determine if the comparision of field names "
+ "in schema evolution is case sensitive .\n"),
ALLOW_SARG_TO_FILTER(
"orc.sarg.to.filter",
"orc.sarg.to.filter",
false,
"A boolean flag to determine if a SArg is allowed to become a filter"),
READER_USE_SELECTED(
"orc.filter.use.selected",
"orc.filter.use.selected",
false,
"A boolean flag to determine if the selected vector is supported by\n"
+ "the reading application. If false, the output of the ORC reader "
+ "must have the filter\n"
+ "reapplied to avoid using unset values in the unselected rows.\n"
+ "If unsure please leave this as false."),
ALLOW_PLUGIN_FILTER(
"orc.filter.plugin",
"orc.filter.plugin",
false,
"Enables the use of plugin filters during read. The plugin filters "
+ "are discovered against the service "
+ "org.apache.orc.filter.PluginFilterService, if multiple filters are "
+ "determined, they are combined using AND. The order of application is "
+ "non-deterministic and the filter functionality should not depend on the "
+ "order of application."),
WRITE_VARIABLE_LENGTH_BLOCKS(
"orc.write.variable.length.blocks",
null,
false,
"A boolean flag as to whether the ORC writer should write variable length\n"
+ "HDFS blocks."),
DIRECT_ENCODING_COLUMNS(
"orc.column.encoding.direct",
"orc.column.encoding.direct",
"",
"Comma-separated list of columns for which dictionary encoding is to be skipped."),
// some JVM doesn't allow array creation of size Integer.MAX_VALUE, so chunk size is slightly
// less than max int
ORC_MAX_DISK_RANGE_CHUNK_LIMIT(
"orc.max.disk.range.chunk.limit",
"hive.exec.orc.max.disk.range.chunk.limit",
Integer.MAX_VALUE - 1024,
"When reading stripes >2GB, specify max limit for the chunk size."),
ORC_MIN_DISK_SEEK_SIZE(
"orc.min.disk.seek.size",
"orc.min.disk.seek.size",
0,
"When determining contiguous reads, gaps within this size are "
+ "read contiguously and not seeked. Default value of zero disables this "
+ "optimization"),
ORC_MIN_DISK_SEEK_SIZE_TOLERANCE(
"orc.min.disk.seek.size.tolerance",
"orc.min.disk.seek.size.tolerance",
0.00,
"Define the tolerance for for extra bytes read as a result of "
+ "orc.min.disk.seek.size. If the "
+ "(bytesRead - bytesNeeded) / bytesNeeded is greater than this "
+ "threshold then extra work is performed to drop the extra bytes from "
+ "memory after the read."),
ENCRYPTION("orc.encrypt", "orc.encrypt", null, "The list of keys and columns to encrypt with"),
DATA_MASK("orc.mask", "orc.mask", null, "The masks to apply to the encrypted columns"),
KEY_PROVIDER(
"orc.key.provider",
"orc.key.provider",
"hadoop",
"The kind of KeyProvider to use for encryption."),
PROLEPTIC_GREGORIAN(
"orc.proleptic.gregorian",
"orc.proleptic.gregorian",
false,
"Should we read and write dates & times using the proleptic Gregorian calendar\n"
+ "instead of the hybrid Julian Gregorian? Hive before 3.1 and Spark before 3.0\n"
+ "used hybrid."),
PROLEPTIC_GREGORIAN_DEFAULT(
"orc.proleptic.gregorian.default",
"orc.proleptic.gregorian.default",
false,
"This value controls whether pre-ORC 27 files are using the hybrid or proleptic\n"
+ "calendar. Only Hive 3.1 and the C++ library wrote using the proleptic, so hybrid\n"
+ "is the default."),
ROW_BATCH_SIZE(
"orc.row.batch.size",
"orc.row.batch.size",
1024,
"The number of rows to include in a orc vectorized reader batch. "
+ "The value should be carefully chosen to minimize overhead and avoid OOMs in reading data."),
ROW_BATCH_CHILD_LIMIT(
"orc.row.child.limit",
"orc.row.child.limit",
1024 * 32,
"The maximum number of child elements to buffer before "
+ "the ORC row writer writes the batch to the file.");
private final String attribute;
private final String hiveConfName;
private final Object defaultValue;
private final String description;
OrcConf(String attribute, String hiveConfName, Object defaultValue, String description) {
this.attribute = attribute;
this.hiveConfName = hiveConfName;
this.defaultValue = defaultValue;
this.description = description;
}
public String getAttribute() {
return attribute;
}
public String getHiveConfName() {
return hiveConfName;
}
public Object getDefaultValue() {
return defaultValue;
}
public String getDescription() {
return description;
}
private String lookupValue(Properties tbl, Configuration conf) {
String result = null;
if (tbl != null) {
result = tbl.getProperty(attribute);
}
if (result == null && conf != null) {
result = conf.get(attribute);
if (result == null && hiveConfName != null) {
result = conf.get(hiveConfName);
}
}
return result;
}
public int getInt(Properties tbl, Configuration conf) {
String value = lookupValue(tbl, conf);
if (value != null) {
return Integer.parseInt(value);
}
return ((Number) defaultValue).intValue();
}
public int getInt(Configuration conf) {
return getInt(null, conf);
}
/**
* @deprecated Use {@link #getInt(Configuration)} instead. This method was incorrectly added and
* shouldn't be used anymore.
*/
@Deprecated
public void getInt(Configuration conf, int value) {
// noop
}
public void setInt(Configuration conf, int value) {
conf.setInt(attribute, value);
}
public long getLong(Properties tbl, Configuration conf) {
String value = lookupValue(tbl, conf);
if (value != null) {
return Long.parseLong(value);
}
return ((Number) defaultValue).longValue();
}
public long getLong(Configuration conf) {
return getLong(null, conf);
}
public void setLong(Configuration conf, long value) {
conf.setLong(attribute, value);
}
public String getString(Properties tbl, Configuration conf) {
String value = lookupValue(tbl, conf);
return value == null ? (String) defaultValue : value;
}
public String getString(Configuration conf) {
return getString(null, conf);
}
public List getStringAsList(Configuration conf) {
String value = getString(null, conf);
List confList = new ArrayList<>();
if (StringUtils.isEmpty(value)) {
return confList;
}
for (String str : value.split(",")) {
String trimStr = StringUtils.trim(str);
if (StringUtils.isNotEmpty(trimStr)) {
confList.add(trimStr);
}
}
return confList;
}
public void setString(Configuration conf, String value) {
conf.set(attribute, value);
}
public boolean getBoolean(Properties tbl, Configuration conf) {
String value = lookupValue(tbl, conf);
if (value != null) {
return Boolean.parseBoolean(value);
}
return (Boolean) defaultValue;
}
public boolean getBoolean(Configuration conf) {
return getBoolean(null, conf);
}
public void setBoolean(Configuration conf, boolean value) {
conf.setBoolean(attribute, value);
}
public double getDouble(Properties tbl, Configuration conf) {
String value = lookupValue(tbl, conf);
if (value != null) {
return Double.parseDouble(value);
}
return ((Number) defaultValue).doubleValue();
}
public double getDouble(Configuration conf) {
return getDouble(null, conf);
}
public void setDouble(Configuration conf, double value) {
conf.setDouble(attribute, value);
}
}