org.apache.hadoop.hive.serde2.ColumnProjectionUtils Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.HiveStringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
/**
* ColumnProjectionUtils.
*
*/
public final class ColumnProjectionUtils {
public static final Logger LOG = LoggerFactory.getLogger(ColumnProjectionUtils.class);
public static final String READ_COLUMN_IDS_CONF_STR = "hive.io.file.readcolumn.ids";
/**
* the nested column path is the string from the root to the leaf
* e.g.
* c:struct<a:string,b:string>
* the column a's path is c.a and b's path is c.b
*/
public static final String READ_NESTED_COLUMN_PATH_CONF_STR =
"hive.io.file.readNestedColumn.paths";
public static final String READ_ALL_COLUMNS = "hive.io.file.read.all.columns";
public static final String READ_COLUMN_NAMES_CONF_STR = "hive.io.file.readcolumn.names";
public static final String FETCH_VIRTUAL_COLUMNS_CONF_STR = "hive.io.file.fetch.virtual.columns";
private static final String READ_COLUMN_IDS_CONF_STR_DEFAULT = "";
private static final String READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT = "";
private static final boolean READ_ALL_COLUMNS_DEFAULT = true;
private static final Joiner CSV_JOINER = Joiner.on(",").skipNulls();
/**
* job config key for an ORC TypeDescription.toString().
* If set it will be favoured by ORC record readers over Hive schema literals such as
* IOConstants.SCHEMA_EVOLUTION_COLUMNS or IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES
*/
public static final String ORC_SCHEMA_STRING = "hive.orc.schema.string";
/**
* @deprecated for backwards compatibility with <= 0.12, use setReadAllColumns
*/
@Deprecated
public static void setFullyReadColumns(Configuration conf) {
setReadAllColumns(conf);
}
/**
* @deprecated for backwards compatibility with <= 0.12, use setReadAllColumns
* and appendReadColumns
*/
@Deprecated
@VisibleForTesting
public static void setReadColumnIDs(Configuration conf, List ids) {
setReadColumnIDConf(conf, READ_COLUMN_IDS_CONF_STR_DEFAULT);
appendReadColumns(conf, ids);
}
/**
* @deprecated for backwards compatibility with <= 0.12, use appendReadColumns
*/
@Deprecated
public static void appendReadColumnIDs(Configuration conf, List ids) {
appendReadColumns(conf, ids);
}
/**
* Sets the READ_ALL_COLUMNS flag and removes any previously
* set column ids.
*/
public static void setReadAllColumns(Configuration conf) {
conf.setBoolean(READ_ALL_COLUMNS, true);
setReadColumnIDConf(conf, READ_COLUMN_IDS_CONF_STR_DEFAULT);
}
/**
* Returns the READ_ALL_COLUMNS columns flag.
*/
public static boolean isReadAllColumns(Configuration conf) {
return conf.getBoolean(READ_ALL_COLUMNS, READ_ALL_COLUMNS_DEFAULT);
}
/**
* Sets the READ_ALL_COLUMNS flag to false and overwrites column ids
* with the provided list.
*/
public static void setReadColumns(Configuration conf, List ids) {
setReadColumnIDConf(conf, READ_COLUMN_IDS_CONF_STR_DEFAULT);
appendReadColumns(conf, ids);
}
/**
* Appends read columns' ids (start from zero). Once a column
* is included in the list, a underlying record reader of a columnar file format
* (e.g. RCFile and ORC) can know what columns are needed.
*/
public static void appendReadColumns(Configuration conf, List ids) {
String id = toReadColumnIDString(ids);
String old = conf.get(READ_COLUMN_IDS_CONF_STR, null);
String newConfStr = HiveStringUtils.joinIgnoringEmpty(new String[] {id, old}, StringUtils.COMMA);
setReadColumnIDConf(conf, newConfStr);
// Set READ_ALL_COLUMNS to false
conf.setBoolean(READ_ALL_COLUMNS, false);
}
/**
* Appends read nested column's paths. Once a read nested column path
* is included in the list, a underlying record reader of a columnar file format
* (e.g. Parquet and ORC) can know what columns are needed.
*/
public static void appendNestedColumnPaths(
Configuration conf,
List paths) {
if (paths == null || paths.isEmpty()) {
return;
}
String pathsStr = StringUtils.join(StringUtils.COMMA_STR,
paths.toArray(new String[paths.size()]));
String old = conf.get(READ_NESTED_COLUMN_PATH_CONF_STR, null);
String newConfStr = pathsStr;
if (old != null && !old.isEmpty()) {
newConfStr = newConfStr + StringUtils.COMMA_STR + old;
}
setReadNestedColumnPathConf(conf, newConfStr);
}
/**
* This method appends read column information to configuration to use for PPD. It is
* currently called with information from TSOP. Names come from TSOP input RowSchema, and
* IDs are the indexes inside the schema (which PPD assumes correspond to indexes inside the
* files to PPD in; something that would be invalid in many cases of schema evolution).
* @param conf Config to set values to.
* @param ids Column ids.
* @param names Column names.
*/
public static void appendReadColumns(
Configuration conf, List ids, List names, List groupPaths, boolean fetchVirtualCols) {
if (ids.size() != names.size()) {
LOG.warn("Read column counts do not match: "
+ ids.size() + " ids, " + names.size() + " names");
}
appendReadColumns(conf, ids);
appendReadColumnNames(conf, names);
appendNestedColumnPaths(conf, groupPaths);
conf.setBoolean(FETCH_VIRTUAL_COLUMNS_CONF_STR, fetchVirtualCols);
}
public static void appendReadColumns(
StringBuilder readColumnsBuffer, StringBuilder readColumnNamesBuffer, List ids,
List names) {
CSV_JOINER.appendTo(readColumnsBuffer, ids);
CSV_JOINER.appendTo(readColumnNamesBuffer, names);
}
/**
* Returns an array of column ids(start from zero) which is set in the given
* parameter conf.
*/
public static List getReadColumnIDs(Configuration conf) {
String skips = conf.get(READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS_CONF_STR_DEFAULT);
String[] list = StringUtils.split(skips);
List result = new ArrayList(list.length);
for (String element : list) {
// it may contain duplicates, remove duplicates
Integer toAdd = Integer.parseInt(element);
if (!result.contains(toAdd)) {
result.add(toAdd);
}
// NOTE: some code uses this list to correlate with column names, and yet these lists may
// contain duplicates, which this call will remove and the other won't. As far as I can
// tell, no code will actually use these two methods together; all is good if the code
// gets the ID list without relying on this method. Or maybe it just works by magic.
}
return result;
}
public static Set getNestedColumnPaths(Configuration conf) {
String skips =
conf.get(READ_NESTED_COLUMN_PATH_CONF_STR, READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT);
return new HashSet<>(Arrays.asList(StringUtils.split(skips)));
}
public static String[] getReadColumnNames(Configuration conf) {
String colNames = conf.get(READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_IDS_CONF_STR_DEFAULT);
if (colNames != null && !colNames.isEmpty()) {
return colNames.split(conf.get(serdeConstants.COLUMN_NAME_DELIMITER, String.valueOf(SerDeUtils.COMMA)));
}
return new String[] {};
}
private static void setReadColumnIDConf(Configuration conf, String id) {
if (id.trim().isEmpty()) {
conf.set(READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS_CONF_STR_DEFAULT);
} else {
conf.set(READ_COLUMN_IDS_CONF_STR, id);
}
}
private static void setReadNestedColumnPathConf(
Configuration conf,
String nestedColumnPaths) {
nestedColumnPaths = nestedColumnPaths.toLowerCase();
if (nestedColumnPaths.trim().isEmpty()) {
conf.set(READ_NESTED_COLUMN_PATH_CONF_STR, READ_NESTED_COLUMN_PATH_CONF_STR_DEFAULT);
} else {
conf.set(READ_NESTED_COLUMN_PATH_CONF_STR, nestedColumnPaths);
}
}
private static void appendReadColumnNames(Configuration conf, List cols) {
String old = conf.get(READ_COLUMN_NAMES_CONF_STR, "");
String delim = conf.get(serdeConstants.COLUMN_NAME_DELIMITER, String.valueOf(SerDeUtils.COMMA));
String result = String.join(delim, cols);
if (!old.isEmpty()) {
result = old + delim + result;
}
conf.set(READ_COLUMN_NAMES_CONF_STR, result);
}
private static String toReadColumnIDString(List ids) {
String id = "";
for (int i = 0; i < ids.size(); i++) {
if (i == 0) {
id = id + ids.get(i);
} else {
id = id + StringUtils.COMMA_STR + ids.get(i);
}
}
return id;
}
private ColumnProjectionUtils() {
// prevent instantiation
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy