org.apache.hadoop.hive.ql.io.HiveFileFormatUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Presto
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.OutputCommitter;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.Shell;
import org.apache.hive.common.util.ReflectionUtil;
/**
* An util class for various Hive file format tasks.
* registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class)
* are added for backward compatibility. They return the newly added
* HiveOutputFormat for the older ones.
*
*/
public final class HiveFileFormatUtils {
static {
outputFormatSubstituteMap =
new ConcurrentHashMap, Class extends OutputFormat>>();
HiveFileFormatUtils.registerOutputFormatSubstitute(
IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class);
HiveFileFormatUtils.registerOutputFormatSubstitute(
SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class);
}
@SuppressWarnings("unchecked")
private static Map, Class extends OutputFormat>>
outputFormatSubstituteMap;
/**
* register a substitute.
*
* @param origin
* the class that need to be substituted
* @param substitute
*/
@SuppressWarnings("unchecked")
public static void registerOutputFormatSubstitute(Class> origin,
Class extends HiveOutputFormat> substitute) {
outputFormatSubstituteMap.put(origin, substitute);
}
/**
* get a OutputFormat's substitute HiveOutputFormat.
*/
@SuppressWarnings("unchecked")
public static Class extends OutputFormat> getOutputFormatSubstitute(
Class> origin) {
if (origin == null || HiveOutputFormat.class.isAssignableFrom(origin)) {
return (Class extends OutputFormat>) origin; // hive native
}
Class extends OutputFormat> substitute = outputFormatSubstituteMap.get(origin);
if (substitute != null) {
return substitute; // substituted
}
return (Class extends OutputFormat>) origin;
}
/**
* get the final output path of a given FileOutputFormat.
*
* @param parent
* parent dir of the expected final output path
* @param jc
* job configuration
* @deprecated
*/
@Deprecated
public static Path getOutputFormatFinalPath(Path parent, String taskId, JobConf jc,
HiveOutputFormat, ?> hiveOutputFormat, boolean isCompressed,
Path defaultFinalPath) throws IOException {
if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) {
return new Path(parent, taskId
+ Utilities.getFileExtension(jc, isCompressed));
}
return defaultFinalPath;
}
static {
inputFormatCheckerMap =
new HashMap, Class extends InputFormatChecker>>();
HiveFileFormatUtils.registerInputFormatChecker(
SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class);
HiveFileFormatUtils.registerInputFormatChecker(RCFileInputFormat.class,
RCFileInputFormat.class);
inputFormatCheckerInstanceCache =
new HashMap, InputFormatChecker>();
}
@SuppressWarnings("unchecked")
private static Map, Class extends InputFormatChecker>> inputFormatCheckerMap;
private static Map, InputFormatChecker> inputFormatCheckerInstanceCache;
/**
* register an InputFormatChecker for a given InputFormat.
*
* @param format
* the class that need to be substituted
* @param checker
*/
@SuppressWarnings("unchecked")
public static synchronized void registerInputFormatChecker(
Class extends InputFormat> format,
Class extends InputFormatChecker> checker) {
inputFormatCheckerMap.put(format, checker);
}
/**
* get an InputFormatChecker for a file format.
*/
public static synchronized Class extends InputFormatChecker> getInputFormatChecker(
Class> inputFormat) {
Class extends InputFormatChecker> result = inputFormatCheckerMap
.get(inputFormat);
return result;
}
/**
* checks if files are in same format as the given input format.
*/
@SuppressWarnings("unchecked")
public static boolean checkInputFormat(FileSystem fs, HiveConf conf,
Class extends InputFormat> inputFormatCls, ArrayList files)
throws HiveException {
if (files.size() > 0) {
Class extends InputFormatChecker> checkerCls = getInputFormatChecker(inputFormatCls);
if (checkerCls == null
&& inputFormatCls.isAssignableFrom(TextInputFormat.class)) {
// we get a text input format here, we can not determine a file is text
// according to its content, so we can do is to test if other file
// format can accept it. If one other file format can accept this file,
// we treat this file as text file, although it maybe not.
return checkTextInputFormat(fs, conf, files);
}
if (checkerCls != null) {
InputFormatChecker checkerInstance = inputFormatCheckerInstanceCache
.get(checkerCls);
try {
if (checkerInstance == null) {
checkerInstance = checkerCls.newInstance();
inputFormatCheckerInstanceCache.put(checkerCls, checkerInstance);
}
return checkerInstance.validateInput(fs, conf, files);
} catch (Exception e) {
throw new HiveException(e);
}
}
return true;
}
return false;
}
@SuppressWarnings("unchecked")
private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf,
ArrayList files) throws HiveException {
Set> inputFormatter = inputFormatCheckerMap
.keySet();
for (Class extends InputFormat> reg : inputFormatter) {
boolean result = checkInputFormat(fs, conf, reg, files);
if (result) {
return false;
}
}
return true;
}
public static RecordWriter getHiveRecordWriter(JobConf jc,
TableDesc tableInfo, Class extends Writable> outputClass,
FileSinkDesc conf, Path outPath, Reporter reporter) throws HiveException {
HiveOutputFormat, ?> hiveOutputFormat = getHiveOutputFormat(jc, tableInfo);
try {
boolean isCompressed = conf.getCompressed();
JobConf jc_output = jc;
if (isCompressed) {
jc_output = new JobConf(jc);
String codecStr = conf.getCompressCodec();
if (codecStr != null && !codecStr.trim().equals("")) {
Class extends CompressionCodec> codec =
(Class extends CompressionCodec>) JavaUtils.loadClass(codecStr);
FileOutputFormat.setOutputCompressorClass(jc_output, codec);
}
String type = conf.getCompressType();
if (type != null && !type.trim().equals("")) {
CompressionType style = CompressionType.valueOf(type);
SequenceFileOutputFormat.setOutputCompressionType(jc, style);
}
}
return getRecordWriter(jc_output, hiveOutputFormat, outputClass,
isCompressed, tableInfo.getProperties(), outPath, reporter);
} catch (Exception e) {
throw new HiveException(e);
}
}
public static RecordWriter getRecordWriter(JobConf jc,
OutputFormat, ?> outputFormat,
Class extends Writable> valueClass, boolean isCompressed,
Properties tableProp, Path outPath, Reporter reporter
) throws IOException, HiveException {
if (!(outputFormat instanceof HiveOutputFormat)) {
outputFormat = new HivePassThroughOutputFormat(outputFormat);
}
return ((HiveOutputFormat)outputFormat).getHiveRecordWriter(
jc, outPath, valueClass, isCompressed, tableProp, reporter);
}
public static HiveOutputFormat, ?> getHiveOutputFormat(Configuration conf, TableDesc tableDesc)
throws HiveException {
return getHiveOutputFormat(conf, tableDesc.getOutputFileFormatClass());
}
public static HiveOutputFormat, ?> getHiveOutputFormat(Configuration conf, PartitionDesc partDesc)
throws HiveException {
return getHiveOutputFormat(conf, partDesc.getOutputFileFormatClass());
}
private static HiveOutputFormat, ?> getHiveOutputFormat(
Configuration conf, Class extends OutputFormat> outputClass) throws HiveException {
OutputFormat, ?> outputFormat = ReflectionUtil.newInstance(outputClass, conf);
if (!(outputFormat instanceof HiveOutputFormat)) {
outputFormat = new HivePassThroughOutputFormat(outputFormat);
}
return (HiveOutputFormat, ?>) outputFormat;
}
public static RecordUpdater getAcidRecordUpdater(JobConf jc, TableDesc tableInfo, int bucket,
FileSinkDesc conf, Path outPath,
ObjectInspector inspector,
Reporter reporter, int rowIdColNum)
throws HiveException, IOException {
HiveOutputFormat, ?> hiveOutputFormat = getHiveOutputFormat(jc, tableInfo);
AcidOutputFormat, ?> acidOutputFormat = null;
if (hiveOutputFormat instanceof AcidOutputFormat) {
acidOutputFormat = (AcidOutputFormat)hiveOutputFormat;
} else {
throw new HiveException("Unable to create RecordUpdater for HiveOutputFormat that does not " +
"implement AcidOutputFormat");
}
// TODO not 100% sure about this. This call doesn't set the compression type in the conf
// file the way getHiveRecordWriter does, as ORC appears to read the value for itself. Not
// sure if this is correct or not.
return getRecordUpdater(jc, acidOutputFormat, conf.getCompressed(), conf.getTransactionId(),
bucket, inspector, tableInfo.getProperties(), outPath, reporter, rowIdColNum);
}
private static RecordUpdater getRecordUpdater(JobConf jc,
AcidOutputFormat, ?> acidOutputFormat,
boolean isCompressed,
long txnId,
int bucket,
ObjectInspector inspector,
Properties tableProp,
Path outPath,
Reporter reporter,
int rowIdColNum) throws IOException {
return acidOutputFormat.getRecordUpdater(outPath, new AcidOutputFormat.Options(jc)
.isCompressed(isCompressed)
.tableProperties(tableProp)
.reporter(reporter)
.writingBase(false)
.minimumTransactionId(txnId)
.maximumTransactionId(txnId)
.bucket(bucket)
.inspector(inspector)
.recordIdColumn(rowIdColNum));
}
public static PartitionDesc getPartitionDescFromPathRecursively(
Map pathToPartitionInfo, Path dir,
Map
© 2015 - 2025 Weber Informatics LLC | Privacy Policy