org.apache.hadoop.hive.ql.exec.FetchOperator Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveRecordReader;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.AnnotationUtils;
import org.apache.hive.common.util.ReflectionUtil;
import com.google.common.collect.Iterators;
/**
* FetchTask implementation.
**/
public class FetchOperator implements Serializable {
static final Log LOG = LogFactory.getLog(FetchOperator.class.getName());
static final LogHelper console = new LogHelper(LOG);
public static final String FETCH_OPERATOR_DIRECTORY_LIST =
"hive.complete.dir.list";
private FetchWork work;
private Operator operator; // operator tree for processing row further (optional)
private final boolean hasVC;
private final boolean isStatReader;
private final boolean isPartitioned;
private final boolean isNonNativeTable;
private StructObjectInspector vcsOI;
private final List vcCols;
private ExecMapperContext context;
private transient Deserializer tableSerDe;
private transient StructObjectInspector tableOI;
private transient StructObjectInspector partKeyOI;
private transient StructObjectInspector convertedOI;
private transient Iterator iterPath;
private transient Iterator iterPartDesc;
private transient Iterator iterSplits = Iterators.emptyIterator();
private transient Path currPath;
private transient PartitionDesc currDesc;
private transient Deserializer currSerDe;
private transient Converter ObjectConverter;
private transient RecordReader currRecReader;
private transient JobConf job;
private transient WritableComparable key;
private transient Writable value;
private transient Object[] vcValues;
private transient int headerCount;
private transient int footerCount;
private transient FooterBuffer footerBuffer;
private transient StructObjectInspector outputOI;
private transient Object[] row;
public FetchOperator(FetchWork work, JobConf job) throws HiveException {
this(work, job, null, null);
}
public FetchOperator(FetchWork work, JobConf job, Operator operator,
List vcCols) throws HiveException {
this.job = job;
this.work = work;
this.operator = operator;
this.vcCols = vcCols;
this.hasVC = vcCols != null && !vcCols.isEmpty();
this.isStatReader = work.getTblDesc() == null;
this.isPartitioned = !isStatReader && work.isPartitioned();
this.isNonNativeTable = !isStatReader && work.getTblDesc().isNonNative();
initialize();
}
private void initialize() throws HiveException {
if (isStatReader) {
outputOI = work.getStatRowOI();
return;
}
if (hasVC) {
List names = new ArrayList(vcCols.size());
List inspectors = new ArrayList(vcCols.size());
for (VirtualColumn vc : vcCols) {
inspectors.add(vc.getObjectInspector());
names.add(vc.getName());
}
vcsOI = ObjectInspectorFactory.getStandardStructObjectInspector(names, inspectors);
vcValues = new Object[vcCols.size()];
}
if (hasVC && isPartitioned) {
row = new Object[3];
} else if (hasVC || isPartitioned) {
row = new Object[2];
} else {
row = new Object[1];
}
if (isPartitioned) {
iterPath = work.getPartDir().iterator();
iterPartDesc = work.getPartDesc().iterator();
} else {
iterPath = Arrays.asList(work.getTblDir()).iterator();
iterPartDesc = Iterators.cycle(new PartitionDesc(work.getTblDesc(), null));
}
outputOI = setupOutputObjectInspector();
context = setupExecContext(operator, work.getPathLists());
}
private ExecMapperContext setupExecContext(Operator operator, List paths) {
ExecMapperContext context = null;
if (hasVC || work.getSplitSample() != null) {
context = new ExecMapperContext(job);
if (operator != null) {
operator.passExecContext(context);
}
}
setFetchOperatorContext(job, paths);
return context;
}
public FetchWork getWork() {
return work;
}
public void setWork(FetchWork work) {
this.work = work;
}
/**
* A cache of InputFormat instances.
*/
private static final Map inputFormats = new HashMap();
@SuppressWarnings("unchecked")
static InputFormat getInputFormatFromCache(Class inputFormatClass,
JobConf conf) throws IOException {
if (Configurable.class.isAssignableFrom(inputFormatClass) ||
JobConfigurable.class.isAssignableFrom(inputFormatClass)) {
return ReflectionUtil.newInstance(inputFormatClass, conf);
}
InputFormat format = inputFormats.get(inputFormatClass.getName());
if (format == null) {
try {
format = ReflectionUtil.newInstance(inputFormatClass, conf);
inputFormats.put(inputFormatClass.getName(), format);
} catch (Exception e) {
throw new IOException("Cannot create an instance of InputFormat class "
+ inputFormatClass.getName() + " as specified in mapredWork!", e);
}
}
return format;
}
private StructObjectInspector getPartitionKeyOI(TableDesc tableDesc) throws Exception {
String pcols = tableDesc.getProperties().getProperty(
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
String pcolTypes = tableDesc.getProperties().getProperty(
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
String[] partKeys = pcols.trim().split("/");
String[] partKeyTypes = pcolTypes.trim().split(":");
ObjectInspector[] inspectors = new ObjectInspector[partKeys.length];
for (int i = 0; i < partKeys.length; i++) {
inspectors[i] = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(
TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]));
}
return ObjectInspectorFactory.getStandardStructObjectInspector(
Arrays.asList(partKeys), Arrays.asList(inspectors));
}
private Object[] createPartValue(PartitionDesc partDesc, StructObjectInspector partOI) {
Map partSpec = partDesc.getPartSpec();
List fields = partOI.getAllStructFieldRefs();
Object[] partValues = new Object[fields.size()];
for (int i = 0; i < partValues.length; i++) {
StructField field = fields.get(i);
String value = partSpec.get(field.getFieldName());
ObjectInspector oi = field.getFieldObjectInspector();
partValues[i] = ObjectInspectorConverters.getConverter(
PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi).convert(value);
}
return partValues;
}
private boolean getNextPath() throws Exception {
while (iterPath.hasNext()) {
currPath = iterPath.next();
currDesc = iterPartDesc.next();
if (isNonNativeTable) {
return true;
}
FileSystem fs = currPath.getFileSystem(job);
if (fs.exists(currPath)) {
for (FileStatus fStat : listStatusUnderPath(fs, currPath)) {
if (fStat.getLen() > 0) {
return true;
}
}
}
}
return false;
}
/**
* Set context for this fetch operator in to the jobconf.
* This helps InputFormats make decisions based on the scope of the complete
* operation.
* @param conf the configuration to modify
* @param paths the list of input directories
*/
static void setFetchOperatorContext(JobConf conf, List paths) {
if (paths != null) {
StringBuilder buff = new StringBuilder();
for (Path path : paths) {
if (buff.length() > 0) {
buff.append('\t');
}
buff.append(StringEscapeUtils.escapeJava(path.toString()));
}
conf.set(FETCH_OPERATOR_DIRECTORY_LIST, buff.toString());
}
}
private RecordReader getRecordReader() throws Exception {
if (!iterSplits.hasNext()) {
FetchInputFormatSplit[] splits = getNextSplits();
if (splits == null) {
return null;
}
if (!isPartitioned || convertedOI == null) {
currSerDe = tableSerDe;
ObjectConverter = null;
} else {
currSerDe = needConversion(currDesc) ? currDesc.getDeserializer(job) : tableSerDe;
ObjectInspector inputOI = currSerDe.getObjectInspector();
ObjectConverter = ObjectInspectorConverters.getConverter(inputOI, convertedOI);
}
if (isPartitioned) {
row[1] = createPartValue(currDesc, partKeyOI);
}
iterSplits = Arrays.asList(splits).iterator();
if (LOG.isDebugEnabled()) {
LOG.debug("Creating fetchTask with deserializer typeinfo: "
+ currSerDe.getObjectInspector().getTypeName());
LOG.debug("deserializer properties:\ntable properties: " +
currDesc.getTableDesc().getProperties() + "\npartition properties: " +
currDesc.getProperties());
}
}
final FetchInputFormatSplit target = iterSplits.next();
@SuppressWarnings("unchecked")
final RecordReader reader = target.getRecordReader(job);
if (hasVC || work.getSplitSample() != null) {
currRecReader = new HiveRecordReader(reader, job) {
@Override
public boolean doNext(WritableComparable key, Writable value) throws IOException {
// if current pos is larger than shrinkedLength which is calculated for
// each split by table sampling, stop fetching any more (early exit)
if (target.shrinkedLength > 0 &&
context.getIoCxt().getCurrentBlockStart() > target.shrinkedLength) {
return false;
}
return super.doNext(key, value);
}
};
((HiveContextAwareRecordReader)currRecReader).
initIOContext(target, job, target.inputFormat.getClass(), reader);
} else {
currRecReader = reader;
}
key = currRecReader.createKey();
value = currRecReader.createValue();
headerCount = footerCount = 0;
return currRecReader;
}
protected FetchInputFormatSplit[] getNextSplits() throws Exception {
while (getNextPath()) {
// not using FileInputFormat.setInputPaths() here because it forces a connection to the
// default file system - which may or may not be online during pure metadata operations
job.set("mapred.input.dir", StringUtils.escapeString(currPath.toString()));
// Fetch operator is not vectorized and as such turn vectorization flag off so that
// non-vectorized record reader is created below.
HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
Class formatter = currDesc.getInputFileFormatClass();
Utilities.copyTableJobPropertiesToConf(currDesc.getTableDesc(), job);
InputFormat inputFormat = getInputFormatFromCache(formatter, job);
InputSplit[] splits = inputFormat.getSplits(job, 1);
FetchInputFormatSplit[] inputSplits = new FetchInputFormatSplit[splits.length];
for (int i = 0; i < splits.length; i++) {
inputSplits[i] = new FetchInputFormatSplit(splits[i], inputFormat);
}
if (work.getSplitSample() != null) {
inputSplits = splitSampling(work.getSplitSample(), inputSplits);
}
if (inputSplits.length > 0) {
return inputSplits;
}
}
return null;
}
private FetchInputFormatSplit[] splitSampling(SplitSample splitSample,
FetchInputFormatSplit[] splits) {
long totalSize = 0;
for (FetchInputFormatSplit split: splits) {
totalSize += split.getLength();
}
List result = new ArrayList(splits.length);
long targetSize = splitSample.getTargetSize(totalSize);
int startIndex = splitSample.getSeedNum() % splits.length;
long size = 0;
for (int i = 0; i < splits.length; i++) {
FetchInputFormatSplit split = splits[(startIndex + i) % splits.length];
result.add(split);
long splitgLength = split.getLength();
if (size + splitgLength >= targetSize) {
if (size + splitgLength > targetSize) {
split.shrinkedLength = targetSize - size;
}
break;
}
size += splitgLength;
}
return result.toArray(new FetchInputFormatSplit[result.size()]);
}
/**
* Get the next row and push down it to operator tree.
* Currently only used by FetchTask.
**/
public boolean pushRow() throws IOException, HiveException {
if (work.getRowsComputedUsingStats() != null) {
for (List