org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.FooterBuffer;
import org.apache.hadoop.hive.ql.io.IOContext.Comparison;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
/** This class prepares an IOContext, and provides the ability to perform a binary search on the
* data. The binary search can be used by setting the value of inputFormatSorted in the
* MapreduceWork to true, but it should only be used if the data is going to a FilterOperator,
* which filters by comparing a value in the data with a constant, using one of the comparisons
* =, <, >, <=, >=. If the RecordReader's underlying format is an RCFile, this object can perform
* a binary search to find the block to begin reading from, and stop reading once it can be
* determined no other entries will match the filter.
*/
public abstract class HiveContextAwareRecordReader implements RecordReader {
private static final Log LOG = LogFactory.getLog(HiveContextAwareRecordReader.class.getName());
private boolean initDone = false;
private long rangeStart;
private long rangeEnd;
private long splitEnd;
private long previousPosition = -1;
private boolean wasUsingSortedSearch = false;
private String genericUDFClassName = null;
private final List stopComparisons = new ArrayList();
private Map pathToPartitionInfo;
protected RecordReader recordReader;
protected JobConf jobConf;
protected boolean isSorted = false;
public HiveContextAwareRecordReader(JobConf conf) throws IOException {
this(null, conf);
}
public HiveContextAwareRecordReader(RecordReader recordReader) {
this.recordReader = recordReader;
}
public HiveContextAwareRecordReader(RecordReader recordReader, JobConf conf)
throws IOException {
this.recordReader = recordReader;
this.jobConf = conf;
}
public void setRecordReader(RecordReader recordReader) {
this.recordReader = recordReader;
}
/**
* Close this {@link InputSplit} to future operations.
*
* @throws IOException
*/
public abstract void doClose() throws IOException;
private IOContext ioCxtRef = null;
@Override
public void close() throws IOException {
doClose();
initDone = false;
ioCxtRef = null;
}
@Override
public boolean next(K key, V value) throws IOException {
if(!initDone) {
throw new IOException("Hive IOContext is not inited.");
}
updateIOContext();
try {
boolean retVal = doNext(key, value);
if(retVal) {
if(key instanceof RecordIdentifier) {
//supports AcidInputFormat which uses the KEY pass ROW__ID info
ioCxtRef.setRecordIdentifier((RecordIdentifier)key);
}
else if(recordReader instanceof AcidInputFormat.AcidRecordReader) {
//supports AcidInputFormat which do not use the KEY pass ROW__ID info
ioCxtRef.setRecordIdentifier(((AcidInputFormat.AcidRecordReader) recordReader).getRecordIdentifier());
}
}
return retVal;
} catch (IOException e) {
ioCxtRef.setIOExceptions(true);
throw e;
}
}
protected void updateIOContext()
throws IOException {
long pointerPos = this.getPos();
if (!ioCxtRef.isBlockPointer()) {
ioCxtRef.setCurrentBlockStart(pointerPos);
ioCxtRef.setCurrentRow(0);
return;
}
ioCxtRef.setCurrentRow(ioCxtRef.getCurrentRow() + 1);
if (ioCxtRef.getNextBlockStart() == -1) {
ioCxtRef.setNextBlockStart(pointerPos);
ioCxtRef.setCurrentRow(0);
}
if (pointerPos != ioCxtRef.getNextBlockStart()) {
// the reader pointer has moved to the end of next block, or the end of
// current record.
ioCxtRef.setCurrentRow(0);
if (ioCxtRef.getCurrentBlockStart() == ioCxtRef.getNextBlockStart()) {
ioCxtRef.setCurrentRow(1);
}
ioCxtRef.setCurrentBlockStart(ioCxtRef.getNextBlockStart());
ioCxtRef.setNextBlockStart(pointerPos);
}
}
public IOContext getIOContext() {
return IOContext.get(jobConf);
}
private void initIOContext(long startPos, boolean isBlockPointer,
Path inputPath) {
ioCxtRef = this.getIOContext();
ioCxtRef.setCurrentBlockStart(startPos);
ioCxtRef.setBlockPointer(isBlockPointer);
ioCxtRef.setInputPath(inputPath);
LOG.info("Processing file " + inputPath);
initDone = true;
}
public void initIOContext(FileSplit split, JobConf job,
Class inputFormatClass) throws IOException {
this.initIOContext(split, job, inputFormatClass, null);
}
public void initIOContext(FileSplit split, JobConf job,
Class inputFormatClass, RecordReader recordReader) throws IOException {
boolean blockPointer = false;
long blockStart = -1;
FileSplit fileSplit = split;
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem(job);
if (inputFormatClass.getName().contains("SequenceFile")) {
SequenceFile.Reader in = new SequenceFile.Reader(fs, path, job);
blockPointer = in.isBlockCompressed();
in.sync(fileSplit.getStart());
blockStart = in.getPosition();
in.close();
} else if (recordReader instanceof RCFileRecordReader) {
blockPointer = true;
blockStart = ((RCFileRecordReader) recordReader).getStart();
} else if (inputFormatClass.getName().contains("RCFile")) {
blockPointer = true;
RCFile.Reader in = new RCFile.Reader(fs, path, job);
in.sync(fileSplit.getStart());
blockStart = in.getPosition();
in.close();
}
this.jobConf = job;
this.initIOContext(blockStart, blockPointer, path.makeQualified(fs));
this.initIOContextSortedProps(split, recordReader, job);
}
public void initIOContextSortedProps(FileSplit split, RecordReader recordReader, JobConf job) {
this.jobConf = job;
this.getIOContext().resetSortingValues();
this.isSorted = jobConf.getBoolean("hive.input.format.sorted", false);
this.rangeStart = split.getStart();
this.rangeEnd = split.getStart() + split.getLength();
this.splitEnd = rangeEnd;
if (recordReader instanceof RCFileRecordReader && rangeEnd != 0 && this.isSorted) {
// Binary search only works if we know the size of the split, and the recordReader is an
// RCFileRecordReader
this.getIOContext().setUseSorted(true);
this.getIOContext().setBinarySearching(true);
this.wasUsingSortedSearch = true;
} else {
// Use the defalut methods for next in the child class
this.isSorted = false;
}
}
@Override
public float getProgress() throws IOException {
if (this.getIOContext().isBinarySearching()) {
return 0;
} else {
return recordReader.getProgress();
}
}
private FooterBuffer footerBuffer = null;
private int headerCount = 0;
private int footerCount = 0;
public boolean doNext(K key, V value) throws IOException {
if (this.isSorted) {
if (this.getIOContext().shouldEndBinarySearch() ||
(!this.getIOContext().useSorted() && this.wasUsingSortedSearch)) {
beginLinearSearch();
this.wasUsingSortedSearch = false;
this.getIOContext().setEndBinarySearch(false);
}
if (this.getIOContext().useSorted()) {
if (this.genericUDFClassName == null &&
this.getIOContext().getGenericUDFClassName() != null) {
setGenericUDFClassName(this.getIOContext().getGenericUDFClassName());
}
if (this.getIOContext().isBinarySearching()) {
// Proceed with a binary search
if (this.getIOContext().getComparison() != null) {
switch (this.getIOContext().getComparison()) {
case GREATER:
case EQUAL:
// Indexes have only one entry per value, could go linear from here, if we want to
// use this for any sorted table, we'll need to continue the search
rangeEnd = previousPosition;
break;
case LESS:
rangeStart = previousPosition;
break;
default:
break;
}
}
long position = (rangeStart + rangeEnd) / 2;
sync(position);
long newPosition = getSyncedPosition();
// If the newPosition is the same as the previousPosition, we've reached the end of the
// binary search, if the new position at least as big as the size of the split, any
// matching rows must be in the final block, so we can end the binary search.
if (newPosition == previousPosition || newPosition >= splitEnd) {
this.getIOContext().setBinarySearching(false);
sync(rangeStart);
}
previousPosition = newPosition;
} else if (foundAllTargets()) {
// Found all possible rows which will not be filtered
return false;
}
}
}
try {
/**
* When start reading new file, check header, footer rows.
* If file contains header, skip header lines before reading the records.
* If file contains footer, used a FooterBuffer to remove footer lines
* at the end of the table file.
**/
if (this.ioCxtRef.getCurrentBlockStart() == 0) {
// Check if the table file has header to skip.
Path filePath = this.ioCxtRef.getInputPath();
PartitionDesc part = null;
try {
if (pathToPartitionInfo == null) {
pathToPartitionInfo = Utilities
.getMapWork(jobConf).getPathToPartitionInfo();
}
part = HiveFileFormatUtils
.getPartitionDescFromPathRecursively(pathToPartitionInfo,
filePath, IOPrepareCache.get().getPartitionDescMap());
} catch (AssertionError ae) {
LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath()
+ "because " + ae.getMessage());
part = null;
} catch (Exception e) {
LOG.info("Cannot get partition description from " + this.ioCxtRef.getInputPath()
+ "because " + e.getMessage());
part = null;
}
TableDesc table = (part == null) ? null : part.getTableDesc();
if (table != null) {
headerCount = Utilities.getHeaderCount(table);
footerCount = Utilities.getFooterCount(table, jobConf);
}
// If input contains header, skip header.
if (!Utilities.skipHeader(recordReader, headerCount, (WritableComparable)key, (Writable)value)) {
return false;
}
if (footerCount > 0) {
footerBuffer = new FooterBuffer();
if (!footerBuffer.initializeBuffer(jobConf, recordReader, footerCount, (WritableComparable)key, (Writable)value)) {
return false;
}
}
}
if (footerBuffer == null) {
// Table files don't have footer rows.
return recordReader.next(key, value);
} else {
return footerBuffer.updateBuffer(jobConf, recordReader, (WritableComparable)key, (Writable)value);
}
} catch (Exception e) {
return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf);
}
}
private void sync(long position) throws IOException {
((RCFileRecordReader)recordReader).sync(position);
((RCFileRecordReader)recordReader).resetBuffer();
}
private long getSyncedPosition() throws IOException {
return recordReader.getPos();
}
/**
* This uses the name of the generic UDF being used by the filter to determine whether we should
* perform a binary search, and what the comparisons we should use to signal the end of the
* linear scan are.
* @param genericUDFClassName
* @throws IOException
*/
private void setGenericUDFClassName(String genericUDFClassName) throws IOException {
this.genericUDFClassName = genericUDFClassName;
if (genericUDFClassName.equals(GenericUDFOPEqual.class.getName())) {
stopComparisons.add(Comparison.GREATER);
} else if (genericUDFClassName.equals(GenericUDFOPLessThan.class.getName())) {
stopComparisons.add(Comparison.EQUAL);
stopComparisons.add(Comparison.GREATER);
if (this.getIOContext().isBinarySearching()) {
beginLinearSearch();
}
} else if (genericUDFClassName.equals(GenericUDFOPEqualOrLessThan.class.getName())) {
stopComparisons.add(Comparison.GREATER);
if (this.getIOContext().isBinarySearching()) {
beginLinearSearch();
}
} else if (genericUDFClassName.equals(GenericUDFOPGreaterThan.class.getName()) ||
genericUDFClassName.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) {
// Do nothing
} else {
// This is an unsupported operator
LOG.debug(genericUDFClassName + " is not the name of a supported class. " +
"Continuing linearly.");
if (this.getIOContext().isBinarySearching()) {
beginLinearSearch();
}
}
}
/**
* This should be called after the binary search is finished and before the linear scan begins
* @throws IOException
*/
private void beginLinearSearch() throws IOException {
sync(rangeStart);
this.getIOContext().setBinarySearching(false);
this.wasUsingSortedSearch = false;
}
/**
* Returns true if the current comparison is in the list of stop comparisons, i.e. we've found
* all records which won't be filtered
* @return true if the current comparison is found
*/
public boolean foundAllTargets() {
if (this.getIOContext().getComparison() == null ||
!stopComparisons.contains(this.getIOContext().getComparison())) {
return false;
}
return true;
}
}