Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_ZEROCOPY;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.DiskRange;
import org.apache.hadoop.hive.common.DiskRangeList;
import org.apache.hadoop.hive.common.DiskRangeList.DiskRangeListCreateHelper;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO;
import org.apache.hadoop.hive.ql.io.orc.RecordReaderUtils.ByteBufferAllocatorPool;
import org.apache.hadoop.hive.ql.io.orc.TreeReaderFactory.TreeReader;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.shims.HadoopShims.ZeroCopyReaderShim;
import org.apache.hadoop.io.Text;
class RecordReaderImpl implements RecordReader {
static final Log LOG = LogFactory.getLog(RecordReaderImpl.class);
private static final boolean isLogDebugEnabled = LOG.isDebugEnabled();
private final Path path;
private final FSDataInputStream file;
private final long firstRow;
private final List stripes =
new ArrayList();
private OrcProto.StripeFooter stripeFooter;
private final long totalRowCount;
private final CompressionCodec codec;
private final List types;
private final int bufferSize;
private final boolean[] included;
private final long rowIndexStride;
private long rowInStripe = 0;
private int currentStripe = -1;
private long rowBaseInStripe = 0;
private long rowCountInStripe = 0;
private final Map streams =
new HashMap();
DiskRangeList bufferChunks = null;
private final TreeReader reader;
private final OrcProto.RowIndex[] indexes;
private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
private final SargApplier sargApp;
// an array about which row groups aren't skipped
private boolean[] includedRowGroups = null;
private final Configuration conf;
private final MetadataReader metadata;
private final ByteBufferAllocatorPool pool = new ByteBufferAllocatorPool();
private final ZeroCopyReaderShim zcr;
public final static class Index {
OrcProto.RowIndex[] rowGroupIndex;
OrcProto.BloomFilterIndex[] bloomFilterIndex;
Index(OrcProto.RowIndex[] rgIndex, OrcProto.BloomFilterIndex[] bfIndex) {
this.rowGroupIndex = rgIndex;
this.bloomFilterIndex = bfIndex;
}
public OrcProto.RowIndex[] getRowGroupIndex() {
return rowGroupIndex;
}
public OrcProto.BloomFilterIndex[] getBloomFilterIndex() {
return bloomFilterIndex;
}
public void setRowGroupIndex(OrcProto.RowIndex[] rowGroupIndex) {
this.rowGroupIndex = rowGroupIndex;
}
}
/**
* Given a list of column names, find the given column and return the index.
* @param columnNames the list of potential column names
* @param columnName the column name to look for
* @param rootColumn offset the result with the rootColumn
* @return the column number or -1 if the column wasn't found
*/
static int findColumns(String[] columnNames,
String columnName,
int rootColumn) {
for(int i=0; i < columnNames.length; ++i) {
if (columnName.equals(columnNames[i])) {
return i + rootColumn;
}
}
return -1;
}
/**
* Find the mapping from predicate leaves to columns.
* @param sargLeaves the search argument that we need to map
* @param columnNames the names of the columns
* @param rootColumn the offset of the top level row, which offsets the
* result
* @return an array mapping the sarg leaves to concrete column numbers
*/
public static int[] mapSargColumns(List sargLeaves,
String[] columnNames,
int rootColumn) {
int[] result = new int[sargLeaves.size()];
Arrays.fill(result, -1);
for(int i=0; i < result.length; ++i) {
String colName = sargLeaves.get(i).getColumnName();
result[i] = findColumns(columnNames, colName, rootColumn);
}
return result;
}
protected RecordReaderImpl(List stripes,
FileSystem fileSystem,
Path path,
Reader.Options options,
List types,
CompressionCodec codec,
int bufferSize,
long strideRate,
Configuration conf
) throws IOException {
this.path = path;
this.file = fileSystem.open(path);
this.codec = codec;
this.types = types;
this.bufferSize = bufferSize;
this.included = options.getInclude();
this.conf = conf;
this.rowIndexStride = strideRate;
this.metadata = new MetadataReader(file, codec, bufferSize, types.size());
SearchArgument sarg = options.getSearchArgument();
if (sarg != null && strideRate != 0) {
sargApp = new SargApplier(
sarg, options.getColumnNames(), strideRate, types, included.length);
} else {
sargApp = null;
}
long rows = 0;
long skippedRows = 0;
long offset = options.getOffset();
long maxOffset = options.getMaxOffset();
for(StripeInformation stripe: stripes) {
long stripeStart = stripe.getOffset();
if (offset > stripeStart) {
skippedRows += stripe.getNumberOfRows();
} else if (stripeStart < maxOffset) {
this.stripes.add(stripe);
rows += stripe.getNumberOfRows();
}
}
final boolean zeroCopy = (conf != null)
&& (HiveConf.getBoolVar(conf, HIVE_ORC_ZEROCOPY));
zcr = zeroCopy ? RecordReaderUtils.createZeroCopyShim(file, codec, pool) : null;
firstRow = skippedRows;
totalRowCount = rows;
boolean skipCorrupt = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_ORC_SKIP_CORRUPT_DATA);
reader = RecordReaderFactory.createTreeReader(0, conf, types, included, skipCorrupt);
indexes = new OrcProto.RowIndex[types.size()];
bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
advanceToNextRow(reader, 0L, true);
}
public static final class PositionProviderImpl implements PositionProvider {
private final OrcProto.RowIndexEntry entry;
private int index;
public PositionProviderImpl(OrcProto.RowIndexEntry entry) {
this(entry, 0);
}
public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) {
this.entry = entry;
this.index = startPos;
}
@Override
public long getNext() {
return entry.getPositions(index++);
}
}
OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
return metadata.readStripeFooter(stripe);
}
static enum Location {
BEFORE, MIN, MIDDLE, MAX, AFTER
}
/**
* Given a point and min and max, determine if the point is before, at the
* min, in the middle, at the max, or after the range.
* @param point the point to test
* @param min the minimum point
* @param max the maximum point
* @param the type of the comparision
* @return the location of the point
*/
static Location compareToRange(Comparable point, T min, T max) {
int minCompare = point.compareTo(min);
if (minCompare < 0) {
return Location.BEFORE;
} else if (minCompare == 0) {
return Location.MIN;
}
int maxCompare = point.compareTo(max);
if (maxCompare > 0) {
return Location.AFTER;
} else if (maxCompare == 0) {
return Location.MAX;
}
return Location.MIDDLE;
}
/**
* Get the maximum value out of an index entry.
* @param index
* the index entry
* @return the object for the maximum value or null if there isn't one
*/
static Object getMax(ColumnStatistics index) {
if (index instanceof IntegerColumnStatistics) {
return ((IntegerColumnStatistics) index).getMaximum();
} else if (index instanceof DoubleColumnStatistics) {
return ((DoubleColumnStatistics) index).getMaximum();
} else if (index instanceof StringColumnStatistics) {
return ((StringColumnStatistics) index).getMaximum();
} else if (index instanceof DateColumnStatistics) {
return ((DateColumnStatistics) index).getMaximum();
} else if (index instanceof DecimalColumnStatistics) {
return ((DecimalColumnStatistics) index).getMaximum();
} else if (index instanceof TimestampColumnStatistics) {
return ((TimestampColumnStatistics) index).getMaximum();
} else if (index instanceof BooleanColumnStatistics) {
if (((BooleanColumnStatistics)index).getTrueCount()!=0) {
return Boolean.TRUE;
} else {
return Boolean.FALSE;
}
} else {
return null;
}
}
/**
* Get the minimum value out of an index entry.
* @param index
* the index entry
* @return the object for the minimum value or null if there isn't one
*/
static Object getMin(ColumnStatistics index) {
if (index instanceof IntegerColumnStatistics) {
return ((IntegerColumnStatistics) index).getMinimum();
} else if (index instanceof DoubleColumnStatistics) {
return ((DoubleColumnStatistics) index).getMinimum();
} else if (index instanceof StringColumnStatistics) {
return ((StringColumnStatistics) index).getMinimum();
} else if (index instanceof DateColumnStatistics) {
return ((DateColumnStatistics) index).getMinimum();
} else if (index instanceof DecimalColumnStatistics) {
return ((DecimalColumnStatistics) index).getMinimum();
} else if (index instanceof TimestampColumnStatistics) {
return ((TimestampColumnStatistics) index).getMinimum();
} else if (index instanceof BooleanColumnStatistics) {
if (((BooleanColumnStatistics)index).getFalseCount()!=0) {
return Boolean.FALSE;
} else {
return Boolean.TRUE;
}
} else {
return null;
}
}
/**
* Evaluate a predicate with respect to the statistics from the column
* that is referenced in the predicate.
* @param statsProto the statistics for the column mentioned in the predicate
* @param predicate the leaf predicate we need to evaluation
* @param bloomFilter
* @return the set of truth values that may be returned for the given
* predicate.
*/
static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
Object minValue = getMin(cs);
Object maxValue = getMax(cs);
BloomFilterIO bf = null;
if (bloomFilter != null) {
bf = new BloomFilterIO(bloomFilter);
}
return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
}
/**
* Evaluate a predicate with respect to the statistics from the column
* that is referenced in the predicate.
* @param stats the statistics for the column mentioned in the predicate
* @param predicate the leaf predicate we need to evaluation
* @return the set of truth values that may be returned for the given
* predicate.
*/
static TruthValue evaluatePredicate(ColumnStatistics stats,
PredicateLeaf predicate, BloomFilterIO bloomFilter) {
Object minValue = getMin(stats);
Object maxValue = getMax(stats);
return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
}
static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
Object max, boolean hasNull, BloomFilterIO bloomFilter) {
// if we didn't have any values, everything must have been null
if (min == null) {
if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
return TruthValue.YES;
} else {
return TruthValue.NULL;
}
}
TruthValue result;
try {
// Predicate object and stats objects are converted to the type of the predicate object.
Object baseObj = predicate.getLiteral();
Object minValue = getBaseObjectForComparison(predicate.getType(), min);
Object maxValue = getBaseObjectForComparison(predicate.getType(), max);
Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj);
result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull);
if (bloomFilter != null && result != TruthValue.NO_NULL && result != TruthValue.NO) {
result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull);
}
// in case failed conversion, return the default YES_NO_NULL truth value
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Exception when evaluating predicate. Skipping ORC PPD." +
" Exception: " + ExceptionUtils.getStackTrace(e));
}
if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) {
result = TruthValue.YES_NO;
} else {
result = TruthValue.YES_NO_NULL;
}
}
return result;
}
private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj,
Object minValue,
Object maxValue,
boolean hasNull) {
Location loc;
switch (predicate.getOperator()) {
case NULL_SAFE_EQUALS:
loc = compareToRange((Comparable) predObj, minValue, maxValue);
if (loc == Location.BEFORE || loc == Location.AFTER) {
return TruthValue.NO;
} else {
return TruthValue.YES_NO;
}
case EQUALS:
loc = compareToRange((Comparable) predObj, minValue, maxValue);
if (minValue.equals(maxValue) && loc == Location.MIN) {
return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
} else if (loc == Location.BEFORE || loc == Location.AFTER) {
return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
} else {
return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
}
case LESS_THAN:
loc = compareToRange((Comparable) predObj, minValue, maxValue);
if (loc == Location.AFTER) {
return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
} else if (loc == Location.BEFORE || loc == Location.MIN) {
return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
} else {
return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
}
case LESS_THAN_EQUALS:
loc = compareToRange((Comparable) predObj, minValue, maxValue);
if (loc == Location.AFTER || loc == Location.MAX) {
return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
} else if (loc == Location.BEFORE) {
return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
} else {
return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
}
case IN:
if (minValue.equals(maxValue)) {
// for a single value, look through to see if that value is in the
// set
for (Object arg : predicate.getLiteralList()) {
predObj = getBaseObjectForComparison(predicate.getType(), arg);
loc = compareToRange((Comparable) predObj, minValue, maxValue);
if (loc == Location.MIN) {
return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
}
}
return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
} else {
// are all of the values outside of the range?
for (Object arg : predicate.getLiteralList()) {
predObj = getBaseObjectForComparison(predicate.getType(), arg);
loc = compareToRange((Comparable) predObj, minValue, maxValue);
if (loc == Location.MIN || loc == Location.MIDDLE ||
loc == Location.MAX) {
return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
}
}
return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
}
case BETWEEN:
List