Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.stats.StatsCollectionContext;
import org.apache.hadoop.hive.ql.stats.StatsPublisher;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.JobConf;
/**
* Table Scan Operator If the data is coming from the map-reduce framework, just
* forward it. This will be needed as part of local work when data is not being
* read as part of map-reduce framework
**/
public class TableScanOperator extends Operator implements
Serializable, VectorizationContextRegion {
private static final long serialVersionUID = 1L;
private VectorizationContext taskVectorizationContext;
protected transient JobConf jc;
private transient boolean inputFileChanged = false;
private TableDesc tableDesc;
private transient Stat currentStat;
private transient Map stats;
private transient int rowLimit = -1;
private transient int currCount = 0;
// insiderView will tell this TableScan is inside a view or not.
private transient boolean insideView;
private transient boolean vectorized;
private String defaultPartitionName;
/**
* These values are saved during MapWork, FetchWork, etc preparation and later added to the the
* JobConf of each task.
*/
private String schemaEvolutionColumns;
private String schemaEvolutionColumnsTypes;
public TableDesc getTableDescSkewJoin() {
return tableDesc;
}
public void setTableDescSkewJoin(TableDesc tableDesc) {
this.tableDesc = tableDesc;
}
public void setSchemaEvolution(String schemaEvolutionColumns, String schemaEvolutionColumnsTypes) {
this.schemaEvolutionColumns = schemaEvolutionColumns;
this.schemaEvolutionColumnsTypes = schemaEvolutionColumnsTypes;
}
public String getSchemaEvolutionColumns() {
return schemaEvolutionColumns;
}
public String getSchemaEvolutionColumnsTypes() {
return schemaEvolutionColumnsTypes;
}
/**
* Other than gathering statistics for the ANALYZE command, the table scan operator
* does not do anything special other than just forwarding the row. Since the table
* data is always read as part of the map-reduce framework by the mapper. But, when this
* assumption stops to be true, i.e table data won't be only read by the mapper, this
* operator will be enhanced to read the table.
**/
@Override
public void process(Object row, int tag) throws HiveException {
if (rowLimit >= 0) {
if (checkSetDone(row, tag)) {
return;
}
}
if (conf != null && conf.isGatherStats()) {
gatherStats(row);
}
forward(row, inputObjInspectors[tag], vectorized);
}
private boolean checkSetDone(Object row, int tag) {
if (row instanceof VectorizedRowBatch) {
// We need to check with 'instanceof' instead of just checking
// vectorized because the row can be a VectorizedRowBatch when
// FetchOptimizer kicks in even if the operator pipeline is not
// vectorized
VectorizedRowBatch batch = (VectorizedRowBatch) row;
if (currCount >= rowLimit) {
setDone(true);
return true;
}
if (currCount + batch.size > rowLimit) {
batch.size = rowLimit - currCount;
}
currCount += batch.size;
} else if (currCount++ >= rowLimit) {
setDone(true);
return true;
}
return false;
}
// Change the table partition for collecting stats
@Override
public void cleanUpInputFileChangedOp() throws HiveException {
inputFileChanged = true;
// If the file name to bucket number mapping is maintained, store the bucket number
// in the execution context. This is needed for the following scenario:
// insert overwrite table T1 select * from T2;
// where T1 and T2 are sorted/bucketed by the same keys into the same number of buckets
// Although one mapper per file is used (BucketizedInputHiveInput), it is possible that
// any mapper can pick up any file (depending on the size of the files). The bucket number
// corresponding to the input file is stored to name the output bucket file appropriately.
Map bucketNameMapping =
(conf != null) ? conf.getBucketFileNameMapping() : null;
if ((bucketNameMapping != null) && (!bucketNameMapping.isEmpty())) {
Path currentInputPath = getExecContext().getCurrentInputPath();
getExecContext().setFileId(Integer.toString(bucketNameMapping.get(
currentInputPath.getName())));
}
}
private void gatherStats(Object row) {
// first row/call or a new partition
if ((currentStat == null) || inputFileChanged) {
String partitionSpecs;
inputFileChanged = false;
if (conf.getPartColumns() == null || conf.getPartColumns().size() == 0) {
partitionSpecs = ""; // non-partitioned
} else {
// Figure out the partition spec from the input.
// This is only done once for the first row (when stat == null)
// since all rows in the same mapper should be from the same partition.
List