org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinResolver Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.physical;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.TaskGraphWalker;
import org.apache.hadoop.hive.ql.parse.SemanticException;
/*
* Convert tasks involving JOIN into MAPJOIN.
* If hive.auto.convert.join is true, the tasks involving join are converted.
* Consider the query:
* select .... from T1 join T2 on T1.key = T2.key join T3 on T1.key = T3.key
*
* There is a map-reduce task which performs a 3-way join (T1, T2, T3).
* The task would be converted to a conditional task which would have 4 children
* a. Mapjoin considering T1 as the big table
* b. Mapjoin considering T2 as the big table
* c. Mapjoin considering T3 as the big table
* d. Map-reduce join (the original task).
*
* Note that the sizes of all the inputs may not be available at compile time. At runtime, it is
* determined which branch we want to pick up from the above.
*
* However, if hive.auto.convert.join.noconditionaltask is set to true, and
* the sum of any n-1 tables is smaller than hive.auto.convert.join.noconditionaltask.size,
* then a mapjoin is created instead of the conditional task. For the above, if the size of
* T1 + T2 is less than the threshold, then the task is converted to a mapjoin task with T3 as
* the big table.
*
* In this case, further optimization is performed by merging 2 consecutive map-only jobs.
* Consider the query:
* select ... from T1 join T2 on T1.key1 = T2.key1 join T3 on T1.key2 = T3.key2
*
* Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2)
* followed by another map-reduce job (to perform join of the result with T3). After the
* optimization, both these tasks would be converted to map-only tasks. These 2 map-only jobs
* are then merged into a single map-only job. As a followup (HIVE-3952), it would be possible to
* merge a map-only task with a map-reduce task.
* Consider the query:
* select T1.key2, count(*) from T1 join T2 on T1.key1 = T2.key1 group by T1.key2;
* Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2)
* followed by another map-reduce job (to perform groupby of the result). After the
* optimization, the join task would be converted to map-only tasks. After HIVE-3952, the map-only
* task would be merged with the map-reduce task to create a single map-reduce task.
*/
public class CommonJoinResolver implements PhysicalPlanResolver {
@Override
public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
// create dispatcher and graph walker
Dispatcher disp = new CommonJoinTaskDispatcher(pctx);
TaskGraphWalker ogw = new TaskGraphWalker(disp);
// get all the tasks nodes from root task
List topNodes = new ArrayList();
topNodes.addAll(pctx.rootTasks);
// begin to walk through the task tree.
ogw.startWalking(topNodes, null);
return pctx;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy