org.apache.hadoop.hive.ql.exec.spark.SparkUtilities Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Presto
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.spark;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import io.prestosql.hive.$internal.com.google.common.base.Preconditions;
import io.prestosql.hive.$internal.org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.spark.session.SparkSession;
import org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManager;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext;
import org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SparkWork;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hive.spark.client.SparkClientUtilities;
import org.apache.spark.SparkConf;
/**
* Contains utilities methods used as part of Spark tasks.
*/
public class SparkUtilities {
public static HiveKey copyHiveKey(HiveKey key) {
HiveKey copy = new HiveKey();
copy.setDistKeyLength(key.getDistKeyLength());
copy.setHashCode(key.hashCode());
copy.set(key);
return copy;
}
public static BytesWritable copyBytesWritable(BytesWritable bw) {
BytesWritable copy = new BytesWritable();
copy.set(bw);
return copy;
}
/**
* Uploads a local file to HDFS
*
* @param source
* @param conf
* @return
* @throws IOException
*/
public static URI uploadToHDFS(URI source, HiveConf conf) throws IOException {
Path localFile = new Path(source.getPath());
Path remoteFile = new Path(SessionState.get().getSparkSession().getHDFSSessionDir(),
getFileName(source));
FileSystem fileSystem = FileSystem.get(remoteFile.toUri(), conf);
// Overwrite if the remote file already exists. Whether the file can be added
// on executor is up to spark, i.e. spark.files.overwrite
fileSystem.copyFromLocalFile(false, true, localFile, remoteFile);
Path fullPath = fileSystem.getFileStatus(remoteFile).getPath();
return fullPath.toUri();
}
// checks if a resource has to be uploaded to HDFS for yarn-cluster mode
public static boolean needUploadToHDFS(URI source, SparkConf sparkConf) {
String master = sparkConf.get("spark.master");
String deployMode = sparkConf.contains("spark.submit.deployMode") ?
sparkConf.get("spark.submit.deployMode") : null;
return SparkClientUtilities.isYarnClusterMode(master, deployMode) &&
!(source.getScheme().equals("hdfs") || source.getScheme().equals("viewfs"));
}
private static String getFileName(URI uri) {
if (uri == null) {
return null;
}
String name = FilenameUtils.getName(uri.getPath());
return name;
}
public static boolean isDedicatedCluster(Configuration conf) {
String master = conf.get("spark.master");
return SparkClientUtilities.isYarnMaster(master) || SparkClientUtilities.isLocalMaster(master);
}
public static SparkSession getSparkSession(HiveConf conf,
SparkSessionManager sparkSessionManager) throws HiveException {
SparkSession sparkSession = SessionState.get().getSparkSession();
HiveConf sessionConf = SessionState.get().getConf();
// Spark configurations are updated close the existing session
// In case of async queries or confOverlay is not empty,
// sessionConf and conf are different objects
if (sessionConf.getSparkConfigUpdated() || conf.getSparkConfigUpdated()) {
sparkSessionManager.closeSession(sparkSession);
sparkSession = null;
conf.setSparkConfigUpdated(false);
sessionConf.setSparkConfigUpdated(false);
}
sparkSession = sparkSessionManager.getSession(sparkSession, conf, true);
SessionState.get().setSparkSession(sparkSession);
return sparkSession;
}
/**
* Generate a temporary path for dynamic partition pruning in Spark branch
* TODO: no longer need this if we use accumulator!
* @param basePath
* @param id
* @return
*/
public static Path generateTmpPathForPartitionPruning(Path basePath, String id) {
return new Path(basePath, id);
}
/**
* Return the ID for this BaseWork, in String form.
* @param work the input BaseWork
* @return the unique ID for this BaseWork
*/
public static String getWorkId(BaseWork work) {
String workName = work.getName();
return workName.substring(workName.indexOf(" ") + 1);
}
public static SparkTask createSparkTask(HiveConf conf) {
return (SparkTask) TaskFactory.get(
new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID)));
}
public static SparkTask createSparkTask(SparkWork work, HiveConf conf) {
return (SparkTask) TaskFactory.get(work);
}
/**
* Recursively find all operators under root, that are of class clazz or are the sub-class of clazz, and
* put them in result.
* @param result all operators under root that are of class clazz
* @param root the root operator under which all operators will be examined
* @param clazz clas to collect. Must NOT be null.
*/
public static void collectOp(Collection> result, Operator> root, Class> clazz) {
Preconditions.checkArgument(clazz != null, "AssertionError: clazz should not be null");
if (root == null) {
return;
}
if (clazz.isAssignableFrom(root.getClass())) {
result.add(root);
}
for (Operator> child : root.getChildOperators()) {
collectOp(result, child, clazz);
}
}
/**
* Collect operators of type T starting from root. Matching operators will be put into result.
* Set seen can be used to skip search in certain branches.
*/
public static > void collectOp(Operator> root, Class cls,
Collection result, Set> seen) {
if (seen.contains(root)) {
return;
}
Deque> deque = new ArrayDeque<>();
deque.add(root);
while (!deque.isEmpty()) {
Operator> op = deque.remove();
seen.add(op);
if (cls.isInstance(op)) {
result.add((T) op);
}
if (op.getChildOperators() != null) {
for (Operator> child : op.getChildOperators()) {
if (!seen.contains(child)) {
deque.add(child);
}
}
}
}
}
/**
* remove currTask from the children of its parentTask
* remove currTask from the parent of its childrenTask
* @param currTask
*/
public static void removeEmptySparkTask(SparkTask currTask) {
//remove currTask from parentTasks
ArrayList parTasks = new ArrayList();
parTasks.addAll(currTask.getParentTasks());
Object[] parTaskArr = parTasks.toArray();
for (Object parTask : parTaskArr) {
((Task) parTask).removeDependentTask(currTask);
}
//remove currTask from childTasks
currTask.removeFromChildrenTasks();
}
// Find if there's any DPP sink branch of the branchingOP that is equivalent
// to the branch represented by the list.
public static SparkPartitionPruningSinkOperator findReusableDPPSink(
Operator extends OperatorDesc> branchingOP, List> list) {
for (Operator extends OperatorDesc> other : branchingOP.getChildOperators()) {
int i;
for (i = 0; i < list.size(); i++) {
if (other == list.get(i) || !other.logicalEquals(list.get(i))) {
break;
}
if (i != list.size() - 1) {
if (other.getChildOperators() == null || other.getChildOperators().size() != 1) {
break;
}
other = other.getChildOperators().get(0);
}
}
if (i == list.size()) {
return (SparkPartitionPruningSinkOperator) other;
}
}
return null;
}
/**
* For DPP sinks w/ common join, we'll split the tree and what's above the branching
* operator is computed multiple times. Therefore it may not be good for performance to support
* nested DPP sinks, i.e. one DPP sink depends on other DPP sinks.
* The following is an example:
*
* TS TS
* | |
* ... FIL
* | | \
* RS RS SEL
* \ / |
* TS JOIN GBY
* | / \ |
* RS RS SEL DPP2
* \ / |
* JOIN GBY
* |
* DPP1
*
* where DPP1 depends on DPP2.
*
* To avoid such case, we'll visit all the branching operators. If a branching operator has any
* further away DPP branches w/ common join in its sub-tree, such branches will be removed.
* In the above example, the branch of DPP1 will be removed.
*/
public static void removeNestedDPP(OptimizeSparkProcContext procContext) {
Set allDPPs = new HashSet<>();
Set> seen = new HashSet<>();
// collect all DPP sinks
for (TableScanOperator root : procContext.getParseContext().getTopOps().values()) {
SparkUtilities.collectOp(root, SparkPartitionPruningSinkOperator.class, allDPPs, seen);
}
// collect all branching operators
Set> branchingOps = new HashSet<>();
for (SparkPartitionPruningSinkOperator dpp : allDPPs) {
branchingOps.add(dpp.getBranchingOp());
}
// remember the branching ops we have visited
Set> visited = new HashSet<>();
for (Operator> branchingOp : branchingOps) {
if (!visited.contains(branchingOp)) {
visited.add(branchingOp);
seen.clear();
Set nestedDPPs = new HashSet<>();
for (Operator> branch : branchingOp.getChildOperators()) {
if (!isDirectDPPBranch(branch)) {
SparkUtilities.collectOp(branch, SparkPartitionPruningSinkOperator.class, nestedDPPs,
seen);
}
}
for (SparkPartitionPruningSinkOperator nestedDPP : nestedDPPs) {
visited.add(nestedDPP.getBranchingOp());
// if a DPP is with MJ, the tree won't be split and so we don't have to remove it
if (!nestedDPP.isWithMapjoin()) {
OperatorUtils.removeBranch(nestedDPP);
}
}
}
}
}
// whether of pattern "SEL - GBY - DPP"
public static boolean isDirectDPPBranch(Operator> op) {
if (op instanceof SelectOperator && op.getChildOperators() != null
&& op.getChildOperators().size() == 1) {
op = op.getChildOperators().get(0);
if (op instanceof GroupByOperator && op.getChildOperators() != null
&& op.getChildOperators().size() == 1) {
op = op.getChildOperators().get(0);
return op instanceof SparkPartitionPruningSinkOperator;
}
}
return false;
}
public static String reverseDNSLookupURL(String url) throws UnknownHostException {
// Run a reverse DNS lookup on the URL
URI uri = URI.create(url);
InetAddress address = InetAddress.getByName(uri.getHost());
return uri.getScheme() + "://" + address.getCanonicalHostName() + ":" + uri.getPort();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy