org.apache.hadoop.hive.ql.plan.SparkWork Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.plan;

import java.io.Serializable;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;

import com.google.common.base.Preconditions;

/**
 * This class encapsulates all the work objects that can be executed
 * in a single Spark job. Currently it's basically a tree with MapWork at the
 * roots and and ReduceWork at all other nodes.
 */
@SuppressWarnings("serial")
@Explain(displayName = "Spark")
public class SparkWork extends AbstractOperatorDesc {
  private static int counter;
  private final String name;

  private final Set roots = new LinkedHashSet();
  private final Set leaves = new LinkedHashSet<>();

  protected final Map> workGraph =
      new HashMap>();
  protected final Map> invertedWorkGraph =
      new HashMap>();
  protected final Map, SparkEdgeProperty> edgeProperties =
      new HashMap, SparkEdgeProperty>();

  private Map> requiredCounterPrefix;

  private Map cloneToWork;

  public SparkWork(String name) {
    this.name = name + ":" + (++counter);
    cloneToWork = new HashMap();
  }


  @Explain(displayName = "DagName")
  public String getName() {
    return name;
  }

  /**
   * @return a map of "vertex name" to BaseWork
   */
  @Explain(displayName = "Vertices")
  public Map getWorkMap() {
    Map result = new LinkedHashMap();
    for (BaseWork w: getAllWork()) {
      result.put(w.getName(), w);
    }
    return result;
  }

  /**
   * @return a topologically sorted list of BaseWork
   */
  public List getAllWork() {

    List result = new LinkedList();
    Set seen = new HashSet();

    for (BaseWork leaf: leaves) {
      // make sure all leaves are visited at least once
      visit(leaf, seen, result);
    }

    return result;
  }

  public Collection getAllWorkUnsorted() {
    return workGraph.keySet();
  }

  private void visit(BaseWork child, Set seen, List result) {
    if (seen.contains(child)) {
      // don't visit multiple times
      return;
    }

    seen.add(child);

    for (BaseWork parent: getParents(child)) {
      if (!seen.contains(parent)) {
        visit(parent, seen, result);
      }
    }

    result.add(child);
  }

  /**
   * Add all nodes in the collection without any connections.
   */
  public void addAll(Collection c) {
    for (BaseWork w: c) {
      this.add(w);
    }
  }

  /**
   * Add all nodes in the collection without any connections.
   */
  public void addAll(BaseWork[] bws) {
    for (BaseWork w: bws) {
      this.add(w);
    }
  }

  /**
   * Whether the specified BaseWork is a vertex in this graph
   * @param w the BaseWork to check
   * @return whether specified BaseWork is in this graph
   */
  public boolean contains(BaseWork w) {
    return workGraph.containsKey(w);
  }

  /**
   * add creates a new node in the graph without any connections
   */
  public void add(BaseWork w) {
    if (workGraph.containsKey(w)) {
      return;
    }
    workGraph.put(w, new LinkedList());
    invertedWorkGraph.put(w, new LinkedList());
    roots.add(w);
    leaves.add(w);
  }

  /**
   * disconnect removes an edge between a and b. Both a and
   * b have to be in the graph. If there is no matching edge
   * no change happens.
   */
  public void disconnect(BaseWork a, BaseWork b) {
    workGraph.get(a).remove(b);
    invertedWorkGraph.get(b).remove(a);
    if (getParents(b).isEmpty()) {
      roots.add(b);
    }
    if (getChildren(a).isEmpty()) {
      leaves.add(a);
    }
    edgeProperties.remove(new ImmutablePair(a, b));
  }

  /**
   * getRoots returns all nodes that do not have a parent.
   */
  public Set getRoots() {
    return new LinkedHashSet(roots);
  }

  /**
   * getLeaves returns all nodes that do not have a child
   */
  public Set getLeaves() {
    return new LinkedHashSet(leaves);
  }

  public void setRequiredCounterPrefix(Map> requiredCounterPrefix) {
    this.requiredCounterPrefix = requiredCounterPrefix;
  }

  public Map> getRequiredCounterPrefix() {
    return requiredCounterPrefix;
  }

  /**
   * getParents returns all the nodes with edges leading into work
   */
  public List getParents(BaseWork work) {
    Preconditions.checkArgument(invertedWorkGraph.containsKey(work),
        "AssertionError: expected invertedWorkGraph.containsKey(work) to be true");
    Preconditions.checkArgument(invertedWorkGraph.get(work) != null,
        "AssertionError: expected invertedWorkGraph.get(work) to be not null");
    return new LinkedList(invertedWorkGraph.get(work));
  }

  /**
   * getChildren returns all the nodes with edges leading out of work
   */
  public List getChildren(BaseWork work) {
    Preconditions.checkArgument(workGraph.containsKey(work),
        "AssertionError: expected workGraph.containsKey(work) to be true");
    Preconditions.checkArgument(workGraph.get(work) != null,
        "AssertionError: expected workGraph.get(work) to be not null");
    return new LinkedList(workGraph.get(work));
  }

  /**
   * remove removes a node from the graph and removes all edges with
   * work as start or end point. No change to the graph if the node
   * doesn't exist.
   */
  public void remove(BaseWork work) {
    if (!workGraph.containsKey(work)) {
      return;
    }

    List children = getChildren(work);
    List parents = getParents(work);

    for (BaseWork w: children) {
      edgeProperties.remove(new ImmutablePair(work, w));
      invertedWorkGraph.get(w).remove(work);
      if (invertedWorkGraph.get(w).size() == 0) {
        roots.add(w);
      }
    }

    for (BaseWork w: parents) {
      edgeProperties.remove(new ImmutablePair(w, work));
      workGraph.get(w).remove(work);
      if (workGraph.get(w).size() == 0) {
        leaves.add(w);
      }
    }

    roots.remove(work);
    leaves.remove(work);

    workGraph.remove(work);
    invertedWorkGraph.remove(work);
  }

  /**
   * returns the edge type connecting work a and b
   */
  public SparkEdgeProperty getEdgeProperty(BaseWork a, BaseWork b) {
    return edgeProperties.get(new ImmutablePair(a, b));
  }

  /**
   * connect adds an edge between a and b. Both nodes have
   * to be added prior to calling connect.
   * @param
   */
  public void connect(BaseWork a, BaseWork b, SparkEdgeProperty edgeProp) {
    workGraph.get(a).add(b);
    invertedWorkGraph.get(b).add(a);
    roots.remove(b);
    leaves.remove(a);
    ImmutablePair workPair = new ImmutablePair(a, b);
    edgeProperties.put(workPair, edgeProp);
  }

  /*
   * Dependency is a class used for explain
   */
  public class Dependency implements Serializable, Comparable {
    public BaseWork w;
    public SparkEdgeProperty prop;

    @Explain(displayName = "Name")
    public String getName() {
      return w.getName();
    }

    @Explain(displayName = "Shuffle Type")
    public String getShuffleType() {
      return prop.getShuffleType();
    }

    @Explain(displayName = "Number of Partitions")
    public String getNumPartitions() {
      return Integer.toString(prop.getNumPartitions());
    }

    @Override
    public int compareTo(Dependency o) {
      int compare = getName().compareTo(o.getName());
      if (compare == 0) {
        compare = getShuffleType().compareTo(o.getShuffleType());
      }
      return compare;
    }
  }

  /**
   * Task name is usually sorted by natural order, which is the same
   * as the topological order in most cases. However, with Spark, some
   * tasks may be converted, so have new names. The natural order may
   * be different from the topological order. This class is to make
   * sure all tasks to be sorted by topological order deterministically.
   */
  private static class ComparableName implements Comparable {
    private final Map dependencies;
    private final String name;

    ComparableName(Map dependencies, String name) {
      this.dependencies = dependencies;
      this.name = name;
    }

    /**
     * Check if task n1 depends on task n2
     */
    boolean dependsOn(String n1, String n2) {
      for (String p = dependencies.get(n1); p != null; p = dependencies.get(p)) {
        if (p.equals(n2)) {
          return true;
        }
      }
      return false;
    }

    /**
     * Get the number of parents of task n
     */
    int getDepth(String n) {
      int depth = 0;
      for (String p = dependencies.get(n); p != null; p = dependencies.get(p)) {
        depth++;
      }
      return depth;
    }

    @Override
    public int compareTo(ComparableName o) {
      if (dependsOn(name, o.name)) {
        // this depends on o
        return 1;
      }
      if (dependsOn(o.name, name)) {
        // o depends on this
        return -1;
      }
      // No dependency, check depth
      int d1 = getDepth(name);
      int d2 = getDepth(o.name);
      if (d1 == d2) {
        // Same depth, using natural order
        return name.compareTo(o.name);
      }
      // Deep one is bigger, i.e. less to the top
      return d1 > d2 ? 1 : -1;
    }

    @Override
    public String toString() {
      return name;
    }
   }

  @Explain(displayName = "Edges")
  public Map> getDependencyMap() {
    Map allDependencies = new HashMap();
    Map> result =
      new LinkedHashMap>();
    for (BaseWork baseWork : getAllWork()) {
      if (invertedWorkGraph.get(baseWork) != null && invertedWorkGraph.get(baseWork).size() > 0) {
        List dependencies = new LinkedList();
        for (BaseWork d : invertedWorkGraph.get(baseWork)) {
          allDependencies.put(baseWork.getName(), d.getName());
          Dependency dependency = new Dependency();
          dependency.w = d;
          dependency.prop = getEdgeProperty(d, baseWork);
          dependencies.add(dependency);
        }
        if (!dependencies.isEmpty()) {
          Collections.sort(dependencies);
          result.put(new ComparableName(allDependencies,
            baseWork.getName()), dependencies);
        }
      }
    }
    return result;
  }

  /**
   * @return all reduce works of this spark work, in sorted order.
   */
  public List getAllReduceWork() {
    List result = new ArrayList();
    for (BaseWork work : getAllWork()) {
      if (work instanceof ReduceWork) {
        result.add((ReduceWork) work);
      }
    }
    return result;
  }

  public Map getCloneToWork() {
    return cloneToWork;
  }

  public void setCloneToWork(Map cloneToWork) {
    this.cloneToWork = cloneToWork;
  }
}