All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin Maven / Gradle / Ivy

Go to download

Hive is a data warehouse infrastructure built on top of Hadoop see http://wiki.apache.org/hadoop/Hive

There is a newer version: 0.11.0-shark-0.9.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.plan;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Task;

/**
 * ConditionalResolverSkewJoin.
 *
 */
public class ConditionalResolverCommonJoin implements ConditionalResolver, Serializable {
  private static final long serialVersionUID = 1L;

  /**
   * ConditionalResolverSkewJoinCtx.
   *
   */
  public static class ConditionalResolverCommonJoinCtx implements Serializable {
    private static final long serialVersionUID = 1L;

    private HashMap> aliasToTask;
    HashMap> pathToAliases;
    HashMap aliasToKnownSize;
    private Task commonJoinTask;

    private String localTmpDir;
    private String hdfsTmpDir;

    public ConditionalResolverCommonJoinCtx() {
    }

    public HashMap> getAliasToTask() {
      return aliasToTask;
    }

    public void setAliasToTask(HashMap> aliasToTask) {
      this.aliasToTask = aliasToTask;
    }

    public Task getCommonJoinTask() {
      return commonJoinTask;
    }

    public void setCommonJoinTask(Task commonJoinTask) {
      this.commonJoinTask = commonJoinTask;
    }

    public HashMap getAliasToKnownSize() {
      return aliasToKnownSize;
    }

    public void setAliasToKnownSize(HashMap aliasToKnownSize) {
      this.aliasToKnownSize = aliasToKnownSize;
    }

    public HashMap> getPathToAliases() {
      return pathToAliases;
    }

    public void setPathToAliases(HashMap> pathToAliases) {
      this.pathToAliases = pathToAliases;
    }

    public String getLocalTmpDir() {
      return localTmpDir;
    }

    public void setLocalTmpDir(String localTmpDir) {
      this.localTmpDir = localTmpDir;
    }

    public String getHdfsTmpDir() {
      return hdfsTmpDir;
    }

    public void setHdfsTmpDir(String hdfsTmpDir) {
      this.hdfsTmpDir = hdfsTmpDir;
    }
  }

  public ConditionalResolverCommonJoin() {
  }

  @Override
  public List> getTasks(HiveConf conf, Object objCtx) {
    ConditionalResolverCommonJoinCtx ctx = (ConditionalResolverCommonJoinCtx) objCtx;
    List> resTsks = new ArrayList>();

    // get aliasToPath and pass it to the heuristic
    HashMap> pathToAliases = ctx.getPathToAliases();
    HashMap aliasToKnownSize = ctx.getAliasToKnownSize();
    String bigTableAlias = this.resolveMapJoinTask(pathToAliases, ctx
        .getAliasToTask(), aliasToKnownSize, ctx.getHdfsTmpDir(), ctx
        .getLocalTmpDir(), conf);

    if (bigTableAlias == null) {
      // run common join task
      resTsks.add(ctx.getCommonJoinTask());
    } else {
      // run the map join task
      Task task = ctx.getAliasToTask().get(bigTableAlias);
      //set task tag
      if(task.getTaskTag() == Task.CONVERTED_LOCAL_MAPJOIN) {
        task.getBackupTask().setTaskTag(Task.BACKUP_COMMON_JOIN);
      }
      resTsks.add(task);

    }

    return resTsks;
  }

  class AliasFileSizePair implements Comparable {
    String alias;
    long size;
    AliasFileSizePair(String alias, long size) {
      super();
      this.alias = alias;
      this.size = size;
    }
    @Override
    public int compareTo(AliasFileSizePair o) {
      if (o == null) {
        return 1;
      }
      return (int)(size - o.size);
    }
  }

  private String resolveMapJoinTask(
      HashMap> pathToAliases,
      HashMap> aliasToTask,
      HashMap aliasToKnownSize, String hdfsTmpDir,
      String localTmpDir, HiveConf conf) {

    String bigTableFileAlias = null;
    long smallTablesFileSizeSum = 0;

    Map aliasToFileSizeMap = new HashMap();
    for (Map.Entry entry : aliasToKnownSize.entrySet()) {
      String alias = entry.getKey();
      AliasFileSizePair pair = new AliasFileSizePair(alias, entry.getValue());
      aliasToFileSizeMap.put(alias, pair);
    }

    try {
      // need to compute the input size at runtime, and select the biggest as
      // the big table.
      for (Map.Entry> oneEntry : pathToAliases
          .entrySet()) {
        String p = oneEntry.getKey();
        // this path is intermediate data
        if (p.startsWith(hdfsTmpDir) || p.startsWith(localTmpDir)) {
          ArrayList aliasArray = oneEntry.getValue();
          if (aliasArray.size() <= 0) {
            continue;
          }
          Path path = new Path(p);
          FileSystem fs = path.getFileSystem(conf);
          long fileSize = fs.getContentSummary(path).getLength();
          for (String alias : aliasArray) {
            AliasFileSizePair pair = aliasToFileSizeMap.get(alias);
            if (pair == null) {
              pair = new AliasFileSizePair(alias, 0);
              aliasToFileSizeMap.put(alias, pair);
            }
            pair.size += fileSize;
          }
        }
      }
      // generate file size to alias mapping; but not set file size as key,
      // because different file may have the same file size.

      List aliasFileSizeList = new ArrayList(
          aliasToFileSizeMap.values());

      Collections.sort(aliasFileSizeList);
      // iterating through this list from the end to beginning, trying to find
      // the big table for mapjoin
      int idx = aliasFileSizeList.size() - 1;
      boolean bigAliasFound = false;
      while (idx >= 0) {
        AliasFileSizePair pair = aliasFileSizeList.get(idx);
        String alias = pair.alias;
        long size = pair.size;
        idx--;
        if (!bigAliasFound && aliasToTask.get(alias) != null) {
          // got the big table
          bigAliasFound = true;
          bigTableFileAlias = alias;
          continue;
        }
        smallTablesFileSizeSum += size;
      }

      // compare with threshold
      long threshold = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
      if (smallTablesFileSizeSum <= threshold) {
        return bigTableFileAlias;
      } else {
        return null;
      }
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy