org.apache.hadoop.hive.ql.plan.BucketMapJoinContext Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.plan;

import java.io.Serializable;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.exec.BucketMatcher;
import org.apache.hadoop.hive.ql.plan.Explain.Level;

/**
 * was inner class of MapreLocalWork. context for bucket mapjoin (or smb join)
 */
public class BucketMapJoinContext implements Serializable {

  private static final long serialVersionUID = 1L;

  // table alias (small) --> input file name (big) --> target file names (small)
  private Map>> aliasBucketFileNameMapping;
  private String mapJoinBigTableAlias;
  private Class bucketMatcherClass;

  // summary of aliasBucketFileNameMapping for test result
  // full paths are replaced with base filenames
  private transient Map>> aliasBucketBaseFileNameMapping;

  // input file name (big) to bucket number
  private Map bucketFileNameMapping;

  // partition spec string to input file names (big)
  private Map> bigTablePartSpecToFileMapping;

  // inverse of partSpecToFileMapping, populated at runtime
  private transient Map inputToPartSpecMapping;

  public BucketMapJoinContext() {}

  public BucketMapJoinContext(MapJoinDesc clone) {
    this.mapJoinBigTableAlias = clone.getBigTableAlias();
    this.aliasBucketFileNameMapping = clone.getAliasBucketFileNameMapping();
    this.bucketFileNameMapping = clone.getBigTableBucketNumMapping();
    this.bigTablePartSpecToFileMapping = clone.getBigTablePartSpecToFileMapping();
  }

  public void setMapJoinBigTableAlias(String bigTableAlias) {
    this.mapJoinBigTableAlias = bigTableAlias;
  }

  public void deriveBucketMapJoinMapping() {
    if (aliasBucketFileNameMapping != null) {
      aliasBucketBaseFileNameMapping = new LinkedHashMap>>();

      for (Map.Entry>> aliasToMappins
          : aliasBucketFileNameMapping.entrySet()) {
        String tableAlias = aliasToMappins.getKey();
        Map> fullPathMappings = aliasToMappins.getValue();

        Map> baseFileNameMapping = new LinkedHashMap>();
        for (Map.Entry> inputToBuckets : fullPathMappings.entrySet()) {
          // For a given table and its bucket full file path list,
          // only keep the base file name (remove file path etc).
          // And put the new list into the new mapping.
          String inputPath = inputToBuckets.getKey();
          List bucketPaths = inputToBuckets.getValue();

          List bucketBaseFileNames = new ArrayList(bucketPaths.size());
          //for each bucket file, only keep its base files and store into a new list.
          for (String bucketFName : bucketPaths) {
            bucketBaseFileNames.add(getBaseFileName(bucketFName));
          }
          //put the new mapping
          baseFileNameMapping.put(getBaseFileName(inputPath), bucketBaseFileNames);
        }
        aliasBucketBaseFileNameMapping.put(tableAlias, baseFileNameMapping);
      }
    }
  }

  private static final Pattern partPattern = Pattern.compile("^[^=]+=[^=]+$");

  // extract partition spec to file name part from path
  private String getBaseFileName(String string) {
    try {
      Path path = new Path(string);
      Path cursor = path.getParent();
      while (partPattern.matcher(cursor.getName()).matches()) {
        cursor = cursor.getParent();
      }
      return cursor.toUri().relativize(path.toUri()).getPath();
    } catch (Exception ex) {
      // This could be due to either URI syntax error or File constructor
      // illegal arg; we don't really care which one it is.
      return string;
    }
  }

  public String getMapJoinBigTableAlias() {
    return mapJoinBigTableAlias;
  }

  public Class getBucketMatcherClass() {
    return bucketMatcherClass;
  }

  public void setBucketMatcherClass(
      Class bucketMatcherClass) {
    this.bucketMatcherClass = bucketMatcherClass;
  }

  @Explain(displayName = "Alias Bucket File Name Mapping", explainLevels = { Level.EXTENDED })
  public Map>> getAliasBucketFileNameMapping() {
    return aliasBucketFileNameMapping;
  }

  public void setAliasBucketFileNameMapping(
      Map>> aliasBucketFileNameMapping) {
    this.aliasBucketFileNameMapping = aliasBucketFileNameMapping;
  }

  @Override
  public String toString() {
    if (aliasBucketFileNameMapping != null) {
      return "Mapping:" + aliasBucketFileNameMapping.toString();
    } else {
      return "";
    }
  }

  @Explain(displayName = "Alias Bucket Base File Name Mapping", explainLevels = { Level.EXTENDED })
  public Map>> getAliasBucketBaseFileNameMapping() {
    return aliasBucketBaseFileNameMapping;
  }

  public void setAliasBucketBaseFileNameMapping(
      Map>> aliasBucketBaseFileNameMapping) {
    this.aliasBucketBaseFileNameMapping = aliasBucketBaseFileNameMapping;
  }

  @Explain(displayName = "Alias Bucket Output File Name Mapping", explainLevels = { Level.EXTENDED })
  public Map getBucketFileNameMapping() {
    return bucketFileNameMapping;
  }

  public void setBucketFileNameMapping(Map bucketFileNameMapping) {
    this.bucketFileNameMapping = bucketFileNameMapping;
  }

  public Map> getBigTablePartSpecToFileMapping() {
    return bigTablePartSpecToFileMapping;
  }

  public void setBigTablePartSpecToFileMapping(
      Map> bigTablePartSpecToFileMapping) {
    this.bigTablePartSpecToFileMapping = bigTablePartSpecToFileMapping;
  }

  /**
   * Given a small table input file, find the mapping
   * big table input file with the smallest bucket number.
   */
  public String getMappingBigFile(String alias, String smallFile) {
    HashSet bigFiles = new HashSet();
    Map> mapping = aliasBucketFileNameMapping.get(alias);
    for (Map.Entry> entry: mapping.entrySet()) {
      if (entry.getValue().contains(smallFile)) {
        bigFiles.add(entry.getKey());
      }
    }
    // There could be several big table input files
    // mapping to the same small input file.
    // Find that one with the lowest bucket id.
    int bucketId = Integer.MAX_VALUE;
    String bigFile = null;
    for (String f: bigFiles) {
      int id = bucketFileNameMapping.get(f);
      if (id < bucketId) {
        bucketId = id;
        bigFile = f;
      }
    }
    return bigFile;
  }

  // returns fileId for SMBJoin, which consists part of result file name
  // needed to avoid file name conflict when big table is partitioned
  public String createFileId(String inputPath) {
    String bucketNum = String.valueOf(bucketFileNameMapping.get(inputPath));
    if (bigTablePartSpecToFileMapping != null) {
      // partSpecToFileMapping is null if big table is partitioned
      return prependPartSpec(inputPath, bucketNum);
    }
    return bucketNum;
  }

  // returns name of hashfile made by HASHTABLESINK which is read by MAPJOIN
  public String createFileName(String inputPath, String fileName) {
    if (bigTablePartSpecToFileMapping != null) {
      // partSpecToFileMapping is null if big table is partitioned
      return prependPartSpec(inputPath, fileName);
    }
    return fileName;
  }

  // prepends partition spec of input path to candidate file name
  private String prependPartSpec(String inputPath, String fileName) {
    Map mapping = inputToPartSpecMapping == null ?
        inputToPartSpecMapping = revert(bigTablePartSpecToFileMapping) : inputToPartSpecMapping;
    String partSpec = mapping.get(URI.create(inputPath).getPath());
    return partSpec == null || partSpec.isEmpty() ? fileName :
      "(" + FileUtils.escapePathName(partSpec) + ")" + fileName;
  }

  // revert partSpecToFileMapping to inputToPartSpecMapping
  private Map revert(Map> mapping) {
    Map converted = new HashMap();
    for (Map.Entry> entry : mapping.entrySet()) {
      String partSpec = entry.getKey();
      for (String file : entry.getValue()) {
        converted.put(URI.create(file).getPath(), partSpec);
      }
    }
    return converted;
  }
}