org.apache.hadoop.hive.ql.plan.BucketMapJoinContext Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;
import java.io.Serializable;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.exec.BucketMatcher;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
/**
* was inner class of MapreLocalWork. context for bucket mapjoin (or smb join)
*/
public class BucketMapJoinContext implements Serializable {
private static final long serialVersionUID = 1L;
// table alias (small) --> input file name (big) --> target file names (small)
private Map>> aliasBucketFileNameMapping;
private String mapJoinBigTableAlias;
private Class extends BucketMatcher> bucketMatcherClass;
// summary of aliasBucketFileNameMapping for test result
// full paths are replaced with base filenames
private transient Map>> aliasBucketBaseFileNameMapping;
// input file name (big) to bucket number
private Map bucketFileNameMapping;
// partition spec string to input file names (big)
private Map> bigTablePartSpecToFileMapping;
// inverse of partSpecToFileMapping, populated at runtime
private transient Map inputToPartSpecMapping;
public BucketMapJoinContext() {}
public BucketMapJoinContext(MapJoinDesc clone) {
this.mapJoinBigTableAlias = clone.getBigTableAlias();
this.aliasBucketFileNameMapping = clone.getAliasBucketFileNameMapping();
this.bucketFileNameMapping = clone.getBigTableBucketNumMapping();
this.bigTablePartSpecToFileMapping = clone.getBigTablePartSpecToFileMapping();
}
public void setMapJoinBigTableAlias(String bigTableAlias) {
this.mapJoinBigTableAlias = bigTableAlias;
}
public void deriveBucketMapJoinMapping() {
if (aliasBucketFileNameMapping != null) {
aliasBucketBaseFileNameMapping = new LinkedHashMap>>();
for (Map.Entry>> aliasToMappins
: aliasBucketFileNameMapping.entrySet()) {
String tableAlias = aliasToMappins.getKey();
Map> fullPathMappings = aliasToMappins.getValue();
Map> baseFileNameMapping = new LinkedHashMap>();
for (Map.Entry> inputToBuckets : fullPathMappings.entrySet()) {
// For a given table and its bucket full file path list,
// only keep the base file name (remove file path etc).
// And put the new list into the new mapping.
String inputPath = inputToBuckets.getKey();
List bucketPaths = inputToBuckets.getValue();
List bucketBaseFileNames = new ArrayList(bucketPaths.size());
//for each bucket file, only keep its base files and store into a new list.
for (String bucketFName : bucketPaths) {
bucketBaseFileNames.add(getBaseFileName(bucketFName));
}
//put the new mapping
baseFileNameMapping.put(getBaseFileName(inputPath), bucketBaseFileNames);
}
aliasBucketBaseFileNameMapping.put(tableAlias, baseFileNameMapping);
}
}
}
private static final Pattern partPattern = Pattern.compile("^[^=]+=[^=]+$");
// extract partition spec to file name part from path
private String getBaseFileName(String string) {
try {
Path path = new Path(string);
Path cursor = path.getParent();
while (partPattern.matcher(cursor.getName()).matches()) {
cursor = cursor.getParent();
}
return cursor.toUri().relativize(path.toUri()).getPath();
} catch (Exception ex) {
// This could be due to either URI syntax error or File constructor
// illegal arg; we don't really care which one it is.
return string;
}
}
public String getMapJoinBigTableAlias() {
return mapJoinBigTableAlias;
}
public Class extends BucketMatcher> getBucketMatcherClass() {
return bucketMatcherClass;
}
public void setBucketMatcherClass(
Class extends BucketMatcher> bucketMatcherClass) {
this.bucketMatcherClass = bucketMatcherClass;
}
@Explain(displayName = "Alias Bucket File Name Mapping", explainLevels = { Level.EXTENDED })
public Map>> getAliasBucketFileNameMapping() {
return aliasBucketFileNameMapping;
}
public void setAliasBucketFileNameMapping(
Map>> aliasBucketFileNameMapping) {
this.aliasBucketFileNameMapping = aliasBucketFileNameMapping;
}
@Override
public String toString() {
if (aliasBucketFileNameMapping != null) {
return "Mapping:" + aliasBucketFileNameMapping.toString();
} else {
return "";
}
}
@Explain(displayName = "Alias Bucket Base File Name Mapping", explainLevels = { Level.EXTENDED })
public Map>> getAliasBucketBaseFileNameMapping() {
return aliasBucketBaseFileNameMapping;
}
public void setAliasBucketBaseFileNameMapping(
Map>> aliasBucketBaseFileNameMapping) {
this.aliasBucketBaseFileNameMapping = aliasBucketBaseFileNameMapping;
}
@Explain(displayName = "Alias Bucket Output File Name Mapping", explainLevels = { Level.EXTENDED })
public Map getBucketFileNameMapping() {
return bucketFileNameMapping;
}
public void setBucketFileNameMapping(Map bucketFileNameMapping) {
this.bucketFileNameMapping = bucketFileNameMapping;
}
public Map> getBigTablePartSpecToFileMapping() {
return bigTablePartSpecToFileMapping;
}
public void setBigTablePartSpecToFileMapping(
Map> bigTablePartSpecToFileMapping) {
this.bigTablePartSpecToFileMapping = bigTablePartSpecToFileMapping;
}
/**
* Given a small table input file, find the mapping
* big table input file with the smallest bucket number.
*/
public String getMappingBigFile(String alias, String smallFile) {
HashSet bigFiles = new HashSet();
Map> mapping = aliasBucketFileNameMapping.get(alias);
for (Map.Entry> entry: mapping.entrySet()) {
if (entry.getValue().contains(smallFile)) {
bigFiles.add(entry.getKey());
}
}
// There could be several big table input files
// mapping to the same small input file.
// Find that one with the lowest bucket id.
int bucketId = Integer.MAX_VALUE;
String bigFile = null;
for (String f: bigFiles) {
int id = bucketFileNameMapping.get(f);
if (id < bucketId) {
bucketId = id;
bigFile = f;
}
}
return bigFile;
}
// returns fileId for SMBJoin, which consists part of result file name
// needed to avoid file name conflict when big table is partitioned
public String createFileId(String inputPath) {
String bucketNum = String.valueOf(bucketFileNameMapping.get(inputPath));
if (bigTablePartSpecToFileMapping != null) {
// partSpecToFileMapping is null if big table is partitioned
return prependPartSpec(inputPath, bucketNum);
}
return bucketNum;
}
// returns name of hashfile made by HASHTABLESINK which is read by MAPJOIN
public String createFileName(String inputPath, String fileName) {
if (bigTablePartSpecToFileMapping != null) {
// partSpecToFileMapping is null if big table is partitioned
return prependPartSpec(inputPath, fileName);
}
return fileName;
}
// prepends partition spec of input path to candidate file name
private String prependPartSpec(String inputPath, String fileName) {
Map mapping = inputToPartSpecMapping == null ?
inputToPartSpecMapping = revert(bigTablePartSpecToFileMapping) : inputToPartSpecMapping;
String partSpec = mapping.get(URI.create(inputPath).getPath());
return partSpec == null || partSpec.isEmpty() ? fileName :
"(" + FileUtils.escapePathName(partSpec) + ")" + fileName;
}
// revert partSpecToFileMapping to inputToPartSpecMapping
private Map revert(Map> mapping) {
Map converted = new HashMap();
for (Map.Entry> entry : mapping.entrySet()) {
String partSpec = entry.getKey();
for (String file : entry.getValue()) {
converted.put(URI.create(file).getPath(), partSpec);
}
}
return converted;
}
}