org.apache.hadoop.hive.ql.exec.SkewJoinHandler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.persistence.RowContainer;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * At runtime in Join, we output big keys in one table into one corresponding
 * directories, and all same keys in other tables into different dirs(one for
 * each table). The directories will look like:
 * 
 * 
 * dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys which
 * is big in T1),dir-T3-keys(containing keys which is big in T1), ...
 * 

 * dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
 * big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
 * 

 * dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
 * keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
 * 
 *
 * 
 * For each skew key, we first write all values to a local tmp file. At the time
 * of ending the current group, the local tmp file will be uploaded to hdfs.
 * Right now, we use one file per skew key.
 *
 * 

 * For more info, please see https://issues.apache.org/jira/browse/HIVE-964.
 *
 */
public class SkewJoinHandler {

  protected static final Logger LOG = LoggerFactory.getLogger(SkewJoinHandler.class
      .getName());

  public int currBigKeyTag = -1;

  private int rowNumber = 0;
  private int currTag = -1;

  private int skewKeyDefinition = -1;
  private Map skewKeysTableObjectInspector = null;
  private Map tblSerializers = null;
  private Map tblDesc = null;

  private Map bigKeysExistingMap = null;

  private LongWritable skewjoinFollowupJobs;

  private final boolean noOuterJoin;
  Configuration hconf = null;
  List