All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hivemall.hcatalog.mapreduce.HCatBaseOutputFormat Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hive.hcatalog.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hive.hcatalog.common.ErrorType;
import org.apache.hive.hcatalog.common.HCatConstants;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;

public abstract class HCatBaseOutputFormat extends OutputFormat, HCatRecord> {

//  static final private Log LOG = LogFactory.getLog(HCatBaseOutputFormat.class);

  /**
   * @see org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat#getTableSchema(org.apache.hadoop.conf.Configuration)
   * @deprecated Use {@link #getTableSchema(org.apache.hadoop.conf.Configuration)}
   */
  public static HCatSchema getTableSchema(JobContext context) throws IOException {
    return getTableSchema(context.getConfiguration());
  }

  /**
   * Gets the table schema for the table specified in the HCatOutputFormat.setOutput call
   * on the specified job context.
   * @param conf the Configuration object
   * @return the table schema
   * @throws IOException if HCatOutputFormat.setOutput has not been called for the passed context
   */
  public static HCatSchema getTableSchema(Configuration conf) throws IOException {
    OutputJobInfo jobInfo = getJobInfo(conf);
    return jobInfo.getTableInfo().getDataColumns();
  }

  /**
   * Check for validity of the output-specification for the job.
   * @param context information about the job
   * @throws IOException when output should not be attempted
   */
  @Override
  public void checkOutputSpecs(JobContext context
  ) throws IOException, InterruptedException {
    getOutputFormat(context).checkOutputSpecs(context);
  }

  /**
   * Gets the output format instance.
   * @param context the job context
   * @return the output format instance
   * @throws IOException
   */
  protected OutputFormat, HCatRecord> getOutputFormat(JobContext context) 
    throws IOException {
    OutputJobInfo jobInfo = getJobInfo(context);
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), 
        jobInfo.getTableInfo().getStorerInfo());
    // Always configure storage handler with jobproperties/jobconf before calling any methods on it
    configureOutputStorageHandler(context);
    if (storageHandler instanceof FosterStorageHandler) {
      return new FileOutputFormatContainer(ReflectionUtils.newInstance(
          storageHandler.getOutputFormatClass(),context.getConfiguration()));
    }
    else { 
      return new DefaultOutputFormatContainer(ReflectionUtils.newInstance(
          storageHandler.getOutputFormatClass(),context.getConfiguration()));
    }
  }

  /**
   * @see org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat#getJobInfo(org.apache.hadoop.conf.Configuration)
   * @deprecated use {@link #getJobInfo(org.apache.hadoop.conf.Configuration)}
   */
  public static OutputJobInfo getJobInfo(JobContext jobContext) throws IOException {
    return getJobInfo(jobContext.getConfiguration());
  }

  /**
   * Gets the HCatOuputJobInfo object by reading the Configuration and deserializing
   * the string. If InputJobInfo is not present in the configuration, throws an
   * exception since that means HCatOutputFormat.setOutput has not been called.
   * @param conf the job Configuration object
   * @return the OutputJobInfo object
   * @throws IOException the IO exception
   */
  public static OutputJobInfo getJobInfo(Configuration conf) throws IOException {
    String jobString = conf.get(HCatConstants.HCAT_KEY_OUTPUT_INFO);
    if (jobString == null) {
      throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED);
    }

    return (OutputJobInfo) HCatUtil.deserialize(jobString);
  }

  /**
   * Configure the output storage handler
   * @param jobContext the job context
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  static void configureOutputStorageHandler(
    JobContext jobContext) throws IOException {
    configureOutputStorageHandler(jobContext, (List) null);
  }

  /**
   * Configure the output storage handler with allowing specification of missing dynamic partvals
   * @param jobContext the job context
   * @param dynamicPartVals
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  static void configureOutputStorageHandler(
    JobContext jobContext, List dynamicPartVals) throws IOException {
    Configuration conf = jobContext.getConfiguration();
    try {
      OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize(conf.get(HCatConstants.HCAT_KEY_OUTPUT_INFO));
      HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(jobContext.getConfiguration(),jobInfo.getTableInfo().getStorerInfo());

      Map partitionValues = jobInfo.getPartitionValues();
      String location = jobInfo.getLocation();

      if (dynamicPartVals != null) {
        // dynamic part vals specified
        List dynamicPartKeys = jobInfo.getDynamicPartitioningKeys();
        if (dynamicPartVals.size() != dynamicPartKeys.size()) {
          throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,
            "Unable to configure dynamic partitioning for storage handler, mismatch between"
              + " number of partition values obtained[" + dynamicPartVals.size()
              + "] and number of partition values required[" + dynamicPartKeys.size() + "]");
        }
        for (int i = 0; i < dynamicPartKeys.size(); i++) {
          partitionValues.put(dynamicPartKeys.get(i), dynamicPartVals.get(i));
        }

//            // re-home location, now that we know the rest of the partvals
//            Table table = jobInfo.getTableInfo().getTable();
//
//            List partitionCols = new ArrayList();
//            for(FieldSchema schema : table.getPartitionKeys()) {
//              partitionCols.add(schema.getName());
//            }
        jobInfo.setPartitionValues(partitionValues);
      }

      HCatUtil.configureOutputStorageHandler(storageHandler, conf, jobInfo);
    } catch (Exception e) {
      if (e instanceof HCatException) {
        throw (HCatException) e;
      } else {
        throw new HCatException(ErrorType.ERROR_INIT_STORAGE_HANDLER, e);
      }
    }
  }

  /**
   * Configure the output storage handler, with allowing specification
   * of partvals from which it picks the dynamic partvals
   * @param context the job context
   * @param jobInfo the output job info
   * @param fullPartSpec
   * @throws IOException
   */

  protected static void configureOutputStorageHandler(
    JobContext context, OutputJobInfo jobInfo,
    Map fullPartSpec) throws IOException {
    List dynamicPartKeys = jobInfo.getDynamicPartitioningKeys();
    if ((dynamicPartKeys == null) || (dynamicPartKeys.isEmpty())) {
      configureOutputStorageHandler(context, (List) null);
    } else {
      List dynKeyVals = new ArrayList();
      for (String dynamicPartKey : dynamicPartKeys) {
        dynKeyVals.add(fullPartSpec.get(dynamicPartKey));
      }
      configureOutputStorageHandler(context, dynKeyVals);
    }
  }


  protected static void setPartDetails(OutputJobInfo jobInfo, final HCatSchema schema,
                     Map partMap) throws HCatException, IOException {
    List posOfPartCols = new ArrayList();
    List posOfDynPartCols = new ArrayList();

    // If partition columns occur in data, we want to remove them.
    // So, find out positions of partition columns in schema provided by user.
    // We also need to update the output Schema with these deletions.

    // Note that, output storage handlers never sees partition columns in data
    // or schema.

    HCatSchema schemaWithoutParts = new HCatSchema(schema.getFields());
    for (String partKey : partMap.keySet()) {
      Integer idx;
      if ((idx = schema.getPosition(partKey)) != null) {
        posOfPartCols.add(idx);
        schemaWithoutParts.remove(schema.get(partKey));
      }
    }

    // Also, if dynamic partitioning is being used, we want to
    // set appropriate list of columns for the columns to be dynamically specified.
    // These would be partition keys too, so would also need to be removed from
    // output schema and partcols

    if (jobInfo.isDynamicPartitioningUsed()) {
      for (String partKey : jobInfo.getDynamicPartitioningKeys()) {
        Integer idx;
        if ((idx = schema.getPosition(partKey)) != null) {
          posOfPartCols.add(idx);
          posOfDynPartCols.add(idx);
          schemaWithoutParts.remove(schema.get(partKey));
        }
      }
    }

    HCatUtil.validatePartitionSchema(
      new Table(jobInfo.getTableInfo().getTable()), schemaWithoutParts);
    jobInfo.setPosOfPartCols(posOfPartCols);
    jobInfo.setPosOfDynPartCols(posOfDynPartCols);
    jobInfo.setOutputSchema(schemaWithoutParts);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy