All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.hive.HiveSerDeWrapper Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.hive;

import java.io.IOException;

import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat;
import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.avro.AvroSerDe;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.mapred.TextInputFormat;

import com.google.common.base.Enums;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;

import org.apache.gobblin.annotation.Alpha;
import org.apache.gobblin.configuration.State;


/**
 * A wrapper around {@link SerDe} that bundles input format, output format and file extension with a {@link SerDe},
 * and provides additional functionalities.
 *
 * @author Ziyang Liu
 */
@Alpha
@SuppressWarnings("deprecation")
public class HiveSerDeWrapper {

  private static final String SERDE_SERIALIZER_PREFIX = "serde.serializer.";
  private static final String SERDE_DESERIALIZER_PREFIX = "serde.deserializer.";

  public static final String SERDE_SERIALIZER_TYPE = SERDE_SERIALIZER_PREFIX + "type";
  public static final String SERDE_SERIALIZER_INPUT_FORMAT_TYPE = SERDE_SERIALIZER_PREFIX + "input.format.type";
  public static final String SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE = SERDE_SERIALIZER_PREFIX + "output.format.type";

  public static final String SERDE_DESERIALIZER_TYPE = SERDE_DESERIALIZER_PREFIX + "type";
  public static final String SERDE_DESERIALIZER_INPUT_FORMAT_TYPE = SERDE_DESERIALIZER_PREFIX + "input.format.type";
  public static final String SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE = SERDE_DESERIALIZER_PREFIX + "output.format.type";

  public enum BuiltInHiveSerDe {

    AVRO(AvroSerDe.class.getName(), AvroContainerInputFormat.class.getName(),
        AvroContainerOutputFormat.class.getName()),
    ORC(OrcSerde.class.getName(), OrcInputFormat.class.getName(), OrcOutputFormat.class.getName()),
    PARQUET(ParquetHiveSerDe.class.getName(), MapredParquetInputFormat.class.getName(),
        MapredParquetOutputFormat.class.getName()),
    TEXTFILE(LazySimpleSerDe.class.getName(), TextInputFormat.class.getName(),
        HiveIgnoreKeyTextOutputFormat.class.getName());

    private final String serDeClassName;
    private final String inputFormatClassName;
    private final String outputFormatClassName;

    private BuiltInHiveSerDe(String serDeClassName, String inputFormatClassName, String outputFormatClassName) {
      this.serDeClassName = serDeClassName;
      this.inputFormatClassName = inputFormatClassName;
      this.outputFormatClassName = outputFormatClassName;
    }

    @Override
    public String toString() {
      return this.serDeClassName;
    }
  }

  private Optional serDe = Optional.absent();
  private final String serDeClassName;
  private final String inputFormatClassName;
  private final String outputFormatClassName;

  private HiveSerDeWrapper(BuiltInHiveSerDe hiveSerDe) {
    this(hiveSerDe.serDeClassName, hiveSerDe.inputFormatClassName, hiveSerDe.outputFormatClassName);
  }

  private HiveSerDeWrapper(String serDeClassName, String inputFormatClassName, String outputFormatClassName) {
    this.serDeClassName = serDeClassName;
    this.inputFormatClassName = inputFormatClassName;
    this.outputFormatClassName = outputFormatClassName;
  }

  /**
   * Get the {@link SerDe} instance associated with this {@link HiveSerDeWrapper}.
   * This method performs lazy initialization.
   */
  public SerDe getSerDe() throws IOException {
    if (!this.serDe.isPresent()) {
      try {
        this.serDe = Optional.of(SerDe.class.cast(Class.forName(this.serDeClassName).newInstance()));
      } catch (Throwable t) {
        throw new IOException("Failed to instantiate SerDe " + this.serDeClassName, t);
      }
    }
    return this.serDe.get();
  }

  /**
   * Get the input format class name associated with this {@link HiveSerDeWrapper}.
   */
  public String getInputFormatClassName() {
    return this.inputFormatClassName;
  }

  /**
   * Get the output format class name associated with this {@link HiveSerDeWrapper}.
   */
  public String getOutputFormatClassName() {
    return this.outputFormatClassName;
  }

  /**
   * Get an instance of {@link HiveSerDeWrapper}.
   *
   * @param serDeType The SerDe type. This should be one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}s.
   */
  public static HiveSerDeWrapper get(String serDeType) {
    return get(serDeType, Optional. absent(), Optional. absent());
  }

  /**
   * Get an instance of {@link HiveSerDeWrapper}.
   *
   * @param serDeType The SerDe type. If serDeType is one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe},
   * the other three parameters are not used. Otherwise, serDeType should be the class name of a {@link SerDe},
   * and the other three parameters must be present.
   */
  public static HiveSerDeWrapper get(String serDeType, Optional inputFormatClassName,
      Optional outputFormatClassName) {
    Optional hiveSerDe = Enums.getIfPresent(BuiltInHiveSerDe.class, serDeType.toUpperCase());
    if (hiveSerDe.isPresent()) {
      return new HiveSerDeWrapper(hiveSerDe.get());
    }
    Preconditions.checkArgument(inputFormatClassName.isPresent(),
        "Missing input format class name for SerDe " + serDeType);
    Preconditions.checkArgument(outputFormatClassName.isPresent(),
        "Missing output format class name for SerDe " + serDeType);
    return new HiveSerDeWrapper(serDeType, inputFormatClassName.get(), outputFormatClassName.get());
  }

  /**
   * Get an instance of {@link HiveSerDeWrapper} from a {@link State}.
   *
   * @param state The state should contain property {@link #SERDE_SERIALIZER_TYPE}, and optionally contain properties
   * {@link #SERDE_SERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE} and
   */
  public static HiveSerDeWrapper getSerializer(State state) {
    Preconditions.checkArgument(state.contains(SERDE_SERIALIZER_TYPE),
        "Missing required property " + SERDE_SERIALIZER_TYPE);
    return get(state.getProp(SERDE_SERIALIZER_TYPE),
        Optional.fromNullable(state.getProp(SERDE_SERIALIZER_INPUT_FORMAT_TYPE)),
        Optional.fromNullable(state.getProp(SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE)));
  }

  /**
   * Get an instance of {@link HiveSerDeWrapper} from a {@link State}.
   *
   * @param state The state should contain property {@link #SERDE_DESERIALIZER_TYPE}, and optionally contain properties
   * {@link #SERDE_DESERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE} and
   */
  public static HiveSerDeWrapper getDeserializer(State state) {
    Preconditions.checkArgument(state.contains(SERDE_DESERIALIZER_TYPE),
        "Missing required property " + SERDE_DESERIALIZER_TYPE);
    return get(state.getProp(SERDE_DESERIALIZER_TYPE),
        Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_INPUT_FORMAT_TYPE)),
        Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE)));
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy