gobblin.hive.HiveRegistrationUnit Maven / Gradle / Ivy
/*
 * Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */
package gobblin.hive;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import gobblin.annotation.Alpha;
import gobblin.configuration.State;
import lombok.AllArgsConstructor;
import lombok.Getter;
/**
 * A class that represents a Hive table or partition.
 *
 * @author Ziyang Liu
 */
@Getter
@Alpha
public class HiveRegistrationUnit {
  protected final String dbName;
  protected final String tableName;
  protected final List columns = Lists.newArrayList();
  protected final State props = new State();
  protected final State storageProps = new State();
  protected final State serDeProps = new State();
  protected final Optional serDeManager;
  /**
   * Table or Partition properties
   */
  protected Optional createTime;
  protected Optional lastAccessTime;
  /**
   * Storage properties
   */
  protected Optional location;
  protected Optional inputFormat;
  protected Optional outputFormat;
  protected Optional isCompressed;
  protected Optional numBuckets;
  protected Optional> bucketColumns;
  protected Optional isStoredAsSubDirs;
  /**
   * SerDe properties
   */
  protected Optional serDeType;
  HiveRegistrationUnit(Builder> builder) {
    Preconditions.checkArgument(!Strings.isNullOrEmpty(builder.dbName));
    Preconditions.checkArgument(!Strings.isNullOrEmpty(builder.tableName));
    this.dbName = builder.dbName;
    this.tableName = builder.tableName;
    this.columns.addAll(builder.columns);
    this.props.addAll(builder.props);
    this.storageProps.addAll(builder.storageProps);
    this.serDeProps.addAll(builder.serDeProps);
    this.serDeManager = builder.serDeManager;
    populateTablePartitionFields(this.props);
    populateStorageFields(this.storageProps);
    populateSerDeFields(this.serDeProps);
  }
  @SuppressWarnings("serial")
  protected void populateTablePartitionFields(State state) {
    this.createTime = populateField(state, HiveConstants.CREATE_TIME, new TypeToken() {});
    this.lastAccessTime = populateField(state, HiveConstants.LAST_ACCESS_TIME, new TypeToken() {});
  }
  @SuppressWarnings({ "serial" })
  protected void populateStorageFields(State state) {
    this.location = populateField(state, HiveConstants.LOCATION, new TypeToken() {});
    this.inputFormat = populateField(state, HiveConstants.INPUT_FORMAT, new TypeToken() {});
    this.outputFormat = populateField(state, HiveConstants.OUTPUT_FORMAT, new TypeToken() {});
    this.isCompressed = populateField(state, HiveConstants.COMPRESSED, new TypeToken() {});
    this.numBuckets = populateField(state, HiveConstants.NUM_BUCKETS, new TypeToken() {});
    this.bucketColumns = populateField(state, HiveConstants.BUCKET_COLUMNS, new TypeToken>() {});
    this.isStoredAsSubDirs = populateField(state, HiveConstants.STORED_AS_SUB_DIRS, new TypeToken() {});
  }
  @SuppressWarnings("serial")
  protected void populateSerDeFields(State state) {
    this.serDeType = populateField(state, HiveConstants.SERDE_TYPE, new TypeToken() {});
  }
  @SuppressWarnings({ "serial", "unchecked" })
  protected static  Optional populateField(State state, String key, TypeToken token) {
    if (state.contains(key)) {
      Optional fieldValue;
      if (new TypeToken() {}.isAssignableFrom(token)) {
        fieldValue = (Optional) Optional.of(state.getPropAsBoolean(key));
      } else if (new TypeToken() {}.isAssignableFrom(token)) {
        fieldValue = (Optional) Optional.of(state.getPropAsInt(key));
      } else if (new TypeToken() {}.isAssignableFrom(token)) {
        fieldValue = (Optional) Optional.of(state.getPropAsLong(key));
      } else if (new TypeToken>() {}.isAssignableFrom(token)) {
        fieldValue = (Optional) Optional.of(state.getPropAsList(key));
      } else {
        fieldValue = (Optional) Optional.of(state.getProp(key));
      }
      state.removeProp(key);
      return fieldValue;
    }
    return Optional. absent();
  }
  /**
   * Set the columns for a table or partition.
   *
   * 
   *   Columns does not need to be set for a table if the table's serde already provides the schema,
   *   such as Avro tables. Columns does not need to be set for a partition if they are the same as
   *   the table's columns.
   * 
   * @param columns
   */
  public void setColumns(List columns) {
    this.columns.clear();
    this.columns.addAll(columns);
  }
  /**
   * Set a table/partition parameter.
   *
   * 
   *   When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
   *   {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
   *   which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
   *   one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
   *   {@link #setProp(String, Object)}, {@link #setStorageProp(String, Object)} and
   *   {@link #setSerDeProp(String, Object)}. When using query-based Hive registration, they do not need to be
   *   distinguished since all parameters will be passed via TBLPROPERTIES.
   * 
   */
  public void setProp(String key, Object value) {
    this.props.setProp(key, value);
    updateTablePartitionFields(this.props, key, value);
  }
  /**
   * Set a storage parameter for a table/partition.
   *
   * 
   *   When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
   *   {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
   *   which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
   *   one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
   *   {@link #setProp(String, Object)}, {@link #setStorageProp(String, Object)} and
   *   {@link #setSerDeProp(String, Object)}. When using query-based Hive registration, they do not need to be
   *   distinguished since all parameters will be passed via TBLPROPERTIES.
   * 
   */
  public void setStorageProp(String key, Object value) {
    this.storageProps.setProp(key, value);
    updateStorageFields(this.storageProps, key, value);
  }
  /**
   * Set a serde parameter for a table/partition.
   *
   * 
   *   When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
   *   {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
   *   which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
   *   one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
   *   {@link #setProp(String, Object)}, {@link #setStorageProp(String, Object)} and
   *   {@link #setSerDeProp(String, Object)}. When using query-based Hive registration, they do not need to be
   *   distinguished since all parameters will be passed via TBLPROPERTIES.
   * 
   */
  public void setSerDeProp(String key, Object value) {
    this.serDeProps.setProp(key, value);
    updateSerDeFields(this.serDeProps, key, value);
  }
  /**
   * Set table/partition parameters.
   *
   * 
   *   When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
   *   {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
   *   which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
   *   one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
   *   {@link #setProps(State)}, {@link #setStorageProps(State)} and
   *   {@link #setSerDeProps(State)}. When using query-based Hive registration, they do not need to be
   *   distinguished since all parameters will be passed via TBLPROPERTIES.
   * 
   */
  public void setProps(State props) {
    for (String propKey : props.getPropertyNames()) {
      setProp(propKey, props.getProp(propKey));
    }
  }
  /**
   * Set storage parameters for a table/partition.
   *
   * 
   *   When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
   *   {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
   *   which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
   *   one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
   *   {@link #setProps(State)}, {@link #setStorageProps(State)} and
   *   {@link #setSerDeProps(State)}. When using query-based Hive registration, they do not need to be
   *   distinguished since all parameters will be passed via TBLPROPERTIES.
   * 
   */
  public void setStorageProps(State storageProps) {
    for (String propKey : storageProps.getPropertyNames()) {
      setStorageProp(propKey, storageProps.getProp(propKey));
    }
  }
  /**
   * Set serde parameters for a table/partition.
   *
   * 
   *   When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
   *   {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
   *   which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
   *   one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
   *   {@link #setProps(State)}, {@link #setStorageProps(State)} and
   *   {@link #setSerDeProps(State)}. When using query-based Hive registration, they do not need to be
   *   distinguished since all parameters will be passed via TBLPROPERTIES.
   * 
   */
  public void setSerDeProps(State serdeProps) {
    for (String propKey : serdeProps.getPropertyNames()) {
      setSerDeProp(propKey, serdeProps.getProp(propKey));
    }
  }
  protected void updateTablePartitionFields(State state, String key, Object value) {
    boolean isExistingField = true;
    switch (key) {
      case HiveConstants.CREATE_TIME:
        this.createTime = Optional.of((Long) value);
        break;
      case HiveConstants.LAST_ACCESS_TIME:
        this.createTime = Optional.of((Long) value);
        break;
      default:
        isExistingField = false;
    }
    if (isExistingField) {
      state.removeProp(key);
    }
  }
  protected void updateStorageFields(State state, String key, Object value) {
    boolean isExistingField = true;
    switch (key) {
      case HiveConstants.LOCATION:
        this.location = Optional.of((String) value);
        break;
      case HiveConstants.INPUT_FORMAT:
        this.inputFormat = Optional.of((String) value);
        break;
      case HiveConstants.OUTPUT_FORMAT:
        this.outputFormat = Optional.of((String) value);
        break;
      case HiveConstants.COMPRESSED:
        this.isCompressed = Optional.of((Boolean) value);
        break;
      case HiveConstants.NUM_BUCKETS:
        this.numBuckets = Optional.of((Integer) value);
        break;
      case HiveConstants.BUCKET_COLUMNS:
        this.bucketColumns = Optional.of(Splitter.on(',').omitEmptyStrings().trimResults().splitToList((String) value));
        break;
      case HiveConstants.STORED_AS_SUB_DIRS:
        this.isStoredAsSubDirs = Optional.of((Boolean) value);
        break;
      default:
        isExistingField = false;
    }
    if (isExistingField) {
      state.removeProp(key);
    }
  }
  protected void updateSerDeFields(State state, String key, Object value) {
    boolean isExistingField = true;
    switch (key) {
      case HiveConstants.SERDE_TYPE:
        this.serDeType = Optional.of((String) value);
        break;
      default:
        isExistingField = false;
    }
    if (isExistingField) {
      state.removeProp(key);
    }
  }
  /**
   * Set serde properties for a table/partition using the table/partition's {@link HiveSerDeManager}.
   *
   * 
   *   Requires that the {@link HiveSerDeManager} of the table/partition must be specified in
   *   {@link Builder#withSerdeManaager(HiveSerDeManager)}, and the table/partition's location must be specified
   *   either in {@link #setLocation(String)} or via {@link HiveConstants#LOCATION}.
   * 
   */
  public void setSerDeProps(Path path) throws IOException {
    this.serDeManager.get().addSerDeProperties(path, this);
  }
  /**
   * Set serde properties for a table/partition using another table/partition's serde properties.
   *
   * 
   *   A benefit of doing this is to avoid obtaining the schema multiple times when creating a table and a partition
   *   with the same schema, or creating several tables and partitions with the same schema. After the first
   *   table/partition is created, one can use the same SerDe properties to create the other tables/partitions.
   * 
   */
  public void setSerDeProps(HiveRegistrationUnit other) throws IOException {
    this.serDeManager.get().addSerDeProperties(other, this);
  }
  public void setCreateTime(long createTime) {
    this.createTime = Optional.of(createTime);
  }
  public void setLastAccessTime(long lastAccessTime) {
    this.lastAccessTime = Optional.of(lastAccessTime);
  }
  public void setLocation(String location) {
    this.location = Optional.of(location);
  }
  public void setInputFormat(String inputFormat) {
    this.inputFormat = Optional.of(inputFormat);
  }
  public void setOutputFormat(String outputFormat) {
    this.outputFormat = Optional.of(outputFormat);
  }
  public void setCompressed(boolean isCompressed) {
    this.isCompressed = Optional.of(isCompressed);
  }
  public void setNumBuckets(int numBuckets) {
    this.numBuckets = Optional.of(numBuckets);
  }
  public void setBucketColumns(List bucketColumns) {
    this.bucketColumns = Optional.> of(ImmutableList. copyOf(bucketColumns));
  }
  public void setStoredAsSubDirs(boolean isStoredAsSubDirs) {
    this.isStoredAsSubDirs = Optional.of(isStoredAsSubDirs);
  }
  public void setSerDeType(String serDeType) {
    this.serDeType = Optional.of(serDeType);
  }
  static abstract class Builder> {
    private String dbName;
    private String tableName;
    private List columns = Lists.newArrayList();
    private State props = new State();
    private State storageProps = new State();
    private State serDeProps = new State();
    private Optional serDeManager = Optional.absent();
    @SuppressWarnings("unchecked")
    public T withDbName(String dbName) {
      this.dbName = dbName;
      return (T) this;
    }
    @SuppressWarnings("unchecked")
    public T withTableName(String tableName) {
      this.tableName = tableName;
      return (T) this;
    }
    @SuppressWarnings("unchecked")
    public T withColumns(List columns) {
      this.columns = columns;
      return (T) this;
    }
    @SuppressWarnings("unchecked")
    public T withProps(State props) {
      this.props = props;
      return (T) this;
    }
    @SuppressWarnings("unchecked")
    public T withStorageProps(State storageProps) {
      this.storageProps = storageProps;
      return (T) this;
    }
    @SuppressWarnings("unchecked")
    public T withSerdeProps(State serDeProps) {
      this.serDeProps = serDeProps;
      return (T) this;
    }
    @SuppressWarnings("unchecked")
    public T withSerdeManaager(HiveSerDeManager serDeManager) {
      this.serDeManager = Optional.of(serDeManager);
      return (T) this;
    }
    public abstract HiveRegistrationUnit build();
  }
  @AllArgsConstructor
  @Getter
  public static class Column {
    private final String name;
    private final String type;
    private final String comment;
  }
}