All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.hive.policy.HiveRegistrationPolicyBase Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.hive.policy;

import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.ConstructorUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.TableType;

import com.codahale.metrics.Timer;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.typesafe.config.Config;

import org.apache.gobblin.annotation.Alpha;
import org.apache.gobblin.config.client.ConfigClient;
import org.apache.gobblin.config.client.api.VersionStabilityPolicy;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.hive.HivePartition;
import org.apache.gobblin.hive.HiveRegProps;
import org.apache.gobblin.hive.HiveRegister;
import org.apache.gobblin.hive.HiveSerDeManager;
import org.apache.gobblin.hive.HiveTable;
import org.apache.gobblin.hive.metastore.HiveMetaStoreUtils;
import org.apache.gobblin.hive.spec.HiveSpec;
import org.apache.gobblin.hive.spec.SimpleHiveSpec;
import org.apache.gobblin.instrumented.Instrumented;
import org.apache.gobblin.metrics.MetricContext;
import org.apache.gobblin.source.extractor.extract.kafka.ConfigStoreUtils;
import org.apache.gobblin.source.extractor.extract.kafka.KafkaSource;


/**
 * A base implementation of {@link HiveRegistrationPolicy}. It obtains database name from
 * property {@link #HIVE_DATABASE_NAME} or {@link #HIVE_DATABASE_REGEX} (group 1), obtains
 * table name from property {@link #HIVE_TABLE_NAME} and {@link #HIVE_TABLE_REGEX} (group 1),
 * and builds a {@link SimpleHiveSpec}.
 *
 * @author Ziyang Liu
 */
@Alpha
public class HiveRegistrationPolicyBase implements HiveRegistrationPolicy {

  public static final String HIVE_DATABASE_NAME = "hive.database.name";
  public static final String ADDITIONAL_HIVE_DATABASE_NAMES = "additional.hive.database.names";
  public static final String HIVE_DATABASE_REGEX = "hive.database.regex";
  public static final String HIVE_DATABASE_NAME_PREFIX = "hive.database.name.prefix";
  public static final String HIVE_DATABASE_NAME_SUFFIX = "hive.database.name.suffix";
  public static final String HIVE_TABLE_NAME = "hive.table.name";
  public static final String ADDITIONAL_HIVE_TABLE_NAMES = "additional.hive.table.names";
  public static final String HIVE_TABLE_REGEX = "hive.table.regex";
  public static final String HIVE_TABLE_NAME_PREFIX = "hive.table.name.prefix";
  public static final String HIVE_TABLE_NAME_SUFFIX = "hive.table.name.suffix";
  public static final String HIVE_SANITIZE_INVALID_NAMES = "hive.sanitize.invalid.names";
  public static final String HIVE_FS_URI = "hive.registration.fs.uri";
  // {@value PRIMARY_TABLE_TOKEN} if present in {@value ADDITIONAL_HIVE_TABLE_NAMES} or dbPrefix.{@value HIVE_TABLE_NAME}
  // .. will be replaced by the table name determined via {@link #getTableName(Path)}
  public static final String PRIMARY_TABLE_TOKEN = "$PRIMARY_TABLE";
  protected static final ConfigClient configClient =
      org.apache.gobblin.config.client.ConfigClient.createConfigClient(VersionStabilityPolicy.WEAK_LOCAL_STABILITY);

  /**
   * A valid db or table name should start with an alphanumeric character, and contains only
   * alphanumeric characters and '_'.
   */
  private static final Pattern VALID_DB_TABLE_NAME_PATTERN_1 = Pattern.compile("[a-z0-9][a-z0-9_]*");

  /**
   * A valid db or table name should contain at least one letter or '_' (i.e., should not be numbers only).
   */
  private static final Pattern VALID_DB_TABLE_NAME_PATTERN_2 = Pattern.compile(".*[a-z_].*");
  public static final String CONFIG_FOR_TOPIC_TIMER = "configForTopicTimer";

  protected final HiveRegProps props;
  protected final FileSystem fs;
  protected final boolean sanitizeNameAllowed;
  protected final Optional dbNamePattern;
  protected final Optional tableNamePattern;
  protected final String dbNamePrefix;
  protected final String dbNameSuffix;
  protected final String tableNamePrefix;
  protected final String tableNameSuffix;

  protected final MetricContext metricContext;

  public HiveRegistrationPolicyBase(State props) throws IOException {
    Preconditions.checkNotNull(props);
    this.props = new HiveRegProps(props);
    if (props.contains(HiveRegistrationPolicyBase.HIVE_FS_URI)) {
      this.fs = FileSystem.get(URI.create(props.getProp(HiveRegistrationPolicyBase.HIVE_FS_URI)), new Configuration());
    } else {
      this.fs = FileSystem.get(new Configuration());
    }
    this.sanitizeNameAllowed = props.getPropAsBoolean(HIVE_SANITIZE_INVALID_NAMES, true);
    this.dbNamePattern = props.contains(HIVE_DATABASE_REGEX)
        ? Optional.of(Pattern.compile(props.getProp(HIVE_DATABASE_REGEX))) : Optional. absent();
    this.tableNamePattern = props.contains(HIVE_TABLE_REGEX)
        ? Optional.of(Pattern.compile(props.getProp(HIVE_TABLE_REGEX))) : Optional. absent();
    this.dbNamePrefix = props.getProp(HIVE_DATABASE_NAME_PREFIX, StringUtils.EMPTY);
    this.dbNameSuffix = props.getProp(HIVE_DATABASE_NAME_SUFFIX, StringUtils.EMPTY);
    this.tableNamePrefix = props.getProp(HIVE_TABLE_NAME_PREFIX, StringUtils.EMPTY);
    this.tableNameSuffix = props.getProp(HIVE_TABLE_NAME_SUFFIX, StringUtils.EMPTY);

    this.metricContext = Instrumented.getMetricContext(props, HiveRegister.class);
  }

  /**
   * This method first tries to obtain the database name from {@link #HIVE_DATABASE_NAME}.
   * If this property is not specified, it then tries to obtain the database name using
   * the first group of {@link #HIVE_DATABASE_REGEX}.
   *
   */
  protected Optional getDatabaseName(Path path) {
    if (!this.props.contains(HIVE_DATABASE_NAME) && !this.props.contains(HIVE_DATABASE_REGEX)) {
      return Optional. absent();
    }

    return Optional. of(
        this.dbNamePrefix + getDatabaseOrTableName(path, HIVE_DATABASE_NAME, HIVE_DATABASE_REGEX, this.dbNamePattern)
            + this.dbNameSuffix);
  }

  /**
   * Obtain Hive database names. The returned {@link Iterable} contains the database name returned by
   * {@link #getDatabaseName(Path)} (if present) plus additional database names specified in
   * {@link #ADDITIONAL_HIVE_DATABASE_NAMES}.
   *
   */
  protected Iterable getDatabaseNames(Path path) {
    List databaseNames = Lists.newArrayList();

    Optional databaseName;
    if ((databaseName = getDatabaseName(path)).isPresent()) {
      databaseNames.add(databaseName.get());
    }

    if (!Strings.isNullOrEmpty(this.props.getProp(ADDITIONAL_HIVE_DATABASE_NAMES))) {
      for (String additionalDbName : this.props.getPropAsList(ADDITIONAL_HIVE_DATABASE_NAMES)) {
        databaseNames.add(this.dbNamePrefix + additionalDbName + this.dbNameSuffix);
      }
    }

    Preconditions.checkState(!databaseNames.isEmpty(), "Hive database name not specified");
    return databaseNames;
  }

  /**
   * This method first tries to obtain the database name from {@link #HIVE_TABLE_NAME}.
   * If this property is not specified, it then tries to obtain the database name using
   * the first group of {@link #HIVE_TABLE_REGEX}.
   */
  protected Optional getTableName(Path path) {
    if (!this.props.contains(HIVE_TABLE_NAME) && !this.props.contains(HIVE_TABLE_REGEX)) {
      return Optional. absent();
    }
    return Optional. of(
        this.tableNamePrefix + getDatabaseOrTableName(path, HIVE_TABLE_NAME, HIVE_TABLE_REGEX, this.tableNamePattern)
            + this.tableNameSuffix);
  }

  /***
   * Obtain Hive table names.
   *
   * The returned {@link Iterable} contains:
   *  1. Table name returned by {@link #getTableName(Path)}
   *  2. Table names specified by additional.hive.table.names
   *
   * In table names above, the {@value PRIMARY_TABLE_TOKEN} if present is also replaced by the
   * table name obtained via {@link #getTableName(Path)}.
   *
   * @param path Path for the table on filesystem.
   * @return Table names to register.
   */
  protected Iterable getTableNames(Path path) {
    List tableNames = getTableNames(Optional.absent(), path);

    Preconditions.checkState(!tableNames.isEmpty(), "Hive table name not specified");
    return tableNames;
  }

  /***
   * Obtain Hive table names filtered by dbPrefix (if present).
   *
   * The returned {@link List} contains:
   *  A. If dbPrefix is absent:
   *    1. Table name returned by {@link #getTableName(Path)}
   *    2. Table names specified by additional.hive.table.names
   *  B. If dbPrefix is present:
   *    1. Table names specified by dbPrefix.hive.table.names
   *
   * In table names above, the {@value PRIMARY_TABLE_TOKEN} if present is also replaced by the
   * table name obtained via {@link #getTableName(Path)}.
   *
   * @param dbPrefix Prefix to the property additional.table.names, to obtain table names only
   *                         for the specified db. Eg. If dbPrefix is db, then
   *                         db.hive.table.names is the resolved property name.
   * @param path Path for the table on filesystem.
   * @return Table names to register.
   */
  protected List getTableNames(Optional dbPrefix, Path path) {
    List tableNames = Lists.newArrayList();

    Optional primaryTableName;
    if ((primaryTableName = getTableName(path)).isPresent() && !dbPrefix.isPresent()) {
      tableNames.add(primaryTableName.get());
    }
    Optional configForTopic = Optional.absent();
    if (primaryTableName.isPresent()) {
      Timer.Context context = this.metricContext.timer(CONFIG_FOR_TOPIC_TIMER).time();
      configForTopic =
          ConfigStoreUtils.getConfigForTopic(this.props.getProperties(), KafkaSource.TOPIC_NAME, this.configClient);
      context.close();
    }

    String additionalNamesProp;
    if (dbPrefix.isPresent()) {
      additionalNamesProp = String.format("%s.%s", dbPrefix.get(), HIVE_TABLE_NAME);
    } else {
      additionalNamesProp = ADDITIONAL_HIVE_TABLE_NAMES;
    }

    if (configForTopic.isPresent() && configForTopic.get().hasPath(additionalNamesProp)) {
      for (String additionalTableName : Splitter.on(",")
          .trimResults()
          .splitToList(configForTopic.get().getString(additionalNamesProp))) {
        String resolvedTableName =
            StringUtils.replace(additionalTableName, PRIMARY_TABLE_TOKEN, primaryTableName.get());
        tableNames.add(this.tableNamePrefix + resolvedTableName + this.tableNameSuffix);
      }
    } else if (!Strings.isNullOrEmpty(this.props.getProp(additionalNamesProp))) {
      for (String additionalTableName : this.props.getPropAsList(additionalNamesProp)) {
        String resolvedTableName =
            primaryTableName.isPresent() ? StringUtils.replace(additionalTableName, PRIMARY_TABLE_TOKEN,
                primaryTableName.get()) : additionalTableName;
        tableNames.add(this.tableNamePrefix + resolvedTableName + this.tableNameSuffix);
      }
    }

    return tableNames;
  }

  protected String getDatabaseOrTableName(Path path, String nameKey, String regexKey, Optional pattern) {
    String name;
    if (this.props.contains(nameKey)) {
      name = this.props.getProp(nameKey);
    } else if (pattern.isPresent()) {
      Matcher matcher = pattern.get().matcher(path.toString());
      if (matcher.matches() && matcher.groupCount() >= 1) {
        name = matcher.group(1);
      } else {
        throw new IllegalStateException("No group match found for regexKey " + regexKey+" with regexp "+ pattern.get().toString() +" on path "+path);
      }
    } else {
      throw new IllegalStateException("Missing required property " + nameKey + " or " + regexKey);
    }

    return sanitizeAndValidateName(name);
  }

  protected String sanitizeAndValidateName(String name) {
    name = name.toLowerCase();

    if (this.sanitizeNameAllowed && !isNameValid(name)) {
      name = sanitizeName(name);
    }

    if (isNameValid(name)) {
      return name;
    }
    throw new IllegalStateException(name + " is not a valid Hive database or table name");
  }

  /**
   * A base implementation for creating {@link HiveTable}s given a {@link Path}.
   *
   * 

* This method returns a list of {@link Hivetable}s that contains one table per db name * (returned by {@link #getDatabaseNames(Path)}) and table name (returned by {@link #getTableNames(Path)}. *

* * @param path a {@link Path} used to create the {@link HiveTable}. * @return a list of {@link HiveTable}s for the given {@link Path}. * @throws IOException */ protected List getTables(Path path) throws IOException { List tables = Lists.newArrayList(); for (String databaseName : getDatabaseNames(path)) { // Get tables to register ONLY for this Hive database (specified via prefix filter in properties) boolean foundTablesViaDbFilter = false; for (String tableName : getTableNames(Optional.of(databaseName), path)) { tables.add(getTable(path, databaseName, tableName)); foundTablesViaDbFilter = true; } // If no tables found via db filter, get tables to register in all Hive databases and add them for this database if (!foundTablesViaDbFilter) { for (String tableName : getTableNames(path)) { tables.add(getTable(path, databaseName, tableName)); } } } return tables; } /** * A base implementation for creating a non bucketed, external {@link HiveTable} for a {@link Path}. * * @param path a {@link Path} used to create the {@link HiveTable}. * @param dbName the database name for the created {@link HiveTable}. * @param tableName the table name for the created {@link HiveTable}. * @return a {@link HiveTable}s for the given {@link Path}. * @throws IOException */ protected HiveTable getTable(Path path, String dbName, String tableName) throws IOException { HiveTable table = new HiveTable.Builder().withDbName(dbName).withTableName(tableName) .withSerdeManaager(HiveSerDeManager.get(this.props)).build(); table.setLocation(this.fs.makeQualified(getTableLocation(path)).toString()); table.setSerDeProps(path); // Setting table-level props. State tableProps = new State(this.props.getTablePartitionProps()); if (this.props.getRuntimeTableProps().isPresent()){ tableProps.setProp(HiveMetaStoreUtils.RUNTIME_PROPS, this.props.getRuntimeTableProps().get()); } table.setProps(tableProps); table.setStorageProps(this.props.getStorageProps()); table.setSerDeProps(this.props.getSerdeProps()); table.setNumBuckets(-1); table.setBucketColumns(Lists. newArrayList()); table.setTableType(TableType.EXTERNAL_TABLE.toString()); return table; } protected Optional getPartition(Path path, HiveTable table) throws IOException { return Optional. absent(); } protected Path getTableLocation(Path path) { return path; } /** * Determine whether a database or table name is valid. * * A name is valid if and only if: it starts with an alphanumeric character, contains only alphanumeric characters * and '_', and is NOT composed of numbers only. */ protected static boolean isNameValid(String name) { Preconditions.checkNotNull(name); name = name.toLowerCase(); return VALID_DB_TABLE_NAME_PATTERN_1.matcher(name).matches() && VALID_DB_TABLE_NAME_PATTERN_2.matcher(name).matches(); } /** * Attempt to sanitize an invalid database or table name by replacing characters that are not alphanumeric * or '_' with '_'. */ protected static String sanitizeName(String name) { return name.replaceAll("[^a-zA-Z0-9_]", "_"); } @Override public Collection getHiveSpecs(Path path) throws IOException { List specs = Lists.newArrayList(); for (HiveTable table : getTables(path)) { specs.add(new SimpleHiveSpec.Builder<>(path).withTable(table).withPartition(getPartition(path, table)).build()); } return specs; } /** * Get a {@link HiveRegistrationPolicy} from a {@link State} object. * * @param props A {@link State} object that contains property, {@link #HIVE_REGISTRATION_POLICY}, * which is the class name of the desired policy. This policy class must have a constructor that * takes a {@link State} object. */ public static HiveRegistrationPolicy getPolicy(State props) { Preconditions.checkArgument(props.contains(ConfigurationKeys.HIVE_REGISTRATION_POLICY)); String policyType = props.getProp(ConfigurationKeys.HIVE_REGISTRATION_POLICY); try { return (HiveRegistrationPolicy) ConstructorUtils.invokeConstructor(Class.forName(policyType), props); } catch (ReflectiveOperationException e) { throw new RuntimeException( "Unable to instantiate " + HiveRegistrationPolicy.class.getSimpleName() + " with type " + policyType, e); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy