All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.compliance.HivePartitionFinder Maven / Gradle / Ivy

Go to download

A distributed data integration framework for streaming and batch data ecosystems.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.compliance;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.commons.lang.NotImplementedException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.thrift.TException;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.configuration.State;
import org.apache.gobblin.data.management.copy.hive.HiveDataset;
import org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder;
import org.apache.gobblin.dataset.DatasetsFinder;
import org.apache.gobblin.hive.HiveMetastoreClientPool;
import org.apache.gobblin.util.AutoReturnableObject;
import org.apache.gobblin.util.WriterUtils;
import org.apache.gobblin.util.reflection.GobblinConstructorUtils;


/**
 * A finder class to find {@link HivePartitionDataset}s.
 *
 * @author adsharma
 */
@Slf4j
public class HivePartitionFinder implements DatasetsFinder {
  protected List hiveDatasets;
  protected State state;
  private static final Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
  private static Optional pool = Optional.absent();
  private static final Object lock = new Object();

  public HivePartitionFinder(State state)
      throws IOException {
    this.state = new State(state);
    this.hiveDatasets = getHiveDatasets(WriterUtils.getWriterFs(this.state), this.state);
  }

  private static List getHiveDatasets(FileSystem fs, State state)
      throws IOException {
    Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCE_DATASET_WHITELIST),
        "Missing required property " + ComplianceConfigurationKeys.COMPLIANCE_DATASET_WHITELIST);
    Properties prop = new Properties();
    prop.setProperty(ComplianceConfigurationKeys.HIVE_DATASET_WHITELIST,
        state.getProp(ComplianceConfigurationKeys.COMPLIANCE_DATASET_WHITELIST));
    HiveDatasetFinder finder = new HiveDatasetFinder(fs, prop);
    return finder.findDatasets();
  }

  /**
   * Will find all datasets according to whitelist, except the backup, trash and staging tables.
   */
  @Override
  public List findDatasets()
      throws IOException {
    List list = new ArrayList<>();
    for (HiveDataset hiveDataset : this.hiveDatasets) {
      for (Partition partition : hiveDataset.getPartitionsFromDataset()) {
        list.add(new HivePartitionDataset(partition));
      }
    }
    String selectionPolicyString = this.state.getProp(ComplianceConfigurationKeys.DATASET_SELECTION_POLICY_CLASS,
        ComplianceConfigurationKeys.DEFAULT_DATASET_SELECTION_POLICY_CLASS);
    Policy selectionPolicy =
        GobblinConstructorUtils.invokeConstructor(Policy.class, selectionPolicyString);
    return selectionPolicy.selectedList(list);
  }

  public static HivePartitionDataset findDataset(String completePartitionName, State prop)
      throws IOException {
    synchronized (lock) {
      List partitionList = AT_SPLITTER.splitToList(completePartitionName);
      Preconditions.checkArgument(partitionList.size() == 3, "Invalid partition name");
      if (!pool.isPresent()) {
        pool = Optional.of(HiveMetastoreClientPool.get(new Properties(),
            Optional.fromNullable(new Properties().getProperty(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))));
      }
      try (AutoReturnableObject client = pool.get().getClient()) {
        Table table = new Table(client.get().getTable(partitionList.get(0), partitionList.get(1)));
        Partition partition = new Partition(table,
            client.get().getPartition(partitionList.get(0), partitionList.get(1), partitionList.get(2)));
        return new HivePartitionDataset(partition);
      } catch (TException | HiveException e) {
        throw new IOException(e);
      }
    }
  }

  @Override
  public Path commonDatasetRoot() {
    // Not implemented by this method
    throw new NotImplementedException();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy