org.apache.gobblin.compliance.HivePartitionVersionFinder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gobblin-compliance Show documentation
A distributed data integration framework for streaming and batch data ecosystems.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.compliance;

import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.compliance.retention.ComplianceRetentionJob;
import org.apache.gobblin.compliance.retention.HivePartitionRetentionVersion;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.data.management.copy.hive.HiveDataset;
import org.apache.gobblin.dataset.Dataset;
import org.apache.gobblin.util.AutoReturnableObject;


/**
 * A version finder class to find {@link HivePartitionVersion}s.
 *
 * @author adsharma
 */
@Slf4j
public class HivePartitionVersionFinder implements org.apache.gobblin.data.management.version.finder.VersionFinder {
  protected final FileSystem fs;
  protected final State state;
  protected List patterns;
  private Optional owner = Optional.absent();
  private List versions = new ArrayList<>();
  private static final Object lock = new Object();
  private static final Splitter At_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();

  public HivePartitionVersionFinder(FileSystem fs, State state, List patterns) {
    this.fs = fs;
    this.state = new State(state);
    this.patterns = patterns;
  }

  @Override
  public Class versionClass() {
    return HivePartitionVersion.class;
  }

  /**
   * Will find all the versions of the {@link HivePartitionDataset}.
   *
   * For a dataset with table name table1, corresponding versions table will be
   * table1_backup_timestamp or table1_staging_timestamp or table1_trash_timestamp
   *
   * Based on pattern, a type of version will be selected eg. backup or trash or staging
   *
   * If a Hive version's table contain no Partitions, it will be dropped.
   */
  @Override
  public Collection findDatasetVersions(Dataset dataset)
      throws IOException {
    List versions = new ArrayList<>();
    if (!(dataset instanceof HivePartitionDataset)) {
      return versions;
    }
    HivePartitionDataset hivePartitionDataset = (HivePartitionDataset) dataset;
    this.owner = hivePartitionDataset.getOwner();
    Preconditions.checkArgument(!this.patterns.isEmpty(),
        "No patterns to find versions for the dataset " + dataset.datasetURN());

    versions
        .addAll(findVersions(hivePartitionDataset.getName(), hivePartitionDataset.datasetURN()));
    return versions;
  }

  private List findVersions(String name, String urn)
      throws IOException {
    State state = new State(this.state);
    Preconditions.checkArgument(this.state.contains(ComplianceConfigurationKeys.HIVE_VERSIONS_WHITELIST),
        "Missing required property " + ComplianceConfigurationKeys.HIVE_VERSIONS_WHITELIST);

    state.setProp(ComplianceConfigurationKeys.HIVE_DATASET_WHITELIST,
        this.state.getProp(ComplianceConfigurationKeys.HIVE_VERSIONS_WHITELIST));
    setVersions(name, state);
    log.info("Found " + this.versions.size() + " versions for the dataset " + urn);
    return this.versions;
  }

  private void addPartitionsToVersions(List versions, String name,
      List partitions)
      throws IOException {
    for (Partition partition : partitions) {
      if (partition.getName().equalsIgnoreCase(name)) {
        versions.add(new HivePartitionRetentionVersion(partition));
      }
    }
  }

  private void setVersions(final String name, final State state)
      throws IOException {
    try {
      UserGroupInformation loginUser = UserGroupInformation.getLoginUser();
      loginUser.doAs(new PrivilegedExceptionAction() {
        @Override
        public Void run()
            throws IOException {
          synchronized (lock) {
            List partitions = null;
            for (String tableName : ComplianceRetentionJob.tableNamesList) {
              for (String pattern : patterns) {
                if (tableName.contains(pattern)) {
                  partitions = getPartitions(tableName);
                  addPartitionsToVersions(versions, name, partitions);
                }
              }
            }
          }
          return null;
        }
      });
    } catch (InterruptedException | IOException e) {
      throw new IOException(e);
    }
  }

  private static List getPartitions(String completeTableName) {
    List tableList = At_SPLITTER.splitToList(completeTableName);
    if (tableList.size() != 2) {
      log.warn("Invalid table name " + completeTableName);
      return Collections.EMPTY_LIST;
    }
    try (AutoReturnableObject client = ComplianceRetentionJob.pool.getClient()) {
      Table table = client.get().getTable(tableList.get(0), tableList.get(1));
      HiveDataset dataset = new HiveDataset(FileSystem.newInstance(new Configuration()), ComplianceRetentionJob.pool,
          new org.apache.hadoop.hive.ql.metadata.Table(table), new Properties());
      return dataset.getPartitionsFromDataset();
    } catch (IOException | TException e) {
      log.warn("Unable to get Partitions for table " + completeTableName + " " + e.getMessage());
    }
    return Collections.EMPTY_LIST;
  }
}