org.apache.gobblin.compliance.retention.ComplianceRetentionJob Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.compliance.retention;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.thrift.TException;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import edu.umd.cs.findbugs.annotations.SuppressWarnings;
import javax.annotation.Nullable;
import lombok.extern.slf4j.Slf4j;
import org.apache.gobblin.compliance.ComplianceConfigurationKeys;
import org.apache.gobblin.compliance.ComplianceEvents;
import org.apache.gobblin.compliance.ComplianceJob;
import org.apache.gobblin.compliance.HiveProxyQueryExecutor;
import org.apache.gobblin.compliance.purger.HivePurgerQueryTemplate;
import org.apache.gobblin.compliance.utils.ProxyUtils;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.data.management.copy.hive.HiveDataset;
import org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder;
import org.apache.gobblin.data.management.retention.dataset.CleanableDataset;
import org.apache.gobblin.dataset.Dataset;
import org.apache.gobblin.dataset.DatasetsFinder;
import org.apache.gobblin.hive.HiveMetastoreClientPool;
import org.apache.gobblin.util.ExecutorsUtils;
import org.apache.gobblin.util.reflection.GobblinConstructorUtils;
import static org.apache.gobblin.compliance.ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS;
/**
* A compliance job for compliance retention requirements.
*/
@Slf4j
@SuppressWarnings
public class ComplianceRetentionJob extends ComplianceJob {
public static final List tableNamesList = new ArrayList<>();
public static HiveMetastoreClientPool pool;
private List tablesToDrop = new ArrayList();
public ComplianceRetentionJob(Properties properties) {
super(properties);
try {
this.pool = HiveMetastoreClientPool
.get(properties, Optional.fromNullable(properties.getProperty(HiveDatasetFinder.HIVE_METASTORE_URI_KEY)));
initDatasetFinder(properties);
ProxyUtils.cancelTokens(new State(properties));
} catch (InterruptedException | TException | IOException e) {
Throwables.propagate(e);
}
}
public void initDatasetFinder(Properties properties)
throws IOException {
Preconditions.checkArgument(properties.containsKey(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS),
"Missing required propety " + GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
String finderClass = properties.getProperty(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
this.finder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, finderClass, new State(properties));
Iterator datasetsIterator =
new HiveDatasetFinder(FileSystem.newInstance(new Configuration()), properties).getDatasetsIterator();
while (datasetsIterator.hasNext()) {
// Drop partitions from empty tables if property is set, otherwise skip the table
HiveDataset hiveDataset = datasetsIterator.next();
List partitionsFromDataset = hiveDataset.getPartitionsFromDataset();
String completeTableName = hiveDataset.getTable().getCompleteName();
if (!partitionsFromDataset.isEmpty()) {
this.tableNamesList.add(completeTableName);
continue;
}
if (!Boolean.parseBoolean(properties.getProperty(ComplianceConfigurationKeys.SHOULD_DROP_EMPTY_TABLES,
ComplianceConfigurationKeys.DEFAULT_SHOULD_DROP_EMPTY_TABLES))) {
continue;
}
if (completeTableName.contains(ComplianceConfigurationKeys.TRASH) || completeTableName
.contains(ComplianceConfigurationKeys.BACKUP) || completeTableName
.contains(ComplianceConfigurationKeys.STAGING)) {
this.tablesToDrop.add(hiveDataset);
}
}
}
public void run()
throws IOException {
// Dropping empty tables
for (HiveDataset dataset : this.tablesToDrop) {
log.info("Dropping table: " + dataset.getTable().getCompleteName());
executeDropTableQuery(dataset, this.properties);
}
Preconditions.checkNotNull(this.finder, "Dataset finder class is not set");
List datasets = this.finder.findDatasets();
this.finishCleanSignal = Optional.of(new CountDownLatch(datasets.size()));
for (final Dataset dataset : datasets) {
ListenableFuture future = this.service.submit(new Callable() {
@Override
public Void call()
throws Exception {
if (dataset instanceof CleanableDataset) {
((CleanableDataset) dataset).clean();
} else {
log.warn(
"Not an instance of " + CleanableDataset.class + " Dataset won't be cleaned " + dataset.datasetURN());
}
return null;
}
});
Futures.addCallback(future, new FutureCallback() {
@Override
public void onSuccess(@Nullable Void result) {
ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
log.info("Successfully cleaned: " + dataset.datasetURN());
}
@Override
public void onFailure(Throwable t) {
ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
log.warn("Exception caught when cleaning " + dataset.datasetURN() + ".", t);
ComplianceRetentionJob.this.throwables.add(t);
ComplianceRetentionJob.this.eventSubmitter.submit(ComplianceEvents.Retention.FAILED_EVENT_NAME, ImmutableMap
.of(ComplianceEvents.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(t),
ComplianceEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
}
});
}
}
@Override
public void close()
throws IOException {
try {
if (this.finishCleanSignal.isPresent()) {
this.finishCleanSignal.get().await();
}
if (!this.throwables.isEmpty()) {
for (Throwable t : this.throwables) {
log.error("Failed clean due to ", t);
}
throw new RuntimeException("Retention failed for one or more datasets");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Not all datasets finish retention", e);
} finally {
ExecutorsUtils.shutdownExecutorService(this.service, Optional.of(log));
this.closer.close();
}
}
private static void executeDropTableQuery(HiveDataset hiveDataset, Properties properties)
throws IOException {
String dbName = hiveDataset.getTable().getDbName();
String tableName = hiveDataset.getTable().getTableName();
Optional datasetOwner = Optional.fromNullable(hiveDataset.getTable().getOwner());
try (HiveProxyQueryExecutor hiveProxyQueryExecutor = ProxyUtils
.getQueryExecutor(new State(properties), datasetOwner)) {
hiveProxyQueryExecutor.executeQuery(HivePurgerQueryTemplate.getDropTableQuery(dbName, tableName), datasetOwner);
} catch (SQLException e) {
throw new IOException(e);
}
}
/**
* Generates retention metrics for the instrumentation of this class.
*/
@Override
protected void regenerateMetrics() {
// TODO
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy