All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.compliance.purger.HivePurgerPublisher Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.compliance.purger;

import java.security.PrivilegedExceptionAction;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;

import com.google.common.base.Splitter;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.compliance.ComplianceConfigurationKeys;
import org.apache.gobblin.compliance.ComplianceEvents;
import org.apache.gobblin.compliance.HivePartitionDataset;
import org.apache.gobblin.compliance.utils.DatasetUtils;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.configuration.WorkUnitState;
import org.apache.gobblin.instrumented.Instrumented;
import org.apache.gobblin.metrics.MetricContext;
import org.apache.gobblin.metrics.event.EventSubmitter;
import org.apache.gobblin.publisher.DataPublisher;
import org.apache.gobblin.source.workunit.WorkUnit;
import org.apache.gobblin.util.HostUtils;


/**
 * The Publisher moves COMMITTED WorkUnitState to SUCCESSFUL, otherwise FAILED.
 *
 * @author adsharma
 */
@Slf4j
public class HivePurgerPublisher extends DataPublisher {
  protected MetricContext metricContext;
  protected EventSubmitter eventSubmitter;
  public HiveMetaStoreClient client;

  public HivePurgerPublisher(State state) throws Exception {
    super(state);
    this.metricContext = Instrumented.getMetricContext(state, this.getClass());
    this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, ComplianceEvents.NAMESPACE).
        build();

    initHiveMetastoreClient();
  }

  public void initHiveMetastoreClient() throws Exception {
    if (this.state.contains(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION)) {
      String superUser = this.state.getProp(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SUPER_USER);
      String realm = this.state.getProp(ConfigurationKeys.KERBEROS_REALM);
      String keytabLocation = this.state.getProp(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION);
      log.info("Establishing MetastoreClient connection using " + keytabLocation);

      UserGroupInformation.loginUserFromKeytab(HostUtils.getPrincipalUsingHostname(superUser, realm), keytabLocation);
      UserGroupInformation loginUser = UserGroupInformation.getLoginUser();
      loginUser.doAs(new PrivilegedExceptionAction() {
        @Override
        public Void run() throws TException {
          HivePurgerPublisher.this.client = new HiveMetaStoreClient(new HiveConf());
          return null;
        }
      });
    } else {
      HivePurgerPublisher.this.client = new HiveMetaStoreClient(new HiveConf());
    }
  }

  public void initialize() {
  }

  @Override
  public void publishData(Collection states) {
    for (WorkUnitState state : states) {
      if (state.getWorkingState() == WorkUnitState.WorkingState.SUCCESSFUL) {
        state.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
        submitEvent(state, ComplianceEvents.Purger.WORKUNIT_COMMITTED);
      } else {
        state.setWorkingState(WorkUnitState.WorkingState.FAILED);
        submitEvent(state, ComplianceEvents.Purger.WORKUNIT_FAILED);
      }
    }
  }

  private void submitEvent(WorkUnitState state, String name) {
    WorkUnit workUnit = state.getWorkunit();
    Map metadata = new HashMap<>();
    String recordsRead = state.getProp(ComplianceConfigurationKeys.NUM_ROWS);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSREAD, recordsRead);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESREAD,
        getDataSize(workUnit.getProp(ComplianceConfigurationKeys.RAW_DATA_SIZE),
            workUnit.getProp(ComplianceConfigurationKeys.TOTAL_SIZE)));

    String partitionNameProp = workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME);
    Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
    List namesList = AT_SPLITTER.splitToList(partitionNameProp);
    if (namesList.size() != 3) {
      log.warn("Not submitting event. Invalid partition name: " + partitionNameProp);
      return;
    }

    String dbName = namesList.get(0), tableName = namesList.get(1), partitionName = namesList.get(2);
    org.apache.hadoop.hive.metastore.api.Partition apiPartition = null;
    Partition qlPartition = null;
    try {
      Table table = new Table(this.client.getTable(dbName, tableName));
      apiPartition = this.client.getPartition(dbName, tableName, partitionName);
      qlPartition = new Partition(table, apiPartition);
    } catch (Exception e) {
      log.warn("Not submitting event. Failed to resolve partition '" + partitionName + "': " + e);
      e.printStackTrace();
      return;
    }

    HivePartitionDataset hivePartitionDataset = new HivePartitionDataset(qlPartition);

    String recordsWritten = DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.NUM_ROWS,
        ComplianceConfigurationKeys.DEFAULT_NUM_ROWS);

    String recordsPurged = Long.toString((Long.parseLong(recordsRead) - Long.parseLong(recordsWritten)));
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSWRITTEN,
        recordsWritten);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESWRITTEN, getDataSize(DatasetUtils
        .getProperty(hivePartitionDataset, ComplianceConfigurationKeys.RAW_DATA_SIZE,
            ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE), DatasetUtils
        .getProperty(hivePartitionDataset, ComplianceConfigurationKeys.TOTAL_SIZE,
            ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE)));

    metadata.put(DatasetMetrics.DATABASE_NAME, hivePartitionDataset.getDbName());
    metadata.put(DatasetMetrics.TABLE_NAME, hivePartitionDataset.getTableName());
    metadata.put(DatasetMetrics.PARTITION_NAME, hivePartitionDataset.getName());
    metadata.put(DatasetMetrics.RECORDS_PURGED, recordsPurged);

    this.eventSubmitter.submit(name, metadata);
  }

  private String getDataSize(String rawDataSize, String totalDataSize) {
    long rawDataSizeVal = Long.parseLong(rawDataSize);
    long totalDataSizeVal = Long.parseLong(totalDataSize);
    long dataSize = totalDataSizeVal;
    if (totalDataSizeVal <= 0) {
      dataSize = rawDataSizeVal;
    }
    return Long.toString(dataSize);
  }

  public void publishMetadata(Collection states) {
  }

  @Override
  public void close() {
  }

  public static class DatasetMetrics {
    public static final String DATABASE_NAME = "HiveDatabaseName";
    public static final String TABLE_NAME = "HiveTableName";
    public static final String PARTITION_NAME = "HivePartitionName";
    public static final String RECORDS_PURGED = "RecordsPurged";
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy