org.apache.gobblin.compaction.hive.SerialCompactor Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.compaction.hive;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import java.util.UUID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.io.Closer;
import org.apache.gobblin.compaction.Compactor;
import org.apache.gobblin.util.HiveJdbcConnector;
/**
* An implementation of compactor. This class assumes that the snapshot table
* and the delta tables are taken one after another (hence the name
* "SerialCompactor").
*/
public class SerialCompactor implements Compactor {
private static final Logger LOG = LoggerFactory.getLogger(SerialCompactor.class);
private static final String HIVE_DB_NAME = "hive.db.name";
private static final String HIVE_QUEUE_NAME = "hive.queue.name";
private static final String HIVE_USE_MAPJOIN = "hive.use.mapjoin";
private static final String HIVE_MAPJOIN_SMALLTABLE_FILESIZE = "hive.mapjoin.smalltable.filesize";
private static final String HIVE_AUTO_CONVERT_JOIN = "hive.auto.convert.join";
private static final String HIVE_INPUT_SPLIT_SIZE = "hive.input.split.size";
private static final String MAPRED_MIN_SPLIT_SIZE = "mapred.min.split.size";
private static final String MAPREDUCE_JOB_REDUCES = "mapreduce.job.reduces";
private static final String MAPREDUCE_JOB_NUM_REDUCERS = "mapreduce.job.num.reducers";
private static final String MAPREDUCE_JOB_QUEUENAME = "mapreduce.job.queuename";
private final AvroExternalTable snapshot;
private final List deltas;
private final String outputTableName;
private final String outputDataLocationInHdfs;
private final AvroExternalTable latestTable;
private final String jobId;
private HiveJdbcConnector conn;
public static class Builder {
private AvroExternalTable snapshot;
private List deltas;
private String outputTableName;
private String outputDataLocationInHdfs;
public Builder withSnapshot(AvroExternalTable snapshot) {
this.snapshot = snapshot;
return this;
}
public Builder withDeltas(List deltas) {
Preconditions.checkArgument(deltas.size() >= 1, "Number of delta tables should be at least 1");
this.deltas = deltas;
return this;
}
public Builder withOutputTableName(String outputTableName) {
this.outputTableName = outputTableName;
return this;
}
public Builder withOutputDataLocationInHdfs(String outputDataLocationInHdfs) {
this.outputDataLocationInHdfs = outputDataLocationInHdfs;
return this;
}
public SerialCompactor build() {
return new SerialCompactor(this);
}
}
private SerialCompactor(SerialCompactor.Builder builder) {
this.snapshot = builder.snapshot;
this.deltas = builder.deltas;
this.outputTableName = builder.outputTableName;
this.outputDataLocationInHdfs = builder.outputDataLocationInHdfs;
this.latestTable = this.deltas.get(this.deltas.size() - 1);
this.jobId = UUID.randomUUID().toString().replaceAll("-", "_");
}
@Override
public void compact() throws IOException {
checkSchemaCompatibility();
Closer closer = Closer.create();
try {
this.conn = closer.register(HiveJdbcConnector.newConnectorWithProps(CompactionRunner.properties));
setHiveParameters();
createTables();
HiveTable mergedDelta = mergeDeltas();
HiveManagedTable notUpdated = getNotUpdatedRecords(this.snapshot, mergedDelta);
unionNotUpdatedRecordsAndDeltas(notUpdated, mergedDelta);
} catch (SQLException e) {
LOG.error("SQLException during compaction: " + e.getMessage());
throw new RuntimeException(e);
} catch (IOException e) {
LOG.error("IOException during compaction: " + e.getMessage());
throw new RuntimeException(e);
} catch (RuntimeException e) {
LOG.error("Runtime Exception during compaction: " + e.getMessage());
throw e;
} finally {
try {
deleteTmpFiles();
} finally {
closer.close();
}
}
}
private void checkSchemaCompatibility() {
for (int i = 0; i < this.deltas.size(); i++) {
if (!this.snapshot.hasSamePrimaryKey(this.deltas.get(i))) {
String message = "Schema incompatible: the snapshot table and delta table #" + (i + 1)
+ " do not have the same primary key.";
LOG.error(message);
throw new RuntimeException(message);
}
}
}
private void setHiveParameters() throws SQLException {
setHiveQueueName();
setHiveDbName();
setHiveMapjoin();
setHiveInputSplitSize();
setNumberOfReducers();
}
private void setHiveQueueName() throws SQLException {
this.conn.executeStatements("set " + MAPREDUCE_JOB_QUEUENAME + "="
+ CompactionRunner.jobProperties.getProperty(HIVE_QUEUE_NAME, "default"));
}
private void setHiveDbName() throws SQLException {
this.conn.executeStatements("use " + CompactionRunner.jobProperties.getProperty(HIVE_DB_NAME, "default"));
}
private void setHiveMapjoin() throws SQLException {
boolean useMapjoin = Boolean.parseBoolean(CompactionRunner.jobProperties.getProperty(HIVE_USE_MAPJOIN, "false"));
boolean smallTableSizeSpecified = CompactionRunner.jobProperties.containsKey(HIVE_MAPJOIN_SMALLTABLE_FILESIZE);
if (useMapjoin && smallTableSizeSpecified) {
this.conn.executeStatements("set " + HIVE_AUTO_CONVERT_JOIN + "=true");
this.conn.executeStatements("set " + HIVE_MAPJOIN_SMALLTABLE_FILESIZE + "="
+ CompactionRunner.jobProperties.getProperty(HIVE_MAPJOIN_SMALLTABLE_FILESIZE));
}
}
private void setHiveInputSplitSize() throws SQLException {
boolean splitSizeSpecified = CompactionRunner.jobProperties.containsKey(HIVE_INPUT_SPLIT_SIZE);
if (splitSizeSpecified) {
this.conn.executeStatements(
"set " + MAPRED_MIN_SPLIT_SIZE + "=" + CompactionRunner.jobProperties.getProperty(HIVE_INPUT_SPLIT_SIZE));
}
}
private void setNumberOfReducers() throws SQLException {
boolean numOfReducersSpecified = CompactionRunner.jobProperties.containsKey(MAPREDUCE_JOB_NUM_REDUCERS);
if (numOfReducersSpecified) {
this.conn.executeStatements("set " + MAPREDUCE_JOB_REDUCES + "="
+ CompactionRunner.jobProperties.getProperty(MAPREDUCE_JOB_NUM_REDUCERS));
}
}
private void createTables() throws SQLException {
this.snapshot.createTable(this.conn, this.jobId);
for (AvroExternalTable delta : this.deltas) {
delta.createTable(this.conn, this.jobId);
}
}
private HiveTable mergeDeltas() throws SQLException {
if (this.deltas.size() == 1) {
LOG.info("Only one delta table: no need to merge delta");
return this.deltas.get(0);
}
HiveManagedTable mergedDelta =
new HiveManagedTable.Builder().withName("merged_delta").withAttributes(this.deltas.get(0).getAttributes())
.withPrimaryKeys(this.deltas.get(0).getPrimaryKeys()).build();
mergedDelta.createTable(this.conn, this.jobId);
insertFirstDeltaIntoMergedDelta(mergedDelta);
this.deltas.get(0).dropTable(this.conn, this.jobId);
for (int i = 1; i < this.deltas.size(); i++) {
mergedDelta = mergeTwoDeltas(mergedDelta, this.deltas.get(i));
LOG.info("Merged the first " + (i + 1) + " delta tables");
this.deltas.get(i).dropTable(this.conn, this.jobId);
}
return mergedDelta;
}
private void insertFirstDeltaIntoMergedDelta(HiveManagedTable mergedDelta) throws SQLException {
String insertStmt = "INSERT OVERWRITE TABLE " + mergedDelta.getNameWithJobId(this.jobId) + " SELECT * FROM "
+ this.deltas.get(0).getNameWithJobId(this.jobId);
this.conn.executeStatements(insertStmt);
}
private HiveManagedTable mergeTwoDeltas(HiveManagedTable mergedDelta, AvroExternalTable nextDelta)
throws SQLException {
HiveManagedTable notUpdated = getNotUpdatedRecords(mergedDelta, nextDelta);
HiveTable notUpdatedWithNewSchema = notUpdated.addNewColumnsInSchema(this.conn, this.latestTable, this.jobId);
HiveTable nextDeltaWithNewSchema = nextDelta.addNewColumnsInSchema(this.conn, this.latestTable, this.jobId);
mergedDelta = new HiveManagedTable.Builder().withName(mergedDelta.getName())
.withAttributes(this.latestTable.getAttributes()).withPrimaryKeys(this.latestTable.getPrimaryKeys()).build();
mergedDelta.createTable(this.conn, this.jobId);
String unionStmt = "INSERT OVERWRITE TABLE " + mergedDelta.getNameWithJobId(this.jobId) + " SELECT "
+ getAttributesInNewSchema() + " FROM " + notUpdatedWithNewSchema.getNameWithJobId(this.jobId) + " UNION ALL "
+ "SELECT " + getAttributesInNewSchema() + " FROM " + nextDeltaWithNewSchema.getNameWithJobId(this.jobId);
this.conn.executeStatements(unionStmt);
nextDelta.dropTable(this.conn, this.jobId);
return mergedDelta;
}
private String getAttributesInNewSchema() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < this.latestTable.getAttributes().size(); i++) {
sb.append(this.latestTable.getAttributes().get(i).name());
if (i < this.latestTable.getAttributes().size() - 1) {
sb.append(", ");
}
}
return sb.toString();
}
private HiveManagedTable getNotUpdatedRecords(HiveTable oldTable, HiveTable newTable) throws SQLException {
LOG.info("Getting records in table " + oldTable.getNameWithJobId(this.jobId) + " but not in table "
+ newTable.getNameWithJobId(this.jobId));
HiveManagedTable notUpdated = new HiveManagedTable.Builder().withName("not_updated")
.withPrimaryKeys(oldTable.getPrimaryKeys()).withAttributes(oldTable.getAttributes()).build();
notUpdated.createTable(this.conn, this.jobId);
String leftOuterJoinStmt = "INSERT OVERWRITE TABLE " + notUpdated.getNameWithJobId(this.jobId) + " SELECT "
+ oldTable.getNameWithJobId(this.jobId) + ".* FROM " + oldTable.getNameWithJobId(this.jobId)
+ " LEFT OUTER JOIN " + newTable.getNameWithJobId(this.jobId) + " ON " + getJoinCondition(oldTable, newTable)
+ " WHERE " + getKeyIsNullPredicate(newTable);
this.conn.executeStatements(leftOuterJoinStmt);
oldTable.dropTable(this.conn, this.jobId);
return notUpdated;
}
private String getJoinCondition(HiveTable firstTable, HiveTable secondTable) {
if (!firstTable.getPrimaryKeys().equals(secondTable.getPrimaryKeys())) {
throw new RuntimeException("The primary keys of table " + firstTable.getName() + " and table "
+ secondTable.getName() + " are different");
}
boolean addAnd = false;
StringBuilder sb = new StringBuilder();
for (String keyAttribute : firstTable.getPrimaryKeys()) {
if (addAnd) {
sb.append(" AND ");
}
sb.append(firstTable.getNameWithJobId(this.jobId) + "." + keyAttribute + " = "
+ secondTable.getNameWithJobId(this.jobId) + "." + keyAttribute);
addAnd = true;
}
return sb.toString();
}
private String getKeyIsNullPredicate(HiveTable table) {
boolean addAnd = false;
StringBuilder sb = new StringBuilder();
for (String keyAttribute : table.getPrimaryKeys()) {
if (addAnd) {
sb.append(" AND ");
}
sb.append(table.getNameWithJobId(this.jobId) + "." + keyAttribute + " IS NULL");
addAnd = true;
}
return sb.toString();
}
private AvroExternalTable unionNotUpdatedRecordsAndDeltas(HiveManagedTable notUpdated, HiveTable mergedDelta)
throws IOException, SQLException {
LOG.info("Taking union of table " + notUpdated.getNameWithJobId(this.jobId)
+ "(records in snapshot but not in delta) and table " + mergedDelta.getNameWithJobId(this.jobId)
+ "(merged delta)");
HiveTable notUpdatedWithNewSchema = notUpdated.addNewColumnsInSchema(this.conn, this.latestTable, this.jobId);
HiveTable mergedDeltaWithNewSchema = mergedDelta.addNewColumnsInSchema(this.conn, this.latestTable, this.jobId);
AvroExternalTable outputTable = new AvroExternalTable.Builder().withName(this.outputTableName)
.withPrimaryKeys(this.latestTable.getPrimaryKeys())
.withSchemaLocation(this.latestTable.getSchemaLocationInHdfs()).withDataLocation(this.outputDataLocationInHdfs)
.build();
outputTable.createTable(this.conn, this.jobId);
String unionStmt = "INSERT OVERWRITE TABLE " + outputTable.getNameWithJobId(this.jobId) + " SELECT "
+ getAttributesInNewSchema() + " FROM " + notUpdatedWithNewSchema.getNameWithJobId(this.jobId) + " UNION ALL "
+ "SELECT " + getAttributesInNewSchema() + " FROM " + mergedDeltaWithNewSchema.getNameWithJobId(this.jobId);
this.conn.executeStatements(unionStmt);
notUpdatedWithNewSchema.dropTable(this.conn, this.jobId);
mergedDeltaWithNewSchema.dropTable(this.conn, this.jobId);
return outputTable;
}
private void deleteTmpFiles() throws IllegalArgumentException, IOException {
this.snapshot.deleteTmpFilesIfNeeded();
for (AvroExternalTable delta : this.deltas) {
delta.deleteTmpFilesIfNeeded();
}
}
@Override
public void cancel() throws IOException {
// Do nothing
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy