org.apache.gobblin.compaction.hive.CompactionRunner Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.compaction.hive;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.ConfigurationConverter;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.gobblin.compaction.CliOptions;
import org.apache.gobblin.compaction.mapreduce.MRCompactionRunner;
/**
* Run Hive compaction based on config files.
*/
public class CompactionRunner {
private static final Logger LOG = LoggerFactory.getLogger(CompactionRunner.class);
private static final String COMPACTION_CONFIG_DIR = "compaction.config.dir";
private static final String TIMING_FILE = "timing.file";
private static final String TIMING_FILE_DEFAULT = "time.txt";
private static final String SNAPSHOT = "snapshot";
private static final String DELTA = "delta";
private static final String NAME = ".name";
private static final String PKEY = ".pkey";
private static final String DATALOCATION = ".datalocation";
private static final String SCHEMALOCATION = ".schemalocation";
private static final String COPYDATA = ".copydata";
private static final String COPYDATA_DEFAULT = "false";
private static final String DATAFORMAT_EXTENSION_NAME = ".dataformat.extension.name";
private static final String OUTPUT = "output";
static Properties properties = new Properties();
static Properties jobProperties = new Properties();
public static void main(String[] args) throws IOException, ConfigurationException {
properties = CliOptions.parseArgs(MRCompactionRunner.class, args);
File compactionConfigDir = new File(properties.getProperty(COMPACTION_CONFIG_DIR));
File[] listOfFiles = compactionConfigDir.listFiles();
if (listOfFiles == null || listOfFiles.length == 0) {
System.err.println("No compaction configuration files found under " + compactionConfigDir);
System.exit(1);
}
int numOfJobs = 0;
for (File file : listOfFiles) {
if (file.isFile() && !file.getName().startsWith(".")) {
numOfJobs++;
}
}
LOG.info("Found " + numOfJobs + " compaction tasks.");
try (PrintWriter pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(properties.getProperty(TIMING_FILE, TIMING_FILE_DEFAULT)), Charset.forName("UTF-8")))) {
for (File file : listOfFiles) {
if (file.isFile() && !file.getName().startsWith(".")) {
Configuration jobConfig = new PropertiesConfiguration(file.getAbsolutePath());
jobProperties = ConfigurationConverter.getProperties(jobConfig);
long startTime = System.nanoTime();
compact();
long endTime = System.nanoTime();
long elapsedTime = endTime - startTime;
double seconds = TimeUnit.NANOSECONDS.toSeconds(elapsedTime);
pw.printf("%s: %f%n", file.getAbsolutePath(), seconds);
}
}
}
}
private static void compact() throws IOException {
SerialCompactor sc = new SerialCompactor.Builder().withSnapshot(buildSnapshotTable()).withDeltas(buildDeltaTables())
.withOutputTableName(jobProperties.getProperty(OUTPUT + NAME))
.withOutputDataLocationInHdfs(jobProperties.getProperty(OUTPUT + DATALOCATION)).build();
sc.compact();
}
private static AvroExternalTable buildSnapshotTable() throws IOException {
return buildAvroExternalTable(SNAPSHOT);
}
private static List buildDeltaTables() throws IOException {
List deltas = new ArrayList<>();
for (int i = 1;; i++) {
String deltai = DELTA + "." + i;
if (jobProperties.getProperty(deltai + DATALOCATION) == null) {
break;
}
deltas.add(buildAvroExternalTable(deltai));
}
return deltas;
}
private static AvroExternalTable buildAvroExternalTable(String tableType) throws IOException {
AvroExternalTable.Builder builder =
new AvroExternalTable.Builder().withName(jobProperties.getProperty(tableType + NAME, ""))
.withPrimaryKeys(jobProperties.getProperty(tableType + PKEY))
.withSchemaLocation(jobProperties.getProperty(tableType + SCHEMALOCATION, ""))
.withDataLocation(jobProperties.getProperty(tableType + DATALOCATION));
if (Boolean.parseBoolean(jobProperties.getProperty(tableType + COPYDATA, COPYDATA_DEFAULT))) {
builder = builder.withMoveDataToTmpHdfsDir(jobProperties.getProperty(tableType + DATAFORMAT_EXTENSION_NAME, ""));
}
return builder.build();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy