org.apache.gobblin.compaction.source.CompactionSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.compaction.source;
import java.io.IOException;
import java.net.URI;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.joda.time.DateTimeUtils;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.gobblin.compaction.mapreduce.MRCompactionTaskFactory;
import org.apache.gobblin.compaction.mapreduce.MRCompactor;
import org.apache.gobblin.compaction.suite.CompactionSuite;
import org.apache.gobblin.compaction.suite.CompactionSuiteUtils;
import org.apache.gobblin.compaction.verify.CompactionVerifier;
import org.apache.gobblin.config.ConfigBuilder;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.SourceState;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.configuration.WorkUnitState;
import org.apache.gobblin.data.management.dataset.DatasetUtils;
import org.apache.gobblin.data.management.dataset.DefaultFileSystemGlobFinder;
import org.apache.gobblin.data.management.dataset.SimpleDatasetRequest;
import org.apache.gobblin.data.management.dataset.SimpleDatasetRequestor;
import org.apache.gobblin.dataset.Dataset;
import org.apache.gobblin.dataset.DatasetsFinder;
import org.apache.gobblin.runtime.JobState;
import org.apache.gobblin.runtime.task.FailedTask;
import org.apache.gobblin.runtime.task.TaskUtils;
import org.apache.gobblin.source.Source;
import org.apache.gobblin.source.WorkUnitStreamSource;
import org.apache.gobblin.source.extractor.Extractor;
import org.apache.gobblin.source.workunit.BasicWorkUnitStream;
import org.apache.gobblin.source.workunit.WorkUnit;
import org.apache.gobblin.source.workunit.WorkUnitStream;
import org.apache.gobblin.util.ClassAliasResolver;
import org.apache.gobblin.util.Either;
import org.apache.gobblin.util.ExecutorsUtils;
import org.apache.gobblin.util.HadoopUtils;
import org.apache.gobblin.util.executors.IteratorExecutor;
import org.apache.gobblin.util.reflection.GobblinConstructorUtils;
import org.apache.gobblin.util.request_allocation.GreedyAllocator;
import org.apache.gobblin.util.request_allocation.HierarchicalAllocator;
import org.apache.gobblin.util.request_allocation.HierarchicalPrioritizer;
import org.apache.gobblin.util.request_allocation.RequestAllocator;
import org.apache.gobblin.util.request_allocation.RequestAllocatorConfig;
import org.apache.gobblin.util.request_allocation.RequestAllocatorUtils;
import org.apache.gobblin.util.request_allocation.ResourceEstimator;
import org.apache.gobblin.util.request_allocation.ResourcePool;
/**
* A compaction source derived from {@link Source} which uses {@link DefaultFileSystemGlobFinder} to find all
* {@link Dataset}s. Use {@link CompactionSuite#getDatasetsFinderVerifiers()} to guarantee a given dataset has passed
* all verification. Each found dataset will be serialized to {@link WorkUnit} by {@link CompactionSuite#save(Dataset, State)}
*/
@Slf4j
public class CompactionSource implements WorkUnitStreamSource {
public static final String COMPACTION_INIT_TIME = "compaction.init.time";
private CompactionSuite suite;
private Path tmpJobDir;
private FileSystem fs;
private RequestAllocator allocator;
@Override
public List getWorkunits(SourceState state) {
throw new UnsupportedOperationException("Please use getWorkunitStream");
}
@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
try {
fs = getSourceFileSystem(state);
state.setProp(COMPACTION_INIT_TIME, DateTimeUtils.currentTimeMillis());
suite = CompactionSuiteUtils.getCompactionSuiteFactory(state).createSuite(state);
initRequestAllocator(state);
initJobDir(state);
copyJarDependencies(state);
DatasetsFinder finder = DatasetUtils.instantiateDatasetFinder(state.getProperties(),
getSourceFileSystem(state),
DefaultFileSystemGlobFinder.class.getName());
List datasets = finder.findDatasets();
CompactionWorkUnitIterator workUnitIterator = new CompactionWorkUnitIterator ();
// Spawn a single thread to create work units
new Thread(new SingleWorkUnitGeneratorService (state, prioritize(datasets, state), workUnitIterator), "SingleWorkUnitGeneratorService").start();
return new BasicWorkUnitStream.Builder (workUnitIterator).build();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* A work unit generator service will do the following:
* 1) Convert dataset iterator to verification callable iterator, each callable element is a verification procedure
* 2) Use {@link IteratorExecutor} to execute callable iterator
* 3) Collect all failed datasets at step 2), retry them until timeout. Once timeout create failed workunits on purpose.
*/
private class SingleWorkUnitGeneratorService implements Runnable {
private SourceState state;
private List datasets;
private CompactionWorkUnitIterator workUnitIterator;
private IteratorExecutor executor;
public SingleWorkUnitGeneratorService (SourceState state, List datasets, CompactionWorkUnitIterator workUnitIterator) {
this.state = state;
this.datasets = datasets;
this.workUnitIterator = workUnitIterator;
}
public void run () {
try {
Stopwatch stopwatch = Stopwatch.createStarted();
int threads = this.state.getPropAsInt(CompactionVerifier.COMPACTION_VERIFICATION_THREADS, 5);
long timeOutInMinute = this.state.getPropAsLong(CompactionVerifier.COMPACTION_VERIFICATION_TIMEOUT_MINUTES, 30);
long iterationCountLimit = this.state.getPropAsLong(CompactionVerifier.COMPACTION_VERIFICATION_ITERATION_COUNT_LIMIT, Integer.MAX_VALUE);
long iteration = 0;
Map failedReasonMap = null;
while (datasets.size() > 0 && iteration++ < iterationCountLimit) {
Iterator> verifierIterator =
Iterators.transform (datasets.iterator(), new Function>() {
@Override
public Callable apply(Dataset dataset) {
return new DatasetVerifier (dataset, workUnitIterator, suite.getDatasetsFinderVerifiers());
}
});
executor = new IteratorExecutor<>(verifierIterator, threads,
ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of("Verifier-compaction-dataset-pool-%d")));
List failedDatasets = Lists.newArrayList();
failedReasonMap = Maps.newHashMap();
List> futures = executor.executeAndGetResults();
for (Either either: futures) {
if (either instanceof Either.Right) {
ExecutionException exc = ((Either.Right) either).getRight();
DatasetVerificationException dve = (DatasetVerificationException) exc.getCause();
failedDatasets.add(dve.dataset);
failedReasonMap.put(dve.dataset.getUrn(), ExceptionUtils.getFullStackTrace(dve.cause));
} else {
VerifiedDataset vd = ((Either.Left) either).getLeft();
if (!vd.verifiedResult.allVerificationPassed) {
if (vd.verifiedResult.shouldRetry) {
log.debug ("Dataset {} verification has failure but should retry", vd.dataset.datasetURN());
failedDatasets.add(vd.dataset);
failedReasonMap.put(vd.dataset.getUrn(), vd.verifiedResult.failedReason);
} else {
log.debug ("Dataset {} verification has failure but no need to retry", vd.dataset.datasetURN());
}
}
}
}
this.datasets = prioritize(failedDatasets, state);
if (stopwatch.elapsed(TimeUnit.MINUTES) > timeOutInMinute) {
break;
}
}
if (this.datasets.size() > 0) {
for (Dataset dataset: datasets) {
log.info ("{} is timed out and give up the verification, adding a failed task", dataset.datasetURN());
// create failed task for these failed datasets
this.workUnitIterator.addWorkUnit (createWorkUnitForFailure(dataset, failedReasonMap.get(dataset.getUrn())));
}
}
this.workUnitIterator.done();
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
private void initRequestAllocator (State state) {
try {
ResourceEstimator estimator = GobblinConstructorUtils.invokeLongestConstructor(
new ClassAliasResolver(ResourceEstimator.class).resolveClass(state.getProp(ConfigurationKeys.COMPACTION_ESTIMATOR,
SimpleDatasetRequest.SimpleDatasetCountEstimator.class.getName())));
RequestAllocatorConfig.Builder configBuilder =
RequestAllocatorConfig.builder(estimator).allowParallelization(1).withLimitedScopeConfig(ConfigBuilder.create()
.loadProps(state.getProperties(), ConfigurationKeys.COMPACTION_PRIORITIZATION_PREFIX).build());
if (!state.contains(ConfigurationKeys.COMPACTION_PRIORITIZER_ALIAS)) {
allocator = new GreedyAllocator<>(configBuilder.build());
return;
}
Comparator prioritizer = GobblinConstructorUtils.invokeLongestConstructor(
new ClassAliasResolver(Comparator.class).resolveClass(state.getProp(ConfigurationKeys.COMPACTION_PRIORITIZER_ALIAS)), state);
configBuilder.withPrioritizer(prioritizer);
if (prioritizer instanceof HierarchicalPrioritizer) {
allocator = new HierarchicalAllocator.Factory().createRequestAllocator(configBuilder.build());
} else {
allocator = RequestAllocatorUtils.inferFromConfig(configBuilder.build());
}
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Cannot initialize allocator", e);
}
}
private List prioritize (List datasets, State state) {
double maxPool = state.getPropAsDouble(MRCompactor.COMPACTION_DATASETS_MAX_COUNT, MRCompactor.DEFUALT_COMPACTION_DATASETS_MAX_COUNT);
ResourcePool pool = ResourcePool.builder().maxResource(SimpleDatasetRequest.SIMPLE_DATASET_COUNT_DIMENSION, maxPool).build();
Iterator newList = Iterators.transform(
this.allocator.allocateRequests(datasets.stream().map(SimpleDatasetRequestor::new).iterator(), pool), (input) -> input.getDataset());
return Lists.newArrayList(newList);
}
private static class DatasetVerificationException extends Exception {
private Dataset dataset;
private Throwable cause;
public DatasetVerificationException (Dataset dataset, Throwable cause) {
super ("Dataset:" + dataset.datasetURN() + " Exception:" + cause);
this.dataset = dataset;
this.cause = cause;
}
}
@AllArgsConstructor
private static class VerifiedDataset {
private Dataset dataset;
private VerifiedResult verifiedResult;
}
@AllArgsConstructor
private static class VerifiedResult {
private boolean allVerificationPassed;
private boolean shouldRetry;
private String failedReason;
}
@AllArgsConstructor
private class DatasetVerifier implements Callable {
private Dataset dataset;
private CompactionWorkUnitIterator workUnitIterator;
private List verifiers;
/**
* {@link VerifiedDataset} wraps original {@link Dataset} because if verification failed, we are able get original
* datasets and restart the entire process of verification against those failed datasets.
*/
public VerifiedDataset call () throws DatasetVerificationException {
try {
VerifiedResult result = this.verify(dataset);
if (result.allVerificationPassed) {
this.workUnitIterator.addWorkUnit (createWorkUnit(dataset));
}
return new VerifiedDataset(dataset, result);
} catch (Exception e) {
throw new DatasetVerificationException(dataset, e);
}
}
public VerifiedResult verify (Dataset dataset) throws Exception {
boolean verificationPassed = true;
boolean shouldRetry = true;
String failedReason = "";
if (verifiers != null) {
for (CompactionVerifier verifier : verifiers) {
CompactionVerifier.Result rst = verifier.verify (dataset);
if (!rst.isSuccessful()) {
verificationPassed = false;
failedReason = rst.getFailureReason();
// Not all verification should be retried. Below are verifications which
// doesn't need retry. If any of then failed, we simply skip this dataset.
if (!verifier.isRetriable()) {
shouldRetry = false;
break;
}
}
}
}
return new VerifiedResult(verificationPassed, shouldRetry, failedReason);
}
}
/**
* Iterator that provides {@link WorkUnit}s for all verified {@link Dataset}s
*/
private static class CompactionWorkUnitIterator implements Iterator {
private LinkedBlockingDeque workUnits;
private WorkUnit last;
private AtomicBoolean isDone;
/**
* Constructor
*/
public CompactionWorkUnitIterator () {
this.workUnits = new LinkedBlockingDeque<>();
this.isDone = new AtomicBoolean(false);
this.last = null;
}
/**
* Check if any {@link WorkUnit} is available. The producer is {@link SingleWorkUnitGeneratorService}
* @return true when a new {@link WorkUnit} is available
* false when {@link CompactionWorkUnitIterator#isDone} is invoked
*/
public boolean hasNext () {
try {
while (true) {
if (last != null) return true;
if (this.isDone.get() && this.workUnits.isEmpty()) return false;
this.last = this.workUnits.poll(1, TimeUnit.SECONDS);
}
} catch (InterruptedException e) {
log.error(e.toString());
return false;
}
}
/**
* Stops the iteration so that {@link CompactionWorkUnitIterator#hasNext()} returns false
*/
public void done () {
this.isDone.set(true);
}
/**
* Obtain next available {@link WorkUnit}.
* The method will first query if any work unit is available by calling {@link CompactionWorkUnitIterator#hasNext()}
* Because {@link CompactionWorkUnitIterator#hasNext()} is a blocking call, this method can also be blocked.
*/
public WorkUnit next () {
if (hasNext()) {
if (last != null) {
WorkUnit tmp = last;
last = null;
return tmp;
} else {
throw new IllegalStateException("last variable cannot be empty");
}
}
throw new NoSuchElementException("work units queue has been exhausted");
}
public void remove() {
throw new UnsupportedOperationException("No remove supported on " + this.getClass().getName());
}
protected void addWorkUnit (WorkUnit wu) {
this.workUnits.add(wu);
}
}
protected WorkUnit createWorkUnit (Dataset dataset) throws IOException {
WorkUnit workUnit = new WorkUnit();
TaskUtils.setTaskFactoryClass(workUnit, MRCompactionTaskFactory.class);
suite.save (dataset, workUnit);
return workUnit;
}
protected WorkUnit createWorkUnitForFailure (Dataset dataset) throws IOException {
WorkUnit workUnit = new FailedTask.FailedWorkUnit();
TaskUtils.setTaskFactoryClass(workUnit, CompactionFailedTask.CompactionFailedTaskFactory.class);
suite.save (dataset, workUnit);
return workUnit;
}
protected WorkUnit createWorkUnitForFailure (Dataset dataset, String reason) throws IOException {
WorkUnit workUnit = new FailedTask.FailedWorkUnit();
workUnit.setProp(CompactionVerifier.COMPACTION_VERIFICATION_FAIL_REASON, reason);
TaskUtils.setTaskFactoryClass(workUnit, CompactionFailedTask.CompactionFailedTaskFactory.class);
suite.save (dataset, workUnit);
return workUnit;
}
@Override
public Extractor getExtractor (WorkUnitState state) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void shutdown (SourceState state) {
try {
boolean f = fs.delete(this.tmpJobDir, true);
log.info("Job dir is removed from {} with status {}", this.tmpJobDir, f);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static FileSystem getSourceFileSystem(State state)
throws IOException {
Configuration conf = HadoopUtils.getConfFromState(state);
String uri = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
return HadoopUtils.getOptionallyThrottledFileSystem(FileSystem.get(URI.create(uri), conf), state);
}
/**
* Create a temporary job directory based on job id or (if not available) UUID
*/
private void initJobDir (SourceState state) throws IOException {
String tmpBase = state.getProp(MRCompactor.COMPACTION_TMP_DEST_DIR, MRCompactor.DEFAULT_COMPACTION_TMP_DEST_DIR);
String jobId;
if (state instanceof JobState) {
jobId = ((JobState) state).getJobId();
} else {
jobId = UUID.randomUUID().toString();
}
this.tmpJobDir = new Path (tmpBase, jobId);
this.fs.mkdirs(this.tmpJobDir);
state.setProp (MRCompactor.COMPACTION_JOB_DIR, tmpJobDir.toString());
log.info ("Job dir is created under {}", this.tmpJobDir);
}
/**
* Copy dependent jars to a temporary job directory on HDFS
*/
private void copyJarDependencies (State state) throws IOException {
if (this.tmpJobDir == null) {
throw new RuntimeException("Job directory is not created");
}
if (!state.contains(ConfigurationKeys.JOB_JAR_FILES_KEY)) {
return;
}
// create sub-dir to save jar files
LocalFileSystem lfs = FileSystem.getLocal(HadoopUtils.getConfFromState(state));
Path tmpJarFileDir = new Path(this.tmpJobDir, MRCompactor.COMPACTION_JAR_SUBDIR);
this.fs.mkdirs(tmpJarFileDir);
state.setProp (MRCompactor.COMPACTION_JARS, tmpJarFileDir.toString());
// copy jar files to hdfs
for (String jarFile : state.getPropAsList(ConfigurationKeys.JOB_JAR_FILES_KEY)) {
for (FileStatus status : lfs.globStatus(new Path(jarFile))) {
Path tmpJarFile = new Path(this.fs.makeQualified(tmpJarFileDir), status.getPath().getName());
this.fs.copyFromLocalFile(status.getPath(), tmpJarFile);
log.info(String.format("%s will be added to classpath", tmpJarFile));
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy