org.datacleaner.spark.functions.AnalyzerResultReduceFunction Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.spark.functions;
import java.util.Arrays;
import org.apache.spark.api.java.function.Function2;
import org.datacleaner.api.AnalyzerResult;
import org.datacleaner.api.AnalyzerResultReducer;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.InjectionManager;
import org.datacleaner.descriptors.ComponentDescriptor;
import org.datacleaner.descriptors.Descriptors;
import org.datacleaner.descriptors.ResultDescriptor;
import org.datacleaner.job.ComponentJob;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.datacleaner.spark.NamedAnalyzerResult;
import org.datacleaner.spark.SparkJobContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class AnalyzerResultReduceFunction
implements Function2 {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(AnalyzerResultReduceFunction.class);
private final SparkJobContext _sparkJobContext;
public AnalyzerResultReduceFunction(final SparkJobContext sparkJobContext) {
_sparkJobContext = sparkJobContext;
}
@Override
public NamedAnalyzerResult call(final NamedAnalyzerResult namedAnalyzerResult1,
final NamedAnalyzerResult namedAnalyzerResult2) throws Exception {
assert namedAnalyzerResult1.getName().equals(namedAnalyzerResult2.getName());
final String key = namedAnalyzerResult1.getName();
final ComponentJob componentJob = _sparkJobContext.getComponentByKey(key);
final AnalyzerResult analyzerResult1 = namedAnalyzerResult1.getAnalyzerResult();
final AnalyzerResult analyzerResult2 = namedAnalyzerResult2.getAnalyzerResult();
logger.info("Reducing results with key '{}' of types: {} and {}", key, analyzerResult1.getClass(),
analyzerResult2.getClass());
final ResultDescriptor rd = getResultDescriptor(componentJob, analyzerResult1);
final Class extends AnalyzerResultReducer>> resultReducerClass = rd.getResultReducerClass();
if (resultReducerClass == null) {
throw new IllegalStateException("The result type (" + analyzerResult1 + ") is not distributable!");
}
final AnalyzerResultReducer reducer = initializeReducer(resultReducerClass);
final AnalyzerResult reducedAnalyzerResult = reducer.reduce(Arrays.asList(analyzerResult1, analyzerResult2));
return new NamedAnalyzerResult(key, reducedAnalyzerResult);
}
private AnalyzerResultReducer initializeReducer(
final Class extends AnalyzerResultReducer>> resultReducerClass) {
final DataCleanerConfiguration configuration = _sparkJobContext.getConfiguration();
final InjectionManager injectionManager = configuration.getEnvironment().getInjectionManagerFactory()
.getInjectionManager(configuration, _sparkJobContext.getAnalysisJob());
final LifeCycleHelper lifeCycleHelper = new LifeCycleHelper(injectionManager, false);
final ComponentDescriptor extends AnalyzerResultReducer>> reducerDescriptor =
Descriptors.ofComponent(resultReducerClass);
@SuppressWarnings("unchecked") final AnalyzerResultReducer reducer =
(AnalyzerResultReducer) reducerDescriptor.newInstance();
lifeCycleHelper.assignProvidedProperties(reducerDescriptor, reducer);
lifeCycleHelper.initialize(reducerDescriptor, reducer);
return reducer;
}
protected ResultDescriptor getResultDescriptor(final ComponentJob componentJob,
final AnalyzerResult analyzerResult) {
final ComponentDescriptor> descriptor = componentJob.getDescriptor();
if (descriptor instanceof ResultDescriptor) {
return (ResultDescriptor) descriptor;
}
// slightly more expensive, but potentially also better / more specific!
return Descriptors.ofResult(analyzerResult);
}
}