All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.spark.functions.AnalyzerResultReduceFunction Maven / Gradle / Ivy

There is a newer version: 6.0.0
Show newest version
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark.functions;

import java.util.Arrays;

import org.apache.spark.api.java.function.Function2;
import org.datacleaner.api.AnalyzerResult;
import org.datacleaner.api.AnalyzerResultReducer;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.InjectionManager;
import org.datacleaner.descriptors.ComponentDescriptor;
import org.datacleaner.descriptors.Descriptors;
import org.datacleaner.descriptors.ResultDescriptor;
import org.datacleaner.job.ComponentJob;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.datacleaner.spark.NamedAnalyzerResult;
import org.datacleaner.spark.SparkJobContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class AnalyzerResultReduceFunction
        implements Function2 {

    private static final long serialVersionUID = 1L;

    private static final Logger logger = LoggerFactory.getLogger(AnalyzerResultReduceFunction.class);

    private final SparkJobContext _sparkJobContext;

    public AnalyzerResultReduceFunction(final SparkJobContext sparkJobContext) {
        _sparkJobContext = sparkJobContext;
    }

    @Override
    public NamedAnalyzerResult call(final NamedAnalyzerResult namedAnalyzerResult1,
            final NamedAnalyzerResult namedAnalyzerResult2) throws Exception {

        assert namedAnalyzerResult1.getName().equals(namedAnalyzerResult2.getName());

        final String key = namedAnalyzerResult1.getName();

        final ComponentJob componentJob = _sparkJobContext.getComponentByKey(key);

        final AnalyzerResult analyzerResult1 = namedAnalyzerResult1.getAnalyzerResult();
        final AnalyzerResult analyzerResult2 = namedAnalyzerResult2.getAnalyzerResult();

        logger.info("Reducing results with key '{}' of types: {} and {}", key, analyzerResult1.getClass(),
                analyzerResult2.getClass());

        final ResultDescriptor rd = getResultDescriptor(componentJob, analyzerResult1);
        final Class> resultReducerClass = rd.getResultReducerClass();

        if (resultReducerClass == null) {
            throw new IllegalStateException("The result type (" + analyzerResult1 + ") is not distributable!");
        }

        final AnalyzerResultReducer reducer = initializeReducer(resultReducerClass);

        final AnalyzerResult reducedAnalyzerResult = reducer.reduce(Arrays.asList(analyzerResult1, analyzerResult2));

        return new NamedAnalyzerResult(key, reducedAnalyzerResult);
    }

    private AnalyzerResultReducer initializeReducer(
            final Class> resultReducerClass) {

        final DataCleanerConfiguration configuration = _sparkJobContext.getConfiguration();
        final InjectionManager injectionManager = configuration.getEnvironment().getInjectionManagerFactory()
                .getInjectionManager(configuration, _sparkJobContext.getAnalysisJob());
        final LifeCycleHelper lifeCycleHelper = new LifeCycleHelper(injectionManager, false);

        final ComponentDescriptor> reducerDescriptor =
                Descriptors.ofComponent(resultReducerClass);

        @SuppressWarnings("unchecked") final AnalyzerResultReducer reducer =
                (AnalyzerResultReducer) reducerDescriptor.newInstance();

        lifeCycleHelper.assignProvidedProperties(reducerDescriptor, reducer);
        lifeCycleHelper.initialize(reducerDescriptor, reducer);

        return reducer;
    }

    protected ResultDescriptor getResultDescriptor(final ComponentJob componentJob,
            final AnalyzerResult analyzerResult) {
        final ComponentDescriptor descriptor = componentJob.getDescriptor();
        if (descriptor instanceof ResultDescriptor) {
            return (ResultDescriptor) descriptor;
        }
        // slightly more expensive, but potentially also better / more specific!
        return Descriptors.ofResult(analyzerResult);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy