All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.util.SourceColumnFinder Maven / Gradle / Ivy

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Free Software Foundation, Inc.
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.util;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Table;
import org.datacleaner.api.ExpressionBasedInputColumn;
import org.datacleaner.api.InputColumn;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.ComponentRequirement;
import org.datacleaner.job.FilterOutcome;
import org.datacleaner.job.HasComponentRequirement;
import org.datacleaner.job.HasFilterOutcomes;
import org.datacleaner.job.InputColumnSinkJob;
import org.datacleaner.job.InputColumnSourceJob;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.builder.SourceColumns;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Helper class for traversing dependencies between virtual and physical columns.
 *
 * For performance reasons this class stores found sources in an internal cache. As there is no mechanism to
 * invalidate or refresh this cache, instances of this class should not be assigned to fields of other
 * classes.
 */
public class SourceColumnFinder {

    private static final String LOG_MESSAGE_RECURSIVE_TRAVERSAL =
            "Ending traversal of object graph because the same originating objects are appearing recursively";

    private static final Logger logger = LoggerFactory.getLogger(SourceColumnFinder.class);
    private final Map, Set> originatingColumnsOfInputColumnCache = new HashMap<>();
    private final Map> originatingColumnsOfSourceCache = new HashMap<>();
    private Set _inputColumnSinks = new HashSet<>();
    private Set _inputColumnSources = new LinkedHashSet<>();
    private Set _outcomeSources = new HashSet<>();
    private Set _outcomeSinks = new HashSet<>();

    private void addSources(final Object... sources) {
        for (final Object source : sources) {
            if (source instanceof InputColumnSinkJob) {
                _inputColumnSinks.add((InputColumnSinkJob) source);
            }
            if (source instanceof InputColumnSourceJob) {
                _inputColumnSources.add((InputColumnSourceJob) source);
            }
            if (source instanceof HasFilterOutcomes) {
                _outcomeSources.add((HasFilterOutcomes) source);
            }
            if (source instanceof HasComponentRequirement) {
                _outcomeSinks.add((HasComponentRequirement) source);
            }
        }
    }

    private void addSources(final Collection sources) {
        addSources(sources.toArray());
    }

    public void addSources(final AnalysisJobBuilder job) {
        addSources(new SourceColumns(job.getSourceColumns()));
        addSources(job.getFilterComponentBuilders());
        addSources(job.getTransformerComponentBuilders());
        addSources(job.getAnalyzerComponentBuilders());
    }

    public void addSources(final AnalysisJob job) {
        addSources(new SourceColumns(job.getSourceColumns()));
        addSources(job.getFilterJobs());
        addSources(job.getTransformerJobs());
        addSources(job.getAnalyzerJobs());
    }

    public List> findInputColumns(final Class dataType) {
        final List> result = new ArrayList<>();
        for (final InputColumnSourceJob source : _inputColumnSources) {
            final InputColumn[] outputColumns = source.getOutput();
            for (final InputColumn col : outputColumns) {
                final Class columnDataType = col.getDataType();
                if (dataType == null || columnDataType == null) {
                    result.add(col);
                } else {
                    if (ReflectionUtils.is(columnDataType, dataType)) {
                        result.add(col);
                    }
                }
            }
        }

        return result;
    }

    /**
     * Finds all source jobs/components for a particular job/component. This
     * method uses {@link Object} as types because input and output can be quite
     * polymorphic. Typically {@link InputColumnSinkJob},
     * {@link InputColumnSourceJob}, {@link HasComponentRequirement} and
     * {@link OutcomeSourceJob} implementations are used.
     *
     * @param job
     *            typically some {@link InputColumnSinkJob}
     * @return a list of jobs/components that are a source of this job.
     */
    public Set findAllSourceJobs(final Object job) {
        final Set result = new HashSet<>();
        findAllSourceJobs(job, result);
        return result;
    }

    private void findAllSourceJobs(final Object job, final Set result) {
        if (job == null) {
            return;
        }

        if (job instanceof InputColumnSinkJob) {
            final InputColumn[] inputColumns = ((InputColumnSinkJob) job).getInput();
            for (final InputColumn inputColumn : inputColumns) {
                final InputColumnSourceJob source = findInputColumnSource(inputColumn);
                if (source != null) {
                    final boolean added = result.add(source);
                    if (added) {
                        findAllSourceJobs(source, result);
                    }
                }
            }
        }

        if (job instanceof HasComponentRequirement) {
            final HasComponentRequirement hasComponentRequirement = (HasComponentRequirement) job;
            final ComponentRequirement requirement = hasComponentRequirement.getComponentRequirement();
            findAllSourceJobs(requirement, result);
        }

        if (job instanceof ComponentRequirement) {
            final Collection requirements = getProcessingDependencies((ComponentRequirement) job);
            for (final FilterOutcome outcome : requirements) {
                final HasFilterOutcomes source = findOutcomeSource(outcome);
                if (source != null) {
                    final boolean added = result.add(source);
                    if (added) {
                        findAllSourceJobs(source, result);
                    }
                }
            }
        }
    }

    public InputColumnSourceJob findInputColumnSource(final InputColumn inputColumn) {
        if (inputColumn instanceof ExpressionBasedInputColumn) {
            return null;
        }
        for (final InputColumnSourceJob source : _inputColumnSources) {
            final InputColumn[] output = source.getOutput();
            for (final InputColumn column : output) {
                if (inputColumn.equals(column)) {
                    return source;
                }
            }
        }
        return null;
    }

    public HasFilterOutcomes findOutcomeSource(final FilterOutcome requirement) {
        for (final HasFilterOutcomes source : _outcomeSources) {
            final Collection outcomes = source.getFilterOutcomes();
            for (final FilterOutcome outcome : outcomes) {
                if (requirement.equals(outcome)) {
                    return source;
                }
            }
        }
        return null;
    }

    public Set findOriginatingColumns(final FilterOutcome requirement) {
        final HasFilterOutcomes source = findOutcomeSource(requirement);

        return findOriginatingColumnsOfSource(source);
    }

    public Table findOriginatingTable(final FilterOutcome requirement) {
        return findOriginatingTable(requirement, new HashSet<>());
    }

    private Table findOriginatingTable(final FilterOutcome requirement, final Set resolvedSet) {
        final HasFilterOutcomes source = findOutcomeSource(requirement);
        if (!resolvedSet.add(source)) {
            logger.debug(LOG_MESSAGE_RECURSIVE_TRAVERSAL);
            return null;
        }
        return findOriginatingTableOfSource(source, resolvedSet);
    }

    public Table findOriginatingTable(final InputColumn inputColumn) {
        return findOriginatingTable(inputColumn, new HashSet<>());
    }

    private Table findOriginatingTable(final InputColumn inputColumn, final Set resolvedSet) {
        if (!resolvedSet.add(inputColumn)) {
            logger.debug(LOG_MESSAGE_RECURSIVE_TRAVERSAL);
            return null;
        }

        if (inputColumn == null) {
            logger.warn("InputColumn was null, no originating table found");
            return null;
        }
        if (inputColumn.isPhysicalColumn()) {
            return inputColumn.getPhysicalColumn().getTable();
        }

        final InputColumnSourceJob inputColumnSource = findInputColumnSource(inputColumn);
        if (!resolvedSet.add(inputColumnSource)) {
            logger.debug(LOG_MESSAGE_RECURSIVE_TRAVERSAL);
            return null;
        }

        return findOriginatingTableOfSource(inputColumnSource, resolvedSet);
    }

    private Table findOriginatingTableOfSource(final Object source, final Set resolvedSet) {
        final Set result = new TreeSet<>();
        if (source instanceof InputColumnSinkJob) {
            final InputColumn[] input = ((InputColumnSinkJob) source).getInput();
            if (input != null) {
                for (final InputColumn col : input) {
                    if (col == null) {
                        logger.warn("InputColumn sink had a null-column element!");
                    } else {
                        final Table table = findOriginatingTable(col, resolvedSet);
                        if (table != null) {
                            result.add(table);
                        }
                    }
                }
            }
        }
        if (source instanceof HasComponentRequirement) {
            final HasComponentRequirement hasComponentRequirement = (HasComponentRequirement) source;
            final ComponentRequirement componentRequirement = hasComponentRequirement.getComponentRequirement();
            final Collection requirements = getProcessingDependencies(componentRequirement);
            for (final FilterOutcome outcome : requirements) {
                final Table table = findOriginatingTable(outcome, resolvedSet);
                if (table != null) {
                    result.add(table);
                }
            }
        }

        if (result.isEmpty()) {
            return null;
        }
        if (result.size() == 1) {
            return result.iterator().next();
        }
        final StringBuilder sb = new StringBuilder();
        for (final Table table : result) {
            if (sb.length() != 0) {
                sb.append(", ");
            }
            sb.append(table.getName());
        }
        throw new IllegalStateException("Multiple originating tables (" + sb + ") found for source: " + source);
    }

    private Set findOriginatingColumnsOfInputColumn(final InputColumn inputColumn) {
        final Set cachedOriginatingColumns = originatingColumnsOfInputColumnCache.get(inputColumn);
        if (cachedOriginatingColumns != null) {
            return cachedOriginatingColumns;
        }

        final Set originatingColumns = new HashSet<>();

        if (inputColumn != null) {
            if (inputColumn.isPhysicalColumn()) {
                originatingColumns.add(inputColumn.getPhysicalColumn());
            } else {
                final InputColumnSourceJob source = findInputColumnSource(inputColumn);

                originatingColumns.addAll(findOriginatingColumnsOfSource(source));
            }
        }

        originatingColumnsOfInputColumnCache.put(inputColumn, originatingColumns);

        return originatingColumns;
    }

    private Set findOriginatingColumnsOfOutcome(final FilterOutcome requirement) {
        final HasFilterOutcomes source = findOutcomeSource(requirement);
        return findOriginatingColumnsOfSource(source);
    }

    private Set findOriginatingColumnsOfSource(final Object source) {
        final Set cachedOriginatingColumns = originatingColumnsOfSourceCache.get(source);
        if (cachedOriginatingColumns != null) {
            return cachedOriginatingColumns;
        }

        final Set originatingColumns = new HashSet<>();

        if (source != null) {
            if (source instanceof InputColumnSinkJob) {
                final InputColumn[] input = ((InputColumnSinkJob) source).getInput();
                if (input != null) {

                    for (final InputColumn inputColumn : input) {
                        originatingColumns.addAll(findOriginatingColumnsOfInputColumn(inputColumn));
                    }
                }
            }
            if (source instanceof HasComponentRequirement) {
                final HasComponentRequirement hasComponentRequirement = (HasComponentRequirement) source;
                final ComponentRequirement componentRequirement = hasComponentRequirement.getComponentRequirement();
                final Collection requirements = getProcessingDependencies(componentRequirement);
                for (final FilterOutcome outcome : requirements) {
                    originatingColumns.addAll(findOriginatingColumnsOfOutcome(outcome));
                }
            }
        }

        originatingColumnsOfSourceCache.put(source, originatingColumns);

        return originatingColumns;
    }

    private Collection getProcessingDependencies(final ComponentRequirement componentRequirement) {
        if (componentRequirement == null) {
            return Collections.emptyList();
        }
        final Collection processingDependencies = componentRequirement.getProcessingDependencies();
        if (processingDependencies == null) {
            return Collections.emptyList();
        }
        return processingDependencies;
    }

    public Set findOriginatingColumns(final InputColumn inputColumn) {
        // TODO: Detect cyclic dependencies between transformers (A depends on
        // B, B depends on A)

        return findOriginatingColumnsOfInputColumn(inputColumn);
    }
}