All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.user.QuickAnalysisStrategy Maven / Gradle / Ivy

There is a newer version: 6.0.0
Show newest version
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Free Software Foundation, Inc.
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.user;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.collections.MapUtils;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.InputColumn;
import org.datacleaner.beans.BooleanAnalyzer;
import org.datacleaner.beans.DateAndTimeAnalyzer;
import org.datacleaner.beans.NumberAnalyzer;
import org.datacleaner.beans.StringAnalyzer;
import org.datacleaner.beans.stringpattern.PatternFinderAnalyzer;
import org.datacleaner.beans.valuedist.ValueDistributionAnalyzer;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.builder.AnalyzerComponentBuilder;
import org.datacleaner.util.ReflectionUtils;

/**
 * Defines the strategy and rules for doing quick analysis.
 *
 * @see QuickAnalysisActionListener
 */
public class QuickAnalysisStrategy implements Serializable {

    private static final long serialVersionUID = 1L;

    private static final String USER_PREFERENCES_NAMESPACE = "datacleaner.quickanalysis.strategy";

    private final int columnsPerAnalyzer;
    private final boolean includeValueDistribution;
    private final boolean includePatternFinder;

    public QuickAnalysisStrategy() {
        this(5, false, false);
    }

    public QuickAnalysisStrategy(final int columnsPerAnalyzer, final boolean includeValueDistribution,
            final boolean includePatternFinder) {
        this.columnsPerAnalyzer = columnsPerAnalyzer;
        this.includeValueDistribution = includeValueDistribution;
        this.includePatternFinder = includePatternFinder;
    }

    /**
     * Saves a {@link QuickAnalysisStrategy} to a {@link UserPreferences}
     * object.
     *
     * @param strategy
     * @param userPreferences
     */
    public static void saveToUserPreferences(final QuickAnalysisStrategy strategy,
            final UserPreferences userPreferences) {
        final Map properties = userPreferences.getAdditionalProperties();

        properties.put(USER_PREFERENCES_NAMESPACE + ".columnsPerAnalyzer", "" + strategy.columnsPerAnalyzer);
        properties
                .put(USER_PREFERENCES_NAMESPACE + ".includeValueDistribution", "" + strategy.includeValueDistribution);
        properties.put(USER_PREFERENCES_NAMESPACE + ".includePatternFinder", "" + strategy.includePatternFinder);
    }

    /**
     * Loads {@link QuickAnalysisStrategy} from a {@link UserPreferences}
     * object.
     *
     * @param userPreferences
     * @return
     */
    public static QuickAnalysisStrategy loadFromUserPreferences(final UserPreferences userPreferences) {
        final Map properties = userPreferences.getAdditionalProperties();

        final int columnsPerAnalyzer =
                MapUtils.getIntValue(properties, USER_PREFERENCES_NAMESPACE + ".columnsPerAnalyzer", 5);
        final boolean includeValueDistribution =
                MapUtils.getBooleanValue(properties, USER_PREFERENCES_NAMESPACE + ".includeValueDistribution", false);
        final boolean includePatternFinder =
                MapUtils.getBooleanValue(properties, USER_PREFERENCES_NAMESPACE + ".includePatternFinder", false);

        return new QuickAnalysisStrategy(columnsPerAnalyzer, includeValueDistribution, includePatternFinder);
    }

    public boolean isIncludePatternFinder() {
        return includePatternFinder;
    }

    public boolean isIncludeValueDistribution() {
        return includeValueDistribution;
    }

    public int getColumnsPerAnalyzer() {
        return columnsPerAnalyzer;
    }

    public void configureAnalysisJobBuilder(final AnalysisJobBuilder ajb) {
        final List> booleanColumns = new ArrayList<>();
        final List> stringColumns = new ArrayList<>();
        final List> numberColumns = new ArrayList<>();
        final List> dateTimeColumns = new ArrayList<>();

        for (final InputColumn inputColumn : ajb.getSourceColumns()) {
            final Class dataType = inputColumn.getDataType();
            if (ReflectionUtils.isBoolean(dataType)) {
                booleanColumns.add(inputColumn);
            } else if (ReflectionUtils.isNumber(dataType)) {
                numberColumns.add(inputColumn);
            } else if (ReflectionUtils.isDate(dataType)) {
                dateTimeColumns.add(inputColumn);
            } else if (ReflectionUtils.isString(dataType)) {
                stringColumns.add(inputColumn);
            }
        }

        if (!booleanColumns.isEmpty()) {
            // boolean analyzer contains combination matrices, so all columns
            // are added to a single analyzer job.
            ajb.addAnalyzer(BooleanAnalyzer.class).addInputColumns(booleanColumns);
        }
        if (!numberColumns.isEmpty()) {
            createAnalyzers(ajb, NumberAnalyzer.class, numberColumns);
        }
        if (!dateTimeColumns.isEmpty()) {
            createAnalyzers(ajb, DateAndTimeAnalyzer.class, dateTimeColumns);
        }
        if (!stringColumns.isEmpty()) {
            createAnalyzers(ajb, StringAnalyzer.class, stringColumns);
        }
    }

    /**
     * Registers analyzers and up to 4 columns per analyzer. This restriction is
     * to ensure that results will be nicely readable. A table might contain
     * hundreds of columns.
     *
     * @param ajb
     * @param analyzerClass
     * @param columns
     */
    private void createAnalyzers(final AnalysisJobBuilder ajb, final Class> analyzerClass,
            final List> columns) {
        final int columnsPerAnalyzer = getColumnsPerAnalyzer();

        AnalyzerComponentBuilder analyzerJobBuilder = ajb.addAnalyzer(analyzerClass);
        int columnCount = 0;
        for (final InputColumn inputColumn : columns) {
            if (columnCount == columnsPerAnalyzer) {
                analyzerJobBuilder = ajb.addAnalyzer(analyzerClass);
                columnCount = 0;
            }
            analyzerJobBuilder.addInputColumn(inputColumn);

            if (isIncludeValueDistribution()) {
                ajb.addAnalyzer(ValueDistributionAnalyzer.class).addInputColumn(inputColumn);
            }
            if (inputColumn.getDataType() == String.class && isIncludePatternFinder()) {
                ajb.addAnalyzer(PatternFinderAnalyzer.class).addInputColumn(inputColumn);
            }
            columnCount++;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy