All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.beans.DateAndTimeAnalyzer Maven / Gradle / Ivy

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans;

import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import javax.inject.Inject;
import javax.inject.Named;

import org.datacleaner.api.Analyzer;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Provided;
import org.datacleaner.components.categories.DateAndTimeCategory;
import org.datacleaner.result.AnnotatedRowsResult;
import org.datacleaner.result.Crosstab;
import org.datacleaner.result.CrosstabDimension;
import org.datacleaner.result.CrosstabNavigator;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.joda.time.LocalDate;
import org.joda.time.LocalTime;

@Named("Date/time analyzer")
@Description("Records a variety of interesting measures for date or time based data. Which are the highest/lowest values? How is the year distribution of dates? Are there null values?")
@Concurrent(true)
@Categorized(DateAndTimeCategory.class)
public class DateAndTimeAnalyzer implements Analyzer {

    public static final String MEASURE_LOWEST_TIME = "Lowest time";
    public static final String MEASURE_HIGHEST_TIME = "Highest time";
    public static final String MEASURE_LOWEST_DATE = "Lowest date";
    public static final String MEASURE_HIGHEST_DATE = "Highest date";
    public static final String MEASURE_NULL_COUNT = "Null count";
    public static final String MEASURE_ROW_COUNT = "Row count";
    public static final String DIMENSION_MEASURE = "Measure";
    public static final String DIMENSION_COLUMN = "Column";

    public static final String MEASURE_MEAN = "Mean";
    public static final String MEASURE_MEDIAN = "Median";
    public static final String MEASURE_PERCENTILE25 = "25th percentile";
    public static final String MEASURE_PERCENTILE75 = "75th percentile";
    public static final String MEASURE_KURTOSIS = "Kurtosis";
    public static final String MEASURE_SKEWNESS = "Skewness";

    private Map, DateAndTimeAnalyzerColumnDelegate> _delegates = new HashMap, DateAndTimeAnalyzerColumnDelegate>();

    @Inject
    @Configured(order = 1)
    InputColumn[] _columns;

    @Inject
    @Configured(order = 10)
    @Description("Gather so-called descriptive statistics, including median, skewness, kurtosis and percentiles, which have a larger memory-footprint.")
    boolean descriptiveStatistics = false;

    @Inject
    @Provided
    RowAnnotationFactory _annotationFactory;

    @Initialize
    public void init() {
        for (InputColumn col : _columns) {
            final DateAndTimeAnalyzerColumnDelegate delegate = new DateAndTimeAnalyzerColumnDelegate(
                    descriptiveStatistics, _annotationFactory);
            _delegates.put(col, delegate);
        }
    }

    @Override
    public void run(InputRow row, int distinctCount) {
        for (InputColumn col : _columns) {
            Date value = row.getValue(col);
            DateAndTimeAnalyzerColumnDelegate delegate = _delegates.get(col);
            delegate.run(value, row, distinctCount);
        }
    }

    @Override
    public DateAndTimeAnalyzerResult getResult() {
        CrosstabDimension measureDimension = new CrosstabDimension(DIMENSION_MEASURE);
        measureDimension.addCategory(MEASURE_ROW_COUNT);
        measureDimension.addCategory(MEASURE_NULL_COUNT);
        measureDimension.addCategory(MEASURE_HIGHEST_DATE);
        measureDimension.addCategory(MEASURE_LOWEST_DATE);
        measureDimension.addCategory(MEASURE_HIGHEST_TIME);
        measureDimension.addCategory(MEASURE_LOWEST_TIME);
        measureDimension.addCategory(MEASURE_MEAN);

        if (descriptiveStatistics) {
            measureDimension.addCategory(MEASURE_MEDIAN);
            measureDimension.addCategory(MEASURE_PERCENTILE25);
            measureDimension.addCategory(MEASURE_PERCENTILE75);
            measureDimension.addCategory(MEASURE_SKEWNESS);
            measureDimension.addCategory(MEASURE_KURTOSIS);
        }

        CrosstabDimension columnDimension = new CrosstabDimension(DIMENSION_COLUMN);
        for (InputColumn column : _columns) {
            columnDimension.addCategory(column.getName());
        }

        final Crosstab crosstab = new Crosstab(Serializable.class, columnDimension,
                measureDimension);
        final CrosstabNavigator nav = crosstab.navigate();
        for (InputColumn column : _columns) {
            final DateAndTimeAnalyzerColumnDelegate delegate = _delegates.get(column);

            nav.where(columnDimension, column.getName());

            nav.where(measureDimension, MEASURE_ROW_COUNT).put(delegate.getNumRows());

            final int numNull = delegate.getNumNull();
            nav.where(measureDimension, MEASURE_NULL_COUNT).put(numNull);
            if (numNull > 0) {
                nav.attach(new AnnotatedRowsResult(delegate.getNullAnnotation(), _annotationFactory, column));
            }

            final LocalDate maxDate = delegate.getMaxDate();
            nav.where(measureDimension, MEASURE_HIGHEST_DATE).put(toString(maxDate));
            RowAnnotation annotation = delegate.getMaxDateAnnotation();
            if (annotation.getRowCount() > 0) {
                nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
            }

            final LocalDate minDate = delegate.getMinDate();
            nav.where(measureDimension, MEASURE_LOWEST_DATE).put(toString(minDate));
            annotation = delegate.getMinDateAnnotation();
            if (annotation.getRowCount() > 0) {
                nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
            }

            final LocalTime maxTime = delegate.getMaxTime();
            nav.where(measureDimension, MEASURE_HIGHEST_TIME).put(toString(maxTime));
            annotation = delegate.getMaxTimeAnnotation();
            if (annotation.getRowCount() > 0) {
                nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
            }

            final LocalTime minTime = delegate.getMinTime();
            nav.where(measureDimension, MEASURE_LOWEST_TIME).put(toString(minTime));
            annotation = delegate.getMinTimeAnnotation();
            if (annotation.getRowCount() > 0) {
                nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
            }

            final Date mean = delegate.getMean();
            nav.where(measureDimension, MEASURE_MEAN).put(toString(mean));

            if (descriptiveStatistics) {
                final Date median = delegate.getMedian();
                nav.where(measureDimension, MEASURE_MEDIAN).put(toString(median));

                final Date percentile25 = delegate.getPercentile25();
                nav.where(measureDimension, MEASURE_PERCENTILE25).put(toString(percentile25));

                final Date percentile75 = delegate.getPercentile75();
                nav.where(measureDimension, MEASURE_PERCENTILE75).put(toString(percentile75));

                final Number kurtosis = delegate.getKurtosis();
                nav.where(measureDimension, MEASURE_KURTOSIS).put(kurtosis);

                final Number skewness = delegate.getSkewness();
                nav.where(measureDimension, MEASURE_SKEWNESS).put(skewness);
            }
        }

        return new DateAndTimeAnalyzerResult(crosstab);
    }

    private String toString(Object obj) {
        if (obj == null) {
            return null;
        }
        if (obj instanceof Date) {
            SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm");
            return format.format((Date)obj);
        }
        return obj.toString();
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy