org.datacleaner.components.fillpattern.FillPatternAnalyzer Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.components.fillpattern;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.inject.Inject;
import javax.inject.Named;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.MappedProperty;
import org.datacleaner.api.Provided;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.util.LabelUtils;
@Named("Fill pattern")
@Description("Determines the patterns of filling specific fields")
public class FillPatternAnalyzer implements Analyzer {
private static final String GROUP_NAME_SINGLE = "__single_group__";
private static final int PARALLEL_RESULT_CALCULATION_THRESHOLD = 10;
public static final String PROPERTY_GROUP_COLUMN = "Group column";
public static final String PROPERTY_INSPECTED_COLUMNS = "Inspected columns";
public static final String PROPERTY_INSPECTION_TYPES = "Inspection types";
public static final String FILLED_LABEL = "";
@Inject
@Configured(order = 1, value = PROPERTY_GROUP_COLUMN, required = false)
InputColumn groupColumn;
@Inject
@Configured(order = 2, value = PROPERTY_INSPECTED_COLUMNS)
InputColumn>[] inspectedColumns;
@Inject
@Configured(order = 3, value = PROPERTY_INSPECTION_TYPES)
@MappedProperty(PROPERTY_INSPECTED_COLUMNS)
InspectionType[] inspectionTypes;
@Inject
@Provided
RowAnnotationFactory rowAnnotationFactory;
private final ConcurrentMap _buildersByGroup =
new ConcurrentHashMap();
public void run(InputRow row, int count) {
final String group;
if (groupColumn == null) {
group = GROUP_NAME_SINGLE;
} else {
group = row.getValue(groupColumn);
}
final FillPatternsBuilder fillPatternsBuilder = getOrCreateFillPatternsBuilder(group);
final List