org.datacleaner.beans.standardize.CountryStandardizationTransformer Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.standardize;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.HasAnalyzerResult;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Transformer;
import org.datacleaner.components.categories.ImproveSuperCategory;
import org.datacleaner.components.categories.LocationCategory;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.util.LabelUtils;
@Named("Country standardizer")
@Description("Allows you to standardize the country names and codes used throughout your database")
@Categorized(superCategory = ImproveSuperCategory.class, value = LocationCategory.class)
public class CountryStandardizationTransformer implements Transformer, HasAnalyzerResult {
public enum OutputFormat implements HasName {
ISO2("2-letter ISO code"), ISO3("3-letter ISO code"), NAME("Country name");
private final String _name;
OutputFormat(final String name) {
_name = name;
}
public String getName() {
return _name;
}
}
public static final String PROPERTY_COUNTRY_COLUMN = "Country column";
public static final String PROPERTY_OUTPUT_FORMAT = "Output format";
public static final String PROPERTY_DEFAULT_COUNTRY = "Default country";
public final Map countryCountMap = new HashMap<>();
@Configured(PROPERTY_COUNTRY_COLUMN)
@Description("A column containing potentially unstandardized country names, codes, abbreviations.")
InputColumn countryColumn;
@Configured(PROPERTY_OUTPUT_FORMAT)
@Description("The output format of the transformation.")
OutputFormat outputFormat = OutputFormat.ISO2;
@Configured(value = PROPERTY_DEFAULT_COUNTRY, required = false)
@Description("Country to return if input value is missing or not recognized.")
Country defaultCountry = null;
@Provided
@Inject
RowAnnotationFactory _rowAnnotationFactory;
AtomicInteger _unrecognizedCountries = new AtomicInteger(0);
@Override
public OutputColumns getOutputColumns() {
return new OutputColumns(String.class, countryColumn.getName() + " (standardized)");
}
@Override
public String[] transform(final InputRow inputRow) {
final String value = inputRow.getValue(countryColumn);
Country country = Country.find(value);
if (country == null) {
_unrecognizedCountries.incrementAndGet();
country = defaultCountry;
}
final String countryName;
if (country == null) {
countryName = null;
} else {
switch (outputFormat) {
case ISO2:
countryName = country.getTwoLetterISOCode();
break;
case ISO3:
countryName = country.getThreeLetterISOCode();
break;
case NAME:
countryName = country.getCountryName();
break;
default:
throw new IllegalStateException("Unexpected output format: " + outputFormat);
}
}
final String correctedCountryName;
if (countryName != null) {
correctedCountryName = countryName;
} else {
correctedCountryName = LabelUtils.UNEXPECTED_LABEL;
}
final RowAnnotation annotation;
// ConcurrentHashMap does not support null keys
synchronized (this) {
if (!countryCountMap.containsKey(correctedCountryName)) {
countryCountMap.put(correctedCountryName, _rowAnnotationFactory.createAnnotation());
}
annotation = countryCountMap.get(correctedCountryName);
}
_rowAnnotationFactory.annotate(inputRow, 1, annotation);
return new String[] { countryName };
}
@Override
public CountryStandardizationResult getResult() {
return new CountryStandardizationResult(_rowAnnotationFactory, countryCountMap,
_unrecognizedCountries.intValue());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy