All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.beans.standardize.CountryStandardizationTransformer Maven / Gradle / Ivy

There is a newer version: 6.0.0
Show newest version
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans.standardize;

import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import javax.inject.Inject;
import javax.inject.Named;

import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.HasAnalyzerResult;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Transformer;
import org.datacleaner.components.categories.ImproveSuperCategory;
import org.datacleaner.components.categories.LocationCategory;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.util.LabelUtils;

@Named("Country standardizer")
@Description("Allows you to standardize the country names and codes used throughout your database")
@Categorized(superCategory = ImproveSuperCategory.class, value = LocationCategory.class)
public class CountryStandardizationTransformer implements Transformer, HasAnalyzerResult {

    public enum OutputFormat implements HasName {

        ISO2("2-letter ISO code"), ISO3("3-letter ISO code"), NAME("Country name");

        private final String _name;

        OutputFormat(final String name) {
            _name = name;
        }

        public String getName() {
            return _name;
        }

    }

    public static final String PROPERTY_COUNTRY_COLUMN = "Country column";
    public static final String PROPERTY_OUTPUT_FORMAT = "Output format";
    public static final String PROPERTY_DEFAULT_COUNTRY = "Default country";
    public final Map countryCountMap = new HashMap<>();

    @Configured(PROPERTY_COUNTRY_COLUMN)
    @Description("A column containing potentially unstandardized country names, codes, abbreviations.")
    InputColumn countryColumn;

    @Configured(PROPERTY_OUTPUT_FORMAT)
    @Description("The output format of the transformation.")
    OutputFormat outputFormat = OutputFormat.ISO2;

    @Configured(value = PROPERTY_DEFAULT_COUNTRY, required = false)
    @Description("Country to return if input value is missing or not recognized.")
    Country defaultCountry = null;

    @Provided
    @Inject
    RowAnnotationFactory _rowAnnotationFactory;
    AtomicInteger _unrecognizedCountries = new AtomicInteger(0);

    @Override
    public OutputColumns getOutputColumns() {
        return new OutputColumns(String.class, countryColumn.getName() + " (standardized)");
    }

    @Override
    public String[] transform(final InputRow inputRow) {
        final String value = inputRow.getValue(countryColumn);
        Country country = Country.find(value);

        if (country == null) {
            _unrecognizedCountries.incrementAndGet();
            country = defaultCountry;
        }

        final String countryName;
        if (country == null) {
            countryName = null;
        } else {
            switch (outputFormat) {
            case ISO2:
                countryName = country.getTwoLetterISOCode();
                break;
            case ISO3:
                countryName = country.getThreeLetterISOCode();
                break;
            case NAME:
                countryName = country.getCountryName();
                break;
            default:
                throw new IllegalStateException("Unexpected output format: " + outputFormat);
            }
        }

        final String correctedCountryName;

        if (countryName != null) {
            correctedCountryName = countryName;
        } else {
            correctedCountryName = LabelUtils.UNEXPECTED_LABEL;
        }

        final RowAnnotation annotation;
        // ConcurrentHashMap does not support null keys
        synchronized (this) {
            if (!countryCountMap.containsKey(correctedCountryName)) {
                countryCountMap.put(correctedCountryName, _rowAnnotationFactory.createAnnotation());
            }
            annotation = countryCountMap.get(correctedCountryName);
        }
        _rowAnnotationFactory.annotate(inputRow, 1, annotation);

        return new String[] { countryName };
    }

    @Override
    public CountryStandardizationResult getResult() {
        return new CountryStandardizationResult(_rowAnnotationFactory, countryCountMap,
                _unrecognizedCountries.intValue());
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy