All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.beans.stringpattern.PatternFinder Maven / Gradle / Ivy

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans.stringpattern;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

/***
 * A string pattern finder. This component can consume rows and produce string
 * patterns. It does not contain the actual logic to store/persist the rows, but
 * has callback methods so that it's rather easy to implement this on your own.
 * 
 * 
 * 
 * @param 
 *            the type representing the row. Enables the user of the class to
 *            use his own row type, such as InputRow, String[] or even just
 *            Object.
 */
public abstract class PatternFinder {

    private final ConcurrentHashMap> _patterns;
    private final TokenizerConfiguration _configuration;
    private final Tokenizer _tokenizer;

    public PatternFinder(Tokenizer tokenizer, TokenizerConfiguration configuration) {
        _configuration = configuration;
        _tokenizer = tokenizer;
        _patterns = new ConcurrentHashMap<>();
    }

    public PatternFinder(TokenizerConfiguration configuration) {
        this(new DefaultTokenizer(configuration), configuration);
    }

    /**
     * This method should be invoked by the user of the PatternFinder. Invoke it
     * for each value in your dataset. Repeated values are handled correctly but
     * if available it is more effecient to handle only the distinct values and
     * their corresponding distinct counts.
     * 
     * @param row
     *            the row containing the value
     * @param value
     *            the string value to be tokenized and matched against other
     *            patterns
     * @param distinctCount
     *            the count of the value
     */
    public void run(R row, String value, int distinctCount) {
        final List tokens;
        try {
            tokens = _tokenizer.tokenize(value);
        } catch (RuntimeException e) {
            throw new IllegalStateException("Error occurred while tokenizing value: " + value, e);
        }

        final String patternCode = getPatternCode(tokens);
        final Collection patterns = getOrCreatePatterns(patternCode);

        // lock on "patterns" since it is going to be the same collection for
        // all matching pattern codes.
        synchronized (patterns) {
            boolean match = false;
            for (TokenPattern pattern : patterns) {
                if (pattern.match(tokens)) {
                    storeMatch(pattern, row, value, distinctCount);
                    match = true;
                    break;
                }
            }

            if (!match) {
                final TokenPattern pattern;
                try {
                    pattern = new TokenPatternImpl(value, tokens, _configuration);
                } catch (RuntimeException e) {
                    throw new IllegalStateException("Error occurred while creating pattern for: " + tokens, e);
                }

                storeNewPattern(pattern, row, value, distinctCount);
                patterns.add(pattern);
            }
        }
    }

    /**
     * Gets a collection of known {@link TokenPattern}s that matches the pattern
     * code
     * 
     * @param patternCode
     * @return
     */
    private Collection getOrCreatePatterns(String patternCode) {
        // first try the cheapest get(..) method
        final Collection patterns = _patterns.get(patternCode);
        if (patterns != null) {
            return patterns;
        }

        // then try the concurrent version which requires a collection
        final Collection newPatterns = new ArrayList<>(3);
        final Collection existingPatterns = _patterns.putIfAbsent(patternCode, newPatterns);
        if (existingPatterns == null) {
            return newPatterns;
        }
        return existingPatterns;
    }

    /**
     * Creates an almost unique String code for a list of tokens. This code is
     * used to improve search time when looking for potential matching patterns.
     * 
     * @param tokens
     * @return
     */
    private String getPatternCode(List tokens) {
        final StringBuilder sb = new StringBuilder();
        sb.append(tokens.size());
        for (Token token : tokens) {
            sb.append(token.getType().ordinal());
        }
        return sb.toString();
    }

    public Collection getPatterns() {
        final Set result = new HashSet();
        final Collection> values = _patterns.values();
        for (Collection set : values) {
            result.addAll(set);
        }
        return result;
    }

    /**
     * This method is invoked every time a new pattern is created (ie. when a
     * match could not be found in the existing patterns).
     * 
     * @param pattern
     *            the newly produced pattern
     * @param row
     *            the row that was handed to the run(...) method
     * @param value
     *            the value that was handed to the run(...) method
     * @param distinctCount
     *            the distinctCount that was handed to the run(...) method
     */
    protected abstract void storeNewPattern(TokenPattern pattern, R row, String value, int distinctCount);

    /**
     * This method is invoked every time a tokenized value matches an existing
     * pattern. All existing patterns will previously have been created using
     * the storeNewPattern(...) method.
     * 
     * @param pattern
     *            the existing pattern
     * @param row
     *            the row that was handed to the run(...) method
     * @param value
     *            the value that was handed to the run(...) method
     * @param distinctCount
     *            the distinctCount that was handed to the run(...) method
     */
    protected abstract void storeMatch(TokenPattern pattern, R row, String value, int distinctCount);
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy