All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.text.RemoveMinorityScriptsTextFilter Maven / Gradle / Ivy

package com.optimaize.langdetect.text;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * Removes text written in scripts that are not the dominant script of the text.
 *
 * TODO this does not do special handling for Japanese (3 scripts) and Korean (2 scripts), they should be
 * counted together and kept.
 *
 * @author Fabian Kessler
 */
public class RemoveMinorityScriptsTextFilter implements TextFilter {

    private final double threshold;

    /**
     * If a script has less than this fraction of content compared to the most used one, its text is removed.
     *
     * Example: Latin 10%, Cyrillic 80%, Common 10% (punctuation n'stuff). Now 10 is put in relation to 80.
     *
     * @param threshold 0-1, suggested value is 0.3. If smaller then removed, equal remains.
     */
    public static RemoveMinorityScriptsTextFilter forThreshold(double threshold) {
        return new RemoveMinorityScriptsTextFilter(threshold);
    }

    private RemoveMinorityScriptsTextFilter(double threshold) {
        this.threshold = threshold;
    }

    @Override
    public String filter(CharSequence text) {
        Map counts = countByScript(text);
        if (counts.size()<=1) {
            //nothing to do
            return text.toString();
        } else {
            long most = findMost(counts);
            Set toRemove = new HashSet<>();
            for (Map.Entry entry : counts.entrySet()) {
                if (entry.getValue()==most) continue;
                double ratio = entry.getValue().doubleValue() / most;
                if (ratio <= threshold) {
                    toRemove.add(entry.getKey());
                }
            }
            if (toRemove.isEmpty()) {
                return text.toString();
            } else {
                return remove(text, toRemove);
            }
        }
    }

    private String remove(CharSequence text, Set toRemove) {
        StringBuilder remaining = new StringBuilder();
        Character.UnicodeScript last = null;
        for (int i=0; i counts) {
        long max = 0L;
        for (Long aLong : counts.values()) {
            if (aLong > max) max = aLong;
        }
        return max;
    }

    private Map countByScript(CharSequence text) {
        Map counter = new HashMap<>();
        Character.UnicodeScript last = null;
        for (int i=0; i counter, Character.UnicodeScript unicodeScript) {
        Long number = counter.get(unicodeScript);
        if (number==null) {
            counter.put(unicodeScript, 1L);
        } else {
            counter.put(unicodeScript, number+1);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy