org.apache.commons.codec.language.bm.PhoneticEngine Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gwt-commons-codec Show documentation
The Apache Commons Codec component contains encoders and decoders for various formats such as Base16, Base32, Base64, digest, and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities. This is a port for GWT, which enables program, to use Apache Commons Codec also in the frontend compiled by the gwt compiler to java-script.
There is a newer version: 1.17.1-0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language.bm;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.Rule.Phoneme;

/**
 * Converts words into potential phonetic representations.
 * 
 * This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
 * into account the likely source language. Next, this phonetic representation is converted into a
 * pan-European 'average' representation, allowing comparison between different versions of essentially
 * the same word from different languages.
 * 

 * This class is intentionally immutable and thread-safe.
 * If you wish to alter the settings for a PhoneticEngine, you
 * must make a new one with the updated settings.
 * 

 * Ported from phoneticengine.php
 *
 * @since 1.6
 */
public class PhoneticEngine {

    /**
     * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
     * this package, and probably not outside the {@link PhoneticEngine} class.
     *
     * @since 1.6
     */
    static final class PhonemeBuilder {

        /**
         * An empty builder where all phonemes must come from some set of languages. This will contain a single
         * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
         * phoneme from scratch.
         *
         * @param languages the set of languages
         * @return  a new, empty phoneme builder
         */
        public static PhonemeBuilder empty(final Languages.LanguageSet languages) {
            return new PhonemeBuilder(new Rule.Phoneme("", languages));
        }

        private final Set phonemes;

        private PhonemeBuilder(final Rule.Phoneme phoneme) {
            this.phonemes = new LinkedHashSet<>();
            this.phonemes.add(phoneme);
        }

        private PhonemeBuilder(final Set phonemes) {
            this.phonemes = phonemes;
        }

        /**
         * Creates a new phoneme builder containing all phonemes in this one extended by {@code str}.
         *
         * @param str   the characters to append to the phonemes
         */
        public void append(final CharSequence str) {
            for (final Rule.Phoneme ph : this.phonemes) {
                ph.append(str);
            }
        }

        /**
         * Applies the given phoneme expression to all phonemes in this phoneme builder.
         * 
         * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
         * incompatible.
         *
         * @param phonemeExpr   the expression to apply
         * @param maxPhonemes   the maximum number of phonemes to build up
         */
        public void apply(final Rule.PhonemeExpr phonemeExpr, final int maxPhonemes) {
            final Set newPhonemes = new LinkedHashSet<>(maxPhonemes);

            EXPR: for (final Rule.Phoneme left : this.phonemes) {
                for (final Rule.Phoneme right : phonemeExpr.getPhonemes()) {
                    final LanguageSet languages = left.getLanguages().restrictTo(right.getLanguages());
                    if (!languages.isEmpty()) {
                        final Rule.Phoneme join = new Phoneme(left, right, languages);
                        if (newPhonemes.size() < maxPhonemes) {
                            newPhonemes.add(join);
                            if (newPhonemes.size() >= maxPhonemes) {
                                break EXPR;
                            }
                        }
                    }
                }
            }

            this.phonemes.clear();
            this.phonemes.addAll(newPhonemes);
        }

        /**
         * Gets underlying phoneme set. Please don't mutate.
         *
         * @return  the phoneme set
         */
        public Set getPhonemes() {
            return this.phonemes;
        }

        /**
         * Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
         * joined with a pipe. This is explicitly provided in place of toString as it is a potentially
         * expensive operation, which should be avoided when debugging.
         *
         * @return  the stringified phoneme set
         */
        public String makeString() {
            final StringBuilder sb = new StringBuilder();

            for (final Rule.Phoneme ph : this.phonemes) {
                if (sb.length() > 0) {
                    sb.append("|");
                }
                sb.append(ph.getPhonemeText());
            }

            return sb.toString();
        }
    }

    /**
     * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
     * After invocation, the values {@code i} and {@code found} are updated. {@code i} points to the
     * index of the next char in {@code input} that must be processed next (the input up to that index having been
     * processed already), and {@code found} indicates if a matching rule was found or not. In the case where a
     * matching rule was found, {@code phonemeBuilder} is replaced with a new builder containing the phonemes
     * updated by the matching rule.
     *
     * Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
     * as it is constructed as needed by the calling methods.
     * @since 1.6
     */
    private static final class RulesApplication {
        private final Map> finalRules;
        private final CharSequence input;

        private final PhonemeBuilder phonemeBuilder;
        private int i;
        private final int maxPhonemes;
        private boolean found;

        public RulesApplication(final Map> finalRules, final CharSequence input,
                                final PhonemeBuilder phonemeBuilder, final int i, final int maxPhonemes) {
            Objects.requireNonNull(finalRules, "finalRules");
            this.finalRules = finalRules;
            this.phonemeBuilder = phonemeBuilder;
            this.input = input;
            this.i = i;
            this.maxPhonemes = maxPhonemes;
        }

        public int getI() {
            return this.i;
        }

        public PhonemeBuilder getPhonemeBuilder() {
            return this.phonemeBuilder;
        }

        /**
         * Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
         * and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
         * match, {@code i} is advanced one and the character is silently dropped from the phonetic spelling.
         *
         * @return {@code this}
         */
        public RulesApplication invoke() {
            this.found = false;
            int patternLength = 1;
            final List rules = this.finalRules.get(input.subSequence(i, i+patternLength));
            if (rules != null) {
                for (final Rule rule : rules) {
                    final String pattern = rule.getPattern();
                    patternLength = pattern.length();
                    if (rule.patternAndContextMatches(this.input, this.i)) {
                        this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes);
                        this.found = true;
                        break;
                    }
                }
            }

            if (!this.found) {
                patternLength = 1;
            }

            this.i += patternLength;
            return this;
        }

        public boolean isFound() {
            return this.found;
        }
    }

    private static final Map> NAME_PREFIXES = new EnumMap<>(NameType.class);

    static {
        NAME_PREFIXES.put(NameType.ASHKENAZI,
                Collections.unmodifiableSet(
                        new HashSet<>(Arrays.asList("bar", "ben", "da", "de", "van", "von"))));
        NAME_PREFIXES.put(NameType.SEPHARDIC,
                Collections.unmodifiableSet(
                        new HashSet<>(Arrays.asList("al", "el", "da", "dal", "de", "del", "dela", "de la",
                                                          "della", "des", "di", "do", "dos", "du", "van", "von"))));
        NAME_PREFIXES.put(NameType.GENERIC,
                Collections.unmodifiableSet(
                        new HashSet<>(Arrays.asList("da", "dal", "de", "del", "dela", "de la", "della",
                                                          "des", "di", "do", "dos", "du", "van", "von"))));
    }

    /**
     * Joins some strings with an internal separator.
     * @param strings   Strings to join
     * @param sep       String to separate them with
     * @return a single String consisting of each element of {@code strings} interleaved by {@code sep}
     */
    private static String join(final Iterable strings, final String sep) {
        final StringBuilder sb = new StringBuilder();
        final Iterator si = strings.iterator();
        if (si.hasNext()) {
            sb.append(si.next());
        }
        while (si.hasNext()) {
            sb.append(sep).append(si.next());
        }

        return sb.toString();
    }

    private static final int DEFAULT_MAX_PHONEMES = 20;

    private final Lang lang;

    private final NameType nameType;

    private final RuleType ruleType;

    private final boolean concat;

    private final int maxPhonemes;

    /**
     * Generates a new, fully-configured phonetic engine.
     *
     * @param nameType
     *            the type of names it will use
     * @param ruleType
     *            the type of rules it will apply
     * @param concat
     *            if it will concatenate multiple encodings
     */
    public PhoneticEngine(final NameType nameType, final RuleType ruleType, final boolean concat) {
        this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES);
    }

    /**
     * Generates a new, fully-configured phonetic engine.
     *
     * @param nameType
     *            the type of names it will use
     * @param ruleType
     *            the type of rules it will apply
     * @param concat
     *            if it will concatenate multiple encodings
     * @param maxPhonemes
     *            the maximum number of phonemes that will be handled
     * @since 1.7
     */
    public PhoneticEngine(final NameType nameType, final RuleType ruleType, final boolean concat,
                          final int maxPhonemes) {
        if (ruleType == RuleType.RULES) {
            throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES);
        }
        this.nameType = nameType;
        this.ruleType = ruleType;
        this.concat = concat;
        this.lang = Lang.instance(nameType);
        this.maxPhonemes = maxPhonemes;
    }

    /**
     * Applies the final rules to convert from a language-specific phonetic representation to a
     * language-independent representation.
     *
     * @param phonemeBuilder the current phonemes
     * @param finalRules the final rules to apply
     * @return the resulting phonemes
     */
    private PhonemeBuilder applyFinalRules(final PhonemeBuilder phonemeBuilder,
                                           final Map> finalRules) {
        Objects.requireNonNull(finalRules, "finalRules");
        if (finalRules.isEmpty()) {
            return phonemeBuilder;
        }

        final Map phonemes =
            new TreeMap<>(Rule.Phoneme.COMPARATOR);

        for (final Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
            PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
            final String phonemeText = phoneme.getPhonemeText().toString();

            for (int i = 0; i < phonemeText.length();) {
                final RulesApplication rulesApplication =
                        new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke();
                final boolean found = rulesApplication.isFound();
                subBuilder = rulesApplication.getPhonemeBuilder();

                if (!found) {
                    // not found, appending as-is
                    subBuilder.append(phonemeText.subSequence(i, i + 1));
                }

                i = rulesApplication.getI();
            }

            // the phonemes map orders the phonemes only based on their text, but ignores the language set
            // when adding new phonemes, check for equal phonemes and merge their language set, otherwise
            // phonemes with the same text but different language set get lost
            for (final Rule.Phoneme newPhoneme : subBuilder.getPhonemes()) {
                if (phonemes.containsKey(newPhoneme)) {
                    final Rule.Phoneme oldPhoneme = phonemes.remove(newPhoneme);
                    final Rule.Phoneme mergedPhoneme = oldPhoneme.mergeWithLanguage(newPhoneme.getLanguages());
                    phonemes.put(mergedPhoneme, mergedPhoneme);
                } else {
                    phonemes.put(newPhoneme, newPhoneme);
                }
            }
        }

        return new PhonemeBuilder(phonemes.keySet());
    }

    /**
     * Encodes a string to its phonetic representation.
     *
     * @param input
     *            the String to encode
     * @return the encoding of the input
     */
    public String encode(final String input) {
        final Languages.LanguageSet languageSet = this.lang.guessLanguages(input);
        return encode(input, languageSet);
    }

    /**
     * Encodes an input string into an output phonetic representation, given a set of possible origin languages.
     *
     * @param input
     *            String to phoneticise; a String with dashes or spaces separating each word
     * @param languageSet
     *            set of possible origin languages
     * @return a phonetic representation of the input; a String containing '-'-separated phonetic representations of the
     *         input
     */
    public String encode(String input, final Languages.LanguageSet languageSet) {
        final Map> rules = Rule.getInstanceMap(this.nameType, RuleType.RULES, languageSet);
        // rules common across many (all) languages
        final Map> finalRules1 = Rule.getInstanceMap(this.nameType, this.ruleType, "common");
        // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
        final Map> finalRules2 = Rule.getInstanceMap(this.nameType, this.ruleType, languageSet);

        // tidy the input
        // lower case is a locale-dependent operation
        input = input.toLowerCase(Locale.ENGLISH).replace('-', ' ').trim();

        if (this.nameType == NameType.GENERIC) {
            if (input.length() >= 2 && input.substring(0, 2).equals("d'")) { // check for d'
                final String remainder = input.substring(2);
                final String combined = "d" + remainder;
                return "(" + encode(remainder) + ")-(" + encode(combined) + ")";
            }
            for (final String l : NAME_PREFIXES.get(this.nameType)) {
                // handle generic prefixes
                if (input.startsWith(l + " ")) {
                    // check for any prefix in the words list
                    final String remainder = input.substring(l.length() + 1); // input without the prefix
                    final String combined = l + remainder; // input with prefix without space
                    return "(" + encode(remainder) + ")-(" + encode(combined) + ")";
                }
            }
        }

        final List words = Arrays.asList(input.split("\\s+"));
        final List words2 = new ArrayList<>();

        // special-case handling of word prefixes based upon the name type
        switch (this.nameType) {
        case SEPHARDIC:
            for (final String aWord : words) {
                final String[] parts = aWord.split("'");
                final String lastPart = parts[parts.length - 1];
                words2.add(lastPart);
            }
            words2.removeAll(NAME_PREFIXES.get(this.nameType));
            break;
        case ASHKENAZI:
            words2.addAll(words);
            words2.removeAll(NAME_PREFIXES.get(this.nameType));
            break;
        case GENERIC:
            words2.addAll(words);
            break;
        default:
            throw new IllegalStateException("Unreachable case: " + this.nameType);
        }

        if (this.concat) {
            // concat mode enabled
            input = join(words2, " ");
        } else if (words2.size() == 1) {
            // not a multi-word name
            input = words.iterator().next();
        } else {
            // encode each word in a multi-word name separately (normally used for approx matches)
            final StringBuilder result = new StringBuilder();
            for (final String word : words2) {
                result.append("-").append(encode(word));
            }
            // return the result without the leading "-"
            return result.substring(1);
        }

        PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet);

        // loop over each char in the input - we will handle the increment manually
        for (int i = 0; i < input.length();) {
            final RulesApplication rulesApplication =
                    new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).invoke();
            i = rulesApplication.getI();
            phonemeBuilder = rulesApplication.getPhonemeBuilder();
        }

        // Apply the general rules
        phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1);
        // Apply the language-specific rules
        phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2);

        return phonemeBuilder.makeString();
    }

    /**
     * Gets the Lang language guessing rules being used.
     *
     * @return the Lang in use
     */
    public Lang getLang() {
        return this.lang;
    }

    /**
     * Gets the NameType being used.
     *
     * @return the NameType in use
     */
    public NameType getNameType() {
        return this.nameType;
    }

    /**
     * Gets the RuleType being used.
     *
     * @return the RuleType in use
     */
    public RuleType getRuleType() {
        return this.ruleType;
    }

    /**
     * Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
     *
     * @return true if multiple phonetic encodings are returned, false if just the first is
     */
    public boolean isConcat() {
        return this.concat;
    }

    /**
     * Gets the maximum number of phonemes the engine will calculate for a given input.
     *
     * @return the maximum number of phonemes
     * @since 1.7
     */
    public int getMaxPhonemes() {
        return this.maxPhonemes;
    }
}