All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.bm.Rule Maven / Gradle / Ivy

There is a newer version: 2.0.31
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language.bm;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.Resources;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;

/**
 * A phoneme rule.
 * 

* Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply * and a logical flag indicating if all languages must be in play. A rule matches if: *

*
    *
  • the pattern matches at the current position
  • *
  • the string up until the beginning of the pattern matches the left context
  • *
  • the string from the end of the pattern matches the right context
  • *
  • logical is ALL and all languages are in scope; or
  • *
  • logical is any other value and at least one language is in scope
  • *
*

* Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user * to explicitly construct their own. *

*

* Rules are immutable and thread-safe. *

*

Rules resources

*

* Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically * named following the pattern: *

*
org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt
*

* The format of these resources is the following: *

*
    *
  • Rules: whitespace separated, double-quoted strings. There should be 4 columns to each row, and these * will be interpreted as: *
      *
    1. pattern
    2. *
    3. left context
    4. *
    5. right context
    6. *
    7. phoneme
    8. *
    *
  • *
  • End-of-line comments: Any occurrence of '//' will cause all text following on that line to be discarded * as a comment.
  • *
  • Multi-line comments: Any line starting with '/*' will start multi-line commenting mode. This will skip * all content until a line ending in '*' and '/' is found.
  • *
  • Blank lines: All blank lines will be skipped.
  • *
* * @since 1.6 */ public class Rule { public static final class Phoneme implements PhonemeExpr { public static final Comparator COMPARATOR = (o1, o2) -> { final int o1Length = o1.phonemeText.length(); final int o2Length = o2.phonemeText.length(); for (int i = 0; i < o1Length; i++) { if (i >= o2Length) { return +1; } final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); if (c != 0) { return c; } } if (o1Length < o2Length) { return -1; } return 0; }; private final StringBuilder phonemeText; private final Languages.LanguageSet languages; public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { this.phonemeText = new StringBuilder(phonemeText); this.languages = languages; } public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { this(phonemeLeft.phonemeText, phonemeLeft.languages); this.phonemeText.append(phonemeRight.phonemeText); } public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { this(phonemeLeft.phonemeText, languages); this.phonemeText.append(phonemeRight.phonemeText); } public Phoneme append(final CharSequence str) { this.phonemeText.append(str); return this; } public Languages.LanguageSet getLanguages() { return this.languages; } @Override public Iterable getPhonemes() { return Collections.singleton(this); } public CharSequence getPhonemeText() { return this.phonemeText; } /** * Deprecated since 1.9. * * @param right the Phoneme to join * @return a new Phoneme * @deprecated since 1.9 */ @Deprecated public Phoneme join(final Phoneme right) { return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), this.languages.restrictTo(right.languages)); } /** * Returns a new Phoneme with the same text but a union of its * current language set and the given one. * * @param lang the language set to merge * @return a new Phoneme */ public Phoneme mergeWithLanguage(final LanguageSet lang) { return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); } @Override public int size() { return 1; } @Override public String toString() { return phonemeText.toString() + "[" + languages + "]"; } } public interface PhonemeExpr { Iterable getPhonemes(); /** * Gets the expression size in phonemes. * * @return the expression size in phonemes. * @since 1.17.0 */ default int size() { // All implementations are int-bound. return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE); } } public static final class PhonemeList implements PhonemeExpr { private final List phonemeList; public PhonemeList(final List phonemes) { this.phonemeList = phonemes; } @Override public List getPhonemes() { return phonemeList; } @Override public int size() { return phonemeList.size(); } } /** * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. */ public interface RPattern { boolean isMatch(CharSequence input); } public static final RPattern ALL_STRINGS_RMATCHER = input -> true; public static final String ALL = "ALL"; private static final String DOUBLE_QUOTE = "\""; private static final String HASH_INCLUDE = "#include"; private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); private static final Map>>>> RULES = new EnumMap<>(NameType.class); static { for (final NameType s : NameType.values()) { final Map>>> rts = new EnumMap<>(RuleType.class); for (final RuleType rt : RuleType.values()) { final Map>> rs = new HashMap<>(); final Languages ls = Languages.getInstance(s); ls.getLanguages().forEach(l -> { try (final Scanner scanner = createScanner(s, rt, l)) { rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); } catch (final IllegalStateException e) { throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); } }); if (!rt.equals(RuleType.RULES)) { try (final Scanner scanner = createScanner(s, rt, "common")) { rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); } } rts.put(rt, Collections.unmodifiableMap(rs)); } RULES.put(s, Collections.unmodifiableMap(rts)); } } private static boolean contains(final CharSequence chars, final char input) { return chars.chars().anyMatch(c -> c == input); } private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", nameType.getName(), rt.getName(), lang); } @SuppressWarnings("resource") // Closing the Scanner closes the resource private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { final String resName = createResourceName(nameType, rt, lang); return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); } @SuppressWarnings("resource") // Closing the Scanner closes the resource private static Scanner createScanner(final String lang) { final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); } private static boolean endsWith(final CharSequence input, final CharSequence suffix) { final int suffixLength = suffix.length(); final int inputLength = input.length(); if (suffixLength > inputLength) { return false; } for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) { if (input.charAt(i) != suffix.charAt(j)) { return false; } } return true; } /** * Gets rules for a combination of name type, rule type and languages. * * @param nameType * the NameType to consider * @param rt * the RuleType to consider * @param langs * the set of languages to consider * @return a list of Rules that apply */ public static List getInstance(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) { final Map> ruleMap = getInstanceMap(nameType, rt, langs); final List allRules = new ArrayList<>(); ruleMap.values().forEach(rules -> allRules.addAll(rules)); return allRules; } /** * Gets rules for a combination of name type, rule type and a single language. * * @param nameType * the NameType to consider * @param rt * the RuleType to consider * @param lang * the language to consider * @return a list of Rules that apply */ public static List getInstance(final NameType nameType, final RuleType rt, final String lang) { return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); } /** * Gets rules for a combination of name type, rule type and languages. * * @param nameType * the NameType to consider * @param rt * the RuleType to consider * @param langs * the set of languages to consider * @return a map containing all Rules that apply, grouped by the first character of the rule pattern * @since 1.9 */ public static Map> getInstanceMap(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) { return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : getInstanceMap(nameType, rt, Languages.ANY); } /** * Gets rules for a combination of name type, rule type and a single language. * * @param nameType * the NameType to consider * @param rt * the RuleType to consider * @param lang * the language to consider * @return a map containing all Rules that apply, grouped by the first character of the rule pattern * @since 1.9 */ public static Map> getInstanceMap(final NameType nameType, final RuleType rt, final String lang) { final Map> rules = RULES.get(nameType).get(rt).get(lang); if (rules == null) { throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", nameType.getName(), rt.getName(), lang)); } return rules; } private static Phoneme parsePhoneme(final String ph) { final int open = ph.indexOf("["); if (open >= 0) { if (!ph.endsWith("]")) { throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); } final String before = ph.substring(0, open); final String in = ph.substring(open + 1, ph.length() - 1); final Set langs = new HashSet<>(Arrays.asList(in.split("[+]"))); return new Phoneme(before, Languages.LanguageSet.from(langs)); } return new Phoneme(ph, Languages.ANY_LANGUAGE); } private static PhonemeExpr parsePhonemeExpr(final String ph) { if (ph.startsWith("(")) { // we have a bracketed list of options if (!ph.endsWith(")")) { throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); } final List phs = new ArrayList<>(); final String body = ph.substring(1, ph.length() - 1); for (final String part : body.split("[|]")) { phs.add(parsePhoneme(part)); } if (body.startsWith("|") || body.endsWith("|")) { phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); } return new PhonemeList(phs); } return parsePhoneme(ph); } private static Map> parseRules(final Scanner scanner, final String location) { final Map> lines = new HashMap<>(); int currentLine = 0; boolean inMultilineComment = false; while (scanner.hasNextLine()) { currentLine++; final String rawLine = scanner.nextLine(); String line = rawLine; if (inMultilineComment) { if (line.endsWith(ResourceConstants.EXT_CMT_END)) { inMultilineComment = false; } } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { inMultilineComment = true; } else { // discard comments final int cmtI = line.indexOf(ResourceConstants.CMT); if (cmtI >= 0) { line = line.substring(0, cmtI); } // trim leading-trailing whitespace line = line.trim(); if (line.isEmpty()) { continue; // empty lines can be safely skipped } if (line.startsWith(HASH_INCLUDE)) { // include statement final String incl = line.substring(HASH_INCLUDE_LENGTH).trim(); if (incl.contains(" ")) { throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + location); } try (final Scanner hashIncludeScanner = createScanner(incl)) { lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); } } else { // rule final String[] parts = line.split("\\s+"); if (parts.length != 4) { throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location); } try { final String pat = stripQuotes(parts[0]); final String lCon = stripQuotes(parts[1]); final String rCon = stripQuotes(parts[2]); final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); final int cLine = currentLine; final Rule r = new Rule(pat, lCon, rCon, ph) { private final int myLine = cLine; private final String loc = location; @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("Rule"); sb.append("{line=").append(myLine); sb.append(", loc='").append(loc).append('\''); sb.append(", pat='").append(pat).append('\''); sb.append(", lcon='").append(lCon).append('\''); sb.append(", rcon='").append(rCon).append('\''); sb.append('}'); return sb.toString(); } }; final String patternKey = r.pattern.substring(0, 1); final List rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>()); rules.add(r); } catch (final IllegalArgumentException e) { throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e); } } } } return lines; } /** * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. * * @param regex * the regular expression to compile * @return an RPattern that will match this regex */ private static RPattern pattern(final String regex) { final boolean startsWith = regex.startsWith("^"); final boolean endsWith = regex.endsWith("$"); final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); final boolean boxes = content.contains("["); if (!boxes) { if (startsWith && endsWith) { // exact match if (content.isEmpty()) { // empty return input -> input.length() == 0; } return input -> input.equals(content); } if ((startsWith || endsWith) && content.isEmpty()) { // matches every string return ALL_STRINGS_RMATCHER; } if (startsWith) { // matches from start return input -> startsWith(input, content); } if (endsWith) { // matches from start return input -> endsWith(input, content); } } else { final boolean startsWithBox = content.startsWith("["); final boolean endsWithBox = content.endsWith("]"); if (startsWithBox && endsWithBox) { String boxContent = content.substring(1, content.length() - 1); if (!boxContent.contains("[")) { // box containing alternatives final boolean negate = boxContent.startsWith("^"); if (negate) { boxContent = boxContent.substring(1); } final String bContent = boxContent; final boolean shouldMatch = !negate; if (startsWith && endsWith) { // exact match return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; } if (startsWith) { // first char return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; } if (endsWith) { // last char return input -> input.length() > 0 && contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; } } } } return new RPattern() { final Pattern pattern = Pattern.compile(regex); @Override public boolean isMatch(final CharSequence input) { final Matcher matcher = pattern.matcher(input); return matcher.find(); } }; } private static boolean startsWith(final CharSequence input, final CharSequence prefix) { if (prefix.length() > input.length()) { return false; } for (int i = 0; i < prefix.length(); i++) { if (input.charAt(i) != prefix.charAt(i)) { return false; } } return true; } private static String stripQuotes(String str) { if (str.startsWith(DOUBLE_QUOTE)) { str = str.substring(1); } if (str.endsWith(DOUBLE_QUOTE)) { str = str.substring(0, str.length() - 1); } return str; } private final RPattern lContext; private final String pattern; private final PhonemeExpr phoneme; private final RPattern rContext; /** * Creates a new rule. * * @param pattern * the pattern * @param lContext * the left context * @param rContext * the right context * @param phoneme * the resulting phoneme */ public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { this.pattern = pattern; this.lContext = pattern(lContext + "$"); this.rContext = pattern("^" + rContext); this.phoneme = phoneme; } /** * Gets the left context. This is a regular expression that must match to the left of the pattern. * * @return the left context Pattern */ public RPattern getLContext() { return this.lContext; } /** * Gets the pattern. This is a string-literal that must exactly match. * * @return the pattern */ public String getPattern() { return this.pattern; } /** * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. * * @return the phoneme */ public PhonemeExpr getPhoneme() { return this.phoneme; } /** * Gets the right context. This is a regular expression that must match to the right of the pattern. * * @return the right context Pattern */ public RPattern getRContext() { return this.rContext; } /** * Decides if the pattern and context match the input starting at a position. It is a match if the * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}. * * @param input * the input String * @param i * the int position within the input * @return true if the pattern and left/right context match, false otherwise */ public boolean patternAndContextMatches(final CharSequence input, final int i) { if (i < 0) { throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); } final int patternLength = this.pattern.length(); final int ipl = i + patternLength; if (ipl > input.length()) { // not enough room for the pattern to match return false; } // evaluate the pattern, left context and right context // fail early if any of the evaluations is not successful if (!input.subSequence(i, ipl).equals(this.pattern)) { return false; } if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { return false; } return this.lContext.isMatch(input.subSequence(0, i)); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy