All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.bm.Rule Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language.bm;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 

* A phoneme rule. *

*

* Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply and a logical flag indicating if * all lanugages must be in play. A rule matches if: *

    *
  • the pattern matches at the current position
  • *
  • the string up until the beginning of the pattern matches the left context
  • *
  • the string from the end of the pattern matches the right context
  • *
  • logical is ALL and all languages are in scope; or
  • *
  • logical is any other value and at least one language is in scope
  • *
*

*

* Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user to explicitly construct their * own. *

*

* Rules are immutable and thread-safe. *

Rules resources

*

* Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named following the pattern: *

org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt
*

*

* The format of these resources is the following: *

    *
  • Rules: whitespace separated, double-quoted strings. There should be 4 columns to each row, and these will be interpreted as: *
      *
    1. pattern
    2. *
    3. left context
    4. *
    5. right context
    6. *
    7. phoneme
    8. *
    *
  • *
  • End-of-line comments: Any occurance of '//' will cause all text following on that line to be discarded as a comment.
  • *
  • Multi-line comments: Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a * line ending in '*' and '/' is found.
  • *
  • Blank lines: All blank lines will be skipped.
  • *
*

* * @author Apache Software Foundation * @since 1.6 */ public class Rule { public static final class Phoneme implements PhonemeExpr { public static final Comparator COMPARATOR = new Comparator() { public int compare(Phoneme o1, Phoneme o2) { for (int i = 0; i < o1.phonemeText.length(); i++) { if (i >= o2.phonemeText.length()) { return +1; } int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); if (c != 0) { return c; } } if (o1.phonemeText.length() < o2.phonemeText.length()) { return -1; } return 0; } }; private final CharSequence phonemeText; private final Languages.LanguageSet languages; public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) { this.phonemeText = phonemeText; this.languages = languages; } public Phoneme append(CharSequence str) { return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages); } public Languages.LanguageSet getLanguages() { return this.languages; } public Iterable getPhonemes() { return Collections.singleton(this); } public CharSequence getPhonemeText() { return this.phonemeText; } public Phoneme join(Phoneme right) { return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), this.languages.restrictTo(right.languages)); } } public interface PhonemeExpr { Iterable getPhonemes(); } public static final class PhonemeList implements PhonemeExpr { private final List phonemes; public PhonemeList(List phonemes) { this.phonemes = phonemes; } public List getPhonemes() { return this.phonemes; } } /** * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. */ public static interface RPattern { boolean isMatch(CharSequence input); } public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { public boolean isMatch(CharSequence input) { return true; } }; public static final String ALL = "ALL"; private static final String DOUBLE_QUOTE = "\""; private static final String HASH_INCLUDE = "#include"; private static final Map>>> RULES = new EnumMap>>>( NameType.class); static { for (NameType s : NameType.values()) { Map>> rts = new EnumMap>>(RuleType.class); for (RuleType rt : RuleType.values()) { Map> rs = new HashMap>(); Languages ls = Languages.getInstance(s); for (String l : ls.getLanguages()) { try { rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); } catch (IllegalStateException e) { throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); } } if (!rt.equals(RuleType.RULES)) { rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); } rts.put(rt, Collections.unmodifiableMap(rs)); } RULES.put(s, Collections.unmodifiableMap(rts)); } } private static boolean contains(CharSequence chars, char input) { for (int i = 0; i < chars.length(); i++) { if (chars.charAt(i) == input) { return true; } } return false; } private static String createResourceName(NameType nameType, RuleType rt, String lang) { return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", nameType.getName(), rt.getName(), lang); } private static Scanner createScanner(NameType nameType, RuleType rt, String lang) { String resName = createResourceName(nameType, rt, lang); InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); if (rulesIS == null) { throw new IllegalArgumentException("Unable to load resource: " + resName); } return new Scanner(rulesIS, ResourceConstants.ENCODING); } private static Scanner createScanner(String lang) { String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); if (rulesIS == null) { throw new IllegalArgumentException("Unable to load resource: " + resName); } return new Scanner(rulesIS, ResourceConstants.ENCODING); } private static boolean endsWith(CharSequence input, CharSequence suffix) { if (suffix.length() > input.length()) { return false; } for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { if (input.charAt(i) != suffix.charAt(j)) { return false; } } return true; } /** * Gets rules for a combination of name type, rule type and languages. * * @param nameType * the NameType to consider * @param rt * the RuleType to consider * @param langs * the set of languages to consider * @return a list of Rules that apply */ public static List getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) { return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) : getInstance(nameType, rt, Languages.ANY); } /** * Gets rules for a combination of name type, rule type and a single language. * * @param nameType * the NameType to consider * @param rt * the RuleType to consider * @param lang * the language to consider * @return a list rules for a combination of name type, rule type and a single language. */ public static List getInstance(NameType nameType, RuleType rt, String lang) { List rules = RULES.get(nameType).get(rt).get(lang); if (rules == null) { throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", nameType.getName(), rt.getName(), lang)); } return rules; } private static Phoneme parsePhoneme(String ph) { int open = ph.indexOf("["); if (open >= 0) { if (!ph.endsWith("]")) { throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); } String before = ph.substring(0, open); String in = ph.substring(open + 1, ph.length() - 1); Set langs = new HashSet(Arrays.asList(in.split("[+]"))); return new Phoneme(before, Languages.LanguageSet.from(langs)); } else { return new Phoneme(ph, Languages.ANY_LANGUAGE); } } private static PhonemeExpr parsePhonemeExpr(String ph) { if (ph.startsWith("(")) { // we have a bracketed list of options if (!ph.endsWith(")")) { throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); } List phs = new ArrayList(); String body = ph.substring(1, ph.length() - 1); for (String part : body.split("[|]")) { phs.add(parsePhoneme(part)); } if (body.startsWith("|") || body.endsWith("|")) { phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); } return new PhonemeList(phs); } else { return parsePhoneme(ph); } } private static List parseRules(final Scanner scanner, final String location) { List lines = new ArrayList(); int currentLine = 0; boolean inMultilineComment = false; while (scanner.hasNextLine()) { currentLine++; String rawLine = scanner.nextLine(); String line = rawLine; if (inMultilineComment) { if (line.endsWith(ResourceConstants.EXT_CMT_END)) { inMultilineComment = false; } else { // skip } } else { if (line.startsWith(ResourceConstants.EXT_CMT_START)) { inMultilineComment = true; } else { // discard comments int cmtI = line.indexOf(ResourceConstants.CMT); if (cmtI >= 0) { line = line.substring(0, cmtI); } // trim leading-trailing whitespace line = line.trim(); if (line.length() == 0) { continue; // empty lines can be safely skipped } if (line.startsWith(HASH_INCLUDE)) { // include statement String incl = line.substring(HASH_INCLUDE.length()).trim(); if (incl.contains(" ")) { System.err.println("Warining: malformed import statement: " + rawLine); } else { lines.addAll(parseRules(createScanner(incl), location + "->" + incl)); } } else { // rule String[] parts = line.split("\\s+"); if (parts.length != 4) { System.err.println("Warning: malformed rule statement split into " + parts.length + " parts: " + rawLine); } else { try { String pat = stripQuotes(parts[0]); String lCon = stripQuotes(parts[1]); String rCon = stripQuotes(parts[2]); PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); final int cLine = currentLine; Rule r = new Rule(pat, lCon, rCon, ph) { private final int myLine = cLine; private final String loc = location; @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("Rule"); sb.append("{line=").append(myLine); sb.append(", loc='").append(loc).append('\''); sb.append('}'); return sb.toString(); } }; lines.add(r); } catch (IllegalArgumentException e) { throw new IllegalStateException("Problem parsing line " + currentLine, e); } } } } } } return lines; } /** * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. * * @param regex * the regular expression to compile * @return an RPattern that will match this regex */ private static RPattern pattern(final String regex) { boolean startsWith = regex.startsWith("^"); boolean endsWith = regex.endsWith("$"); final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); boolean boxes = content.contains("["); if (!boxes) { if (startsWith && endsWith) { // exact match if (content.length() == 0) { // empty return new RPattern() { public boolean isMatch(CharSequence input) { return input.length() == 0; } }; } else { return new RPattern() { public boolean isMatch(CharSequence input) { return input.equals(content); } }; } } else if ((startsWith || endsWith) && content.length() == 0) { // matches every string return ALL_STRINGS_RMATCHER; } else if (startsWith) { // matches from start return new RPattern() { public boolean isMatch(CharSequence input) { return startsWith(input, content); } }; } else if (endsWith) { // matches from start return new RPattern() { public boolean isMatch(CharSequence input) { return endsWith(input, content); } }; } } else { boolean startsWithBox = content.startsWith("["); boolean endsWithBox = content.endsWith("]"); if (startsWithBox && endsWithBox) { String boxContent = content.substring(1, content.length() - 1); if (!boxContent.contains("[")) { // box containing alternatives boolean negate = boxContent.startsWith("^"); if (negate) { boxContent = boxContent.substring(1); } final String bContent = boxContent; final boolean shouldMatch = !negate; if (startsWith && endsWith) { // exact match return new RPattern() { public boolean isMatch(CharSequence input) { return input.length() == 1 && (contains(bContent, input.charAt(0)) == shouldMatch); } }; } else if (startsWith) { // first char return new RPattern() { public boolean isMatch(CharSequence input) { return input.length() > 0 && (contains(bContent, input.charAt(0)) == shouldMatch); } }; } else if (endsWith) { // last char return new RPattern() { public boolean isMatch(CharSequence input) { return input.length() > 0 && (contains(bContent, input.charAt(input.length() - 1)) == shouldMatch); } }; } } } } // System.out.println("Couldn't optimize regex: " + regex); return new RPattern() { Pattern pattern = Pattern.compile(regex); public boolean isMatch(CharSequence input) { Matcher matcher = pattern.matcher(input); return matcher.find(); } }; } private static boolean startsWith(CharSequence input, CharSequence prefix) { if (prefix.length() > input.length()) { return false; } for (int i = 0; i < prefix.length(); i++) { if (input.charAt(i) != prefix.charAt(i)) { return false; } } return true; } private static String stripQuotes(String str) { if (str.startsWith(DOUBLE_QUOTE)) { str = str.substring(1); } if (str.endsWith(DOUBLE_QUOTE)) { str = str.substring(0, str.length() - 1); } return str; } private final RPattern lContext; private final String pattern; private final PhonemeExpr phoneme; private final RPattern rContext; /** * Creates a new rule. * * @param pattern * the pattern * @param lContext * the left context * @param rContext * the right context * @param phoneme * the resulting phoneme */ public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) { this.pattern = pattern; this.lContext = pattern(lContext + "$"); this.rContext = pattern("^" + rContext); this.phoneme = phoneme; } /** * Gets the left context. This is a regular expression that must match to the left of the pattern. * * @return the left context Pattern */ public RPattern getLContext() { return this.lContext; } /** * Gets the pattern. This is a string-literal that must exactly match. * * @return the pattern */ public String getPattern() { return this.pattern; } /** * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. * * @return the phoneme */ public PhonemeExpr getPhoneme() { return this.phoneme; } /** * Gets the right context. This is a regular expression that must match to the right of the pattern. * * @return the right context Pattern */ public RPattern getRContext() { return this.rContext; } /** * Decides if the pattern and context match the input starting at a position. It is a match if the * lContext matches input up to i, pattern matches at i and * rContext matches from the end of the match of pattern to the end of input. * * @param input * the input String * @param i * the int position within the input * @return true if the pattern and left/right context match, false otherwise */ public boolean patternAndContextMatches(CharSequence input, int i) { if (i < 0) { throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); } int patternLength = this.pattern.length(); int ipl = i + patternLength; if (ipl > input.length()) { // not enough room for the pattern to match return false; } // fixme: this is a readability/speed trade-off - these 3 expressions should be inlined for speed to avoid // evaluating latter ones if earlier ones have already failed, but that would make the code a lot harder to // read boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern); boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length())); boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i)); return patternMatches && rContextMatches && lContextMatches; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy