All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.TransliterationRuleSet Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
 *******************************************************************************
 * Copyright (C) 1996-2010, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.util.ArrayList;
import java.util.List;

import com.ibm.icu.impl.UtilityExtensions;

/**
 * A set of rules for a RuleBasedTransliterator.  This set encodes
 * the transliteration in one direction from one set of characters or short
 * strings to another.  A RuleBasedTransliterator consists of up to
 * two such sets, one for the forward direction, and one for the reverse.
 *
 * 

A TransliterationRuleSet has one important operation, that of * finding a matching rule at a given point in the text. This is accomplished * by the findMatch() method. * *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu */ class TransliterationRuleSet { /** * Vector of rules, in the order added. */ private List ruleVector; /** * Length of the longest preceding context */ private int maxContextLength; /** * Sorted and indexed table of rules. This is created by freeze() from * the rules in ruleVector. rules.length >= ruleVector.size(), and the * references in rules[] are aliases of the references in ruleVector. * A single rule in ruleVector is listed one or more times in rules[]. */ private TransliterationRule[] rules; /** * Index table. For text having a first character c, compute x = c&0xFF. * Now use rules[index[x]..index[x+1]-1]. This index table is created by * freeze(). */ private int[] index; /** * Construct a new empty rule set. */ public TransliterationRuleSet() { ruleVector = new ArrayList(); maxContextLength = 0; } /** * Return the maximum context length. * @return the length of the longest preceding context. */ public int getMaximumContextLength() { return maxContextLength; } /** * Add a rule to this set. Rules are added in order, and order is * significant. * @param rule the rule to add */ public void addRule(TransliterationRule rule) { ruleVector.add(rule); int len; if ((len = rule.getAnteContextLength()) > maxContextLength) { maxContextLength = len; } rules = null; } /** * Close this rule set to further additions, check it for masked rules, * and index it to optimize performance. * @exception IllegalArgumentException if some rules are masked */ public void freeze() { /* Construct the rule array and index table. We reorder the * rules by sorting them into 256 bins. Each bin contains all * rules matching the index value for that bin. A rule * matches an index value if string whose first key character * has a low byte equal to the index value can match the rule. * * Each bin contains zero or more rules, in the same order * they were found originally. However, the total rules in * the bins may exceed the number in the original vector, * since rules that have a variable as their first key * character will generally fall into more than one bin. * * That is, each bin contains all rules that either have that * first index value as their first key character, or have * a set containing the index value as their first character. */ int n = ruleVector.size(); index = new int[257]; // [sic] List v = new ArrayList(2*n); // heuristic; adjust as needed /* Precompute the index values. This saves a LOT of time. */ int[] indexValue = new int[n]; for (int j=0; j= 0) { if (indexValue[j] == x) { v.add(ruleVector.get(j)); } } else { // If the indexValue is < 0, then the first key character is // a set, and we must use the more time-consuming // matchesIndexValue check. In practice this happens // rarely, so we seldom tread this code path. TransliterationRule r = ruleVector.get(j); if (r.matchesIndexValue(x)) { v.add(r); } } } } index[256] = v.size(); /* Freeze things into an array. */ rules = new TransliterationRule[v.size()]; v.toArray(rules); StringBuilder errors = null; /* Check for masking. This is MUCH faster than our old check, * which was each rule against each following rule, since we * only have to check for masking within each bin now. It's * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule * count, and n2 is the per-bin rule count. But n2< " + UtilityExtensions.formatInput(text, pos)); } return true; case UnicodeMatcher.U_PARTIAL_MATCH: if (Transliterator.DEBUG) { System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") + rules[i].toRule(true) + " => " + UtilityExtensions.formatInput(text, pos)); } return false; default: if (Transliterator.DEBUG) { System.out.println("Rule: no match " + rules[i]); } } } // No match or partial match from any rule pos.start += UTF16.getCharCount(text.char32At(pos.start)); if (Transliterator.DEBUG) { System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") + UtilityExtensions.formatInput(text, pos)); } return true; } /** * Create rule strings that represents this rule set. */ String toRules(boolean escapeUnprintable) { int i; int count = ruleVector.size(); StringBuilder ruleSource = new StringBuilder(); for (i=0; i |b ; b > c ; // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { UnicodeSet currentFilter = new UnicodeSet(filter); UnicodeSet revisiting = new UnicodeSet(); int count = ruleVector.size(); for (int i=0; i





© 2015 - 2024 Weber Informatics LLC | Privacy Policy