com.ibm.icu.text.TransliterationRuleSet Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of virtdata-lib-realer Show documentation
With inspiration from other libraries
There is a newer version: 2.12.15
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
 *******************************************************************************
 * Copyright (C) 1996-2010, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.util.ArrayList;
import java.util.List;

import com.ibm.icu.impl.UtilityExtensions;

/**
 * A set of rules for a RuleBasedTransliterator.  This set encodes
 * the transliteration in one direction from one set of characters or short
 * strings to another.  A RuleBasedTransliterator consists of up to
 * two such sets, one for the forward direction, and one for the reverse.
 *
 * A TransliterationRuleSet has one important operation, that of
 * finding a matching rule at a given point in the text.  This is accomplished
 * by the findMatch() method.
 *
 * 
Copyright © IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
 */
class TransliterationRuleSet {
    /**
     * Vector of rules, in the order added.
     */
    private List ruleVector;

    /**
     * Length of the longest preceding context
     */
    private int maxContextLength;

    /**
     * Sorted and indexed table of rules.  This is created by freeze() from
     * the rules in ruleVector.  rules.length >= ruleVector.size(), and the
     * references in rules[] are aliases of the references in ruleVector.
     * A single rule in ruleVector is listed one or more times in rules[].
     */
    private TransliterationRule[] rules;

    /**
     * Index table.  For text having a first character c, compute x = c&0xFF.
     * Now use rules[index[x]..index[x+1]-1].  This index table is created by
     * freeze().
     */
    private int[] index;

    /**
     * Construct a new empty rule set.
     */
    public TransliterationRuleSet() {
        ruleVector = new ArrayList();
        maxContextLength = 0;
    }

    /**
     * Return the maximum context length.
     * @return the length of the longest preceding context.
     */
    public int getMaximumContextLength() {
        return maxContextLength;
    }

    /**
     * Add a rule to this set.  Rules are added in order, and order is
     * significant.
     * @param rule the rule to add
     */
    public void addRule(TransliterationRule rule) {
        ruleVector.add(rule);
        int len;
        if ((len = rule.getAnteContextLength()) > maxContextLength) {
            maxContextLength = len;
        }

        rules = null;
    }

    /**
     * Close this rule set to further additions, check it for masked rules,
     * and index it to optimize performance.
     * @exception IllegalArgumentException if some rules are masked
     */
    public void freeze() {
        /* Construct the rule array and index table.  We reorder the
         * rules by sorting them into 256 bins.  Each bin contains all
         * rules matching the index value for that bin.  A rule
         * matches an index value if string whose first key character
         * has a low byte equal to the index value can match the rule.
         *
         * Each bin contains zero or more rules, in the same order
         * they were found originally.  However, the total rules in
         * the bins may exceed the number in the original vector,
         * since rules that have a variable as their first key
         * character will generally fall into more than one bin.
         *
         * That is, each bin contains all rules that either have that
         * first index value as their first key character, or have
         * a set containing the index value as their first character.
         */
        int n = ruleVector.size();
        index = new int[257]; // [sic]
        List v = new ArrayList(2*n); // heuristic; adjust as needed

        /* Precompute the index values.  This saves a LOT of time.
         */
        int[] indexValue = new int[n];
        for (int j=0; j= 0) {
                    if (indexValue[j] == x) {
                        v.add(ruleVector.get(j));
                    }
                } else {
                    // If the indexValue is < 0, then the first key character is
                    // a set, and we must use the more time-consuming
                    // matchesIndexValue check.  In practice this happens
                    // rarely, so we seldom tread this code path.
                    TransliterationRule r = ruleVector.get(j);
                    if (r.matchesIndexValue(x)) {
                        v.add(r);
                    }
                }
            }
        }
        index[256] = v.size();

        /* Freeze things into an array.
         */
        rules = new TransliterationRule[v.size()];
        v.toArray(rules);

        StringBuilder errors = null;

        /* Check for masking.  This is MUCH faster than our old check,
         * which was each rule against each following rule, since we
         * only have to check for masking within each bin now.  It's
         * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
         * count, and n2 is the per-bin rule count.  But n2< " +
                                       UtilityExtensions.formatInput(text, pos));
                }
                return true;
            case UnicodeMatcher.U_PARTIAL_MATCH:
                if (Transliterator.DEBUG) {
                    System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
                                       rules[i].toRule(true) + " => " +
                                       UtilityExtensions.formatInput(text, pos));
                }
                return false;
                default:
                    if (Transliterator.DEBUG) {
                        System.out.println("Rule: no match " + rules[i]);
                    }
            }
        }
        // No match or partial match from any rule
        pos.start += UTF16.getCharCount(text.char32At(pos.start));
        if (Transliterator.DEBUG) {
            System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
                               UtilityExtensions.formatInput(text, pos));
        }
        return true;
    }

    /**
     * Create rule strings that represents this rule set.
     */
    String toRules(boolean escapeUnprintable) {
        int i;
        int count = ruleVector.size();
        StringBuilder ruleSource = new StringBuilder();
        for (i=0; i |b ; b > c ;
    // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
    void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
        UnicodeSet currentFilter = new UnicodeSet(filter);
        UnicodeSet revisiting = new UnicodeSet();
        int count = ruleVector.size();
        for (int i=0; i