All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.RuleBasedTransliterator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 1996-2016, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.util.HashMap;
import java.util.Map;

/**
 * RuleBasedTransliterator is a transliterator
 * built from a set of rules as defined for
 * {@link Transliterator#createFromRules(String, String, int)}.
 * See the class {@link Transliterator} documentation for the rule syntax.
 *
 * @author Alan Liu
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public class RuleBasedTransliterator extends Transliterator {

    private final Data data;

//    /**
//     * Constructs a new transliterator from the given rules.
//     * @param rules rules, separated by ';'
//     * @param direction either FORWARD or REVERSE.
//     * @exception IllegalArgumentException if rules are malformed
//     * or direction is invalid.
//     */
//     public RuleBasedTransliterator(String ID, String rules, int direction,
//                                   UnicodeFilter filter) {
//        super(ID, filter);
//        if (direction != FORWARD && direction != REVERSE) {
//            throw new IllegalArgumentException("Invalid direction");
//        }
//
//        TransliteratorParser parser = new TransliteratorParser();
//        parser.parse(rules, direction);
//        if (parser.idBlockVector.size() != 0 ||
//            parser.compoundFilter != null) {
//            throw new IllegalArgumentException("::ID blocks illegal in RuleBasedTransliterator constructor");
//        }
//
//        data = (Data)parser.dataVector.get(0);
//        setMaximumContextLength(data.ruleSet.getMaximumContextLength());
//     }

//    /**
//     * Constructs a new transliterator from the given rules in the
//     * FORWARD direction.
//     * @param rules rules, separated by ';'
//     * @exception IllegalArgumentException if rules are malformed
//     * or direction is invalid.
//     */
//    public RuleBasedTransliterator(String ID, String rules) {
//        this(ID, rules, FORWARD, null);
//    }

    RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
        super(ID, filter);
        this.data = data;
        setMaximumContextLength(data.ruleSet.getMaximumContextLength());
    }

    /**
     * Implements {@link Transliterator#handleTransliterate}.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Override
    @Deprecated
    protected void handleTransliterate(Replaceable text,
                                       Position index, boolean incremental) {
        /* We keep start and limit fixed the entire time,
         * relative to the text -- limit may move numerically if text is
         * inserted or removed.  The cursor moves from start to limit, with
         * replacements happening under it.
         *
         * Example: rules 1. ab>x|y
         *                2. yc>z
         *
         * |eabcd   start - no match, advance cursor
         * e|abcd   match rule 1 - change text & adjust cursor
         * ex|ycd   match rule 2 - change text & adjust cursor
         * exz|d    no match, advance cursor
         * exzd|    done
         */

        /* A rule like
         *   a>b|a
         * creates an infinite loop. To prevent that, we put an arbitrary
         * limit on the number of iterations that we take, one that is
         * high enough that any reasonable rules are ok, but low enough to
         * prevent a server from hanging.  The limit is 16 times the
         * number of characters n, unless n is so large that 16n exceeds a
         * uint32_t.
         */
        synchronized(data)  {
            int loopCount = 0;
            int loopLimit = (index.limit - index.start) << 4;
            if (loopLimit < 0) {
                loopLimit = 0x7FFFFFFF;
            }

            while (index.start < index.limit &&
                    loopCount <= loopLimit &&
                    data.ruleSet.transliterate(text, index, incremental)) {
                ++loopCount;
            }
        }
    }


    static class Data {
        public Data() {
            variableNames = new HashMap<>();
            ruleSet = new TransliterationRuleSet();
        }

        /**
         * Rule table.  May be empty.
         */
        public TransliterationRuleSet ruleSet;

        /**
         * Map variable name (String) to variable (char[]).  A variable name
         * corresponds to zero or more characters, stored in a char[] array in
         * this hash.  One or more of these chars may also correspond to a
         * UnicodeSet, in which case the character in the char[] in this hash is
         * a stand-in: it is an index for a secondary lookup in
         * data.variables.  The stand-in also represents the UnicodeSet in
         * the stored rules.
         */
        Map variableNames;

        /**
         * Map category variable (Character) to UnicodeMatcher or UnicodeReplacer.
         * Variables that correspond to a set of characters are mapped
         * from variable name to a stand-in character in data.variableNames.
         * The stand-in then serves as a key in this hash to lookup the
         * actual UnicodeSet object.  In addition, the stand-in is
         * stored in the rule text to represent the set of characters.
         * variables[i] represents character (variablesBase + i).
         */
        Object[] variables;

        /**
         * The character that represents variables[0].  Characters
         * variablesBase through variablesBase +
         * variables.length - 1 represent UnicodeSet objects.
         */
        char variablesBase;

        /**
         * Return the UnicodeMatcher represented by the given character, or
         * null if none.
         */
        public UnicodeMatcher lookupMatcher(int standIn) {
            int i = standIn - variablesBase;
            return (i >= 0 && i < variables.length)
                ? (UnicodeMatcher) variables[i] : null;
        }

        /**
         * Return the UnicodeReplacer represented by the given character, or
         * null if none.
         */
        public UnicodeReplacer lookupReplacer(int standIn) {
            int i = standIn - variablesBase;
            return (i >= 0 && i < variables.length)
                ? (UnicodeReplacer) variables[i] : null;
        }
    }


    /**
     * Return a representation of this transliterator as source rules.
     * These rules will produce an equivalent transliterator if used
     * to construct a new transliterator.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \\uxxxx or
     * \\Uxxxxxxxx.  Unprintable characters are those other than
     * U+000A, U+0020..U+007E.
     * @return rules string
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Override
    @Deprecated
    public String toRules(boolean escapeUnprintable) {
        return data.ruleSet.toRules(escapeUnprintable);
    }

//    /**
//     * Return the set of all characters that may be modified by this
//     * Transliterator, ignoring the effect of our filter.
//     */
//    protected UnicodeSet handleGetSourceSet() {
//        return data.ruleSet.getSourceTargetSet(false, unicodeFilter);
//    }
//
//    /**
//     * Returns the set of all characters that may be generated as
//     * replacement text by this transliterator.
//     */
//    public UnicodeSet getTargetSet() {
//        return data.ruleSet.getSourceTargetSet(true, unicodeFilter);
//    }

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    @Override
    public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
        data.ruleSet.addSourceTargetSet(filter, sourceSet, targetSet);
    }

    /**
     * Temporary hack for registry problem. Needs to be replaced by better architecture.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public Transliterator safeClone() {
        UnicodeFilter filter = getFilter();
        if (filter != null && filter instanceof UnicodeSet) {
            filter = new UnicodeSet((UnicodeSet)filter);
        }
        return new RuleBasedTransliterator(getID(), data, filter);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy