All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.RuleBasedCollator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/**
 *******************************************************************************
 * Copyright (C) 1996-2016, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.CharacterIterator;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Objects;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import com.ibm.icu.impl.ClassLoaderUtil;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Normalizer2Impl.ReorderingBuffer;
import com.ibm.icu.impl.coll.BOCSU;
import com.ibm.icu.impl.coll.Collation;
import com.ibm.icu.impl.coll.CollationCompare;
import com.ibm.icu.impl.coll.CollationData;
import com.ibm.icu.impl.coll.CollationFastLatin;
import com.ibm.icu.impl.coll.CollationIterator;
import com.ibm.icu.impl.coll.CollationKeys;
import com.ibm.icu.impl.coll.CollationKeys.SortKeyByteSink;
import com.ibm.icu.impl.coll.CollationLoader;
import com.ibm.icu.impl.coll.CollationRoot;
import com.ibm.icu.impl.coll.CollationSettings;
import com.ibm.icu.impl.coll.CollationTailoring;
import com.ibm.icu.impl.coll.ContractionsAndExpansions;
import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
import com.ibm.icu.impl.coll.SharedObject;
import com.ibm.icu.impl.coll.TailoredSet;
import com.ibm.icu.impl.coll.UTF16CollationIterator;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;

/**
 * 

* RuleBasedCollator is a concrete subclass of Collator. It allows customization of the Collator via user-specified rule * sets. RuleBasedCollator is designed to be fully compliant to the Unicode Collation Algorithm (UCA) and conforms to ISO 14651. * *

A Collator is thread-safe only when frozen. See {{@link #isFrozen()} and {@link com.ibm.icu.util.Freezable}. * *

* Users are strongly encouraged to read the User * Guide for more information about the collation service before using this class. * *

* Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class * Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the * argument locale. If a customized collation ordering or attributes is required, use the RuleBasedCollator(String) * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on the CLDR root collation, while * re-adjusting the attributes and orders of the characters in the specified rule accordingly. * *

* RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale * is not available, the orders eventually falls back to the * CLDR root sort order. * *

* For information about the collation rule syntax and details about customization, please refer to the Collation customization section of the * User Guide. * *

* Note that there are some differences between the Collation rule syntax used in Java and ICU4J: * *

    *
  • According to the JDK documentation:
    * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule is in force when a Thai vowel of the range * \U0E40-\U0E44 precedes a Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the range * \U0EC0-\U0EC4 precedes a Lao consonant of the range \U0E81-\U0EAE then the vowel is placed after the * consonant for collation purposes. *
    * If a rule is without the modifier '!', the Thai/Lao vowel-consonant swapping is not turned on. *
    *
    * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao vowel-consonant swapping, since the UCA clearly * states that it has to be supported to ensure a correct sorting order. If a '!' is encountered, it is ignored.
  • *
  • As mentioned in the documentation of the base class Collator, compatibility decomposition mode is not supported.
  • *
*

* Examples *

* Creating Customized RuleBasedCollators:

* *
 * String simple = "& a < b < c < d";
 * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
 *
 * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
 *                    + "< f , F < g , G < h , H < i , I < j , "
 *                    + "J < k , K < l , L < m , M < n , N < "
 *                    + "o , O < p , P < q , Q <r , R <s , S < "
 *                    + "t , T < u , U < v , V < w , W < x , X "
 *                    + "< y , Y < z , Z < \u00E5 = a\u030A "
 *                    + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
 *                    + ", \u00C6 < \u00F8 , \u00D8";
 * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
 * 
* *
* * Concatenating rules to combine Collators:
* *
 * // Create an en_US Collator object
 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
 *     Collator.getInstance(new Locale("en", "US", ""));
 * // Create a da_DK Collator object
 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
 *     Collator.getInstance(new Locale("da", "DK", ""));
 * // Combine the two
 * // First, get the collation rules from en_USCollator
 * String en_USRules = en_USCollator.getRules();
 * // Second, get the collation rules from da_DKCollator
 * String da_DKRules = da_DKCollator.getRules();
 * RuleBasedCollator newCollator =
 *                             new RuleBasedCollator(en_USRules + da_DKRules);
 * // newCollator has the combined rules
 * 
* *
* * Making changes to an existing RuleBasedCollator to create a new Collator object, by appending changes to * the existing rule:
* *
 * // Create a new Collator object with additional rules
 * String addRules = "& C < ch, cH, Ch, CH";
 * RuleBasedCollator myCollator =
 *     new RuleBasedCollator(en_USCollator.getRules() + addRules);
 * // myCollator contains the new rules
 * 
* *
* * How to change the order of non-spacing accents:
* *
 * // old rule with main accents
 * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
 *                 + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
 *                 + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
 *                 + "; \u030B ; \u030C ; \u030D ; \u030E "
 *                 + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
 *                 + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
 *                 + "< b , B < c, C < e, E & C < d , D";
 * // change the order of accent characters
 * String addOn = "& \u0300 ; \u0308 ; \u0302";
 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
 * 
* *
* * Putting in a new primary ordering before the default setting, e.g. sort English characters before or after Japanese * characters in the Japanese Collator:
* *
 * // get en_US Collator rules
 * RuleBasedCollator en_USCollator
 *                        = (RuleBasedCollator)Collator.getInstance(Locale.US);
 * // add a few Japanese characters to sort before English characters
 * // suppose the last character before the first base letter 'a' in
 * // the English collation rule is \u2212
 * String jaString = "& \u2212 <\u3041, \u3042 <\u3043, "
 *                   + "\u3044";
 * RuleBasedCollator myJapaneseCollator
 *              = new RuleBasedCollator(en_USCollator.getRules() + jaString);
 * 
* *
*

* This class is not subclassable * * @author Syn Wee Quek * @stable ICU 2.8 */ public final class RuleBasedCollator extends Collator { // public constructors --------------------------------------------------- /** *

* Constructor that takes the argument rules for customization. * The collator will be based on the CLDR root collation, with the * attributes and re-ordering of the characters specified in the argument rules. *

* See the User Guide's section on * Collation Customization for details on the rule syntax. * * @param rules * the collation rules to build the collation table from. * @exception ParseException * and IOException thrown. ParseException thrown when argument rules have an invalid syntax. * IOException thrown when an error occurred while reading internal data. * @stable ICU 2.8 */ public RuleBasedCollator(String rules) throws Exception { if (rules == null) { throw new IllegalArgumentException("Collation rules can not be null"); } validLocale = ULocale.ROOT; internalBuildTailoring(rules); } /** * Implements from-rule constructors. * @param rules rule string * @throws Exception */ private final void internalBuildTailoring(String rules) throws Exception { CollationTailoring base = CollationRoot.getRoot(); // Most code using Collator does not need to build a Collator from rules. // By using reflection, most code will not have a static dependency on the builder code. // CollationBuilder builder = new CollationBuilder(base); ClassLoader classLoader = ClassLoaderUtil.getClassLoader(getClass()); CollationTailoring t; try { Class builderClass = classLoader.loadClass("com.ibm.icu.impl.coll.CollationBuilder"); Object builder = builderClass.getConstructor(CollationTailoring.class).newInstance(base); // builder.parseAndBuild(rules); Method parseAndBuild = builderClass.getMethod("parseAndBuild", String.class); t = (CollationTailoring)parseAndBuild.invoke(builder, rules); } catch(InvocationTargetException e) { throw (Exception)e.getTargetException(); } t.actualLocale = null; adoptTailoring(t); } // public methods -------------------------------------------------------- /** * Clones the RuleBasedCollator * * @return a new instance of this RuleBasedCollator object * @stable ICU 2.8 */ @Override public Object clone() throws CloneNotSupportedException { if (isFrozen()) { return this; } return cloneAsThawed(); } private final void initMaxExpansions() { synchronized(tailoring) { if (tailoring.maxExpansions == null) { tailoring.maxExpansions = CollationElementIterator.computeMaxExpansions(tailoring.data); } } } /** * Return a CollationElementIterator for the given String. * * @see CollationElementIterator * @stable ICU 2.8 */ public CollationElementIterator getCollationElementIterator(String source) { initMaxExpansions(); return new CollationElementIterator(source, this); } /** * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be * preserved since a new copy will be created for use. * * @see CollationElementIterator * @stable ICU 2.8 */ public CollationElementIterator getCollationElementIterator(CharacterIterator source) { initMaxExpansions(); CharacterIterator newsource = (CharacterIterator) source.clone(); return new CollationElementIterator(newsource, this); } /** * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be * preserved since a new copy will be created for use. * * @see CollationElementIterator * @stable ICU 2.8 */ public CollationElementIterator getCollationElementIterator(UCharacterIterator source) { initMaxExpansions(); return new CollationElementIterator(source, this); } // Freezable interface implementation ------------------------------------------------- /** * Determines whether the object has been frozen or not. * *

An unfrozen Collator is mutable and not thread-safe. * A frozen Collator is immutable and thread-safe. * * @stable ICU 4.8 */ @Override public boolean isFrozen() { return frozenLock != null; } /** * Freezes the collator. * @return the collator itself. * @stable ICU 4.8 */ @Override public Collator freeze() { if (!isFrozen()) { frozenLock = new ReentrantLock(); if (collationBuffer == null) { collationBuffer = new CollationBuffer(data); } } return this; } /** * Provides for the clone operation. Any clone is initially unfrozen. * @stable ICU 4.8 */ @Override public RuleBasedCollator cloneAsThawed() { try { RuleBasedCollator result = (RuleBasedCollator) super.clone(); // since all collation data in the RuleBasedCollator do not change // we can safely assign the result.fields to this collator // except in cases where we can't result.settings = settings.clone(); result.collationBuffer = null; result.frozenLock = null; return result; } catch (CloneNotSupportedException e) { // Clone is implemented return null; } } // public setters -------------------------------------------------------- private void checkNotFrozen() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen RuleBasedCollator"); } } private final CollationSettings getOwnedSettings() { return settings.copyOnWrite(); } private final CollationSettings getDefaultSettings() { return tailoring.settings.readOnly(); } /** * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a * correct JIS collation order, distinguishing between Katakana and Hiragana characters. * *

This attribute was an implementation detail of the CLDR Japanese tailoring. * Since ICU 50, this attribute is not settable any more via API functions. * Since CLDR 25/ICU 53, explicit quaternary relations are used * to achieve the same Japanese sort order. * * @param flag * true if Hiragana Quaternary mode is to be on, false otherwise * @see #setHiraganaQuaternaryDefault * @see #isHiraganaQuaternary * @deprecated ICU 50 Implementation detail, cannot be set via API, was removed from implementation. */ @Deprecated public void setHiraganaQuaternary(boolean flag) { checkNotFrozen(); } /** * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See * setHiraganaQuaternary(boolean) for more details. * *

This attribute was an implementation detail of the CLDR Japanese tailoring. * Since ICU 50, this attribute is not settable any more via API functions. * Since CLDR 25/ICU 53, explicit quaternary relations are used * to achieve the same Japanese sort order. * * @see #setHiraganaQuaternary(boolean) * @see #isHiraganaQuaternary * @deprecated ICU 50 Implementation detail, cannot be set via API, was removed from implementation. */ @Deprecated public void setHiraganaQuaternaryDefault() { checkNotFrozen(); } /** * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case * characters first. * * @param upperfirst * true to sort uppercase characters before lowercase characters, false to sort lowercase characters * before uppercase characters * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setLowerCaseFirst * @see #setCaseFirstDefault * @stable ICU 2.8 */ public void setUpperCaseFirst(boolean upperfirst) { checkNotFrozen(); if (upperfirst == isUpperCaseFirst()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setCaseFirst(upperfirst ? CollationSettings.CASE_FIRST_AND_UPPER_MASK : 0); setFastLatinOptions(ownedSettings); } /** * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences. * * @param lowerfirst * true for sorting lower cased characters before upper cased characters, false to ignore case * preferences. * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setUpperCaseFirst * @see #setCaseFirstDefault * @stable ICU 2.8 */ public void setLowerCaseFirst(boolean lowerfirst) { checkNotFrozen(); if (lowerfirst == isLowerCaseFirst()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setCaseFirst(lowerfirst ? CollationSettings.CASE_FIRST : 0); setFastLatinOptions(ownedSettings); } /** * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details. * * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setLowerCaseFirst(boolean) * @see #setUpperCaseFirst(boolean) * @stable ICU 2.8 */ public final void setCaseFirstDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setCaseFirstDefault(defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See * setAlternateHandling(boolean) for more details. * * @see #setAlternateHandlingShifted(boolean) * @see #isAlternateHandlingShifted() * @stable ICU 2.8 */ public void setAlternateHandlingDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setAlternateHandlingDefault(defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See * setCaseLevel(boolean) for more details. * * @see #setCaseLevel(boolean) * @see #isCaseLevel * @stable ICU 2.8 */ public void setCaseLevelDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlagDefault(CollationSettings.CASE_LEVEL, defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See * setDecomposition(int) for more details. * * @see #getDecomposition * @see #setDecomposition(int) * @stable ICU 2.8 */ public void setDecompositionDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlagDefault(CollationSettings.CHECK_FCD, defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See * setFrenchCollation(boolean) for more details. * * @see #isFrenchCollation * @see #setFrenchCollation(boolean) * @stable ICU 2.8 */ public void setFrenchCollationDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlagDefault(CollationSettings.BACKWARD_SECONDARY, defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See * setStrength(int) for more details. * * @see #setStrength(int) * @see #getStrength * @stable ICU 2.8 */ public void setStrengthDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setStrengthDefault(defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Method to set numeric collation to its default value. * * @see #getNumericCollation * @see #setNumericCollation * @stable ICU 2.8 */ public void setNumericCollationDefault() { checkNotFrozen(); CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlagDefault(CollationSettings.NUMERIC, defaultSettings.options); setFastLatinOptions(ownedSettings); } /** * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false, * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted * backwards. See the section on * French collation for more information. * * @param flag * true to set the French collation on, false to set it off * @stable ICU 2.8 * @see #isFrenchCollation * @see #setFrenchCollationDefault */ public void setFrenchCollation(boolean flag) { checkNotFrozen(); if(flag == isFrenchCollation()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlag(CollationSettings.BACKWARD_SECONDARY, flag); setFastLatinOptions(ownedSettings); } /** * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition * on Variable Weighting. This * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false, * corresponding to the NON_IGNORABLE mode in UCA. In the NON_IGNORABLE mode, the RuleBasedCollator treats all * the code points with non-ignorable primary weights in the same way. If the mode is set to true, the behavior * corresponds to SHIFTED defined in UCA, this causes code points with PRIMARY orders that are equal or below the * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order. * * @param shifted * true if SHIFTED behavior for alternate handling is desired, false for the NON_IGNORABLE behavior. * @see #isAlternateHandlingShifted * @see #setAlternateHandlingDefault * @stable ICU 2.8 */ public void setAlternateHandlingShifted(boolean shifted) { checkNotFrozen(); if(shifted == isAlternateHandlingShifted()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setAlternateHandlingShifted(shifted); setFastLatinOptions(ownedSettings); } /** *

* When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value * is false, which means the case level is not generated. The contents of the case level are affected by the case * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable * case level. *

* See the section on case * level for more information. * * @param flag * true if case level sorting is required, false otherwise * @stable ICU 2.8 * @see #setCaseLevelDefault * @see #isCaseLevel */ public void setCaseLevel(boolean flag) { checkNotFrozen(); if(flag == isCaseLevel()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlag(CollationSettings.CASE_LEVEL, flag); setFastLatinOptions(ownedSettings); } /** * Sets the decomposition mode of this Collator. Setting this * decomposition attribute with CANONICAL_DECOMPOSITION allows the * Collator to handle un-normalized text properly, producing the * same results as if the text were normalized. If * NO_DECOMPOSITION is set, it is the user's responsibility to * insure that all text is already in the appropriate form before * a comparison or before getting a CollationKey. Adjusting * decomposition mode allows the user to select between faster and * more complete collation behavior. * *

Since a great many of the world's languages do not require * text normalization, most locales set NO_DECOMPOSITION as the * default decomposition mode. * * The default decompositon mode for the Collator is * NO_DECOMPOSITON, unless specified otherwise by the locale used * to create the Collator. * *

See getDecomposition for a description of decomposition * mode. * * @param decomposition the new decomposition mode * @see #getDecomposition * @see #NO_DECOMPOSITION * @see #CANONICAL_DECOMPOSITION * @throws IllegalArgumentException If the given value is not a valid * decomposition mode. * @stable ICU 2.8 */ @Override public void setDecomposition(int decomposition) { checkNotFrozen(); boolean flag; switch(decomposition) { case NO_DECOMPOSITION: flag = false; break; case CANONICAL_DECOMPOSITION: flag = true; break; default: throw new IllegalArgumentException("Wrong decomposition mode."); } if(flag == settings.readOnly().getFlag(CollationSettings.CHECK_FCD)) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlag(CollationSettings.CHECK_FCD, flag); setFastLatinOptions(ownedSettings); } /** * Sets this Collator's strength attribute. The strength attribute determines the minimum level of difference * considered significant during comparison. * *

See the Collator class description for an example of use. * * @param newStrength * the new strength value. * @see #getStrength * @see #setStrengthDefault * @see #PRIMARY * @see #SECONDARY * @see #TERTIARY * @see #QUATERNARY * @see #IDENTICAL * @exception IllegalArgumentException * If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. * @stable ICU 2.8 */ @Override public void setStrength(int newStrength) { checkNotFrozen(); if(newStrength == getStrength()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setStrength(newStrength); setFastLatinOptions(ownedSettings); } /** * {@icu} Sets the variable top to the top of the specified reordering group. * The variable top determines the highest-sorting character * which is affected by the alternate handling behavior. * If that attribute is set to NON_IGNORABLE, then the variable top has no effect. * @param group one of Collator.ReorderCodes.SPACE, Collator.ReorderCodes.PUNCTUATION, * Collator.ReorderCodes.SYMBOL, Collator.ReorderCodes.CURRENCY; * or Collator.ReorderCodes.DEFAULT to restore the default max variable group * @return this * @see #getMaxVariable * @stable ICU 53 */ @Override public RuleBasedCollator setMaxVariable(int group) { // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1. int value; if(group == Collator.ReorderCodes.DEFAULT) { value = -1; // UCOL_DEFAULT } else if(Collator.ReorderCodes.FIRST <= group && group <= Collator.ReorderCodes.CURRENCY) { value = group - Collator.ReorderCodes.FIRST; } else { throw new IllegalArgumentException("illegal max variable group " + group); } int oldValue = settings.readOnly().getMaxVariable(); if(value == oldValue) { return this; } CollationSettings defaultSettings = getDefaultSettings(); if(settings.readOnly() == defaultSettings) { if(value < 0) { // UCOL_DEFAULT return this; } } CollationSettings ownedSettings = getOwnedSettings(); if(group == Collator.ReorderCodes.DEFAULT) { group = Collator.ReorderCodes.FIRST + defaultSettings.getMaxVariable(); } long varTop = data.getLastPrimaryForGroup(group); assert(varTop != 0); ownedSettings.setMaxVariable(value, defaultSettings.options); ownedSettings.variableTop = varTop; setFastLatinOptions(ownedSettings); return this; } /** * {@icu} Returns the maximum reordering group whose characters are affected by * the alternate handling behavior. * @return the maximum variable reordering group. * @see #setMaxVariable * @stable ICU 53 */ @Override public int getMaxVariable() { return Collator.ReorderCodes.FIRST + settings.readOnly().getMaxVariable(); } /** * {@icu} Sets the variable top to the primary weight of the specified string. * *

Beginning with ICU 53, the variable top is pinned to * the top of one of the supported reordering groups, * and it must not be beyond the last of those groups. * See {@link #setMaxVariable(int)}. * * @param varTop * one or more (if contraction) characters to which the variable top should be set * @return variable top primary weight * @exception IllegalArgumentException * is thrown if varTop argument is not a valid variable top element. A variable top element is * invalid when *

    *
  • it is a contraction that does not exist in the Collation order *
  • the variable top is beyond * the last reordering group supported by setMaxVariable() *
  • when the varTop argument is null or zero in length. *
* @see #getVariableTop * @see RuleBasedCollator#setAlternateHandlingShifted * @deprecated ICU 53 Call {@link #setMaxVariable(int)} instead. */ @Override @Deprecated public int setVariableTop(String varTop) { checkNotFrozen(); if (varTop == null || varTop.length() == 0) { throw new IllegalArgumentException("Variable top argument string can not be null or zero in length."); } boolean numeric = settings.readOnly().isNumeric(); long ce1, ce2; if(settings.readOnly().dontCheckFCD()) { UTF16CollationIterator ci = new UTF16CollationIterator(data, numeric, varTop, 0); ce1 = ci.nextCE(); ce2 = ci.nextCE(); } else { FCDUTF16CollationIterator ci = new FCDUTF16CollationIterator(data, numeric, varTop, 0); ce1 = ci.nextCE(); ce2 = ci.nextCE(); } if(ce1 == Collation.NO_CE || ce2 != Collation.NO_CE) { throw new IllegalArgumentException("Variable top argument string must map to exactly one collation element"); } internalSetVariableTop(ce1 >>> 32); return (int)settings.readOnly().variableTop; } /** * {@icu} Sets the variable top to the specified primary weight. * *

Beginning with ICU 53, the variable top is pinned to * the top of one of the supported reordering groups, * and it must not be beyond the last of those groups. * See {@link #setMaxVariable(int)}. * * @param varTop primary weight, as returned by setVariableTop or getVariableTop * @see #getVariableTop * @see #setVariableTop(String) * @deprecated ICU 53 Call setMaxVariable() instead. */ @Override @Deprecated public void setVariableTop(int varTop) { checkNotFrozen(); internalSetVariableTop(varTop & 0xffffffffL); } private void internalSetVariableTop(long varTop) { if(varTop != settings.readOnly().variableTop) { // Pin the variable top to the end of the reordering group which contains it. // Only a few special groups are supported. int group = data.getGroupForPrimary(varTop); if(group < Collator.ReorderCodes.FIRST || Collator.ReorderCodes.CURRENCY < group) { throw new IllegalArgumentException("The variable top must be a primary weight in " + "the space/punctuation/symbols/currency symbols range"); } long v = data.getLastPrimaryForGroup(group); assert(v != 0 && v >= varTop); varTop = v; if(varTop != settings.readOnly().variableTop) { CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setMaxVariable(group - Collator.ReorderCodes.FIRST, getDefaultSettings().options); ownedSettings.variableTop = varTop; setFastLatinOptions(ownedSettings); } } } /** * {@icu} When numeric collation is turned on, this Collator makes * substrings of digits sort according to their numeric values. * *

This is a way to get '100' to sort AFTER '2'. Note that the longest * digit substring that can be treated as a single unit is * 254 digits (not counting leading zeros). If a digit substring is * longer than that, the digits beyond the limit will be treated as a * separate digit substring. * *

A "digit" in this sense is a code point with General_Category=Nd, * which does not include circled numbers, roman numerals, etc. * Only a contiguous digit substring is considered, that is, * non-negative integers without separators. * There is no support for plus/minus signs, decimals, exponents, etc. * * @param flag * true to turn numeric collation on and false to turn it off * @see #getNumericCollation * @see #setNumericCollationDefault * @stable ICU 2.8 */ public void setNumericCollation(boolean flag) { checkNotFrozen(); // sort substrings of digits as numbers if(flag == getNumericCollation()) { return; } CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.setFlag(CollationSettings.NUMERIC, flag); setFastLatinOptions(ownedSettings); } /** * {@inheritDoc} * * @param order the reordering codes to apply to this collator; if this is null or an empty array * then this clears any existing reordering * @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts) * @see #getReorderCodes * @see Collator#getEquivalentReorderCodes * @see Collator.ReorderCodes * @see UScript * @stable ICU 4.8 */ @Override public void setReorderCodes(int... order) { checkNotFrozen(); int length = (order != null) ? order.length : 0; if(length == 1 && order[0] == ReorderCodes.NONE) { length = 0; } if(length == 0 ? settings.readOnly().reorderCodes.length == 0 : Arrays.equals(order, settings.readOnly().reorderCodes)) { return; } CollationSettings defaultSettings = getDefaultSettings(); if(length == 1 && order[0] == Collator.ReorderCodes.DEFAULT) { if(settings.readOnly() != defaultSettings) { CollationSettings ownedSettings = getOwnedSettings(); ownedSettings.copyReorderingFrom(defaultSettings); setFastLatinOptions(ownedSettings); } return; } CollationSettings ownedSettings = getOwnedSettings(); if(length == 0) { ownedSettings.resetReordering(); } else { ownedSettings.setReordering(data, order.clone()); } setFastLatinOptions(ownedSettings); } private void setFastLatinOptions(CollationSettings ownedSettings) { ownedSettings.fastLatinOptions = CollationFastLatin.getOptions( data, ownedSettings, ownedSettings.fastLatinPrimaries); } // public getters -------------------------------------------------------- /** * Gets the collation tailoring rules for this RuleBasedCollator. * Equivalent to String getRules(false). * * @return the collation tailoring rules * @see #getRules(boolean) * @stable ICU 2.8 */ public String getRules() { return tailoring.getRules(); } /** * Returns current rules. * The argument defines whether full rules (root collation + tailored) rules are returned * or just the tailoring. * *

The root collation rules are an approximation of the root collator's sort order. * They are almost never used or useful at runtime and can be removed from the data. * See User Guide: * Collation Customization, Building on Existing Locales * *

{@link #getRules()} should normally be used instead. * @param fullrules * true if the rules that defines the full set of collation order is required, otherwise false for * returning only the tailored rules * @return the current rules that defines this Collator. * @see #getRules() * @stable ICU 2.6 */ public String getRules(boolean fullrules) { if (!fullrules) { return tailoring.getRules(); } return CollationLoader.getRootRules() + tailoring.getRules(); } /** * Get a UnicodeSet that contains all the characters and sequences tailored in this collator. * * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently * than in the root collator. * @stable ICU 2.4 */ @Override public UnicodeSet getTailoredSet() { UnicodeSet tailored = new UnicodeSet(); if(data.base != null) { new TailoredSet(tailored).forData(data); } return tailored; } /** * Gets unicode sets containing contractions and/or expansions of a collator * * @param contractions * if not null, set to contain contractions * @param expansions * if not null, set to contain expansions * @param addPrefixes * add the prefix contextual elements to contractions * @throws Exception * Throws an exception if any errors occurs. * @stable ICU 3.4 */ public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes) throws Exception { if (contractions != null) { contractions.clear(); } if (expansions != null) { expansions.clear(); } new ContractionsAndExpansions(contractions, expansions, null, addPrefixes).forData(data); } /** * Adds the contractions that start with character c to the set. * Ignores prefixes. Used by AlphabeticIndex. * @internal * @deprecated This API is ICU internal only. */ @Deprecated void internalAddContractions(int c, UnicodeSet set) { new ContractionsAndExpansions(set, null, null, false).forCodePoint(data, c); } /** *

* Get a Collation key for the argument String source from this RuleBasedCollator. *

* General recommendation:
* If comparison are to be done to the same String multiple times, it would be more efficient to generate * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better * performance. *

* See the class documentation for an explanation about CollationKeys. * * @param source * the text String to be transformed into a collation key. * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source * String is null, a null CollationKey is returned. * @see CollationKey * @see #compare(String, String) * @see #getRawCollationKey * @stable ICU 2.8 */ @Override public CollationKey getCollationKey(String source) { if (source == null) { return null; } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); return getCollationKey(source, buffer); } finally { releaseCollationBuffer(buffer); } } private CollationKey getCollationKey(String source, CollationBuffer buffer) { buffer.rawCollationKey = getRawCollationKey(source, buffer.rawCollationKey, buffer); return new CollationKey(source, buffer.rawCollationKey); } /** * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the * result into the user provided argument key. If key has a internal byte array of length that's too small for the * result, the internal byte array will be grown to the exact required size. * * @param source the text String to be transformed into a RawCollationKey * @param key output RawCollationKey to store results * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user * provided key will be returned. * @see #getCollationKey * @see #compare(String, String) * @see RawCollationKey * @stable ICU 2.8 */ @Override public RawCollationKey getRawCollationKey(String source, RawCollationKey key) { if (source == null) { return null; } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); return getRawCollationKey(source, key, buffer); } finally { releaseCollationBuffer(buffer); } } private static final class CollationKeyByteSink extends SortKeyByteSink { CollationKeyByteSink(RawCollationKey key) { super(key.bytes); key_ = key; } @Override protected void AppendBeyondCapacity(byte[] bytes, int start, int n, int length) { // n > 0 && appended_ > capacity_ if (Resize(n, length)) { System.arraycopy(bytes, start, buffer_, length, n); } } @Override protected boolean Resize(int appendCapacity, int length) { int newCapacity = 2 * buffer_.length; int altCapacity = length + 2 * appendCapacity; if (newCapacity < altCapacity) { newCapacity = altCapacity; } if (newCapacity < 200) { newCapacity = 200; } // Do not call key_.ensureCapacity(newCapacity) because we do not // keep key_.size in sync with appended_. // We only set it when we are done. byte[] newBytes = new byte[newCapacity]; System.arraycopy(buffer_, 0, newBytes, 0, length); buffer_ = key_.bytes = newBytes; return true; } private RawCollationKey key_; } private RawCollationKey getRawCollationKey(CharSequence source, RawCollationKey key, CollationBuffer buffer) { if (key == null) { key = new RawCollationKey(simpleKeyLengthEstimate(source)); } else if (key.bytes == null) { key.bytes = new byte[simpleKeyLengthEstimate(source)]; } CollationKeyByteSink sink = new CollationKeyByteSink(key); writeSortKey(source, sink, buffer); key.size = sink.NumberOfBytesAppended(); return key; } private int simpleKeyLengthEstimate(CharSequence source) { return 2 * source.length() + 10; } private void writeSortKey(CharSequence s, CollationKeyByteSink sink, CollationBuffer buffer) { boolean numeric = settings.readOnly().isNumeric(); if(settings.readOnly().dontCheckFCD()) { buffer.leftUTF16CollIter.setText(numeric, s, 0); CollationKeys.writeSortKeyUpToQuaternary( buffer.leftUTF16CollIter, data.compressibleBytes, settings.readOnly(), sink, Collation.PRIMARY_LEVEL, CollationKeys.SIMPLE_LEVEL_FALLBACK, true); } else { buffer.leftFCDUTF16Iter.setText(numeric, s, 0); CollationKeys.writeSortKeyUpToQuaternary( buffer.leftFCDUTF16Iter, data.compressibleBytes, settings.readOnly(), sink, Collation.PRIMARY_LEVEL, CollationKeys.SIMPLE_LEVEL_FALLBACK, true); } if(settings.readOnly().getStrength() == IDENTICAL) { writeIdenticalLevel(s, sink); } sink.Append(Collation.TERMINATOR_BYTE); } private void writeIdenticalLevel(CharSequence s, CollationKeyByteSink sink) { // NFD quick check int nfdQCYesLimit = data.nfcImpl.decompose(s, 0, s.length(), null); sink.Append(Collation.LEVEL_SEPARATOR_BYTE); // Sync the ByteArrayWrapper size with the key length. sink.key_.size = sink.NumberOfBytesAppended(); int prev = 0; if(nfdQCYesLimit != 0) { prev = BOCSU.writeIdenticalLevelRun(prev, s, 0, nfdQCYesLimit, sink.key_); } // Is there non-NFD text? if(nfdQCYesLimit < s.length()) { int destLengthEstimate = s.length() - nfdQCYesLimit; StringBuilder nfd = new StringBuilder(); data.nfcImpl.decompose(s, nfdQCYesLimit, s.length(), nfd, destLengthEstimate); BOCSU.writeIdenticalLevelRun(prev, nfd, 0, nfd.length(), sink.key_); } // Sync the key with the buffer again which got bytes appended and may have been reallocated. sink.setBufferAndAppended(sink.key_.bytes, sink.key_.size); } /** * Returns the CEs for the string. * @param str the string * @internal for tests & tools * @deprecated This API is ICU internal only. */ @Deprecated public long[] internalGetCEs(CharSequence str) { CollationBuffer buffer = null; try { buffer = getCollationBuffer(); boolean numeric = settings.readOnly().isNumeric(); CollationIterator iter; if(settings.readOnly().dontCheckFCD()) { buffer.leftUTF16CollIter.setText(numeric, str, 0); iter = buffer.leftUTF16CollIter; } else { buffer.leftFCDUTF16Iter.setText(numeric, str, 0); iter = buffer.leftFCDUTF16Iter; } int length = iter.fetchCEs() - 1; assert length >= 0 && iter.getCE(length) == Collation.NO_CE; long[] ces = new long[length]; System.arraycopy(iter.getCEs(), 0, ces, 0, length); return ces; } finally { releaseCollationBuffer(buffer); } } /** * Returns this Collator's strength attribute. The strength attribute * determines the minimum level of difference considered significant. * *

{@icunote} This can return QUATERNARY strength, which is not supported by the * JDK version. * *

See the Collator class description for more details. * * @return this Collator's current strength attribute. * @see #setStrength * @see #PRIMARY * @see #SECONDARY * @see #TERTIARY * @see #QUATERNARY * @see #IDENTICAL * @stable ICU 2.8 */ @Override public int getStrength() { return settings.readOnly().getStrength(); } /** * Returns the decomposition mode of this Collator. The decomposition mode * determines how Unicode composed characters are handled. * *

See the Collator class description for more details. * * @return the decomposition mode * @see #setDecomposition * @see #NO_DECOMPOSITION * @see #CANONICAL_DECOMPOSITION * @stable ICU 2.8 */ @Override public int getDecomposition() { return (settings.readOnly().options & CollationSettings.CHECK_FCD) != 0 ? CANONICAL_DECOMPOSITION : NO_DECOMPOSITION; } /** * Return true if an uppercase character is sorted before the corresponding lowercase character. See * setCaseFirst(boolean) for details. * * @see #setUpperCaseFirst * @see #setLowerCaseFirst * @see #isLowerCaseFirst * @see #setCaseFirstDefault * @return true if upper cased characters are sorted before lower cased characters, false otherwise * @stable ICU 2.8 */ public boolean isUpperCaseFirst() { return (settings.readOnly().getCaseFirst() == CollationSettings.CASE_FIRST_AND_UPPER_MASK); } /** * Return true if a lowercase character is sorted before the corresponding uppercase character. See * setCaseFirst(boolean) for details. * * @see #setUpperCaseFirst * @see #setLowerCaseFirst * @see #isUpperCaseFirst * @see #setCaseFirstDefault * @return true lower cased characters are sorted before upper cased characters, false otherwise * @stable ICU 2.8 */ public boolean isLowerCaseFirst() { return (settings.readOnly().getCaseFirst() == CollationSettings.CASE_FIRST); } /** * Checks if the alternate handling behavior is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true, * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more * details. * * @return true or false * @see #setAlternateHandlingShifted(boolean) * @see #setAlternateHandlingDefault * @stable ICU 2.8 */ public boolean isAlternateHandlingShifted() { return settings.readOnly().getAlternateHandling(); } /** * Checks if case level is set to true. See setCaseLevel(boolean) for details. * * @return the case level mode * @see #setCaseLevelDefault * @see #isCaseLevel * @see #setCaseLevel(boolean) * @stable ICU 2.8 */ public boolean isCaseLevel() { return (settings.readOnly().options & CollationSettings.CASE_LEVEL) != 0; } /** * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details. * * @return true if French Collation is set to true, false otherwise * @see #setFrenchCollation(boolean) * @see #setFrenchCollationDefault * @stable ICU 2.8 */ public boolean isFrenchCollation() { return (settings.readOnly().options & CollationSettings.BACKWARD_SECONDARY) != 0; } /** * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details. * *

This attribute was an implementation detail of the CLDR Japanese tailoring. * Since ICU 50, this attribute is not settable any more via API functions. * Since CLDR 25/ICU 53, explicit quaternary relations are used * to achieve the same Japanese sort order. * * @return false * @see #setHiraganaQuaternaryDefault * @see #setHiraganaQuaternary(boolean) * @deprecated ICU 50 Implementation detail, cannot be set via API, was removed from implementation. */ @Deprecated public boolean isHiraganaQuaternary() { return false; } /** * {@icu} Gets the variable top value of a Collator. * * @return the variable top primary weight * @see #getMaxVariable * @stable ICU 2.6 */ @Override public int getVariableTop() { return (int)settings.readOnly().variableTop; } /** * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2' * * @see #setNumericCollation * @see #setNumericCollationDefault * @return true if numeric collation is turned on, false otherwise * @stable ICU 2.8 */ public boolean getNumericCollation() { return (settings.readOnly().options & CollationSettings.NUMERIC) != 0; } /** * Retrieves the reordering codes for this collator. * These reordering codes are a combination of UScript codes and ReorderCodes. * @return a copy of the reordering codes for this collator; * if none are set then returns an empty array * @see #setReorderCodes * @see Collator#getEquivalentReorderCodes * @stable ICU 4.8 */ @Override public int[] getReorderCodes() { return settings.readOnly().reorderCodes.clone(); } // public other methods ------------------------------------------------- /** * {@inheritDoc} * @stable ICU 2.8 */ @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!super.equals(obj)) { return false; } RuleBasedCollator o = (RuleBasedCollator) obj; if(!settings.readOnly().equals(o.settings.readOnly())) { return false; } if(data == o.data) { return true; } boolean thisIsRoot = data.base == null; boolean otherIsRoot = o.data.base == null; assert(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be == if(thisIsRoot != otherIsRoot) { return false; } String theseRules = tailoring.getRules(); String otherRules = o.tailoring.getRules(); if((thisIsRoot || theseRules.length() != 0) && (otherIsRoot || otherRules.length() != 0)) { // Shortcut: If both collators have valid rule strings, then compare those. if(theseRules.equals(otherRules)) { return true; } } // Different rule strings can result in the same or equivalent tailoring. // The rule strings are optional in ICU resource bundles, although included by default. // cloneBinary() drops the rule string. UnicodeSet thisTailored = getTailoredSet(); UnicodeSet otherTailored = o.getTailoredSet(); if(!thisTailored.equals(otherTailored)) { return false; } // For completeness, we should compare all of the mappings; // or we should create a list of strings, sort it with one collator, // and check if both collators compare adjacent strings the same // (order & strength, down to quaternary); or similar. // Testing equality of collators seems unusual. return true; } /** * Generates a unique hash code for this RuleBasedCollator. * * @return the unique hash code for this Collator * @stable ICU 2.8 */ @Override public int hashCode() { int h = settings.readOnly().hashCode(); if(data.base == null) { return h; } // root collator // Do not rely on the rule string, see comments in operator==(). UnicodeSet set = getTailoredSet(); UnicodeSetIterator iter = new UnicodeSetIterator(set); while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { h ^= data.getCE32(iter.codepoint); } return h; } /** * Compares the source text String to the target text String according to the collation rules, strength and * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero * depending on whether the source String is less than, equal to or greater than the target String. See the Collator * class description for an example of use. *

* General recommendation:
* If comparison are to be done to the same String multiple times, it would be more efficient to generate * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed * performance is critical and object instantiation is to be reduced, further optimization may be achieved by * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String, * String) will have a better performance. * * @param source * the source text String. * @param target * the target text String. * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source * and target are equal, value is greater than zero if source is greater than target. * @see CollationKey * @see #getCollationKey * @stable ICU 2.8 */ @Override public int compare(String source, String target) { return doCompare(source, target); } /** * Abstract iterator for identical-level string comparisons. * Returns FCD code points and handles temporary switching to NFD. * *

As with CollationIterator, * Java NFDIterator instances are partially constructed and cached, * and completed when reset for use. * C++ NFDIterator instances are stack-allocated. */ private static abstract class NFDIterator { /** * Partial constructor, must call reset(). */ NFDIterator() {} final void reset() { index = -1; } /** * Returns the next code point from the internal normalization buffer, * or else the next text code point. * Returns -1 at the end of the text. */ final int nextCodePoint() { if(index >= 0) { if(index == decomp.length()) { index = -1; } else { int c = Character.codePointAt(decomp, index); index += Character.charCount(c); return c; } } return nextRawCodePoint(); } /** * @param nfcImpl * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint() * @return the first code point in c's decomposition, * or c itself if it was decomposed already or if it does not decompose */ final int nextDecomposedCodePoint(Normalizer2Impl nfcImpl, int c) { if(index >= 0) { return c; } decomp = nfcImpl.getDecomposition(c); if(decomp == null) { return c; } c = Character.codePointAt(decomp, 0); index = Character.charCount(c); return c; } /** * Returns the next text code point in FCD order. * Returns -1 at the end of the text. */ protected abstract int nextRawCodePoint(); private String decomp; private int index; } private static class UTF16NFDIterator extends NFDIterator { UTF16NFDIterator() {} void setText(CharSequence seq, int start) { reset(); s = seq; pos = start; } @Override protected int nextRawCodePoint() { if(pos == s.length()) { return Collation.SENTINEL_CP; } int c = Character.codePointAt(s, pos); pos += Character.charCount(c); return c; } protected CharSequence s; protected int pos; } private static final class FCDUTF16NFDIterator extends UTF16NFDIterator { FCDUTF16NFDIterator() {} void setText(Normalizer2Impl nfcImpl, CharSequence seq, int start) { reset(); int spanLimit = nfcImpl.makeFCD(seq, start, seq.length(), null); if(spanLimit == seq.length()) { s = seq; pos = start; } else { if(str == null) { str = new StringBuilder(); } else { str.setLength(0); } str.append(seq, start, spanLimit); ReorderingBuffer buffer = new ReorderingBuffer(nfcImpl, str, seq.length() - start); nfcImpl.makeFCD(seq, spanLimit, seq.length(), buffer); s = str; pos = 0; } } private StringBuilder str; } private static final int compareNFDIter(Normalizer2Impl nfcImpl, NFDIterator left, NFDIterator right) { for(;;) { // Fetch the next FCD code point from each string. int leftCp = left.nextCodePoint(); int rightCp = right.nextCodePoint(); if(leftCp == rightCp) { if(leftCp < 0) { break; } continue; } // If they are different, then decompose each and compare again. if(leftCp < 0) { leftCp = -2; // end of string } else if(leftCp == 0xfffe) { leftCp = -1; // U+FFFE: merge separator } else { leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp); } if(rightCp < 0) { rightCp = -2; // end of string } else if(rightCp == 0xfffe) { rightCp = -1; // U+FFFE: merge separator } else { rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp); } if(leftCp < rightCp) { return Collation.LESS; } if(leftCp > rightCp) { return Collation.GREATER; } } return Collation.EQUAL; } /** * Compares two CharSequences. * @internal * @deprecated This API is ICU internal only. */ @Override @Deprecated protected int doCompare(CharSequence left, CharSequence right) { if(left == right) { return Collation.EQUAL; } // Identical-prefix test. int equalPrefixLength = 0; for(;;) { if(equalPrefixLength == left.length()) { if(equalPrefixLength == right.length()) { return Collation.EQUAL; } break; } else if(equalPrefixLength == right.length() || left.charAt(equalPrefixLength) != right.charAt(equalPrefixLength)) { break; } ++equalPrefixLength; } CollationSettings roSettings = settings.readOnly(); boolean numeric = roSettings.isNumeric(); if(equalPrefixLength > 0) { if((equalPrefixLength != left.length() && data.isUnsafeBackward(left.charAt(equalPrefixLength), numeric)) || (equalPrefixLength != right.length() && data.isUnsafeBackward(right.charAt(equalPrefixLength), numeric))) { // Identical prefix: Back up to the start of a contraction or reordering sequence. while(--equalPrefixLength > 0 && data.isUnsafeBackward(left.charAt(equalPrefixLength), numeric)) {} } // Notes: // - A longer string can compare equal to a prefix of it if only ignorables follow. // - With a backward level, a longer string can compare less-than a prefix of it. // Pass the actual start of each string into the CollationIterators, // plus the equalPrefixLength position, // so that prefix matches back into the equal prefix work. } int result; int fastLatinOptions = roSettings.fastLatinOptions; if(fastLatinOptions >= 0 && (equalPrefixLength == left.length() || left.charAt(equalPrefixLength) <= CollationFastLatin.LATIN_MAX) && (equalPrefixLength == right.length() || right.charAt(equalPrefixLength) <= CollationFastLatin.LATIN_MAX)) { result = CollationFastLatin.compareUTF16(data.fastLatinTable, roSettings.fastLatinPrimaries, fastLatinOptions, left, right, equalPrefixLength); } else { result = CollationFastLatin.BAIL_OUT_RESULT; } if(result == CollationFastLatin.BAIL_OUT_RESULT) { CollationBuffer buffer = null; try { buffer = getCollationBuffer(); if(roSettings.dontCheckFCD()) { buffer.leftUTF16CollIter.setText(numeric, left, equalPrefixLength); buffer.rightUTF16CollIter.setText(numeric, right, equalPrefixLength); result = CollationCompare.compareUpToQuaternary( buffer.leftUTF16CollIter, buffer.rightUTF16CollIter, roSettings); } else { buffer.leftFCDUTF16Iter.setText(numeric, left, equalPrefixLength); buffer.rightFCDUTF16Iter.setText(numeric, right, equalPrefixLength); result = CollationCompare.compareUpToQuaternary( buffer.leftFCDUTF16Iter, buffer.rightFCDUTF16Iter, roSettings); } } finally { releaseCollationBuffer(buffer); } } if(result != Collation.EQUAL || roSettings.getStrength() < Collator.IDENTICAL) { return result; } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); // Compare identical level. Normalizer2Impl nfcImpl = data.nfcImpl; if(roSettings.dontCheckFCD()) { buffer.leftUTF16NFDIter.setText(left, equalPrefixLength); buffer.rightUTF16NFDIter.setText(right, equalPrefixLength); return compareNFDIter(nfcImpl, buffer.leftUTF16NFDIter, buffer.rightUTF16NFDIter); } else { buffer.leftFCDUTF16NFDIter.setText(nfcImpl, left, equalPrefixLength); buffer.rightFCDUTF16NFDIter.setText(nfcImpl, right, equalPrefixLength); return compareNFDIter(nfcImpl, buffer.leftFCDUTF16NFDIter, buffer.rightFCDUTF16NFDIter); } } finally { releaseCollationBuffer(buffer); } } // package private constructors ------------------------------------------ RuleBasedCollator(CollationTailoring t, ULocale vl) { data = t.data; settings = t.settings.clone(); tailoring = t; validLocale = vl; actualLocaleIsSameAsValid = false; } private void adoptTailoring(CollationTailoring t) { assert(settings == null && data == null && tailoring == null); data = t.data; settings = t.settings.clone(); tailoring = t; validLocale = t.actualLocale; actualLocaleIsSameAsValid = false; } // package private methods ----------------------------------------------- /** * Tests whether a character is "unsafe" for use as a collation starting point. * * @param c code point or code unit * @return true if c is unsafe * @see CollationElementIterator#setOffset(int) */ final boolean isUnsafe(int c) { return data.isUnsafeBackward(c, settings.readOnly().isNumeric()); } /** * Frozen state of the collator. */ private Lock frozenLock; private static final class CollationBuffer { private CollationBuffer(CollationData data) { leftUTF16CollIter = new UTF16CollationIterator(data); rightUTF16CollIter = new UTF16CollationIterator(data); leftFCDUTF16Iter = new FCDUTF16CollationIterator(data); rightFCDUTF16Iter = new FCDUTF16CollationIterator(data); leftUTF16NFDIter = new UTF16NFDIterator(); rightUTF16NFDIter = new UTF16NFDIterator(); leftFCDUTF16NFDIter = new FCDUTF16NFDIterator(); rightFCDUTF16NFDIter = new FCDUTF16NFDIterator(); } UTF16CollationIterator leftUTF16CollIter; UTF16CollationIterator rightUTF16CollIter; FCDUTF16CollationIterator leftFCDUTF16Iter; FCDUTF16CollationIterator rightFCDUTF16Iter; UTF16NFDIterator leftUTF16NFDIter; UTF16NFDIterator rightUTF16NFDIter; FCDUTF16NFDIterator leftFCDUTF16NFDIter; FCDUTF16NFDIterator rightFCDUTF16NFDIter; RawCollationKey rawCollationKey; } /** * Get the version of this collator object. * * @return the version object associated with this collator * @stable ICU 2.8 */ @Override public VersionInfo getVersion() { int version = tailoring.version; int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor(); return VersionInfo.getInstance( (version >>> 24) + (rtVersion << 4) + (rtVersion >> 4), ((version >> 16) & 0xff), ((version >> 8) & 0xff), (version & 0xff)); } /** * Get the UCA version of this collator object. * * @return the version object associated with this collator * @stable ICU 2.8 */ @Override public VersionInfo getUCAVersion() { VersionInfo v = getVersion(); // Note: This is tied to how the current implementation encodes the UCA version // in the overall getVersion(). // Alternatively, we could load the root collator and get at lower-level data from there. // Either way, it will reflect the input collator's UCA version only // if it is a known implementation. // (C++ comment) It would be cleaner to make this a virtual Collator method. // (In Java, it is virtual.) return VersionInfo.getInstance(v.getMinor() >> 3, v.getMinor() & 7, v.getMilli() >> 6, 0); } private CollationBuffer collationBuffer; private final CollationBuffer getCollationBuffer() { if (isFrozen()) { frozenLock.lock(); } else if (collationBuffer == null) { collationBuffer = new CollationBuffer(data); } return collationBuffer; } private final void releaseCollationBuffer(CollationBuffer buffer) { if (isFrozen()) { frozenLock.unlock(); } } /** * {@inheritDoc} * @draft ICU 53 (retain) * @provisional This API might change or be removed in a future release. */ @Override public ULocale getLocale(ULocale.Type type) { if (type == ULocale.ACTUAL_LOCALE) { return actualLocaleIsSameAsValid ? validLocale : tailoring.actualLocale; } else if(type == ULocale.VALID_LOCALE) { return validLocale; } else { throw new IllegalArgumentException("unknown ULocale.Type " + type); } } /** * {@inheritDoc} */ @Override void setLocale(ULocale valid, ULocale actual) { // This method is called // by other protected functions that checks and makes sure that // valid and actual are not null before passing assert (valid == null) == (actual == null); // Another check we could do is that the actual locale is at // the same level or less specific than the valid locale. if(Objects.equals(actual, tailoring.actualLocale)) { actualLocaleIsSameAsValid = false; } else { assert(Objects.equals(actual, valid)); actualLocaleIsSameAsValid = true; } // Do not modify tailoring.actualLocale: // We cannot be sure that that would be thread-safe. validLocale = valid; } CollationData data; SharedObject.Reference settings; // reference-counted CollationTailoring tailoring; // C++: reference-counted private ULocale validLocale; // Note: No need in Java to track which attributes have been set explicitly. // int or EnumSet explicitlySetAttributes; private boolean actualLocaleIsSameAsValid; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy