All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.RuleBasedCollator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/**
 *******************************************************************************
 * Copyright (C) 1996-2012, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import com.ibm.icu.impl.BOCU;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.ImplicitCEGenerator;
import com.ibm.icu.impl.IntTrie;
import com.ibm.icu.impl.StringUCharacterIterator;
import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.TrieIterator;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.VersionInfo;

/**
 * 

* RuleBasedCollator is a concrete subclass of Collator. It allows customization of the Collator via user-specified rule * sets. RuleBasedCollator is designed to be fully compliant to the Unicode Collation Algorithm (UCA) and conforms to ISO 14651. *

* *

* Users are strongly encouraged to read the users * guide for more information about the collation service before using this class. *

* *

* Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class * Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the * argument locale. If a customized collation ordering ar attributes is required, use the RuleBasedCollator(String) * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on UCA, while * re-adjusting the attributes and orders of the characters in the specified rule accordingly. *

* *

* RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale * is not available, the orders eventually falls back to the UCA * collation order . *

* *

* For information about the collation rule syntax and details about customization, please refer to the Collation customization section of the * user's guide. *

* *

* Note that there are some differences between the Collation rule syntax used in Java and ICU4J: * *

    *
  • According to the JDK documentation: *

    * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule is in force when a Thai vowel of the range * \U0E40-\U0E44 precedes a Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the range * \U0EC0-\U0EC4 precedes a Lao consonant of the range \U0E81-\U0EAE then the vowel is placed after the * consonant for collation purposes. *

    *

    * If a rule is without the modifier '!', the Thai/Lao vowel-consonant swapping is not turned on. *

    *
    *

    * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao vowel-consonant swapping, since the UCA clearly * states that it has to be supported to ensure a correct sorting order. If a '!' is encountered, it is ignored. *

    *
  • As mentioned in the documentation of the base class Collator, compatibility decomposition mode is not supported. *
*

* Examples *

*

* Creating Customized RuleBasedCollators:

* *
 * String simple = "& a < b < c < d";
 * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
 *
 * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
 *                    + "< f , F < g , G < h , H < i , I < j , "
 *                    + "J < k , K < l , L < m , M < n , N < "
 *                    + "o , O < p , P < q , Q < r , R < s , S < "
 *                    + "t , T < u , U < v , V < w , W < x , X "
 *                    + "< y , Y < z , Z < \u00E5 = a\u030A "
 *                    + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
 *                    + ", \u00C6 < \u00F8 , \u00D8";
 * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
 * 
* *
* * Concatenating rules to combine Collators:
* *
 * // Create an en_US Collator object
 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
 *     Collator.getInstance(new Locale("en", "US", ""));
 * // Create a da_DK Collator object
 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
 *     Collator.getInstance(new Locale("da", "DK", ""));
 * // Combine the two
 * // First, get the collation rules from en_USCollator
 * String en_USRules = en_USCollator.getRules();
 * // Second, get the collation rules from da_DKCollator
 * String da_DKRules = da_DKCollator.getRules();
 * RuleBasedCollator newCollator =
 *                             new RuleBasedCollator(en_USRules + da_DKRules);
 * // newCollator has the combined rules
 * 
* *
* * Making changes to an existing RuleBasedCollator to create a new Collator object, by appending changes to * the existing rule:
* *
 * // Create a new Collator object with additional rules
 * String addRules = "& C < ch, cH, Ch, CH";
 * RuleBasedCollator myCollator =
 *     new RuleBasedCollator(en_USCollator.getRules() + addRules);
 * // myCollator contains the new rules
 * 
* *
* * How to change the order of non-spacing accents:
* *
 * // old rule with main accents
 * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
 *                 + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
 *                 + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
 *                 + "; \u030B ; \u030C ; \u030D ; \u030E "
 *                 + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
 *                 + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
 *                 + "< b , B < c, C < e, E & C < d , D";
 * // change the order of accent characters
 * String addOn = "& \u0300 ; \u0308 ; \u0302";
 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
 * 
* *
* * Putting in a new primary ordering before the default setting, e.g. sort English characters before or after Japanese * characters in the Japanese Collator:
* *
 * // get en_US Collator rules
 * RuleBasedCollator en_USCollator
 *                        = (RuleBasedCollator)Collator.getInstance(Locale.US);
 * // add a few Japanese characters to sort before English characters
 * // suppose the last character before the first base letter 'a' in
 * // the English collation rule is \u2212
 * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
 *                   + "\u3044";
 * RuleBasedCollator myJapaneseCollator
 *              = new RuleBasedCollator(en_USCollator.getRules() + jaString);
 * 
* *
*

*

* This class is not subclassable *

* * @author Syn Wee Quek * @stable ICU 2.8 */ public final class RuleBasedCollator extends Collator { // public constructors --------------------------------------------------- /** *

* Constructor that takes the argument rules for customization. The collator will be based on UCA, with the * attributes and re-ordering of the characters specified in the argument rules. *

*

* See the user guide's section on * Collation Customization for details on the rule syntax. *

* * @param rules * the collation rules to build the collation table from. * @exception ParseException * and IOException thrown. ParseException thrown when argument rules have an invalid syntax. * IOException thrown when an error occured while reading internal data. * @stable ICU 2.8 */ public RuleBasedCollator(String rules) throws Exception { checkUCA(); if (rules == null) { throw new IllegalArgumentException("Collation rules can not be null"); } init(rules); } // public methods -------------------------------------------------------- /** * Clones the RuleBasedCollator * * @return a new instance of this RuleBasedCollator object * @stable ICU 2.8 */ public Object clone() throws CloneNotSupportedException { return clone(isFrozen()); } /** * Clones the RuleBasedCollator * * @param frozen should the clone be frozen or not * @return a new instance of this RuleBasedCollator object */ private Object clone(boolean frozen) throws CloneNotSupportedException { //TODO: once buffer and threading issue is resolved have frozen clone just return itself RuleBasedCollator result = (RuleBasedCollator) super.clone(); if (latinOneCEs_ != null) { result.m_reallocLatinOneCEs_ = true; result.m_ContInfo_ = new ContractionInfo(); } // since all collation data in the RuleBasedCollator do not change // we can safely assign the result.fields to this collator // except in cases where we can't result.collationBuffer = null; result.frozenLock = frozen ? new ReentrantLock() : null; return result; } /** * Return a CollationElementIterator for the given String. * * @see CollationElementIterator * @stable ICU 2.8 */ public CollationElementIterator getCollationElementIterator(String source) { return new CollationElementIterator(source, this); } /** * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be * preserved since a new copy will be created for use. * * @see CollationElementIterator * @stable ICU 2.8 */ public CollationElementIterator getCollationElementIterator(CharacterIterator source) { CharacterIterator newsource = (CharacterIterator) source.clone(); return new CollationElementIterator(newsource, this); } /** * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be * preserved since a new copy will be created for use. * * @see CollationElementIterator * @stable ICU 2.8 */ public CollationElementIterator getCollationElementIterator(UCharacterIterator source) { return new CollationElementIterator(source, this); } // Freezable interface implementation ------------------------------------------------- /** * Determines whether the object has been frozen or not. * @stable ICU 4.8 */ public boolean isFrozen() { return frozenLock != null; } /** * Freezes the collator. * @return the collator itself. * @stable ICU 4.8 */ public Collator freeze() { if (!isFrozen()) { frozenLock = new ReentrantLock(); } return this; } /** * Provides for the clone operation. Any clone is initially unfrozen. * @stable ICU 4.8 */ public RuleBasedCollator cloneAsThawed() { RuleBasedCollator clone = null; try { clone = (RuleBasedCollator) clone(false); } catch (CloneNotSupportedException e) { // Clone is implemented } return clone; } // public setters -------------------------------------------------------- /** * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a * correct JIS collation order, distinguishing between Katakana and Hiragana characters. * * This attribute is an implementation detail of the CLDR Japanese tailoring. * The implementation might change to use a different mechanism * to achieve the same Japanese sort order. * Since ICU 50, this attribute is not settable any more via API functions. * * @param flag * true if Hiragana Quaternary mode is to be on, false otherwise * @see #setHiraganaQuaternaryDefault * @see #isHiraganaQuaternary * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation. */ public void setHiraganaQuaternary(boolean flag) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } } /** * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See * setHiraganaQuaternary(boolean) for more details. * * This attribute is an implementation detail of the CLDR Japanese tailoring. * The implementation might change to use a different mechanism * to achieve the same Japanese sort order. * Since ICU 50, this attribute is not settable any more via API functions. * * @see #setHiraganaQuaternary(boolean) * @see #isHiraganaQuaternary * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation. */ public void setHiraganaQuaternaryDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } } /** * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case * characters first. * * @param upperfirst * true to sort uppercase characters before lowercase characters, false to sort lowercase characters * before uppercase characters * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setLowerCaseFirst * @see #setCaseFirstDefault * @stable ICU 2.8 */ public void setUpperCaseFirst(boolean upperfirst) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (upperfirst) { if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) { latinOneRegenTable_ = true; } m_caseFirst_ = AttributeValue.UPPER_FIRST_; } else { if (m_caseFirst_ != AttributeValue.OFF_) { latinOneRegenTable_ = true; } m_caseFirst_ = AttributeValue.OFF_; } updateInternalState(); } /** * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences. * * @param lowerfirst * true for sorting lower cased characters before upper cased characters, false to ignore case * preferences. * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setUpperCaseFirst * @see #setCaseFirstDefault * @stable ICU 2.8 */ public void setLowerCaseFirst(boolean lowerfirst) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (lowerfirst) { if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) { latinOneRegenTable_ = true; } m_caseFirst_ = AttributeValue.LOWER_FIRST_; } else { if (m_caseFirst_ != AttributeValue.OFF_) { latinOneRegenTable_ = true; } m_caseFirst_ = AttributeValue.OFF_; } updateInternalState(); } /** * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details. * * @see #isLowerCaseFirst * @see #isUpperCaseFirst * @see #setLowerCaseFirst(boolean) * @see #setUpperCaseFirst(boolean) * @stable ICU 2.8 */ public final void setCaseFirstDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (m_caseFirst_ != m_defaultCaseFirst_) { latinOneRegenTable_ = true; } m_caseFirst_ = m_defaultCaseFirst_; updateInternalState(); } /** * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See * setAlternateHandling(boolean) for more details. * * @see #setAlternateHandlingShifted(boolean) * @see #isAlternateHandlingShifted() * @stable ICU 2.8 */ public void setAlternateHandlingDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; updateInternalState(); } /** * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See * setCaseLevel(boolean) for more details. * * @see #setCaseLevel(boolean) * @see #isCaseLevel * @stable ICU 2.8 */ public void setCaseLevelDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } m_isCaseLevel_ = m_defaultIsCaseLevel_; updateInternalState(); } /** * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See * setDecomposition(int) for more details. * * @see #getDecomposition * @see #setDecomposition(int) * @stable ICU 2.8 */ public void setDecompositionDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } setDecomposition(m_defaultDecomposition_); updateInternalState(); } /** * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See * setFrenchCollation(boolean) for more details. * * @see #isFrenchCollation * @see #setFrenchCollation(boolean) * @stable ICU 2.8 */ public void setFrenchCollationDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) { latinOneRegenTable_ = true; } m_isFrenchCollation_ = m_defaultIsFrenchCollation_; updateInternalState(); } /** * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See * setStrength(int) for more details. * * @see #setStrength(int) * @see #getStrength * @stable ICU 2.8 */ public void setStrengthDefault() { setStrength(m_defaultStrength_); updateInternalState(); } /** * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER * '2' * * @see #getNumericCollation * @see #setNumericCollation * @stable ICU 2.8 */ public void setNumericCollationDefault() { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } setNumericCollation(m_defaultIsNumericCollation_); updateInternalState(); } /** * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false, * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted * backwards. See the section on * French collation for more information. * * @param flag * true to set the French collation on, false to set it off * @stable ICU 2.8 * @see #isFrenchCollation * @see #setFrenchCollationDefault */ public void setFrenchCollation(boolean flag) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (m_isFrenchCollation_ != flag) { latinOneRegenTable_ = true; } m_isFrenchCollation_ = flag; updateInternalState(); } /** * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition * on Alternate Weighting. This * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false, * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order. * * @param shifted * true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour. * @see #isAlternateHandlingShifted * @see #setAlternateHandlingDefault * @stable ICU 2.8 */ public void setAlternateHandlingShifted(boolean shifted) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } m_isAlternateHandlingShifted_ = shifted; updateInternalState(); } /** *

* When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value * is false, which means the case level is not generated. The contents of the case level are affected by the case * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable * case level. *

*

* See the section on case * level for more information. *

* * @param flag * true if case level sorting is required, false otherwise * @stable ICU 2.8 * @see #setCaseLevelDefault * @see #isCaseLevel */ public void setCaseLevel(boolean flag) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } m_isCaseLevel_ = flag; updateInternalState(); } /** *

* Sets this Collator's strength property. The strength property determines the minimum level of difference * considered significant during comparison. *

*

* See the Collator class description for an example of use. *

* * @param newStrength * the new strength value. * @see #getStrength * @see #setStrengthDefault * @see #PRIMARY * @see #SECONDARY * @see #TERTIARY * @see #QUATERNARY * @see #IDENTICAL * @exception IllegalArgumentException * If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. * @stable ICU 2.8 */ public void setStrength(int newStrength) { super.setStrength(newStrength); updateInternalState(); } /** *

* Variable top is a two byte primary value which causes all the codepoints with primary values that are less or * equal than the variable top to be shifted when alternate handling is set to SHIFTED. *

*

* Sets the variable top to a collation element value of a string supplied. *

* * @param varTop * one or more (if contraction) characters to which the variable top should be set * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined. * @exception IllegalArgumentException * is thrown if varTop argument is not a valid variable top element. A variable top element is * invalid when *
    *
  • it is a contraction that does not exist in the Collation order *
  • when the PRIMARY strength collation element for the variable top has more than two bytes *
  • when the varTop argument is null or zero in length. *
* @see #getVariableTop * @see RuleBasedCollator#setAlternateHandlingShifted * @stable ICU 2.6 */ public int setVariableTop(String varTop) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (varTop == null || varTop.length() == 0) { throw new IllegalArgumentException("Variable top argument string can not be null or zero in length."); } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); return setVariableTop(varTop, buffer); } finally { releaseCollationBuffer(buffer); } } private int setVariableTop(String varTop, CollationBuffer buffer) { buffer.m_srcUtilColEIter_.setText(varTop); int ce = buffer.m_srcUtilColEIter_.next(); // here we check if we have consumed all characters // you can put in either one character or a contraction // you shouldn't put more... if (buffer.m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) { throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist " + "in the Collation order"); } int nextCE = buffer.m_srcUtilColEIter_.next(); if ((nextCE != CollationElementIterator.NULLORDER) && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) { throw new IllegalArgumentException("Variable top argument string can only have a single collation " + "element that has less than or equal to two PRIMARY strength " + "bytes"); } m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16; return ce & CE_PRIMARY_MASK_; } /** * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16 * bits are ignored. * * @param varTop * Collation element value, as returned by setVariableTop or getVariableTop * @see #getVariableTop * @see #setVariableTop(String) * @stable ICU 2.6 */ public void setVariableTop(int varTop) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16; } /** * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings * of digits. This is a way to get '100' to sort AFTER '2' * * @param flag * true to turn numeric collation on and false to turn it off * @see #getNumericCollation * @see #setNumericCollationDefault * @stable ICU 2.8 */ public void setNumericCollation(boolean flag) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } // sort substrings of digits as numbers m_isNumericCollation_ = flag; updateInternalState(); } /** * Sets the reordering codes for this collator. * Collation reordering allows scripts and some other defined blocks of characters * to be moved relative to each other as a block. This reordering is done on top of * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed * at the start and/or the end of the collation order. *

By default, reordering codes specified for the start of the order are placed in the * order given after a group of “special” non-script blocks. These special groups of characters * are space, punctuation, symbol, currency, and digit. These special groups are represented with * {@link Collator.ReorderCodes}. Script groups can be intermingled with * these special non-script blocks if those special blocks are explicitly specified in the reordering. *

The special code {@link Collator.ReorderCodes#OTHERS OTHERS} stands for any script that is not explicitly * mentioned in the list of reordering codes given. Anything that is after {@link Collator.ReorderCodes#OTHERS OTHERS} * will go at the very end of the reordering in the order given. *

The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT} will reset the reordering for this collator * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that * was specified when this collator was created from resource data or from rules. The * {@link Collator.ReorderCodes#DEFAULT DEFAULT} code must be the sole code supplied when it used. If not * that will result in an {@link IllegalArgumentException} being thrown. *

The special reorder code {@link Collator.ReorderCodes#NONE NONE} will remove any reordering for this collator. * The result of setting no reordering will be to have the DUCET/CLDR reordering used. The * {@link Collator.ReorderCodes#NONE NONE} code must be the sole code supplied when it used. * @param order the reordering codes to apply to this collator; if this is null or an empty array * then this clears any existing reordering * @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts) * @see #getReorderCodes * @see #getEquivalentReorderCodes * @stable ICU 4.8 */ public void setReorderCodes(int... order) { if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify frozen object"); } if (order != null && order.length > 0) { m_reorderCodes_ = order.clone(); } else { m_reorderCodes_ = null; } buildPermutationTable(); } // public getters -------------------------------------------------------- /** * Gets the collation tailoring rules for this RuleBasedCollator. * Equivalent to String getRules(false). * * @return the collation tailoring rules * @see #getRules(boolean) * @stable ICU 2.8 */ public String getRules() { return m_rules_; } /** * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the * tailoring. * *

The "UCA rules" are an approximation of the root collator's sort order. * They are almost never used or useful at runtime and can be removed from the data. * See User Guide: * Collation Customization, Building on Existing Locales * *

{@link #getRules()} should normally be used instead. * @param fullrules * true if the rules that defines the full set of collation order is required, otherwise false for * returning only the tailored rules * @return the current rules that defines this Collator. * @see #getRules() * @stable ICU 2.6 */ public String getRules(boolean fullrules) { if (!fullrules) { return m_rules_; } // take the UCA rules and append real rules at the end return UCA_.m_rules_.concat(m_rules_); } /** * Get an UnicodeSet that contains all the characters and sequences tailored in this collator. * * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently * than in the UCA. * @stable ICU 2.4 */ public UnicodeSet getTailoredSet() { try { CollationRuleParser src = new CollationRuleParser(getRules()); return src.getTailoredSet(); } catch (Exception e) { throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!"); } } private static class contContext { RuleBasedCollator coll; UnicodeSet contractions; UnicodeSet expansions; UnicodeSet removedContractions; boolean addPrefixes; contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions, UnicodeSet removedContractions, boolean addPrefixes) { this.coll = coll; this.contractions = contractions; this.expansions = expansions; this.removedContractions = removedContractions; this.addPrefixes = addPrefixes; } } private void addSpecial(contContext c, StringBuilder buffer, int CE) { StringBuilder b = new StringBuilder(); int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_; int newCE = c.coll.m_contractionCE_[offset]; // we might have a contraction that ends from previous level if (newCE != CollationElementIterator.CE_NOT_FOUND_) { if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { addSpecial(c, buffer, newCE); } if (buffer.length() > 1) { if (c.contractions != null) { c.contractions.add(buffer.toString()); } if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { c.expansions.add(buffer.toString()); } } } offset++; // check whether we're doing contraction or prefix if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { while (c.coll.m_contractionIndex_[offset] != 0xFFFF) { b.delete(0, b.length()); b.append(buffer); newCE = c.coll.m_contractionCE_[offset]; b.insert(0, c.coll.m_contractionIndex_[offset]); if (isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { addSpecial(c, b, newCE); } else { if (c.contractions != null) { c.contractions.add(b.toString()); } if (c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { c.expansions.add(b.toString()); } } offset++; } } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) { while (c.coll.m_contractionIndex_[offset] != 0xFFFF) { b.delete(0, b.length()); b.append(buffer); newCE = c.coll.m_contractionCE_[offset]; b.append(c.coll.m_contractionIndex_[offset]); if (isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { addSpecial(c, b, newCE); } else { if (c.contractions != null) { c.contractions.add(b.toString()); } if (c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { c.expansions.add(b.toString()); } } offset++; } } } private void processSpecials(contContext c) { int internalBufferSize = 512; TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_); RangeValueIterator.Element element = new RangeValueIterator.Element(); while (trieiterator.next(element)) { int start = element.start; int limit = element.limit; int CE = element.value; StringBuilder contraction = new StringBuilder(internalBufferSize); if (isSpecial(CE)) { if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) { while (start < limit) { // if there are suppressed contractions, we don't // want to add them. if (c.removedContractions != null && c.removedContractions.contains(start)) { start++; continue; } // we start our contraction from middle, since we don't know if it // will grow toward right or left contraction.append((char) start); addSpecial(c, contraction, CE); start++; } } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { while (start < limit) { c.expansions.add(start++); } } } } } /** * Gets unicode sets containing contractions and/or expansions of a collator * * @param contractions * if not null, set to contain contractions * @param expansions * if not null, set to contain expansions * @param addPrefixes * add the prefix contextual elements to contractions * @throws Exception * Throws an exception if any errors occurs. * @stable ICU 3.4 */ public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes) throws Exception { if (contractions != null) { contractions.clear(); } if (expansions != null) { expansions.clear(); } String rules = getRules(); try { CollationRuleParser src = new CollationRuleParser(rules); contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_, addPrefixes); // Add the UCA contractions processSpecials(c); // This is collator specific. Add contractions from a collator c.coll = this; c.removedContractions = null; processSpecials(c); } catch (Exception e) { throw e; } } /** *

* Get a Collation key for the argument String source from this RuleBasedCollator. *

*

* General recommendation:
* If comparison are to be done to the same String multiple times, it would be more efficient to generate * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better * performance. *

*

* See the class documentation for an explanation about CollationKeys. *

* * @param source * the text String to be transformed into a collation key. * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source * String is null, a null CollationKey is returned. * @see CollationKey * @see #compare(String, String) * @see #getRawCollationKey * @stable ICU 2.8 */ public CollationKey getCollationKey(String source) { if (source == null) { return null; } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); return getCollationKey(source, buffer); } finally { releaseCollationBuffer(buffer); } } private CollationKey getCollationKey(String source, CollationBuffer buffer) { buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_, buffer); return new CollationKey(source, buffer.m_utilRawCollationKey_); } /** * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the * result into the user provided argument key. If key has a internal byte array of length that's too small for the * result, the internal byte array will be grown to the exact required size. * * @param source the text String to be transformed into a RawCollationKey * @param key output RawCollationKey to store results * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user * provided key will be returned. * @see #getCollationKey * @see #compare(String, String) * @see RawCollationKey * @stable ICU 2.8 */ public RawCollationKey getRawCollationKey(String source, RawCollationKey key) { if (source == null) { return null; } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); return getRawCollationKey(source, key, buffer); } finally { releaseCollationBuffer(buffer); } } private RawCollationKey getRawCollationKey(String source, RawCollationKey key, CollationBuffer buffer) { int strength = getStrength(); buffer.m_utilCompare0_ = m_isCaseLevel_; // m_utilCompare1_ = true; buffer.m_utilCompare2_ = strength >= SECONDARY; buffer.m_utilCompare3_ = strength >= TERTIARY; buffer.m_utilCompare4_ = strength >= QUATERNARY; buffer.m_utilCompare5_ = strength == IDENTICAL; boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_; // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so // high. int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_; byte hiragana4 = 0; if (m_isHiragana4_ && buffer.m_utilCompare4_) { // allocate one more space for hiragana, value for hiragana hiragana4 = (byte) commonBottom4; commonBottom4++; } int bottomCount4 = 0xFF - commonBottom4; // If we need to normalize, we'll do it all at once at the beginning! if (buffer.m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) { // if it is identical strength, we have to normalize the string to // NFD so that it will be appended correctly to the end of the sort // key source = Normalizer.decompose(source, false); } else if (getDecomposition() != NO_DECOMPOSITION && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) { // for the rest of the strength, if decomposition is on, FCD is // enough for us to work on. source = Normalizer.normalize(source, Normalizer.FCD); } getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4, buffer); if (key == null) { key = new RawCollationKey(); } getSortKey(source, doFrench, commonBottom4, bottomCount4, key, buffer); return key; } /** * Return true if an uppercase character is sorted before the corresponding lowercase character. See * setCaseFirst(boolean) for details. * * @see #setUpperCaseFirst * @see #setLowerCaseFirst * @see #isLowerCaseFirst * @see #setCaseFirstDefault * @return true if upper cased characters are sorted before lower cased characters, false otherwise * @stable ICU 2.8 */ public boolean isUpperCaseFirst() { return (m_caseFirst_ == AttributeValue.UPPER_FIRST_); } /** * Return true if a lowercase character is sorted before the corresponding uppercase character. See * setCaseFirst(boolean) for details. * * @see #setUpperCaseFirst * @see #setLowerCaseFirst * @see #isUpperCaseFirst * @see #setCaseFirstDefault * @return true lower cased characters are sorted before upper cased characters, false otherwise * @stable ICU 2.8 */ public boolean isLowerCaseFirst() { return (m_caseFirst_ == AttributeValue.LOWER_FIRST_); } /** * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true, * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more * details. * * @return true or false * @see #setAlternateHandlingShifted(boolean) * @see #setAlternateHandlingDefault * @stable ICU 2.8 */ public boolean isAlternateHandlingShifted() { return m_isAlternateHandlingShifted_; } /** * Checks if case level is set to true. See setCaseLevel(boolean) for details. * * @return the case level mode * @see #setCaseLevelDefault * @see #isCaseLevel * @see #setCaseLevel(boolean) * @stable ICU 2.8 */ public boolean isCaseLevel() { return m_isCaseLevel_; } /** * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details. * * @return true if French Collation is set to true, false otherwise * @see #setFrenchCollation(boolean) * @see #setFrenchCollationDefault * @stable ICU 2.8 */ public boolean isFrenchCollation() { return m_isFrenchCollation_; } /** * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details. * * This attribute is an implementation detail of the CLDR Japanese tailoring. * The implementation might change to use a different mechanism * to achieve the same Japanese sort order. * Since ICU 50, this attribute is not settable any more via API functions. * * @return flag true if Hiragana Quaternary mode is on, false otherwise * @see #setHiraganaQuaternaryDefault * @see #setHiraganaQuaternary(boolean) * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation. */ public boolean isHiraganaQuaternary() { return m_isHiragana4_; } /** * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored. * * @return the variable top value of a Collator. * @see #setVariableTop * @stable ICU 2.6 */ public int getVariableTop() { return m_variableTopValue_ << 16; } /** * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2' * * @see #setNumericCollation * @see #setNumericCollationDefault * @return true if numeric collation is turned on, false otherwise * @stable ICU 2.8 */ public boolean getNumericCollation() { return m_isNumericCollation_; } /** * Retrieves the reordering codes for this collator. * These reordering codes are a combination of UScript codes and ReorderCodes. * @return a copy of the reordering codes for this collator; * if none are set then returns an empty array * @see #setReorderCodes * @see #getEquivalentReorderCodes * @stable ICU 4.8 */ public int[] getReorderCodes() { if (m_reorderCodes_ != null) { return m_reorderCodes_.clone(); } else { return LeadByteConstants.EMPTY_INT_ARRAY; } } /** * Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder * codes are grouped and must reorder together. * * @param reorderCode code for which equivalents to be retrieved * @return the set of all reorder codes in the same group as the given reorder code. * @see #setReorderCodes * @see #getReorderCodes * @stable ICU 4.8 */ public static int[] getEquivalentReorderCodes(int reorderCode) { Set equivalentCodesSet = new HashSet(); int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode); for (int leadByte : leadBytes) { int[] codes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte); for (int code : codes) { equivalentCodesSet.add(code); } } int[] equivalentCodes = new int[equivalentCodesSet.size()]; int i = 0; for (int code : equivalentCodesSet) { equivalentCodes[i++] = code; } return equivalentCodes; } // public other methods ------------------------------------------------- /** * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same * collation rules and the same attributes. * * @param obj * the RuleBasedCollator to be compared to. * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise. * @stable ICU 2.8 */ public boolean equals(Object obj) { if (obj == null) { return false; // super does class check } if (this == obj) { return true; } if (getClass() != obj.getClass()) { return false; } RuleBasedCollator other = (RuleBasedCollator) obj; // all other non-transient information is also contained in rules. if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition() || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_ || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_ || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_ || other.m_isHiragana4_ != m_isHiragana4_) { return false; } if (m_reorderCodes_ != null ^ other.m_reorderCodes_ != null) { return false; } if (m_reorderCodes_ != null) { if (m_reorderCodes_.length != other.m_reorderCodes_.length) { return false; } for (int i = 0; i < m_reorderCodes_.length; i++) { if (m_reorderCodes_[i] != other.m_reorderCodes_[i]) { return false; } } } boolean rules = m_rules_ == other.m_rules_; if (!rules && (m_rules_ != null && other.m_rules_ != null)) { rules = m_rules_.equals(other.m_rules_); } if (!rules || !ICUDebug.enabled("collation")) { return rules; } if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_ || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_ || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_ || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_ || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_ || !Arrays.equals(m_unsafe_, other.m_unsafe_)) { return false; } if (!m_trie_.equals(other.m_trie_)) { // we should use the trie iterator here, but then this part is // only used in the test. for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) { int v = m_trie_.getCodePointValue(i); int otherv = other.m_trie_.getCodePointValue(i); if (v != otherv) { int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_); if (mask == (otherv & 0xff000000)) { v &= 0xffffff; otherv &= 0xffffff; if (mask == 0xf1000000) { v -= (m_expansionOffset_ << 4); otherv -= (other.m_expansionOffset_ << 4); } else if (mask == 0xf2000000) { v -= m_contractionOffset_; otherv -= other.m_contractionOffset_; } if (v == otherv) { continue; } } return false; } } } if (!Arrays.equals(m_contractionCE_, other.m_contractionCE_) || !Arrays.equals(m_contractionEnd_, other.m_contractionEnd_) || !Arrays.equals(m_contractionIndex_, other.m_contractionIndex_) || !Arrays.equals(m_expansion_, other.m_expansion_) || !Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) { return false; } // not comparing paddings for (int i = 0; i < m_expansionEndCE_.length; i++) { if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) { return false; } } return true; } /** * Generates a unique hash code for this RuleBasedCollator. * * @return the unique hash code for this Collator * @stable ICU 2.8 */ public int hashCode() { String rules = getRules(); if (rules == null) { rules = ""; } return rules.hashCode(); } /** * Compares the source text String to the target text String according to the collation rules, strength and * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero * depending on whether the source String is less than, equal to or greater than the target String. See the Collator * class description for an example of use.

*

* General recommendation:
* If comparison are to be done to the same String multiple times, it would be more efficient to generate * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed * performance is critical and object instantiation is to be reduced, further optimization may be achieved by * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String, * String) will have a better performance. *

* * @param source * the source text String. * @param target * the target text String. * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source * and target are equal, value is greater than zero if source is greater than target. * @see CollationKey * @see #getCollationKey * @stable ICU 2.8 */ public int compare(String source, String target) { if (source == target) { return 0; } CollationBuffer buffer = null; try { buffer = getCollationBuffer(); return compare(source, target, buffer); } finally { releaseCollationBuffer(buffer); } } private int compare(String source, String target, CollationBuffer buffer) { // Find the length of any leading portion that is equal int offset = getFirstUnmatchedOffset(source, target); // return compareRegular(source, target, offset); if (latinOneUse_) { if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_) || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) { // source or target start with non-latin-1 return compareRegular(source, target, offset, buffer); } else { return compareUseLatin1(source, target, offset, buffer); } } else { return compareRegular(source, target, offset, buffer); } } // package private inner interfaces -------------------------------------- /** * Attribute values to be used when setting the Collator options */ static interface AttributeValue { /** * Indicates that the default attribute value will be used. See individual attribute for details on its default * value. */ static final int DEFAULT_ = -1; /** * Primary collation strength */ static final int PRIMARY_ = Collator.PRIMARY; /** * Secondary collation strength */ static final int SECONDARY_ = Collator.SECONDARY; /** * Tertiary collation strength */ static final int TERTIARY_ = Collator.TERTIARY; /** * Default collation strength */ static final int DEFAULT_STRENGTH_ = Collator.TERTIARY; /** * Internal use for strength checks in Collation elements */ static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1; /** * Quaternary collation strength */ static final int QUATERNARY_ = 3; /** * Identical collation strength */ static final int IDENTICAL_ = Collator.IDENTICAL; /** * Internal use for strength checks */ static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1; /** * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and * DECOMPOSITION_MODE */ static final int OFF_ = 16; /** * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE */ static final int ON_ = 17; /** * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted */ static final int SHIFTED_ = 20; /** * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable */ static final int NON_IGNORABLE_ = 21; /** * Valid for CASE_FIRST - lower case sorts before upper case */ static final int LOWER_FIRST_ = 24; /** * Upper case sorts before lower case */ static final int UPPER_FIRST_ = 25; /** * Number of attribute values */ static final int LIMIT_ = 29; } /** * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values * specific to each one. */ static interface Attribute { /** * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear. */ static final int FRENCH_COLLATION_ = 0; /** * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with * primary weights that are equal or below the variable top value to be ignored on primary level and moved to * the quaternary level. */ static final int ALTERNATE_HANDLING_ = 1; /** * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case * letters to sort before lower case letters, and LOWER_FIRST which does the opposite. */ static final int CASE_FIRST_ = 2; /** * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable * values are OFF (default), when case level is not generated, and ON which causes the case level to be * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to * ignore accent differences in a string is to set the strength to PRIMARY and enable case level. */ static final int CASE_LEVEL_ = 3; /** * Controls whether the normalization check and necessary normalizations are performed. When set to OFF * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental * NFD normalization is performed. */ static final int NORMALIZATION_MODE_ = 4; /** * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise, * quaternary level is affected only by the number of non ignorable code points in the string. Identical * strength is rarely useful, as it amounts to codepoints of the NFD form of the string. */ static final int STRENGTH_ = 5; /** * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a * sneaky way to produce JIS sort order. */ static final int HIRAGANA_QUATERNARY_MODE_ = 6; /** * Attribute count */ static final int LIMIT_ = 7; } /** * DataManipulate singleton */ static class DataManipulate implements Trie.DataManipulate { // public methods ---------------------------------------------------- /** * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data. * * @param ce * collation element of the lead surrogate * @return data offset or 0 for the next trail surrogate * @stable ICU 2.8 */ public final int getFoldingOffset(int ce) { if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { return (ce & 0xFFFFFF); } return 0; } /** * Get singleton object */ public static final DataManipulate getInstance() { if (m_instance_ == null) { m_instance_ = new DataManipulate(); } return m_instance_; } // private data member ---------------------------------------------- /** * Singleton instance */ private static DataManipulate m_instance_; // private constructor ---------------------------------------------- /** * private to prevent initialization */ private DataManipulate() { } } /** * UCAConstants */ static final class UCAConstants { int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705 int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000 int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500 int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05 int FIRST_VARIABLE_[] = new int[2]; // 0x05070505 int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505 int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505 int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505 int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303 int FIRST_IMPLICIT_[] = new int[2]; int LAST_IMPLICIT_[] = new int[2]; int FIRST_TRAILING_[] = new int[2]; int LAST_TRAILING_[] = new int[2]; int PRIMARY_TOP_MIN_; int PRIMARY_IMPLICIT_MIN_; // 0xE8000000 int PRIMARY_IMPLICIT_MAX_; // 0xF0000000 int PRIMARY_TRAILING_MIN_; // 0xE8000000 int PRIMARY_TRAILING_MAX_; // 0xF0000000 int PRIMARY_SPECIAL_MIN_; // 0xE8000000 int PRIMARY_SPECIAL_MAX_; // 0xF0000000 } /** * Script to Lead Byte and Lead Byte to Script Data * */ static final class LeadByteConstants { private static final int DATA_MASK_FOR_INDEX = 0x8000; private static final int[] EMPTY_INT_ARRAY = new int[0]; private int serializedSize = 0; private Map SCRIPT_TO_LEAD_BYTES_INDEX; private byte[] SCRIPT_TO_LEAD_BYTES_DATA; private int[] LEAD_BYTE_TO_SCRIPTS_INDEX; private byte[] LEAD_BYTE_TO_SCRIPTS_DATA; LeadByteConstants() { } void read(DataInputStream dis) throws IOException { int readcount = 0; int indexCount; int dataSize; // script to lead bytes indexCount = dis.readShort(); readcount += 2; dataSize = dis.readShort(); readcount += 2; this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap(); //System.out.println("Script to Lead Bytes Index - Count = " + indexCount); for (int index = 0; index < indexCount; index++) { int reorderCode = dis.readShort(); // reorder code readcount += 2; int dataOffset = 0xffff & dis.readShort(); // data offset readcount += 2; // System.out.println("\t-------------"); // System.out.println("\toffset = " + Integer.toHexString(readcount - 4)); // System.out.println("\treorderCode = " + Integer.toHexString(reorderCode)); // System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset)); this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset); } this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2]; dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length); readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length; // lead byte to scripts indexCount = dis.readShort(); readcount += 2; dataSize = dis.readShort(); readcount += 2; this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount]; //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount); for (int index = 0; index < indexCount; index++) { this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort(); readcount += 2; // System.out.println("\t-------------"); // System.out.println("\toffset = " + Integer.toHexString(readcount - 2)); // System.out.println("\tindex = " + Integer.toHexString(index)); // System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index])); } this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2]; dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length); readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length; this.serializedSize = readcount; } int getSerializedDataSize() { return this.serializedSize; } int[] getReorderCodesForLeadByte(int leadByte) { if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) { return EMPTY_INT_ARRAY; } int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte]; if (offset == 0) { return EMPTY_INT_ARRAY; } int[] reorderCodes; if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) { reorderCodes = new int[1]; reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX; } else { int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset); offset++; reorderCodes = new int[length]; for (int code = 0; code < length; code++, offset++) { reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset); } } return reorderCodes; } int[] getLeadBytesForReorderCode(int reorderCode) { if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) { return EMPTY_INT_ARRAY; } int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode); if (offset == 0) { return EMPTY_INT_ARRAY; } int[] leadBytes; if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) { leadBytes = new int[1]; leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX; } else { int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset); offset++; leadBytes = new int[length]; for (int leadByte = 0; leadByte < length; leadByte++, offset++) { leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset); } } return leadBytes; } private static int readShort(byte[] data, int offset) { return (0xff & data[offset * 2]) << 8 | (data[offset * 2 + 1] & 0xff); } } // package private data member ------------------------------------------- static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04; static final byte BYTE_COMMON_ = (byte) 0x05; static final int COMMON_TOP_2_ = 0x86; // int for unsigness static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; static final int COMMON_BOTTOM_3 = 0x05; /** * Case strength mask */ static final int CE_CASE_BIT_MASK_ = 0xC0; static final int CE_TAG_SHIFT_ = 24; static final int CE_TAG_MASK_ = 0x0F000000; static final int CE_SPECIAL_FLAG_ = 0xF0000000; /** * Lead surrogate that is tailored and doesn't start a contraction */ static final int CE_SURROGATE_TAG_ = 5; /** * Mask to get the primary strength of the collation element */ static final int CE_PRIMARY_MASK_ = 0xFFFF0000; /** * Mask to get the secondary strength of the collation element */ static final int CE_SECONDARY_MASK_ = 0xFF00; /** * Mask to get the tertiary strength of the collation element */ static final int CE_TERTIARY_MASK_ = 0xFF; /** * Primary strength shift */ static final int CE_PRIMARY_SHIFT_ = 16; /** * Secondary strength shift */ static final int CE_SECONDARY_SHIFT_ = 8; /** * Continuation marker */ static final int CE_CONTINUATION_MARKER_ = 0xC0; /** * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus * that off to get the right expansion ce offset. In number of ints. */ int m_expansionOffset_; /** * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have * to minus that off to get the right contraction ce offset. In number of chars. */ int m_contractionOffset_; /** * Flag indicator if Jamo is special */ boolean m_isJamoSpecial_; // Collator options ------------------------------------------------------ int m_defaultVariableTopValue_; boolean m_defaultIsFrenchCollation_; boolean m_defaultIsAlternateHandlingShifted_; int m_defaultCaseFirst_; boolean m_defaultIsCaseLevel_; int m_defaultDecomposition_; int m_defaultStrength_; boolean m_defaultIsHiragana4_; boolean m_defaultIsNumericCollation_; /** * Default script order - the one created at initial rule parse time */ int[] m_defaultReorderCodes_; /** * Value of the variable top */ int m_variableTopValue_; /** * Attribute for special Hiragana */ boolean m_isHiragana4_; /** * Case sorting customization */ int m_caseFirst_; /** * Numeric collation option */ boolean m_isNumericCollation_; /** * Script order */ int[] m_reorderCodes_; // end Collator options -------------------------------------------------- /** * Expansion table */ int m_expansion_[]; /** * Contraction index table */ char m_contractionIndex_[]; /** * Contraction CE table */ int m_contractionCE_[]; /** * Data trie */ IntTrie m_trie_; /** * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch. */ int m_expansionEndCE_[]; /** * Table to store the maximum size of any expansions that end with the corresponding collation element in * m_expansionEndCE_. For use in StringSearch too */ byte m_expansionEndCEMaxSize_[]; /** * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above, * then 'A', 'B', 'C' are "unsafe" but 'Z' is not. */ byte m_unsafe_[]; /** * Table to store information on whether a codepoint can occur as the last character in a contraction */ byte m_contractionEnd_[]; /** * Original collation rules */ String m_rules_; /** * The smallest "unsafe" codepoint */ char m_minUnsafe_; /** * The smallest codepoint that could be the end of a contraction */ char m_minContractionEnd_; /** * General version of the collator */ VersionInfo m_version_; /** * UCA version */ VersionInfo m_UCA_version_; /** * UCD version */ VersionInfo m_UCD_version_; /** * Lead byte and script data */ int m_leadByteToScripts; int m_scriptToLeadBytes; /** * UnicodeData.txt property object */ static final RuleBasedCollator UCA_; /** * UCA Constants */ static final UCAConstants UCA_CONSTANTS_; /** * Lead Byte Constants */ static LeadByteConstants LEADBYTE_CONSTANTS_; /** * Table for UCA and builder use */ static final char UCA_CONTRACTIONS_[]; static final int MAX_UCA_CONTRACTION_LENGTH; private static boolean UCA_INIT_COMPLETE; /** * Implicit generator */ static final ImplicitCEGenerator impCEGen_; static final byte SORT_LEVEL_TERMINATOR_ = 1; // These are values from UCA required for // implicit generation and supressing sort key compression // they should regularly be in the UCA, but if one // is running without UCA, it could be a problem static final int maxRegularPrimary = 0x7A; static final int minImplicitPrimary = 0xE0; static final int maxImplicitPrimary = 0xE4; // block to initialise character property database static { // take pains to let static class init succeed, otherwise the class itself won't exist and // clients will get a NoClassDefFoundException. Instead, make the constructors fail if // we can't load the UCA data. RuleBasedCollator iUCA_ = null; UCAConstants iUCA_CONSTANTS_ = null; LeadByteConstants iLEADBYTE_CONSTANTS = null; char iUCA_CONTRACTIONS_[] = null; Output maxUCAContractionLength = new Output(); ImplicitCEGenerator iimpCEGen_ = null; try { // !!! note what's going on here... // even though the static init of the class is not yet complete, we // instantiate an instance of the class. So we'd better be sure that // instantiation doesn't rely on the static initialization that's // not complete yet! iUCA_ = new RuleBasedCollator(); iUCA_CONSTANTS_ = new UCAConstants(); iLEADBYTE_CONSTANTS = new LeadByteConstants(); iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS, maxUCAContractionLength); // called before doing canonical closure for the UCA. iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary); // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_); iUCA_.init(); ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH); iUCA_.m_rules_ = (String) rb.getObject("UCARules"); } catch (MissingResourceException ex) { // throw ex; } catch (IOException e) { // e.printStackTrace(); // throw new MissingResourceException(e.getMessage(),"",""); } UCA_ = iUCA_; UCA_CONSTANTS_ = iUCA_CONSTANTS_; LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS; UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_; MAX_UCA_CONTRACTION_LENGTH = maxUCAContractionLength.value; impCEGen_ = iimpCEGen_; UCA_INIT_COMPLETE = true; } private static void checkUCA() throws MissingResourceException { if (UCA_INIT_COMPLETE && UCA_ == null) { throw new MissingResourceException("Collator UCA data unavailable", "", ""); } } // package private constructors ------------------------------------------ /** *

* Private contructor for use by subclasses. Public access to creating Collators is handled by the API * Collator.getInstance() or RuleBasedCollator(String rules). *

*

* This constructor constructs the UCA collator internally *

*/ RuleBasedCollator() { checkUCA(); } /** * Constructors a RuleBasedCollator from the argument locale. If no resource bundle is associated with the locale, * UCA is used instead. * * @param locale */ RuleBasedCollator(ULocale locale) { checkUCA(); try { ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale); if (rb != null) { ICUResourceBundle elements = null; // Use keywords, if supplied for lookup String collkey = locale.getKeywordValue("collation"); if (collkey != null) { try { elements = rb.getWithFallback("collations/" + collkey); } catch (MissingResourceException e) { // fall through } } if (elements == null) { // either collation keyword was not supplied or // the keyword was invalid - use default collation for the locale // collations/default should always give a string back // keyword for the real collation data collkey = rb.getStringWithFallback("collations/default"); elements = rb.getWithFallback("collations/" + collkey); } // TODO: Determine actual & valid locale correctly ULocale uloc = rb.getULocale(); setLocale(uloc, uloc); m_rules_ = elements.getString("Sequence"); ByteBuffer buf = elements.get("%%CollationBin").getBinary(); // %%CollationBin if (buf != null) { // m_rules_ = (String)rules[1][1]; CollatorReader.initRBC(this, buf); /* * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /* * CollatorReader reader = new CollatorReader(input, false); if (map.length > * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this); * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); } */ // at this point, we have read in the collator // now we need to check whether the binary image has // the right UCA and other versions if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) { init(m_rules_); return; } try { UResourceBundle reorderRes = elements.get("%%ReorderCodes"); if (reorderRes != null) { int[] reorderCodes = reorderRes.getIntVector(); setReorderCodes(reorderCodes); m_defaultReorderCodes_ = reorderCodes.clone(); } } catch (MissingResourceException e) { // ignore } init(); return; } else { init(m_rules_); return; } } } catch (Exception e) { // fallthrough } setWithUCAData(); } // package private methods ----------------------------------------------- /** * Sets this collator to use the tables in UCA. Note options not taken care of here. */ final void setWithUCATables() { m_contractionOffset_ = UCA_.m_contractionOffset_; m_expansionOffset_ = UCA_.m_expansionOffset_; m_expansion_ = UCA_.m_expansion_; m_contractionIndex_ = UCA_.m_contractionIndex_; m_contractionCE_ = UCA_.m_contractionCE_; m_trie_ = UCA_.m_trie_; m_expansionEndCE_ = UCA_.m_expansionEndCE_; m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_; m_unsafe_ = UCA_.m_unsafe_; m_contractionEnd_ = UCA_.m_contractionEnd_; m_minUnsafe_ = UCA_.m_minUnsafe_; m_minContractionEnd_ = UCA_.m_minContractionEnd_; } /** * Sets this collator to use the all options and tables in UCA. */ final void setWithUCAData() { latinOneFailed_ = true; m_addition3_ = UCA_.m_addition3_; m_bottom3_ = UCA_.m_bottom3_; m_bottomCount3_ = UCA_.m_bottomCount3_; m_caseFirst_ = UCA_.m_caseFirst_; m_caseSwitch_ = UCA_.m_caseSwitch_; m_common3_ = UCA_.m_common3_; m_contractionOffset_ = UCA_.m_contractionOffset_; setDecomposition(UCA_.getDecomposition()); m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_; m_defaultDecomposition_ = UCA_.m_defaultDecomposition_; m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_; m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_; m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_; m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_; m_defaultStrength_ = UCA_.m_defaultStrength_; m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_; m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_; m_expansionOffset_ = UCA_.m_expansionOffset_; m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_; m_isCaseLevel_ = UCA_.m_isCaseLevel_; m_isFrenchCollation_ = UCA_.m_isFrenchCollation_; m_isHiragana4_ = UCA_.m_isHiragana4_; m_isJamoSpecial_ = UCA_.m_isJamoSpecial_; m_isSimple3_ = UCA_.m_isSimple3_; m_mask3_ = UCA_.m_mask3_; m_minContractionEnd_ = UCA_.m_minContractionEnd_; m_minUnsafe_ = UCA_.m_minUnsafe_; m_rules_ = UCA_.m_rules_; setStrength(UCA_.getStrength()); m_top3_ = UCA_.m_top3_; m_topCount3_ = UCA_.m_topCount3_; m_variableTopValue_ = UCA_.m_variableTopValue_; m_isNumericCollation_ = UCA_.m_isNumericCollation_; setWithUCATables(); latinOneFailed_ = false; } /** * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not. * * @param ch * character to determin * @return true if ch is unsafe, false otherwise */ final boolean isUnsafe(char ch) { if (ch < m_minUnsafe_) { return false; } if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) { // Trail surrogate are always considered unsafe. return true; } ch &= HEURISTIC_OVERFLOW_MASK_; ch += HEURISTIC_OVERFLOW_OFFSET_; } int value = m_unsafe_[ch >> HEURISTIC_SHIFT_]; return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; } /** * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at * the end of a contraction, otherwise it is not deterministic. * * @param ch * character to be determined */ final boolean isContractionEnd(char ch) { if (UTF16.isTrailSurrogate(ch)) { return true; } if (ch < m_minContractionEnd_) { return false; } if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { ch &= HEURISTIC_OVERFLOW_MASK_; ch += HEURISTIC_OVERFLOW_OFFSET_; } int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_]; return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; } /** * Retrieve the tag of a special ce * * @param ce * ce to test * @return tag of ce */ static int getTag(int ce) { return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_; } /** * Checking if ce is special * * @param ce * to check * @return true if ce is special */ static boolean isSpecial(int ce) { return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_; } /** * Checks if the argument ce is a continuation * * @param ce * collation element to test * @return true if ce is a continuation */ static final boolean isContinuation(int ce) { return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; } // private inner classes ------------------------------------------------ // private variables ----------------------------------------------------- /** * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark. */ private static final int DEFAULT_MIN_HEURISTIC_ = 0x300; /** * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the * rest of the chars. Size in bytes. */ private static final char HEURISTIC_SIZE_ = 1056; /** * Mask value down to "some power of two" - 1, number of bits, not num of bytes. */ private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff; /** * Unsafe character shift */ private static final int HEURISTIC_SHIFT_ = 3; /** * Unsafe character addition for character too large, it has to be folded then incremented. */ private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256; /** * Mask value to get offset in heuristic table. */ private static final char HEURISTIC_MASK_ = 7; private int m_caseSwitch_; private int m_common3_; private int m_mask3_; /** * When switching case, we need to add or subtract different values. */ private int m_addition3_; /** * Upper range when compressing */ private int m_top3_; /** * Upper range when compressing */ private int m_bottom3_; private int m_topCount3_; private int m_bottomCount3_; /** * Script reordering table */ private byte[] m_leadBytePermutationTable_; /** * Case first constants */ private static final int CASE_SWITCH_ = 0xC0; private static final int NO_CASE_SWITCH_ = 0; /** * Case level constants */ private static final int CE_REMOVE_CASE_ = 0x3F; private static final int CE_KEEP_CASE_ = 0xFF; /** * Case strength mask */ private static final int CE_CASE_MASK_3_ = 0xFF; /** * Sortkey size factor. Values can be changed. */ private static final double PROPORTION_2_ = 0.5; private static final double PROPORTION_3_ = 0.667; // These values come from the UCA ---------------------------------------- /** * This is an enum that lists magic special byte values from the fractional UCA */ // private static final byte BYTE_ZERO_ = 0x0; // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01; // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03; /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; // TODO: Make the following values dynamic since they change with almost every UCA version. static final byte CODAN_PLACEHOLDER = 0x12; private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B; private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF; private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40; private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85; private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45; private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5; private static final int COMMON_BOTTOM_3_ = 0x05; private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86; private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_; private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_); private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_; private static final int COMMON_2_ = COMMON_BOTTOM_2_; private static final int COMMON_UPPER_FIRST_3_ = 0xC5; private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_; // private static final int COMMON_4_ = (byte)0xFF; /* * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes */ // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; /** * If this collator is to generate only simple tertiaries for fast path */ private boolean m_isSimple3_; /** * French collation sorting flag */ private boolean m_isFrenchCollation_; /** * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for * alternate handling will be non-ignorable. */ private boolean m_isAlternateHandlingShifted_; /** * Extra case level for sorting */ private boolean m_isCaseLevel_; /** * Frozen state of the collator. */ private Lock frozenLock; private static final int SORT_BUFFER_INIT_SIZE_ = 128; private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3; private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_; private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_; private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2; private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_; private static final int CE_CONTINUATION_TAG_ = 0xC0; private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F; private static final int LAST_BYTE_MASK_ = 0xFF; // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303; // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303; private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80; private static final byte SORT_CASE_SHIFT_START_ = (byte) 7; /** * CE buffer size */ private static final int CE_BUFFER_SIZE_ = 512; // variables for Latin-1 processing boolean latinOneUse_ = false; boolean latinOneRegenTable_ = false; boolean latinOneFailed_ = false; int latinOneTableLen_ = 0; int latinOneCEs_[] = null; private final class CollationBuffer { /** * Bunch of utility iterators */ protected StringUCharacterIterator m_srcUtilIter_; protected CollationElementIterator m_srcUtilColEIter_; protected StringUCharacterIterator m_tgtUtilIter_; protected CollationElementIterator m_tgtUtilColEIter_; /** * Utility comparison flags */ protected boolean m_utilCompare0_; // private boolean m_utilCompare1_; protected boolean m_utilCompare2_; protected boolean m_utilCompare3_; protected boolean m_utilCompare4_; protected boolean m_utilCompare5_; /** * Utility byte buffer */ protected byte m_utilBytes0_[]; protected byte m_utilBytes1_[]; protected byte m_utilBytes2_[]; protected byte m_utilBytes3_[]; protected byte m_utilBytes4_[]; // private byte m_utilBytes5_[]; protected RawCollationKey m_utilRawCollationKey_; protected int m_utilBytesCount0_; protected int m_utilBytesCount1_; protected int m_utilBytesCount2_; protected int m_utilBytesCount3_; protected int m_utilBytesCount4_; // private int m_utilBytesCount5_; // private int m_utilCount0_; // private int m_utilCount1_; protected int m_utilCount2_; protected int m_utilCount3_; protected int m_utilCount4_; // private int m_utilCount5_; protected int m_utilFrenchStart_; protected int m_utilFrenchEnd_; /** * Preparing the CE buffers. will be filled during the primary phase */ protected int m_srcUtilCEBuffer_[]; protected int m_tgtUtilCEBuffer_[]; protected int m_srcUtilCEBufferSize_; protected int m_tgtUtilCEBufferSize_; protected int m_srcUtilContOffset_; protected int m_tgtUtilContOffset_; protected int m_srcUtilOffset_; protected int m_tgtUtilOffset_; private CollationBuffer() { initBuffers(); } /** * Initializes utility iterators and byte buffer used by compare */ protected final void initBuffers() { resetBuffers(); m_srcUtilIter_ = new StringUCharacterIterator(); m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, RuleBasedCollator.this); m_tgtUtilIter_ = new StringUCharacterIterator(); m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, RuleBasedCollator.this); m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_]; m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_]; } protected final void resetBuffers() { m_utilCompare0_ = false; // private boolean m_utilCompare1_; m_utilCompare2_ = false; m_utilCompare3_ = false; m_utilCompare4_ = false; m_utilCompare5_ = false; m_utilBytesCount0_ = 0; m_utilBytesCount1_ = 0; m_utilBytesCount2_ = 0; m_utilBytesCount3_ = 0; m_utilBytesCount4_ = 0; // private int m_utilBytesCount5_; m_utilCount2_ = 0; m_utilCount3_ = 0; m_utilCount4_ = 0; m_utilFrenchStart_ = 0; m_utilFrenchEnd_ = 0; m_srcUtilContOffset_ = 0; m_tgtUtilContOffset_ = 0; m_srcUtilOffset_ = 0; m_tgtUtilOffset_ = 0; } } // private methods ------------------------------------------------------- private void init(String rules) throws Exception { setWithUCAData(); CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules); builder.setRules(this); m_rules_ = rules; init(); buildPermutationTable(); } private final int compareRegular(String source, String target, int offset, CollationBuffer buffer) { buffer.resetBuffers(); int strength = getStrength(); // setting up the collator parameters buffer.m_utilCompare0_ = m_isCaseLevel_; // m_utilCompare1_ = true; buffer.m_utilCompare2_ = strength >= SECONDARY; buffer.m_utilCompare3_ = strength >= TERTIARY; buffer.m_utilCompare4_ = strength >= QUATERNARY; buffer.m_utilCompare5_ = strength == IDENTICAL; boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_; boolean doShift4 = m_isAlternateHandlingShifted_ && buffer.m_utilCompare4_; boolean doHiragana4 = m_isHiragana4_ && buffer.m_utilCompare4_; if (doHiragana4 && doShift4) { String sourcesub = source.substring(offset); String targetsub = target.substring(offset); return compareBySortKeys(sourcesub, targetsub, buffer); } // This is the lowest primary value that will not be ignored if shifted int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0; buffer.m_srcUtilCEBufferSize_ = 0; buffer.m_tgtUtilCEBufferSize_ = 0; int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset, buffer); if (buffer.m_srcUtilCEBufferSize_ == -1 && buffer.m_tgtUtilCEBufferSize_ == -1) { // since the cebuffer is cleared when we have determined that // either source is greater than target or vice versa, the return // result is the comparison result and not the hiragana result return result; } int hiraganaresult = result; if (buffer.m_utilCompare2_) { result = doSecondaryCompare(doFrench, buffer); if (result != 0) { return result; } } // doing the case bit if (buffer.m_utilCompare0_) { result = doCaseCompare(buffer); if (result != 0) { return result; } } // Tertiary level if (buffer.m_utilCompare3_) { result = doTertiaryCompare(buffer); if (result != 0) { return result; } } if (doShift4) { // checkQuad result = doQuaternaryCompare(lowestpvalue, buffer); if (result != 0) { return result; } } else if (doHiragana4 && hiraganaresult != 0) { // If we're fine on quaternaries, we might be different // on Hiragana. This, however, might fail us in shifted. return hiraganaresult; } // For IDENTICAL comparisons, we use a bitwise character comparison // as a tiebreaker if all else is equal. // Getting here should be quite rare - strings are not identical - // that is checked first, but compared == through all other checks. if (buffer.m_utilCompare5_) { return doIdenticalCompare(source, target, offset, true); } return 0; } // Is this primary weight compressible? // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). // TODO: This should use per-lead-byte flags from FractionalUCA.txt. static boolean isCompressible(int primary1) { return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary; } /** * Gets the 2 bytes of primary order and adds it to the primary byte array * * @param ce * current ce * @param notIsContinuation * flag indicating if the current bytes belong to a continuation ce * @param doShift * flag indicating if ce is to be shifted * @param leadPrimary * lead primary used for compression * @param commonBottom4 * common byte value for Quaternary * @param bottomCount4 * smallest byte value for Quaternary * @return the new lead primary for compression */ private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary, int commonBottom4, int bottomCount4, CollationBuffer buffer) { int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned int p1 = ce >>> 8; // comparison int originalP1 = p1; if (notIsContinuation) { if (m_leadBytePermutationTable_ != null) { p1 = 0xff & m_leadBytePermutationTable_[p1]; } } if (doShift) { if (buffer.m_utilCount4_ > 0) { while (buffer.m_utilCount4_ > bottomCount4) { buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4)); buffer.m_utilBytesCount4_++; buffer.m_utilCount4_ -= bottomCount4; } buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1))); buffer.m_utilBytesCount4_++; buffer.m_utilCount4_ = 0; } // dealing with a variable and we're treating them as shifted // This is a shifted ignorable if (p1 != 0) { // we need to check this since we could be in continuation buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p1); buffer.m_utilBytesCount4_++; } if (p2 != 0) { buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p2); buffer.m_utilBytesCount4_++; } } else { // Note: This code assumes that the table is well built // i.e. not having 0 bytes where they are not supposed to be. // Usually, we'll have non-zero primary1 & primary2, except // in cases of LatinOne and friends, when primary2 will be // regular and simple sortkey calc if (p1 != CollationElementIterator.IGNORABLE) { if (notIsContinuation) { if (leadPrimary == p1) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); buffer.m_utilBytesCount1_++; } else { if (leadPrimary != 0) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_)); buffer.m_utilBytesCount1_++; } if (p2 == CollationElementIterator.IGNORABLE) { // one byter, not compressed buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); buffer.m_utilBytesCount1_++; leadPrimary = 0; } else if (isCompressible(originalP1)) { // compress leadPrimary = p1; buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); buffer.m_utilBytesCount1_++; buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); buffer.m_utilBytesCount1_++; } else { leadPrimary = 0; buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); buffer.m_utilBytesCount1_++; buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); buffer.m_utilBytesCount1_++; } } } else { // continuation, add primary to the key, no compression buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); buffer.m_utilBytesCount1_++; if (p2 != CollationElementIterator.IGNORABLE) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); // second part buffer.m_utilBytesCount1_++; } } } } return leadPrimary; } /** * Gets the secondary byte and adds it to the secondary byte array * * @param ce current ce * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce * @param doFrench flag indicator if french sort is to be performed * @param buffer collation buffer temporary state */ private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench, CollationBuffer buffer) { int s = (ce >> 8) & LAST_BYTE_MASK_; // int for comparison if (s != 0) { if (!doFrench) { // This is compression code. if (s == COMMON_2_ && notIsContinuation) { buffer.m_utilCount2_++; } else { if (buffer.m_utilCount2_ > 0) { if (s > COMMON_2_) { // not necessary for 4th level. while (buffer.m_utilCount2_ > TOP_COUNT_2_) { buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_TOP_2_ - TOP_COUNT_2_)); buffer.m_utilBytesCount2_++; buffer.m_utilCount2_ -= TOP_COUNT_2_; } buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1))); buffer.m_utilBytesCount2_++; } else { while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); buffer.m_utilBytesCount2_++; buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; } buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); buffer.m_utilBytesCount2_++; } buffer.m_utilCount2_ = 0; } buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s); buffer.m_utilBytesCount2_++; } } else { buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s); buffer.m_utilBytesCount2_++; // Do the special handling for French secondaries // We need to get continuation elements and do intermediate // restore // abc1c2c3de with french secondaries need to be edc1c2c3ba // NOT edc3c2c1ba if (notIsContinuation) { if (buffer.m_utilFrenchStart_ != -1) { // reverse secondaries from frenchStartPtr up to // frenchEndPtr reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_); buffer.m_utilFrenchStart_ = -1; } } else { if (buffer.m_utilFrenchStart_ == -1) { buffer.m_utilFrenchStart_ = buffer.m_utilBytesCount2_ - 2; } buffer.m_utilFrenchEnd_ = buffer.m_utilBytesCount2_ - 1; } } } } /** * Reverse the argument buffer * * @param buffer to reverse * @param start index in buffer to start from * @param end index in buffer to end at */ private static void reverseBuffer(byte buffer[], int start, int end) { while (start < end) { byte b = buffer[start]; buffer[start++] = buffer[end]; buffer[end--] = b; } } /** * Insert the case shifting byte if required * * @param caseshift value * @return new caseshift value */ private final int doCaseShift(int caseshift, CollationBuffer buffer) { if (caseshift == 0) { buffer.m_utilBytes0_ = append(buffer.m_utilBytes0_, buffer.m_utilBytesCount0_, SORT_CASE_BYTE_START_); buffer.m_utilBytesCount0_++; caseshift = SORT_CASE_SHIFT_START_; } return caseshift; } /** * Performs the casing sort * * @param tertiary byte in ints for easy comparison * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce * @param caseshift * @param buffer collation buffer temporary state * @return the new value of case shift */ private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift, CollationBuffer buffer) { caseshift = doCaseShift(caseshift, buffer); if (notIsContinuation && tertiary != 0) { byte casebits = (byte) (tertiary & 0xC0); if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { if (casebits == 0) { buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= (1 << (--caseshift)); } else { // second bit caseshift = doCaseShift(caseshift - 1, buffer); buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift); } } else { if (casebits != 0) { buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= 1 << (--caseshift); // second bit caseshift = doCaseShift(caseshift, buffer); buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift); } else { caseshift--; } } } return caseshift; } /** * Gets the tertiary byte and adds it to the tertiary byte array * * @param tertiary byte in int for easy comparison * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce * @param buffer collation buffer temporary state */ private final void doTertiaryBytes(int tertiary, boolean notIsContinuation, CollationBuffer buffer) { if (tertiary != 0) { // This is compression code. // sequence size check is included in the if clause if (tertiary == m_common3_ && notIsContinuation) { buffer.m_utilCount3_++; } else { int common3 = m_common3_ & LAST_BYTE_MASK_; if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) { tertiary += m_addition3_; } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) { tertiary -= m_addition3_; } if (buffer.m_utilCount3_ > 0) { if (tertiary > common3) { while (buffer.m_utilCount3_ > m_topCount3_) { buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_)); buffer.m_utilBytesCount3_++; buffer.m_utilCount3_ -= m_topCount3_; } buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - (buffer.m_utilCount3_ - 1))); buffer.m_utilBytesCount3_++; } else { while (buffer.m_utilCount3_ > m_bottomCount3_) { buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_)); buffer.m_utilBytesCount3_++; buffer.m_utilCount3_ -= m_bottomCount3_; } buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1))); buffer.m_utilBytesCount3_++; } buffer.m_utilCount3_ = 0; } buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) tertiary); buffer.m_utilBytesCount3_++; } } } /** * Gets the Quaternary byte and adds it to the Quaternary byte array * * @param isCodePointHiragana flag indicator if the previous codepoint we dealt with was Hiragana * @param commonBottom4 smallest common Quaternary byte * @param bottomCount4 smallest Quaternary byte * @param hiragana4 hiragana Quaternary byte * @param buffer collation buffer temporary state */ private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4, byte hiragana4, CollationBuffer buffer) { if (isCodePointHiragana) { // This was Hiragana, need to note it if (buffer.m_utilCount4_ > 0) { // Close this part while (buffer.m_utilCount4_ > bottomCount4) { buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4)); buffer.m_utilBytesCount4_++; buffer.m_utilCount4_ -= bottomCount4; } buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1))); buffer.m_utilBytesCount4_++; buffer.m_utilCount4_ = 0; } buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, hiragana4); // Add the Hiragana buffer.m_utilBytesCount4_++; } else { // This wasn't Hiragana, so we can continue adding stuff buffer.m_utilCount4_++; } } /** * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc. * * @param source normalized string * @param doFrench flag indicator if special handling of French has to be done * @param hiragana4 offset for Hiragana quaternary * @param commonBottom4 smallest common quaternary byte * @param bottomCount4 smallest quaternary byte * @param buffer collation buffer temporary state */ private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4, int bottomCount4, CollationBuffer buffer) { int backupDecomposition = getDecomposition(); // TODO- hack fix around frozen state - stop self-modification internalSetDecomposition(NO_DECOMPOSITION); // have to revert to backup later buffer.m_srcUtilIter_.setText(source); buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_); buffer.m_utilFrenchStart_ = -1; buffer.m_utilFrenchEnd_ = -1; boolean doShift = false; boolean notIsContinuation = false; int leadPrimary = 0; // int for easier comparison int caseShift = 0; while (true) { int ce = buffer.m_srcUtilColEIter_.next(); if (ce == CollationElementIterator.NULLORDER) { break; } if (ce == CollationElementIterator.IGNORABLE) { continue; } notIsContinuation = !isContinuation(ce); boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0; // actually we can just check that the first byte is 0 // generation stuffs the order left first boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_; doShift = (m_isAlternateHandlingShifted_ && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0 || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable)); if (doShift && isPrimaryByteIgnorable) { // amendment to the UCA says that primary ignorables and other // ignorables should be removed if following a shifted code // point // if we were shifted and we got an ignorable code point // we should just completely ignore it continue; } leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4, buffer); if (doShift) { continue; } if (buffer.m_utilCompare2_) { doSecondaryBytes(ce, notIsContinuation, doFrench, buffer); } int t = ce & LAST_BYTE_MASK_; if (!notIsContinuation) { t = ce & CE_REMOVE_CONTINUATION_MASK_; } if (buffer.m_utilCompare0_ && (!isPrimaryByteIgnorable || buffer.m_utilCompare2_)) { // do the case level if we need to do it. We don't want to calculate // case level for primary ignorables if we have only primary strength and case level // otherwise we would break well formedness of CEs caseShift = doCaseBytes(t, notIsContinuation, caseShift, buffer); } else if (notIsContinuation) { t ^= m_caseSwitch_; } t &= m_mask3_; if (buffer.m_utilCompare3_) { doTertiaryBytes(t, notIsContinuation, buffer); } if (buffer.m_utilCompare4_ && notIsContinuation) { // compare quad doQuaternaryBytes(buffer.m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4, buffer); } } // TODO - hack fix around frozen state - stop self-modification internalSetDecomposition(backupDecomposition); // reverts to original if (buffer.m_utilFrenchStart_ != -1) { // one last round of checks reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_); } } /** * From the individual strength byte results the final compact sortkey will be calculated. * * @param source text string * @param doFrench flag indicating that special handling of French has to be done * @param commonBottom4 smallest common quaternary byte * @param bottomCount4 smallest quaternary byte * @param key output RawCollationKey to store results, key cannot be null * @param buffer collation buffer temporary state */ private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4, RawCollationKey key, CollationBuffer buffer) { // we have done all the CE's, now let's put them together to form // a key if (buffer.m_utilCompare2_) { doSecondary(doFrench, buffer); } // adding case level should be independent of secondary level if (buffer.m_utilCompare0_) { doCase(buffer); } if (buffer.m_utilCompare3_) { doTertiary(buffer); if (buffer.m_utilCompare4_) { doQuaternary(commonBottom4, bottomCount4, buffer); if (buffer.m_utilCompare5_) { doIdentical(source, buffer); } } } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) 0); buffer.m_utilBytesCount1_++; key.set(buffer.m_utilBytes1_, 0, buffer.m_utilBytesCount1_); } /** * Packs the French bytes * @param buffer collation buffer temporary state */ private static final void doFrench(CollationBuffer buffer) { for (int i = 0; i < buffer.m_utilBytesCount2_; i++) { byte s = buffer.m_utilBytes2_[buffer.m_utilBytesCount2_ - i - 1]; // This is compression code. if (s == COMMON_2_) { ++buffer.m_utilCount2_; } else { if (buffer.m_utilCount2_ > 0) { // getting the unsigned value if ((s & LAST_BYTE_MASK_) > COMMON_2_) { // not necessary for 4th level. while (buffer.m_utilCount2_ > TOP_COUNT_2_) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_TOP_2_ - TOP_COUNT_2_)); buffer.m_utilBytesCount1_++; buffer.m_utilCount2_ -= TOP_COUNT_2_; } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1))); buffer.m_utilBytesCount1_++; } else { while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); buffer.m_utilBytesCount1_++; buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); buffer.m_utilBytesCount1_++; } buffer.m_utilCount2_ = 0; } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, s); buffer.m_utilBytesCount1_++; } } if (buffer.m_utilCount2_ > 0) { while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); buffer.m_utilBytesCount1_++; buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); buffer.m_utilBytesCount1_++; } } /** * Compacts the secondary bytes and stores them into the primary array * * @param doFrench flag indicator that French has to be handled specially * @param buffer collation buffer temporary state */ private static final void doSecondary(boolean doFrench, CollationBuffer buffer) { if (buffer.m_utilCount2_ > 0) { while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); buffer.m_utilBytesCount2_++; buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; } buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); buffer.m_utilBytesCount2_++; } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); buffer.m_utilBytesCount1_++; if (doFrench) { // do the reverse copy doFrench(buffer); } else { if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount2_) { buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_); } System.arraycopy(buffer.m_utilBytes2_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_); buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount2_; } } /** * Increase buffer size * * @param buffer array of bytes * @param size of the byte array * @param incrementsize size to increase * @return the new buffer */ private static final byte[] increase(byte buffer[], int size, int incrementsize) { byte result[] = new byte[buffer.length + incrementsize]; System.arraycopy(buffer, 0, result, 0, size); return result; } /** * Increase buffer size * * @param buffer array of ints * @param size of the byte array * @param incrementsize size to increase * @return the new buffer */ private static final int[] increase(int buffer[], int size, int incrementsize) { int result[] = new int[buffer.length + incrementsize]; System.arraycopy(buffer, 0, result, 0, size); return result; } /** * Compacts the case bytes and stores them into the primary array * * @param buffer collation buffer temporary state */ private static final void doCase(CollationBuffer buffer) { buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); buffer.m_utilBytesCount1_++; if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount0_) { buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_); } System.arraycopy(buffer.m_utilBytes0_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_); buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount0_; } /** * Compacts the tertiary bytes and stores them into the primary array * * @param buffer collation buffer temporary state */ private final void doTertiary(CollationBuffer buffer) { if (buffer.m_utilCount3_ > 0) { if (m_common3_ != COMMON_BOTTOM_3_) { while (buffer.m_utilCount3_ >= m_topCount3_) { buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_)); buffer.m_utilBytesCount3_++; buffer.m_utilCount3_ -= m_topCount3_; } buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - buffer.m_utilCount3_)); buffer.m_utilBytesCount3_++; } else { while (buffer.m_utilCount3_ > m_bottomCount3_) { buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_)); buffer.m_utilBytesCount3_++; buffer.m_utilCount3_ -= m_bottomCount3_; } buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1))); buffer.m_utilBytesCount3_++; } } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); buffer.m_utilBytesCount1_++; if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount3_) { buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_); } System.arraycopy(buffer.m_utilBytes3_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_); buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount3_; } /** * Compacts the quaternary bytes and stores them into the primary array * * @param buffer collation buffer temporary state */ private final void doQuaternary(int commonbottom4, int bottomcount4, CollationBuffer buffer) { if (buffer.m_utilCount4_ > 0) { while (buffer.m_utilCount4_ > bottomcount4) { buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4)); buffer.m_utilBytesCount4_++; buffer.m_utilCount4_ -= bottomcount4; } buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + (buffer.m_utilCount4_ - 1))); buffer.m_utilBytesCount4_++; } buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); buffer.m_utilBytesCount1_++; if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount4_) { buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_); } System.arraycopy(buffer.m_utilBytes4_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_); buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount4_; } /** * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer. * * @param source text string * @param buffer collation buffer temporary state */ private static final void doIdentical(String source, CollationBuffer buffer) { int isize = BOCU.getCompressionLength(source); buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); buffer.m_utilBytesCount1_++; if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + isize) { buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, 1 + isize); } buffer.m_utilBytesCount1_ = BOCU.compress(source, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_); } /** * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence. * * @param source * string * @param target * string * @return offset of the first unmatched characters in source and target. */ private final int getFirstUnmatchedOffset(String source, String target) { int result = 0; int slength = source.length(); int tlength = target.length(); int minlength = slength; if (minlength > tlength) { minlength = tlength; } while (result < minlength && source.charAt(result) == target.charAt(result)) { result++; } if (result > 0) { // There is an identical portion at the beginning of the two // strings. If the identical portion ends within a contraction or a // combining character sequence, back up to the start of that // sequence. char schar = 0; char tchar = 0; if (result < minlength) { schar = source.charAt(result); // first differing chars tchar = target.charAt(result); } else { schar = source.charAt(minlength - 1); if (isUnsafe(schar)) { tchar = schar; } else if (slength == tlength) { return result; } else if (slength < tlength) { tchar = target.charAt(result); } else { schar = source.charAt(result); } } if (isUnsafe(schar) || isUnsafe(tchar)) { // We are stopped in the middle of a contraction or combining // sequence. // Look backwards for the part of the string for the start of // the sequence // It doesn't matter which string we scan, since they are the // same in this region. do { result--; } while (result > 0 && isUnsafe(source.charAt(result))); } } return result; } /** * Appending an byte to an array of bytes and increases it if we run out of space * * @param array * of byte arrays * @param appendindex * index in the byte array to append * @param value * to append * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned */ private static final byte[] append(byte array[], int appendindex, byte value) { try { array[appendindex] = value; } catch (ArrayIndexOutOfBoundsException e) { array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_); array[appendindex] = value; } return array; } /** * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets * in trouble and needs to bail out. * * @param source text string * @param target text string * @param buffer collation buffer temporary state */ private final int compareBySortKeys(String source, String target, CollationBuffer buffer) { buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_); // this method is very seldom called RawCollationKey targetkey = getRawCollationKey(target, null); return buffer.m_utilRawCollationKey_.compareTo(targetkey); } /** * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens. * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer * will not be cleared and the return result will be the hiragana result. * * @param doHiragana4 flag indicator that Hiragana Quaternary has to be observed * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted * @param source text string * @param target text string * @param textoffset offset in text to start the comparison * @param buffer collation buffer temporary state * @return comparion result if a primary difference is found, otherwise hiragana result */ private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target, int textoffset, CollationBuffer buffer) { // Preparing the context objects for iterating over strings buffer.m_srcUtilIter_.setText(source); buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_, textoffset); buffer.m_tgtUtilIter_.setText(target); buffer.m_tgtUtilColEIter_.setText(buffer.m_tgtUtilIter_, textoffset); // Non shifted primary processing is quite simple if (!m_isAlternateHandlingShifted_) { int hiraganaresult = 0; while (true) { int sorder = 0; // We fetch CEs until we hit a non ignorable primary or end. do { sorder = buffer.m_srcUtilColEIter_.next(); buffer.m_srcUtilCEBuffer_ = append(buffer.m_srcUtilCEBuffer_, buffer.m_srcUtilCEBufferSize_, sorder); buffer.m_srcUtilCEBufferSize_++; sorder &= CE_PRIMARY_MASK_; } while (sorder == CollationElementIterator.IGNORABLE); int torder = 0; do { torder = buffer.m_tgtUtilColEIter_.next(); buffer.m_tgtUtilCEBuffer_ = append(buffer.m_tgtUtilCEBuffer_, buffer.m_tgtUtilCEBufferSize_, torder); buffer.m_tgtUtilCEBufferSize_++; torder &= CE_PRIMARY_MASK_; } while (torder == CollationElementIterator.IGNORABLE); if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) { sorder = (m_leadBytePermutationTable_[((sorder >> 24) + 256) % 256] << 24) | (sorder & 0x00FFFFFF); torder = (m_leadBytePermutationTable_[((torder >> 24) + 256) % 256] << 24) | (torder & 0x00FFFFFF); } // if both primaries are the same if (sorder == torder) { // and there are no more CEs, we advance to the next level // see if we are at the end of either string if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) { return -1; } break; } else if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { return 1; } if (doHiragana4 && hiraganaresult == 0 && buffer.m_srcUtilColEIter_.m_isCodePointHiragana_ != buffer.m_tgtUtilColEIter_.m_isCodePointHiragana_) { if (buffer.m_srcUtilColEIter_.m_isCodePointHiragana_) { hiraganaresult = -1; } else { hiraganaresult = 1; } } } else { // if two primaries are different, we are done return endPrimaryCompare(sorder, torder, buffer); } } // no primary difference... do the rest from the buffers return hiraganaresult; } else { // shifted - do a slightly more complicated processing :) while (true) { int sorder = getPrimaryShiftedCompareCE(buffer.m_srcUtilColEIter_, lowestpvalue, true, buffer); int torder = getPrimaryShiftedCompareCE(buffer.m_tgtUtilColEIter_, lowestpvalue, false, buffer); if (sorder == torder) { if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { break; } else { continue; } } else { return endPrimaryCompare(sorder, torder, buffer); } } // no primary difference... do the rest from the buffers } return 0; } /** * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time. * * @param sorder source strength order * @param torder target strength order * @param buffer collation buffer temporary state * @return the comparison result of sorder and torder */ private static final int endPrimaryCompare(int sorder, int torder, CollationBuffer buffer) { // if we reach here, the ce offset accessed is the last ce // appended to the buffer boolean isSourceNullOrder = (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER); boolean isTargetNullOrder = (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER); buffer.m_srcUtilCEBufferSize_ = -1; buffer.m_tgtUtilCEBufferSize_ = -1; if (isSourceNullOrder) { return -1; } if (isTargetNullOrder) { return 1; } // getting rid of the sign sorder >>>= CE_PRIMARY_SHIFT_; torder >>>= CE_PRIMARY_SHIFT_; if (sorder < torder) { return -1; } return 1; } /** * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce. * * @param coleiter collation element iterator * @param doHiragana4 flag indicator if hiragana quaternary is to be handled * @param lowestpvalue lowest primary shifted value that will not be ignored * @param buffer collation buffer temporary state * @return result next modified ce */ private static final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc, CollationBuffer buffer) { boolean shifted = false; int result = CollationElementIterator.IGNORABLE; int cebuffer[] = buffer.m_srcUtilCEBuffer_; int cebuffersize = buffer.m_srcUtilCEBufferSize_; if (!isSrc) { cebuffer = buffer.m_tgtUtilCEBuffer_; cebuffersize = buffer.m_tgtUtilCEBufferSize_; } while (true) { result = coleiter.next(); if (result == CollationElementIterator.NULLORDER) { cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; break; } else if (result == CollationElementIterator.IGNORABLE || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) { // UCA amendment - ignore ignorables that follow shifted code // points continue; } else if (isContinuation(result)) { if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) { // There is primary value if (shifted) { result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_; // preserve interesting continuation cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; continue; } else { cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; break; } } else { // Just lower level values if (!shifted) { cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; } } } else { // regular if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) { cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; break; } else { if ((result & CE_PRIMARY_MASK_) != 0) { shifted = true; result &= CE_PRIMARY_MASK_; cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; continue; } else { cebuffer = append(cebuffer, cebuffersize, result); cebuffersize++; shifted = false; continue; } } } } if (isSrc) { buffer.m_srcUtilCEBuffer_ = cebuffer; buffer.m_srcUtilCEBufferSize_ = cebuffersize; } else { buffer.m_tgtUtilCEBuffer_ = cebuffer; buffer.m_tgtUtilCEBufferSize_ = cebuffersize; } result &= CE_PRIMARY_MASK_; return result; } /** * Appending an int to an array of ints and increases it if we run out of space * * @param array * of int arrays * @param appendindex * index at which value will be appended * @param value * to append * @return array if size is not increased, otherwise a new array will be returned */ private static final int[] append(int array[], int appendindex, int value) { if (appendindex + 1 >= array.length) { array = increase(array, appendindex, CE_BUFFER_SIZE_); } array[appendindex] = value; return array; } /** * Does secondary strength comparison based on the collected ces. * * @param doFrench flag indicates if French ordering is to be done * @param buffer collation buffer temporary state * @return the secondary strength comparison result */ private static final int doSecondaryCompare(boolean doFrench, CollationBuffer buffer) { // now, we're gonna reexamine collected CEs if (!doFrench) { // normal int soffset = 0; int toffset = 0; while (true) { int sorder = CollationElementIterator.IGNORABLE; while (sorder == CollationElementIterator.IGNORABLE) { sorder = buffer.m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_; } int torder = CollationElementIterator.IGNORABLE; while (torder == CollationElementIterator.IGNORABLE) { torder = buffer.m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_; } if (sorder == torder) { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } } else { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; } } } else { // do the French buffer.m_srcUtilContOffset_ = 0; buffer.m_tgtUtilContOffset_ = 0; buffer.m_srcUtilOffset_ = buffer.m_srcUtilCEBufferSize_ - 2; buffer.m_tgtUtilOffset_ = buffer.m_tgtUtilCEBufferSize_ - 2; while (true) { int sorder = getSecondaryFrenchCE(true, buffer); int torder = getSecondaryFrenchCE(false, buffer); if (sorder == torder) { if ((buffer.m_srcUtilOffset_ < 0 && buffer.m_tgtUtilOffset_ < 0) || (buffer.m_srcUtilOffset_ >= 0 && buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) { break; } } else { return (sorder < torder) ? -1 : 1; } } } return 0; } /** * Calculates the next secondary french CE. * * @param isSrc flag indicator if we are calculating the src ces * @param buffer collation buffer temporary state * @return result next modified ce */ private static final int getSecondaryFrenchCE(boolean isSrc, CollationBuffer buffer) { int result = CollationElementIterator.IGNORABLE; int offset = buffer.m_srcUtilOffset_; int continuationoffset = buffer.m_srcUtilContOffset_; int cebuffer[] = buffer.m_srcUtilCEBuffer_; if (!isSrc) { offset = buffer.m_tgtUtilOffset_; continuationoffset = buffer.m_tgtUtilContOffset_; cebuffer = buffer.m_tgtUtilCEBuffer_; } while (result == CollationElementIterator.IGNORABLE && offset >= 0) { if (continuationoffset == 0) { result = cebuffer[offset]; while (isContinuation(cebuffer[offset--])) { } // after this, sorder is at the start of continuation, // and offset points before that if (isContinuation(cebuffer[offset + 1])) { // save offset for later continuationoffset = offset; offset += 2; } } else { result = cebuffer[offset++]; if (!isContinuation(result)) { // we have finished with this continuation offset = continuationoffset; // reset the pointer to before continuation continuationoffset = 0; continue; } } result &= CE_SECONDARY_MASK_; // remove continuation bit } if (isSrc) { buffer.m_srcUtilOffset_ = offset; buffer.m_srcUtilContOffset_ = continuationoffset; } else { buffer.m_tgtUtilOffset_ = offset; buffer.m_tgtUtilContOffset_ = continuationoffset; } return result; } /** * Does case strength comparison based on the collected ces. * * @param buffer collation buffer temporary state * @return the case strength comparison result */ private final int doCaseCompare(CollationBuffer buffer) { int soffset = 0; int toffset = 0; while (true) { int sorder = CollationElementIterator.IGNORABLE; int torder = CollationElementIterator.IGNORABLE; while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { sorder = buffer.m_srcUtilCEBuffer_[soffset++]; if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) { // primary ignorables should not be considered on the case level when the strength is primary // otherwise, the CEs stop being well-formed sorder &= CE_CASE_MASK_3_; sorder ^= m_caseSwitch_; } else { sorder = CollationElementIterator.IGNORABLE; } } while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { torder = buffer.m_tgtUtilCEBuffer_[toffset++]; if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) { // primary ignorables should not be considered on the case level when the strength is primary // otherwise, the CEs stop being well-formed torder &= CE_CASE_MASK_3_; torder ^= m_caseSwitch_; } else { torder = CollationElementIterator.IGNORABLE; } } sorder &= CE_CASE_BIT_MASK_; torder &= CE_CASE_BIT_MASK_; if (sorder == torder) { // checking end of strings if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } } else { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } if (buffer.m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; } } return 0; } /** * Does tertiary strength comparison based on the collected ces. * * @param buffer collation buffer temporary state * @return the tertiary strength comparison result */ private final int doTertiaryCompare(CollationBuffer buffer) { int soffset = 0; int toffset = 0; while (true) { int sorder = CollationElementIterator.IGNORABLE; int torder = CollationElementIterator.IGNORABLE; while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { sorder = buffer.m_srcUtilCEBuffer_[soffset++] & m_mask3_; if (!isContinuation(sorder)) { sorder ^= m_caseSwitch_; } else { sorder &= CE_REMOVE_CASE_; } } while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { torder = buffer.m_tgtUtilCEBuffer_[toffset++] & m_mask3_; if (!isContinuation(torder)) { torder ^= m_caseSwitch_; } else { torder &= CE_REMOVE_CASE_; } } if (sorder == torder) { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } } else { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; } } return 0; } /** * Does quaternary strength comparison based on the collected ces. * * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted * @param buffer collation buffer temporary state * @return the quaternary strength comparison result */ private final int doQuaternaryCompare(int lowestpvalue, CollationBuffer buffer) { boolean sShifted = true; boolean tShifted = true; int soffset = 0; int toffset = 0; while (true) { int sorder = CollationElementIterator.IGNORABLE; int torder = CollationElementIterator.IGNORABLE; while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) { sorder = buffer.m_srcUtilCEBuffer_[soffset++]; if (isContinuation(sorder)) { if (!sShifted) { continue; } } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0 || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) { // non continuation sorder = CE_PRIMARY_MASK_; sShifted = false; } else { sShifted = true; } } sorder >>>= CE_PRIMARY_SHIFT_; while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) { torder = buffer.m_tgtUtilCEBuffer_[toffset++]; if (isContinuation(torder)) { if (!tShifted) { continue; } } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0 || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) { // non continuation torder = CE_PRIMARY_MASK_; tShifted = false; } else { tShifted = true; } } torder >>>= CE_PRIMARY_SHIFT_; if (sorder == torder) { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { return -1; } break; } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } } else { if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { return -1; } if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { return 1; } return (sorder < torder) ? -1 : 1; } } return 0; } /** * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough. * * @param source * text * @param target * text * @param offset * of the first difference in the text strings * @param normalize * flag indicating if we are to normalize the text before comparison * @return 1 if source is greater than target, -1 less than and 0 if equals */ private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize) { if (normalize) { if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) { source = Normalizer.decompose(source, false); } if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) { target = Normalizer.decompose(target, false); } offset = 0; } return doStringCompare(source, target, offset); } /** * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the * all non surrogate characters. * * @param source * text * @param target * text * @param offset * start offset for comparison * @return 1 if source is greater than target, -1 less than and 0 if equals */ private static final int doStringCompare(String source, String target, int offset) { // compare identical prefixes - they do not need to be fixed up char schar = 0; char tchar = 0; int slength = source.length(); int tlength = target.length(); int minlength = Math.min(slength, tlength); while (offset < minlength) { schar = source.charAt(offset); tchar = target.charAt(offset++); if (schar != tchar) { break; } } if (schar == tchar && offset == minlength) { if (slength > minlength) { return 1; } if (tlength > minlength) { return -1; } return 0; } // if both values are in or above the surrogate range, Fix them up. if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) { schar = fixupUTF16(schar); tchar = fixupUTF16(tchar); } // now c1 and c2 are in UTF-32-compatible order return (schar < tchar) ? -1 : 1; // schar and tchar has to be different } /** * Rotate surrogates to the top to get code point order */ private static final char fixupUTF16(char ch) { if (ch >= 0xe000) { ch -= 0x800; } else { ch += 0x2000; } return ch; } private static final int UCOL_REORDER_CODE_IGNORE = ReorderCodes.LIMIT + 1; /** * Builds the lead byte permuatation table */ private void buildPermutationTable() { if (m_reorderCodes_ == null || m_reorderCodes_.length == 0 || (m_reorderCodes_.length == 1 && m_reorderCodes_[0] == ReorderCodes.NONE)) { m_leadBytePermutationTable_ = null; return; } if (m_reorderCodes_[0] == ReorderCodes.DEFAULT) { if (m_reorderCodes_.length != 1) { throw new IllegalArgumentException("Illegal collation reorder codes - default reorder code must be the only code in the list."); } // swap the reorder codes for those at build of the rules if (m_defaultReorderCodes_ == null || m_defaultReorderCodes_.length == 0) { m_leadBytePermutationTable_ = null; } m_reorderCodes_ = m_defaultReorderCodes_.clone(); } // TODO - these need to be read in from the UCA data file // The lowest byte that hasn't been assigned a mapping int toBottom = 0x03; // The highest byte that hasn't been assigned a mapping int toTop = 0xe4; // filled slots in the output m_scriptOrder_ boolean[] permutationSlotFilled = new boolean[256]; // used lead bytes boolean[] newLeadByteUsed = new boolean[256]; if (m_leadBytePermutationTable_ == null) { m_leadBytePermutationTable_ = new byte[256]; } // prefill the reordering codes with the leading entries int[] internalReorderCodes = new int[m_reorderCodes_.length + (ReorderCodes.LIMIT - ReorderCodes.FIRST)]; for (int codeIndex = 0; codeIndex < ReorderCodes.LIMIT - ReorderCodes.FIRST; codeIndex++) { internalReorderCodes[codeIndex] = ReorderCodes.FIRST + codeIndex; } for (int codeIndex = 0; codeIndex < m_reorderCodes_.length; codeIndex++) { internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_reorderCodes_[codeIndex]; if (m_reorderCodes_[codeIndex] >= ReorderCodes.FIRST && m_reorderCodes_[codeIndex] < ReorderCodes.LIMIT) { internalReorderCodes[m_reorderCodes_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE; } } /* * Start from the front of the list and place each script we encounter at the earliest possible locatation * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script * in the last possible location. At each step, we also need to make sure that any scripts that need to not * be moved are copied to their same location in the final table. */ boolean fromTheBottom = true; int reorderCodesIndex = -1; for (int reorderCodesCount = 0; reorderCodesCount < internalReorderCodes.length; reorderCodesCount++) { reorderCodesIndex += fromTheBottom ? 1 : -1; int next = internalReorderCodes[reorderCodesIndex]; if (next == UCOL_REORDER_CODE_IGNORE) { continue; } if (next == UScript.UNKNOWN) { if (fromTheBottom == false) { // double turnaround m_leadBytePermutationTable_ = null; throw new IllegalArgumentException("Illegal collation reorder codes - two \"from the end\" markers."); } fromTheBottom = false; reorderCodesIndex = internalReorderCodes.length; continue; } int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next); if (fromTheBottom) { for (int leadByte : leadBytes) { // don't place a lead byte twice in the permutation table if (permutationSlotFilled[leadByte]) { // lead byte already used m_leadBytePermutationTable_ = null; throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte."); } m_leadBytePermutationTable_[leadByte] = (byte) toBottom; newLeadByteUsed[toBottom] = true; permutationSlotFilled[leadByte] = true; toBottom++; } } else { for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) { int leadByte = leadBytes[leadByteIndex]; // don't place a lead byte twice in the permutation table if (permutationSlotFilled[leadByte]) { // lead byte already used m_leadBytePermutationTable_ = null; throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte."); } m_leadBytePermutationTable_[leadByte] = (byte) toTop; newLeadByteUsed[toTop] = true; permutationSlotFilled[leadByte] = true; toTop--; } } } /* Copy everything that's left over */ int reorderCode = 0; for (int i = 0; i < 256; i++) { if (!permutationSlotFilled[i]) { while (newLeadByteUsed[reorderCode]) { if (reorderCode > 255) { throw new IllegalArgumentException("Unable to fill collation reordering table slots - no available reordering code."); } reorderCode++; } m_leadBytePermutationTable_[i] = (byte) reorderCode; permutationSlotFilled[i] = true; newLeadByteUsed[reorderCode] = true; } } // for (int i = 0; i < 256; i++){ // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16)); // } latinOneRegenTable_ = true; updateInternalState(); } /** * Resets the internal case data members and compression values. */ private void updateInternalState() { if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { m_caseSwitch_ = CASE_SWITCH_; } else { m_caseSwitch_ = NO_CASE_SWITCH_; } if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) { m_mask3_ = CE_REMOVE_CASE_; m_common3_ = COMMON_NORMAL_3_; m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; m_bottom3_ = COMMON_BOTTOM_3_; } else { m_mask3_ = CE_KEEP_CASE_; m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { m_common3_ = COMMON_UPPER_FIRST_3_; m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_; m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_; } else { m_common3_ = COMMON_NORMAL_3_; m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_; m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_; } } // Set the compression values int total3 = m_top3_ - m_bottom3_ - 1; // we multilply double with int, but need only int m_topCount3_ = (int) (PROPORTION_3_ * total3); m_bottomCount3_ = total3 - m_topCount3_; if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) { m_isSimple3_ = true; } else { m_isSimple3_ = false; } if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_ && !m_isAlternateHandlingShifted_ && !latinOneFailed_) { if (latinOneCEs_ == null || latinOneRegenTable_) { if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it latinOneUse_ = true; } else { latinOneUse_ = false; latinOneFailed_ = true; } latinOneRegenTable_ = false; } else { // latin1Table exists and it doesn't need to be regenerated, just use it latinOneUse_ = true; } } else { latinOneUse_ = false; } } /** * Initializes the RuleBasedCollator */ private final void init() { for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) { // Find the smallest unsafe char. if (isUnsafe(m_minUnsafe_)) { break; } } for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) { // Find the smallest contraction-ending char. if (isContractionEnd(m_minContractionEnd_)) { break; } } latinOneFailed_ = true; setStrength(m_defaultStrength_); setDecomposition(m_defaultDecomposition_); m_variableTopValue_ = m_defaultVariableTopValue_; m_isFrenchCollation_ = m_defaultIsFrenchCollation_; m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; m_isCaseLevel_ = m_defaultIsCaseLevel_; m_caseFirst_ = m_defaultCaseFirst_; m_isHiragana4_ = m_defaultIsHiragana4_; m_isNumericCollation_ = m_defaultIsNumericCollation_; latinOneFailed_ = false; if (m_defaultReorderCodes_ != null) { m_reorderCodes_ = m_defaultReorderCodes_.clone(); } else { m_reorderCodes_ = null; } updateInternalState(); } // Consts for Latin-1 special processing private static final int ENDOFLATINONERANGE_ = 0xFF; private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50); private static final int BAIL_OUT_CE_ = 0xFF000000; /** * Generate latin-1 tables */ private static class shiftValues { int primShift = 24; int secShift = 24; int terShift = 24; } private final void addLatinOneEntry(char ch, int CE, shiftValues sh) { int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; boolean continuation = isContinuation(CE); boolean reverseSecondary = false; if (!continuation) { tertiary = ((CE & m_mask3_)); tertiary ^= m_caseSwitch_; reverseSecondary = true; } else { tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_)); tertiary &= CE_REMOVE_CASE_; reverseSecondary = false; } secondary = ((CE >>>= 8) & LAST_BYTE_MASK_); primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_); primary1 = (CE >>> 8); if (primary1 != 0) { if (m_leadBytePermutationTable_ != null && !continuation) { primary1 = m_leadBytePermutationTable_[primary1]; } latinOneCEs_[ch] |= (primary1 << sh.primShift); sh.primShift -= 8; } if (primary2 != 0) { if (sh.primShift < 0) { latinOneCEs_[ch] = BAIL_OUT_CE_; latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_; latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_; return; } latinOneCEs_[ch] |= (primary2 << sh.primShift); sh.primShift -= 8; } if (secondary != 0) { if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24); } else { // normal case latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift); } sh.secShift -= 8; } if (tertiary != 0) { latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift); sh.terShift -= 8; } } private final void resizeLatinOneTable(int newSize) { int newTable[] = new int[3 * newSize]; int sizeToCopy = ((newSize < latinOneTableLen_) ? newSize : latinOneTableLen_); // uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared. System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy); System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy); System.arraycopy(latinOneCEs_, 2 * latinOneTableLen_, newTable, 2 * newSize, sizeToCopy); latinOneTableLen_ = newSize; latinOneCEs_ = newTable; } private final boolean setUpLatinOne() { if (latinOneCEs_ == null || m_reallocLatinOneCEs_) { latinOneCEs_ = new int[3 * LATINONETABLELEN_]; latinOneTableLen_ = LATINONETABLELEN_; m_reallocLatinOneCEs_ = false; } else { Arrays.fill(latinOneCEs_, 0); } if (m_ContInfo_ == null) { m_ContInfo_ = new ContractionInfo(); } char ch = 0; // StringBuffer sCh = new StringBuffer(); // CollationElementIterator it = getCollationElementIterator(sCh.toString()); CollationElementIterator it = getCollationElementIterator(""); shiftValues s = new shiftValues(); int CE = 0; char contractionOffset = ENDOFLATINONERANGE_ + 1; for (ch = 0; ch <= ENDOFLATINONERANGE_; ch++) { s.primShift = 24; s.secShift = 24; s.terShift = 24; if (ch < 0x100) { CE = m_trie_.getLatin1LinearValue(ch); } else { CE = m_trie_.getLeadValue(ch); if (CE == CollationElementIterator.CE_NOT_FOUND_) { CE = UCA_.m_trie_.getLeadValue(ch); } } if (!isSpecial(CE)) { addLatinOneEntry(ch, CE, s); } else { switch (RuleBasedCollator.getTag(CE)) { case CollationElementIterator.CE_EXPANSION_TAG_: case CollationElementIterator.CE_DIGIT_TAG_: // sCh.delete(0, sCh.length()); // sCh.append(ch); // it.setText(sCh.toString()); it.setText(UCharacter.toString(ch)); while ((CE = it.next()) != CollationElementIterator.NULLORDER) { if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { latinOneCEs_[ch] = BAIL_OUT_CE_; latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_; latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_; break; } addLatinOneEntry(ch, CE, s); } break; case CollationElementIterator.CE_CONTRACTION_TAG_: // here is the trick // F2 is contraction. We do something very similar to contractions // but have two indices, one in the real contraction table and the // other to where we stuffed things. This hopes that we don't have // many contractions (this should work for latin-1 tables). { if ((CE & 0x00FFF000) != 0) { latinOneFailed_ = true; return false; } int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; // getContractionOffset(CE)] CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table latinOneCEs_[ch] = CE; latinOneCEs_[latinOneTableLen_ + ch] = CE; latinOneCEs_[2 * latinOneTableLen_ + ch] = CE; // We're going to jump into contraction table, pick the elements // and use them do { // CE = *(contractionCEs + (UCharOffset - contractionIndex)); CE = m_contractionCE_[UCharOffset]; if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { int i; /* general counter */ // uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to // expansion table */ int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; // it.getExpansionOffset(this, // CE); int size = CE & 0xF; // getExpansionCount(CE); // CE = *CEOffset++; if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ for (i = 0; i < size; i++) { if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; break; } addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s); } } else { /* else, we do */ while (m_expansion_[offset] != 0) { if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; break; } addLatinOneEntry(contractionOffset, m_expansion_[offset++], s); } } contractionOffset++; } else if (!isSpecial(CE)) { addLatinOneEntry(contractionOffset++, CE, s); } else { latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; contractionOffset++; } UCharOffset++; s.primShift = 24; s.secShift = 24; s.terShift = 24; if (contractionOffset == latinOneTableLen_) { // we need to reallocate resizeLatinOneTable(2 * latinOneTableLen_); } } while (m_contractionIndex_[UCharOffset] != 0xFFFF); } break; case CollationElementIterator.CE_SPEC_PROC_TAG_: { // 0xB7 is a precontext character defined in UCA5.1, a special // handle is implemeted in order to save LatinOne table for // most locales. if (ch == 0xb7) { addLatinOneEntry(ch, CE, s); } else { latinOneFailed_ = true; return false; } } break; default: latinOneFailed_ = true; return false; } } } // compact table if (contractionOffset < latinOneTableLen_) { resizeLatinOneTable(contractionOffset); } return true; } private static class ContractionInfo { int index; } ContractionInfo m_ContInfo_; private int getLatinOneContraction(int strength, int CE, String s) { // int strength, int CE, String s, Integer ind) { int len = s.length(); // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); int UCharOffset = (CE & 0xFFF) - m_contractionOffset_; int offset = 1; int latinOneOffset = (CE & 0x00FFF000) >>> 12; char schar = 0, tchar = 0; for (;;) { /* * if(len == -1) { if(s[*index] == 0) { // end of string * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; } * } else { */ if (m_ContInfo_.index == len) { return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]); } else { schar = s.charAt(m_ContInfo_.index); } // } while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */ )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ offset++; } if (schar == tchar) { m_ContInfo_.index++; return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]); } else { if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) { return BAIL_OUT_CE_; } // skip completely ignorables int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar); if (isZeroCE == 0) { // we have to ignore completely ignorables m_ContInfo_.index++; continue; } return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]); } } } /** * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the * regular strcoll. * @param buffer collation buffer temporary state */ private final int compareUseLatin1(String source, String target, int startOffset, CollationBuffer buffer) { int sLen = source.length(); int tLen = target.length(); int strength = getStrength(); int sIndex = startOffset, tIndex = startOffset; char sChar = 0, tChar = 0; int sOrder = 0, tOrder = 0; boolean endOfSource = false; // uint32_t *elements = coll->latinOneCEs; boolean haveContractions = false; // if we have contractions in our string // we cannot do French secondary int offset = latinOneTableLen_; // Do the primary level primLoop: for (;;) { while (sOrder == 0) { // this loop skips primary ignorables // sOrder=getNextlatinOneCE(source); if (sIndex == sLen) { endOfSource = true; break; } sChar = source.charAt(sIndex++); // [sIndex++]; // } if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out // fprintf(stderr, "R"); return compareRegular(source, target, startOffset, buffer); } sOrder = latinOneCEs_[sChar]; if (isSpecial(sOrder)) { // if we got a special // specials can basically be either contractions or bail-out signs. If we get anything // else, we'll bail out anywasy if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { m_ContInfo_.index = sIndex; sOrder = getLatinOneContraction(0, sOrder, source); sIndex = m_ContInfo_.index; haveContractions = true; // if there are contractions, we cannot do French secondary // However, if there are contractions in the table, but we always use just one char, // we might be able to do French. This should be checked out. } if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) { // fprintf(stderr, "S"); return compareRegular(source, target, startOffset, buffer); } } } while (tOrder == 0) { // this loop skips primary ignorables // tOrder=getNextlatinOneCE(target); if (tIndex == tLen) { if (endOfSource) { break primLoop; } else { return 1; } } tChar = target.charAt(tIndex++); // [tIndex++]; if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out // fprintf(stderr, "R"); return compareRegular(source, target, startOffset, buffer); } tOrder = latinOneCEs_[tChar]; if (isSpecial(tOrder)) { // Handling specials, see the comments for source if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { m_ContInfo_.index = tIndex; tOrder = getLatinOneContraction(0, tOrder, target); tIndex = m_ContInfo_.index; haveContractions = true; } if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) { // fprintf(stderr, "S"); return compareRegular(source, target, startOffset, buffer); } } } if (endOfSource) { // source is finished, but target is not, say the result. return -1; } if (sOrder == tOrder) { // if we have same CEs, we continue the loop sOrder = 0; tOrder = 0; continue; } else { // compare current top bytes if (((sOrder ^ tOrder) & 0xFF000000) != 0) { // top bytes differ, return difference if (sOrder >>> 8 < tOrder >>> 8) { return -1; } else { return 1; } // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); // since we must return enum value } // top bytes match, continue with following bytes sOrder <<= 8; tOrder <<= 8; } } // after primary loop, we definitely know the sizes of strings, // so we set it and use simpler loop for secondaries and tertiaries // sLen = sIndex; tLen = tIndex; if (strength >= SECONDARY) { // adjust the table beggining // latinOneCEs_ += coll->latinOneTableLen; endOfSource = false; if (!m_isFrenchCollation_) { // non French // This loop is a simplified copy of primary loop // at this point we know that whole strings are latin-1, so we don't // check for that. We also know that we only have contractions as // specials. // sIndex = 0; tIndex = 0; sIndex = startOffset; tIndex = startOffset; secLoop: for (;;) { while (sOrder == 0) { if (sIndex == sLen) { endOfSource = true; break; } sChar = source.charAt(sIndex++); // [sIndex++]; sOrder = latinOneCEs_[offset + sChar]; if (isSpecial(sOrder)) { m_ContInfo_.index = sIndex; sOrder = getLatinOneContraction(1, sOrder, source); sIndex = m_ContInfo_.index; } } while (tOrder == 0) { if (tIndex == tLen) { if (endOfSource) { break secLoop; } else { return 1; } } tChar = target.charAt(tIndex++); // [tIndex++]; tOrder = latinOneCEs_[offset + tChar]; if (isSpecial(tOrder)) { m_ContInfo_.index = tIndex; tOrder = getLatinOneContraction(1, tOrder, target); tIndex = m_ContInfo_.index; } } if (endOfSource) { return -1; } if (sOrder == tOrder) { sOrder = 0; tOrder = 0; continue; } else { // see primary loop for comments on this if (((sOrder ^ tOrder) & 0xFF000000) != 0) { if (sOrder >>> 8 < tOrder >>> 8) { return -1; } else { return 1; } } sOrder <<= 8; tOrder <<= 8; } } } else { // French if (haveContractions) { // if we have contractions, we have to bail out // since we don't really know how to handle them here return compareRegular(source, target, startOffset, buffer); } // For French, we go backwards sIndex = sLen; tIndex = tLen; secFLoop: for (;;) { while (sOrder == 0) { if (sIndex == startOffset) { endOfSource = true; break; } sChar = source.charAt(--sIndex); // [--sIndex]; sOrder = latinOneCEs_[offset + sChar]; // don't even look for contractions } while (tOrder == 0) { if (tIndex == startOffset) { if (endOfSource) { break secFLoop; } else { return 1; } } tChar = target.charAt(--tIndex); // [--tIndex]; tOrder = latinOneCEs_[offset + tChar]; // don't even look for contractions } if (endOfSource) { return -1; } if (sOrder == tOrder) { sOrder = 0; tOrder = 0; continue; } else { // see the primary loop for comments if (((sOrder ^ tOrder) & 0xFF000000) != 0) { if (sOrder >>> 8 < tOrder >>> 8) { return -1; } else { return 1; } } sOrder <<= 8; tOrder <<= 8; } } } } if (strength >= TERTIARY) { // tertiary loop is the same as secondary (except no French) offset += latinOneTableLen_; // sIndex = 0; tIndex = 0; sIndex = startOffset; tIndex = startOffset; endOfSource = false; for (;;) { while (sOrder == 0) { if (sIndex == sLen) { endOfSource = true; break; } sChar = source.charAt(sIndex++); // [sIndex++]; sOrder = latinOneCEs_[offset + sChar]; if (isSpecial(sOrder)) { m_ContInfo_.index = sIndex; sOrder = getLatinOneContraction(2, sOrder, source); sIndex = m_ContInfo_.index; } } while (tOrder == 0) { if (tIndex == tLen) { if (endOfSource) { return 0; // if both strings are at the end, they are equal } else { return 1; } } tChar = target.charAt(tIndex++); // [tIndex++]; tOrder = latinOneCEs_[offset + tChar]; if (isSpecial(tOrder)) { m_ContInfo_.index = tIndex; tOrder = getLatinOneContraction(2, tOrder, target); tIndex = m_ContInfo_.index; } } if (endOfSource) { return -1; } if (sOrder == tOrder) { sOrder = 0; tOrder = 0; continue; } else { if (((sOrder ^ tOrder) & 0xff000000) != 0) { if (sOrder >>> 8 < tOrder >>> 8) { return -1; } else { return 1; } } sOrder <<= 8; tOrder <<= 8; } } } return 0; } /** * Get the version of this collator object. * * @return the version object associated with this collator * @stable ICU 2.8 */ public VersionInfo getVersion() { /* RunTime version */ int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor(); /* Builder version */ int bdVersion = m_version_.getMajor(); /* * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and * an api has to be provided in ucnv.h to obtain this version */ int csVersion = 0; /* combine the version info */ int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF; /* Tailoring rules */ return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(), UCA_.m_UCA_version_.getMajor()); // versionInfo[0] = (uint8_t)(cmbVersion>>8); // versionInfo[1] = (uint8_t)cmbVersion; // versionInfo[2] = coll->image->version[1]; // versionInfo[3] = coll->UCA->image->UCAVersion[0]; } /** * Get the UCA version of this collator object. * * @return the version object associated with this collator * @stable ICU 2.8 */ public VersionInfo getUCAVersion() { return UCA_.m_UCA_version_; } private transient boolean m_reallocLatinOneCEs_; private CollationBuffer collationBuffer; private final CollationBuffer getCollationBuffer() { if (isFrozen()) { frozenLock.lock(); } if (collationBuffer == null) { collationBuffer = new CollationBuffer(); } else { collationBuffer.resetBuffers(); } return collationBuffer; } private final void releaseCollationBuffer(CollationBuffer buffer) { if (isFrozen()) { frozenLock.unlock(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy