All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.CollationRuleParser Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
package com.ibm.icu.text;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator.ReorderCodes;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;

/**
* Class for parsing collation rules, produces a list of tokens that will be
* turned into collation elements
* @author Syn Wee Quek
* @since release 2.2, June 7 2002
*/
final class CollationRuleParser
{
    // public data members ---------------------------------------------------

    // package private constructors ------------------------------------------

    /**
     * 

RuleBasedCollator constructor that takes the rules. * Please see RuleBasedCollator class description for more details on the * collation rule syntax.

* @see java.util.Locale * @param rules the collation rules to build the collation table from. * @exception ParseException thrown when argument rules have an invalid * syntax. */ CollationRuleParser(String rules) throws ParseException { // Prepares m_copySet_ and m_removeSet_. rules = preprocessRules(rules); // Save the rules as a long string. The StringBuilder object is // used to store the result of token parsing as well. m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim()); m_rules_ = m_source_.toString(); // Index of the next unparsed character. m_current_ = 0; // Index of the next unwritten character in the parsed result. m_extraCurrent_ = m_source_.length(); m_variableTop_ = null; m_parsedToken_ = new ParsedToken(); m_hashTable_ = new HashMap(); m_options_ = new OptionSet(RuleBasedCollator.UCA_); m_listHeader_ = new TokenListHeader[512]; m_resultLength_ = 0; // call assembleTokenList() manually, so that we can // init a parser and manually parse tokens //assembleTokenList(); } // package private inner classes ----------------------------------------- /** * Collation options set */ static class OptionSet { // package private constructor --------------------------------------- /** * Initializes the option set with the argument collators * @param collator option to use */ OptionSet(RuleBasedCollator collator) { m_variableTopValue_ = collator.m_variableTopValue_; m_isFrenchCollation_ = collator.isFrenchCollation(); m_isAlternateHandlingShifted_ = collator.isAlternateHandlingShifted(); m_caseFirst_ = collator.m_caseFirst_; m_isCaseLevel_ = collator.isCaseLevel(); m_decomposition_ = collator.getDecomposition(); m_strength_ = collator.getStrength(); m_isHiragana4_ = collator.m_isHiragana4_; if(collator.m_reorderCodes_ != null){ m_scriptOrder_ = new int[collator.m_reorderCodes_.length]; for(int i = 0; i < m_scriptOrder_.length; i++){ m_scriptOrder_[i] = collator.m_reorderCodes_[i]; } } } // package private data members -------------------------------------- int m_variableTopValue_; boolean m_isFrenchCollation_; /** * Attribute for handling variable elements */ boolean m_isAlternateHandlingShifted_; /** * who goes first, lower case or uppercase */ int m_caseFirst_; /** * do we have an extra case level */ boolean m_isCaseLevel_; /** * attribute for normalization */ int m_decomposition_; /** * attribute for strength */ int m_strength_; /** * attribute for special Hiragana */ boolean m_isHiragana4_; /** * the ordering of the scripts */ int[] m_scriptOrder_; } /** * List of tokens used by the collation rules */ static class TokenListHeader { Token m_first_; Token m_last_; Token m_reset_; boolean m_indirect_; int m_baseCE_; int m_baseContCE_; int m_nextCE_; int m_nextContCE_; int m_previousCE_; int m_previousContCE_; int m_pos_[] = new int[Collator.IDENTICAL + 1]; int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)]; int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)]; int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)]; Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1]; Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1]; } /** * Token wrapper for collation rules */ static class Token { // package private data members --------------------------------------- int m_CE_[]; int m_CELength_; int m_expCE_[]; int m_expCELength_; int m_source_; int m_expansion_; int m_prefix_; int m_strength_; int m_toInsert_; int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>> TokenListHeader m_listHeader_; Token m_previous_; Token m_next_; StringBuilder m_rules_; char m_flags_; // package private constructors --------------------------------------- Token() { m_CE_ = new int[128]; m_expCE_ = new int[128]; // TODO: this should also handle reverse m_polarity_ = TOKEN_POLARITY_POSITIVE_; m_next_ = null; m_previous_ = null; m_CELength_ = 0; m_expCELength_ = 0; } // package private methods -------------------------------------------- /** * Hashcode calculation for token * @return the hashcode */ public int hashCode() { int result = 0; int len = (m_source_ & 0xFF000000) >>> 24; int inc = ((len - 32) / 32) + 1; int start = m_source_ & 0x00FFFFFF; int limit = start + len; while (start < limit) { result = (result * 37) + m_rules_.charAt(start); start += inc; } return result; } /** * Equals calculation * @param target object to compare * @return true if target is the same as this object */ public boolean equals(Object target) { if (target == this) { return true; } if (target instanceof Token) { Token t = (Token)target; int sstart = m_source_ & 0x00FFFFFF; int tstart = t.m_source_ & 0x00FFFFFF; int slimit = (m_source_ & 0xFF000000) >> 24; int tlimit = (m_source_ & 0xFF000000) >> 24; int end = sstart + slimit - 1; if (m_source_ == 0 || t.m_source_ == 0) { return false; } if (slimit != tlimit) { return false; } if (m_source_ == t.m_source_) { return true; } while (sstart < end && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) { ++ sstart; ++ tstart; } if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) { return true; } } return false; } } // package private data member ------------------------------------------- /** * Indicator that the token is resetted yet, ie & in the rules */ static final int TOKEN_RESET_ = 0xDEADBEEF; /** * Size of the number of tokens */ int m_resultLength_; /** * List of parsed tokens */ TokenListHeader m_listHeader_[]; /** * Variable top token */ Token m_variableTop_; /** * Collation options */ OptionSet m_options_; /** * Normalized collation rules with some extra characters */ StringBuilder m_source_; /** * Hash table to keep all tokens */ Map m_hashTable_; // package private method ------------------------------------------------ void setDefaultOptionsInCollator(RuleBasedCollator collator) { collator.m_defaultStrength_ = m_options_.m_strength_; collator.m_defaultDecomposition_ = m_options_.m_decomposition_; collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_; collator.m_defaultIsAlternateHandlingShifted_ = m_options_.m_isAlternateHandlingShifted_; collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_; collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_; collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_; collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_; if(m_options_.m_scriptOrder_ != null) { collator.m_defaultReorderCodes_ = m_options_.m_scriptOrder_.clone(); } else { collator.m_defaultReorderCodes_ = null; } } // private inner classes ------------------------------------------------- /** * This is a token that has been parsed but not yet processed. Used to * reduce the number of arguments in the parser */ private static class ParsedToken { // private constructor ---------------------------------------------- /** * Empty constructor */ ParsedToken() { m_charsLen_ = 0; m_charsOffset_ = 0; m_extensionLen_ = 0; m_extensionOffset_ = 0; m_prefixLen_ = 0; m_prefixOffset_ = 0; m_flags_ = 0; m_strength_ = TOKEN_UNSET_; } // private data members --------------------------------------------- int m_strength_; int m_charsOffset_; int m_charsLen_; int m_extensionOffset_; int m_extensionLen_; int m_prefixOffset_; int m_prefixLen_; char m_flags_; char m_indirectIndex_; } /** * Boundary wrappers */ private static class IndirectBoundaries { // package private constructor --------------------------------------- IndirectBoundaries(int startce[], int limitce[]) { // Set values for the top - TODO: once we have values for all the // indirects, we are going to initalize here. m_startCE_ = startce[0]; m_startContCE_ = startce[1]; if (limitce != null) { m_limitCE_ = limitce[0]; m_limitContCE_ = limitce[1]; } else { m_limitCE_ = 0; m_limitContCE_ = 0; } } // package private data members -------------------------------------- int m_startCE_; int m_startContCE_; int m_limitCE_; int m_limitContCE_; } /** * Collation option rule tag */ private static class TokenOption { // package private constructor --------------------------------------- TokenOption(String name, int attribute, String suboptions[], int suboptionattributevalue[]) { m_name_ = name; m_attribute_ = attribute; m_subOptions_ = suboptions; m_subOptionAttributeValues_ = suboptionattributevalue; } // package private data member --------------------------------------- private String m_name_; private int m_attribute_; private String m_subOptions_[]; private int m_subOptionAttributeValues_[]; } // private variables ----------------------------------------------------- /** * Current parsed token */ private ParsedToken m_parsedToken_; /** * Collation rule */ private String m_rules_; private int m_current_; /** * End of the option while reading. * Need it for UnicodeSet reading support. */ private int m_optionEnd_; /* * Current offset in m_source */ //private int m_sourceLimit_; /** * Offset to m_source_ ofr the extra expansion characters */ private int m_extraCurrent_; /** * UnicodeSet that contains code points to be copied from the UCA */ UnicodeSet m_copySet_; /** * UnicodeSet that contains code points for which we want to remove * UCA contractions. It implies copying of these code points from * the UCA. */ UnicodeSet m_removeSet_; /* * This is space for the extra strings that need to be unquoted during the * parsing of the rules */ //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048; /** * Indicator that the token is not set yet */ private static final int TOKEN_UNSET_ = 0xFFFFFFFF; /* * Indicator that the rule is in the > polarity, ie everything on the * right of the rule is less than */ //private static final int TOKEN_POLARITY_NEGATIVE_ = 0; /** * Indicator that the rule is in the < polarity, ie everything on the * right of the rule is greater than */ private static final int TOKEN_POLARITY_POSITIVE_ = 1; /** * Flag mask to determine if top is set */ private static final int TOKEN_TOP_MASK_ = 0x04; /** * Flag mask to determine if variable top is set */ private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08; /** * Flag mask to determine if a before attribute is set */ private static final int TOKEN_BEFORE_ = 0x03; /** * For use in parsing token options */ private static final int TOKEN_SUCCESS_MASK_ = 0x10; /** * These values are used for finding CE values for indirect positioning. * Indirect positioning is a mechanism for allowing resets on symbolic * values. It only works for resets and you cannot tailor indirect names. * An indirect name can define either an anchor point or a range. An anchor * point behaves in exactly the same way as a code point in reset would, * except that it cannot be tailored. A range (we currently only know for * the [top] range will explicitly set the upper bound for generated CEs, * thus allowing for better control over how many CEs can be squeezed * between in the range without performance penalty. In that respect, we use * [top] for tailoring of locales that use CJK characters. Other indirect * values are currently a pure convenience, they can be used to assure that * the CEs will be always positioned in the same place relative to a point * with known properties (e.g. first primary ignorable). */ private static final IndirectBoundaries INDIRECT_BOUNDARIES_[]; // /** // * Inverse UCA constants // */ // private static final int INVERSE_SIZE_MASK_ = 0xFFF00000; // private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF; // private static final int INVERSE_SHIFT_VALUE_ = 20; /** * Collation option tags * [last variable] last variable value * [last primary ignorable] largest CE for primary ignorable * [last secondary ignorable] largest CE for secondary ignorable * [last tertiary ignorable] largest CE for tertiary ignorable * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) */ private static final TokenOption RULES_OPTIONS_[]; static { INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15]; // UCOL_RESET_TOP_VALUE INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_, RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_); // UCOL_FIRST_PRIMARY_IGNORABLE INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_, null); // UCOL_LAST_PRIMARY_IGNORABLE INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_, null); // UCOL_FIRST_SECONDARY_IGNORABLE INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_, null); // UCOL_LAST_SECONDARY_IGNORABLE INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_, null); // UCOL_FIRST_TERTIARY_IGNORABLE INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_, null); // UCOL_LAST_TERTIARY_IGNORABLE INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_, null); // UCOL_FIRST_VARIABLE; INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_, null); // UCOL_LAST_VARIABLE INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_, null); // UCOL_FIRST_NON_VARIABLE INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_, null); // UCOL_LAST_NON_VARIABLE INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_, RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_); // UCOL_FIRST_IMPLICIT INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_, null); // UCOL_LAST_IMPLICIT INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_, RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_); // UCOL_FIRST_TRAILING INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_, null); // UCOL_LAST_TRAILING INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries( RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_, null); INDIRECT_BOUNDARIES_[14].m_limitCE_ = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24; RULES_OPTIONS_ = new TokenOption[20]; String option[] = {"non-ignorable", "shifted"}; int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_, RuleBasedCollator.AttributeValue.SHIFTED_}; RULES_OPTIONS_[0] = new TokenOption("alternate", RuleBasedCollator.Attribute.ALTERNATE_HANDLING_, option, value); option = new String[1]; option[0] = "2"; value = new int[1]; value[0] = RuleBasedCollator.AttributeValue.ON_; RULES_OPTIONS_[1] = new TokenOption("backwards", RuleBasedCollator.Attribute.FRENCH_COLLATION_, option, value); String offonoption[] = new String[2]; offonoption[0] = "off"; offonoption[1] = "on"; int offonvalue[] = new int[2]; offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_; offonvalue[1] = RuleBasedCollator.AttributeValue.ON_; RULES_OPTIONS_[2] = new TokenOption("caseLevel", RuleBasedCollator.Attribute.CASE_LEVEL_, offonoption, offonvalue); option = new String[3]; option[0] = "lower"; option[1] = "upper"; option[2] = "off"; value = new int[3]; value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_; value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_; value[2] = RuleBasedCollator.AttributeValue.OFF_; RULES_OPTIONS_[3] = new TokenOption("caseFirst", RuleBasedCollator.Attribute.CASE_FIRST_, option, value); RULES_OPTIONS_[4] = new TokenOption("normalization", RuleBasedCollator.Attribute.NORMALIZATION_MODE_, offonoption, offonvalue); RULES_OPTIONS_[5] = new TokenOption("hiraganaQ", RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_, offonoption, offonvalue); option = new String[5]; option[0] = "1"; option[1] = "2"; option[2] = "3"; option[3] = "4"; option[4] = "I"; value = new int[5]; value[0] = RuleBasedCollator.AttributeValue.PRIMARY_; value[1] = RuleBasedCollator.AttributeValue.SECONDARY_; value[2] = RuleBasedCollator.AttributeValue.TERTIARY_; value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_; value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_; RULES_OPTIONS_[6] = new TokenOption("strength", RuleBasedCollator.Attribute.STRENGTH_, option, value); RULES_OPTIONS_[7] = new TokenOption("variable top", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[8] = new TokenOption("rearrange", RuleBasedCollator.Attribute.LIMIT_, null, null); option = new String[3]; option[0] = "1"; option[1] = "2"; option[2] = "3"; value = new int[3]; value[0] = RuleBasedCollator.AttributeValue.PRIMARY_; value[1] = RuleBasedCollator.AttributeValue.SECONDARY_; value[2] = RuleBasedCollator.AttributeValue.TERTIARY_; RULES_OPTIONS_[9] = new TokenOption("before", RuleBasedCollator.Attribute.LIMIT_, option, value); RULES_OPTIONS_[10] = new TokenOption("top", RuleBasedCollator.Attribute.LIMIT_, null, null); String firstlastoption[] = new String[7]; firstlastoption[0] = "primary"; firstlastoption[1] = "secondary"; firstlastoption[2] = "tertiary"; firstlastoption[3] = "variable"; firstlastoption[4] = "regular"; firstlastoption[5] = "implicit"; firstlastoption[6] = "trailing"; int firstlastvalue[] = new int[7]; Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_); RULES_OPTIONS_[11] = new TokenOption("first", RuleBasedCollator.Attribute.LIMIT_, firstlastoption, firstlastvalue); RULES_OPTIONS_[12] = new TokenOption("last", RuleBasedCollator.Attribute.LIMIT_, firstlastoption, firstlastvalue); RULES_OPTIONS_[13] = new TokenOption("optimize", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[14] = new TokenOption("suppressContractions", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[15] = new TokenOption("undefined", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[16] = new TokenOption("reorder", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[17] = new TokenOption("charsetname", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[18] = new TokenOption("charset", RuleBasedCollator.Attribute.LIMIT_, null, null); RULES_OPTIONS_[19] = new TokenOption("import", RuleBasedCollator.Attribute.LIMIT_, null, null); } /** * Utility data members */ private Token m_utilToken_ = new Token(); private CollationElementIterator m_UCAColEIter_ = RuleBasedCollator.UCA_.getCollationElementIterator(""); private int m_utilCEBuffer_[] = new int[2]; private boolean m_isStarred_; private int m_currentStarredCharIndex_; private int m_lastStarredCharIndex_; private int m_currentRangeCp_; private int m_lastRangeCp_; private boolean m_inRange_; private int m_previousCp_; private boolean m_savedIsStarred_; // private methods ------------------------------------------------------- /** * Assembles the token list * @exception ParseException thrown when rules syntax fails */ int assembleTokenList() throws ParseException { Token lastToken = null; m_parsedToken_.m_strength_ = TOKEN_UNSET_; int sourcelimit = m_source_.length(); int expandNext = 0; m_isStarred_ = false; while (m_current_ < sourcelimit || m_isStarred_) { m_parsedToken_.m_prefixOffset_ = 0; if (parseNextToken(lastToken == null) < 0) { // we have reached the end continue; } char specs = m_parsedToken_.m_flags_; boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0); boolean top = ((specs & TOKEN_TOP_MASK_) != 0); int lastStrength = TOKEN_UNSET_; if (lastToken != null) { lastStrength = lastToken.m_strength_; } m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24 | m_parsedToken_.m_charsOffset_; m_utilToken_.m_rules_ = m_source_; // 4 Lookup each source in the CharsToToken map, and find a // sourcetoken Token sourceToken = m_hashTable_.get(m_utilToken_); if (m_parsedToken_.m_strength_ != TOKEN_RESET_) { if (lastToken == null) { // this means that rules haven't started properly throwParseException(m_source_.toString(), 0); } // 6 Otherwise (when relation != reset) if (sourceToken == null) { // If sourceToken is null, create new one sourceToken = new Token(); sourceToken.m_rules_ = m_source_; sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 | m_parsedToken_.m_charsOffset_; sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24 | m_parsedToken_.m_prefixOffset_; // TODO: this should also handle reverse sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_; sourceToken.m_next_ = null; sourceToken.m_previous_ = null; sourceToken.m_CELength_ = 0; sourceToken.m_expCELength_ = 0; m_hashTable_.put(sourceToken, sourceToken); } else { // we could have fished out a reset here if (sourceToken.m_strength_ != TOKEN_RESET_ && lastToken != sourceToken) { // otherwise remove sourceToken from where it was. // Take care of the next node if (sourceToken.m_next_ != null) { if (sourceToken.m_next_.m_strength_ > sourceToken.m_strength_) { sourceToken.m_next_.m_strength_ = sourceToken.m_strength_; } sourceToken.m_next_.m_previous_ = sourceToken.m_previous_; } else { // sourcetoken is the last token. // Redefine the tail token. sourceToken.m_listHeader_.m_last_ = sourceToken.m_previous_; } // Take care of the previous node. if (sourceToken.m_previous_ != null) { sourceToken.m_previous_.m_next_ = sourceToken.m_next_; } else { // sourcetoken is the first token. // Redefine the head node. sourceToken.m_listHeader_.m_first_ = sourceToken.m_next_; } sourceToken.m_next_ = null; sourceToken.m_previous_ = null; } } sourceToken.m_strength_ = m_parsedToken_.m_strength_; sourceToken.m_listHeader_ = lastToken.m_listHeader_; // 1. Find the strongest strength in each list, and set // strongestP and strongestN accordingly in the headers. if (lastStrength == TOKEN_RESET_ || sourceToken.m_listHeader_.m_first_ == null) { // If LAST is a reset insert sourceToken in the list. if (sourceToken.m_listHeader_.m_first_ == null) { sourceToken.m_listHeader_.m_first_ = sourceToken; sourceToken.m_listHeader_.m_last_ = sourceToken; } else { // we need to find a place for us // and we'll get in front of the same strength if (sourceToken.m_listHeader_.m_first_.m_strength_ <= sourceToken.m_strength_) { sourceToken.m_next_ = sourceToken.m_listHeader_.m_first_; sourceToken.m_next_.m_previous_ = sourceToken; sourceToken.m_listHeader_.m_first_ = sourceToken; sourceToken.m_previous_ = null; } else { lastToken = sourceToken.m_listHeader_.m_first_; while (lastToken.m_next_ != null && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) { lastToken = lastToken.m_next_; } if (lastToken.m_next_ != null) { lastToken.m_next_.m_previous_ = sourceToken; } else { sourceToken.m_listHeader_.m_last_ = sourceToken; } sourceToken.m_previous_ = lastToken; sourceToken.m_next_ = lastToken.m_next_; lastToken.m_next_ = sourceToken; } } } else { // Otherwise (when LAST is not a reset) // if polarity (LAST) == polarity(relation), insert // sourceToken after LAST, otherwise insert before. // when inserting after or before, search to the next // position with the same strength in that direction. // (This is called postpone insertion). if (sourceToken != lastToken) { if (lastToken.m_polarity_ == sourceToken.m_polarity_) { while (lastToken.m_next_ != null && lastToken.m_next_.m_strength_ > sourceToken.m_strength_) { lastToken = lastToken.m_next_; } sourceToken.m_previous_ = lastToken; if (lastToken.m_next_ != null) { lastToken.m_next_.m_previous_ = sourceToken; } else { sourceToken.m_listHeader_.m_last_ = sourceToken; } sourceToken.m_next_ = lastToken.m_next_; lastToken.m_next_ = sourceToken; } else { while (lastToken.m_previous_ != null && lastToken.m_previous_.m_strength_ > sourceToken.m_strength_) { lastToken = lastToken.m_previous_; } sourceToken.m_next_ = lastToken; if (lastToken.m_previous_ != null) { lastToken.m_previous_.m_next_ = sourceToken; } else { sourceToken.m_listHeader_.m_first_ = sourceToken; } sourceToken.m_previous_ = lastToken.m_previous_; lastToken.m_previous_ = sourceToken; } } else { // repeated one thing twice in rules, stay with the // stronger strength if (lastStrength < sourceToken.m_strength_) { sourceToken.m_strength_ = lastStrength; } } } // if the token was a variable top, we're gonna put it in if (variableTop == true && m_variableTop_ == null) { variableTop = false; m_variableTop_ = sourceToken; } // Treat the expansions. // There are two types of expansions: explicit (x / y) and // reset based propagating expansions // (&abc * d * e <=> &ab * d / c * e / c) // if both of them are in effect for a token, they are combined. sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 | m_parsedToken_.m_extensionOffset_; if (expandNext != 0) { if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) { // primary strength kills off the implicit expansion expandNext = 0; } else if (sourceToken.m_expansion_ == 0) { // if there is no expansion, implicit is just added to // the token sourceToken.m_expansion_ = expandNext; } else { // there is both explicit and implicit expansion. // We need to make a combination int start = expandNext & 0xFFFFFF; int size = expandNext >>> 24; if (size > 0) { m_source_.append(m_source_.substring(start, start + size)); } start = m_parsedToken_.m_extensionOffset_; m_source_.append(m_source_.substring(start, start + m_parsedToken_.m_extensionLen_)); sourceToken.m_expansion_ = (size + m_parsedToken_.m_extensionLen_) << 24 | m_extraCurrent_; m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_; } } // if the previous token was a reset before, the strength of this // token must match the strength of before. Otherwise we have an // undefined situation. // In other words, we currently have a cludge which we use to // represent &a >> x. This is written as &[before 2]a << x. if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) { int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1; if(beforeStrength != sourceToken.m_strength_) { throwParseException(m_source_.toString(), m_current_); } } } else { if (lastToken != null && lastStrength == TOKEN_RESET_) { // if the previous token was also a reset, this means that // we have two consecutive resets and we want to remove the // previous one if empty if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) { m_resultLength_ --; } } if (sourceToken == null) { // this is a reset, but it might still be somewhere in the // tailoring, in shorter form int searchCharsLen = m_parsedToken_.m_charsLen_; while (searchCharsLen > 1 && sourceToken == null) { searchCharsLen --; // key = searchCharsLen << 24 | charsOffset; m_utilToken_.m_source_ = searchCharsLen << 24 | m_parsedToken_.m_charsOffset_; m_utilToken_.m_rules_ = m_source_; sourceToken = m_hashTable_.get(m_utilToken_); } if (sourceToken != null) { expandNext = (m_parsedToken_.m_charsLen_ - searchCharsLen) << 24 | (m_parsedToken_.m_charsOffset_ + searchCharsLen); } } if ((specs & TOKEN_BEFORE_) != 0) { if (top == false) { // we're doing before & there is no indirection int strength = (specs & TOKEN_BEFORE_) - 1; if (sourceToken != null && sourceToken.m_strength_ != TOKEN_RESET_) { // this is a before that is already ordered in the UCA // - so we need to get the previous with good strength while (sourceToken.m_strength_ > strength && sourceToken.m_previous_ != null) { sourceToken = sourceToken.m_previous_; } // here, either we hit the strength or NULL if (sourceToken.m_strength_ == strength) { if (sourceToken.m_previous_ != null) { sourceToken = sourceToken.m_previous_; } else { // start of list sourceToken = sourceToken.m_listHeader_.m_reset_; } } else { // we hit NULL, we should be doing the else part sourceToken = sourceToken.m_listHeader_.m_reset_; sourceToken = getVirginBefore(sourceToken, strength); } } else { sourceToken = getVirginBefore(sourceToken, strength); } } else { // this is both before and indirection top = false; m_listHeader_[m_resultLength_] = new TokenListHeader(); m_listHeader_[m_resultLength_].m_previousCE_ = 0; m_listHeader_[m_resultLength_].m_previousContCE_ = 0; m_listHeader_[m_resultLength_].m_indirect_ = true; // we need to do slightly more work. we need to get the // baseCE using the inverse UCA & getPrevious. The next // bound is not set, and will be decided in ucol_bld int strength = (specs & TOKEN_BEFORE_) - 1; int baseCE = INDIRECT_BOUNDARIES_[ m_parsedToken_.m_indirectIndex_].m_startCE_; int baseContCE = INDIRECT_BOUNDARIES_[ m_parsedToken_.m_indirectIndex_].m_startContCE_; int ce[] = new int[2]; if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_) && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */ int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16; int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary); int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1); ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505; ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_; } else { CollationParsedRuleBuilder.InverseUCA invuca = CollationParsedRuleBuilder.INVERSE_UCA_; invuca.getInversePrevCE(baseCE, baseContCE, strength, ce); } m_listHeader_[m_resultLength_].m_baseCE_ = ce[0]; m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1]; m_listHeader_[m_resultLength_].m_nextCE_ = 0; m_listHeader_[m_resultLength_].m_nextContCE_ = 0; sourceToken = new Token(); expandNext = initAReset(0, sourceToken); } } // 5 If the relation is a reset: // If sourceToken is null // Create new list, create new sourceToken, make the baseCE // from source, put the sourceToken in ListHeader of the new // list if (sourceToken == null) { if (m_listHeader_[m_resultLength_] == null) { m_listHeader_[m_resultLength_] = new TokenListHeader(); } // 3 Consider each item: relation, source, and expansion: // e.g. ...< x / y ... // First convert all expansions into normal form. // Examples: // If "xy" doesn't occur earlier in the list or in the UCA, // convert &xy * c * d * ... into &x * c/y * d * ... // Note: reset values can never have expansions, although // they can cause the very next item to have one. They may // be contractions, if they are found earlier in the list. if (top == false) { CollationElementIterator coleiter = RuleBasedCollator.UCA_.getCollationElementIterator( m_source_.substring(m_parsedToken_.m_charsOffset_, m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_)); int CE = coleiter.next(); // offset to the character in the full rule string int expand = coleiter.getOffset() + m_parsedToken_.m_charsOffset_; int SecondCE = coleiter.next(); m_listHeader_[m_resultLength_].m_baseCE_ = CE & 0xFFFFFF3F; if (RuleBasedCollator.isContinuation(SecondCE)) { m_listHeader_[m_resultLength_].m_baseContCE_ = SecondCE; } else { m_listHeader_[m_resultLength_].m_baseContCE_ = 0; } m_listHeader_[m_resultLength_].m_nextCE_ = 0; m_listHeader_[m_resultLength_].m_nextContCE_ = 0; m_listHeader_[m_resultLength_].m_previousCE_ = 0; m_listHeader_[m_resultLength_].m_previousContCE_ = 0; m_listHeader_[m_resultLength_].m_indirect_ = false; sourceToken = new Token(); expandNext = initAReset(expand, sourceToken); } else { // top == TRUE top = false; m_listHeader_[m_resultLength_].m_previousCE_ = 0; m_listHeader_[m_resultLength_].m_previousContCE_ = 0; m_listHeader_[m_resultLength_].m_indirect_ = true; IndirectBoundaries ib = INDIRECT_BOUNDARIES_[ m_parsedToken_.m_indirectIndex_]; m_listHeader_[m_resultLength_].m_baseCE_ = ib.m_startCE_; m_listHeader_[m_resultLength_].m_baseContCE_ = ib.m_startContCE_; m_listHeader_[m_resultLength_].m_nextCE_ = ib.m_limitCE_; m_listHeader_[m_resultLength_].m_nextContCE_ = ib.m_limitContCE_; sourceToken = new Token(); expandNext = initAReset(0, sourceToken); } } else { // reset to something already in rules top = false; } } // 7 After all this, set LAST to point to sourceToken, and goto // step 3. lastToken = sourceToken; } if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) { m_resultLength_ --; } return m_resultLength_; } /** * Formats and throws a ParseException * @param rules collation rule that failed * @param offset failed offset in rules * @throws ParseException with failure information */ private static final void throwParseException(String rules, int offset) throws ParseException { // for pre-context String precontext = rules.substring(0, offset); String postcontext = rules.substring(offset, rules.length()); StringBuilder error = new StringBuilder( "Parse error occurred in rule at offset "); error.append(offset); error.append("\n after the prefix \""); error.append(precontext); error.append("\" before the suffix \""); error.append(postcontext); throw new ParseException(error.toString(), offset); } private final boolean doSetTop() { m_parsedToken_.m_charsOffset_ = m_extraCurrent_; m_source_.append((char)0xFFFE); IndirectBoundaries ib = INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_]; m_source_.append((char)(ib.m_startCE_ >> 16)); m_source_.append((char)(ib.m_startCE_ & 0xFFFF)); m_extraCurrent_ += 3; if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_ ].m_startContCE_ == 0) { m_parsedToken_.m_charsLen_ = 3; } else { m_source_.append((char)(INDIRECT_BOUNDARIES_[ m_parsedToken_.m_indirectIndex_ ].m_startContCE_ >> 16)); m_source_.append((char)(INDIRECT_BOUNDARIES_[ m_parsedToken_.m_indirectIndex_ ].m_startContCE_ & 0xFFFF)); m_extraCurrent_ += 2; m_parsedToken_.m_charsLen_ = 5; } return true; } private static boolean isCharNewLine(char c) { switch (c) { case 0x000A: /* LF */ case 0x000D: /* CR */ case 0x000C: /* FF */ case 0x0085: /* NEL */ case 0x2028: /* LS */ case 0x2029: /* PS */ return true; default: return false; } } /** * Parses the next token. * * It updates/accesses the following member variables: * m_current_: Index to the next unparsed character (not code point) * in the character array (a StringBuilder object) m_source_. * m_parsedToken_: The parsed token. The following of the token are updated. * .m_strength: The strength of the token. * .m_charsOffset, m_charsLen_: Index to the first character (after operators), * and number of characters in the token. * This may be in the main string, or in the appended string. * .m_extensionOffset_, .m_extensionLen_: * .m_flags: * .m_prefixOffset, .m_prefixLen: Used when "|" is used to specify "context before". * .m_indirectIndex: * @param startofrules * flag indicating if we are at the start of rules * @return the offset of the next unparsed char * @exception ParseException * thrown when rule parsing fails */ private int parseNextToken(boolean startofrules) throws ParseException { if (m_inRange_) { // We are not done processing a range. Continue it. return processNextCodePointInRange(); } else if (m_isStarred_) { // We are not done processing a starred token. Continue it. return processNextTokenInTheStarredList(); } // Get the next token. int nextOffset = parseNextTokenInternal(startofrules); // If the next token is starred and/or in range, we need to handle it here. if (m_inRange_) { // A new range has started. // Check whether it is a chain of ranges with more than one hyphen. if (m_lastRangeCp_ > 0 && m_lastRangeCp_ == m_previousCp_) { throw new ParseException("Chained range syntax", m_current_); } // The current token is the first character of the second code point of the range. // Process just that, and then proceed with the star. m_lastRangeCp_ = m_source_.codePointAt(this.m_parsedToken_.m_charsOffset_); if (m_lastRangeCp_ <= m_previousCp_) { throw new ParseException("Invalid range", m_current_); } // Set current range code point to process the range loop m_currentRangeCp_ = m_previousCp_ + 1; // Set current starred char index to continue processing the starred // expression after the range is done. m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + Character.charCount(m_lastRangeCp_); m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1; return processNextCodePointInRange(); } else if (m_isStarred_) { // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be // separated into several tokens and returned. m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_; m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1; return processNextTokenInTheStarredList(); } return nextOffset; } private int processNextCodePointInRange() throws ParseException { int nChars = Character.charCount(m_currentRangeCp_); m_source_.appendCodePoint(m_currentRangeCp_); m_parsedToken_.m_charsOffset_ = m_extraCurrent_; m_parsedToken_.m_charsLen_ = nChars; m_extraCurrent_ += nChars; ++m_currentRangeCp_; if (m_currentRangeCp_ > m_lastRangeCp_) { // All the code points in the range are processed. // Turn the range flag off. m_inRange_ = false; // If there is a starred portion remaining in the current // parsed token, resume the starred operation. if (m_currentStarredCharIndex_ <= m_lastStarredCharIndex_) { m_isStarred_ = true; } else { m_isStarred_ = false; } } else { m_previousCp_ = m_currentRangeCp_; } return m_current_; } /** * Extracts the next token from the starred token from * m_currentStarredCharIndex_ and returns it. * @return the offset of the next unparsed char * @throws ParseException */ private int processNextTokenInTheStarredList() throws ParseException { // Extract the characters corresponding to the next code point. int cp = m_source_.codePointAt(m_currentStarredCharIndex_); int nChars = Character.charCount(cp); m_parsedToken_.m_charsLen_ = nChars; m_parsedToken_.m_charsOffset_ = m_currentStarredCharIndex_; m_currentStarredCharIndex_ += nChars; // When we are done parsing the starred string, turn the flag off so that // the normal processing is restored. if (m_currentStarredCharIndex_ > m_lastStarredCharIndex_) { m_isStarred_ = false; } m_previousCp_ = cp; return m_current_; } private int resetToTop(boolean top, boolean variableTop, int extensionOffset, int newExtensionLen, byte byteBefore) throws ParseException { m_parsedToken_.m_indirectIndex_ = 5; top = doSetTop(); return doEndParseNextToken(TOKEN_RESET_, top, extensionOffset, newExtensionLen, variableTop, byteBefore); } /** * Gets the next token and sets the necessary internal variables. * This function parses a starred string as a single token, which will be separated * in the calling function. * @param startofrules Boolean value indicating whether this is the first rule * @return the offset of the next unparsed char * @throws ParseException */ @SuppressWarnings("fallthrough") private int parseNextTokenInternal(boolean startofrules) throws ParseException { boolean variabletop = false; boolean top = false; boolean inchars = true; boolean inquote = false; boolean wasinquote = false; byte before = 0; boolean isescaped = false; int /*newcharslen = 0,*/ newextensionlen = 0; int /*charsoffset = 0,*/ extensionoffset = 0; int newstrength = TOKEN_UNSET_; initializeParsedToken(); int limit = m_rules_.length(); while (m_current_ < limit) { char ch = m_source_.charAt(m_current_); if (inquote) { if (ch == 0x0027) { // '\'' inquote = false; } else { if ((m_parsedToken_.m_charsLen_ == 0) || inchars) { if (m_parsedToken_.m_charsLen_ == 0) { m_parsedToken_.m_charsOffset_ = m_extraCurrent_; } m_parsedToken_.m_charsLen_ ++; } else { if (newextensionlen == 0) { extensionoffset = m_extraCurrent_; } newextensionlen ++; } } } else if (isescaped) { isescaped = false; if (newstrength == TOKEN_UNSET_) { throwParseException(m_rules_, m_current_); } if (ch != 0 && m_current_ != limit) { if (inchars) { if (m_parsedToken_.m_charsLen_ == 0) { m_parsedToken_.m_charsOffset_ = m_current_; } m_parsedToken_.m_charsLen_ ++; } else { if (newextensionlen == 0) { extensionoffset = m_current_; } newextensionlen ++; } } } else { if (!PatternProps.isWhiteSpace(ch)) { // Sets the strength for this entry switch (ch) { case 0x003D : // '=' if (newstrength != TOKEN_UNSET_) { return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before); } // if we start with strength, we'll reset to top if (startofrules == true) { return resetToTop(top, variabletop, extensionoffset, newextensionlen, before); } newstrength = Collator.IDENTICAL; if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*' m_current_++; m_isStarred_ = true; } break; case 0x002C : // ',' if (newstrength != TOKEN_UNSET_) { return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before); } // if we start with strength, we'll reset to top if (startofrules == true) { return resetToTop(top, variabletop, extensionoffset, newextensionlen, before); } newstrength = Collator.TERTIARY; break; case 0x003B : // ';' if (newstrength != TOKEN_UNSET_) { return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before); } //if we start with strength, we'll reset to top if(startofrules == true) { return resetToTop(top, variabletop, extensionoffset, newextensionlen, before); } newstrength = Collator.SECONDARY; break; case 0x003C : // '<' if (newstrength != TOKEN_UNSET_) { return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before); } // if we start with strength, we'll reset to top if (startofrules == true) { return resetToTop(top, variabletop, extensionoffset, newextensionlen, before); } // before this, do a scan to verify whether this is // another strength if (m_source_.charAt(m_current_ + 1) == 0x003C) { m_current_ ++; if (m_source_.charAt(m_current_ + 1) == 0x003C) { m_current_ ++; // three in a row! newstrength = Collator.TERTIARY; } else { // two in a row newstrength = Collator.SECONDARY; } } else { // just one newstrength = Collator.PRIMARY; } if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*' m_current_++; m_isStarred_ = true; } break; case 0x0026 : // '&' if (newstrength != TOKEN_UNSET_) { return doEndParseNextToken(newstrength, top, extensionoffset, newextensionlen, variabletop, before); } newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0 break; case 0x005b : // '[' // options - read an option, analyze it m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_); if (m_optionEnd_ != -1) { // ']' byte result = readAndSetOption(); m_current_ = m_optionEnd_; if ((result & TOKEN_TOP_MASK_) != 0) { if (newstrength == TOKEN_RESET_) { doSetTop(); if (before != 0) { // This is a combination of before and // indirection like // '&[before 2][first regular]>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_) && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */ int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16; int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary); ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1); int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1); m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505; m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_; m_parsedToken_.m_charsOffset_ = m_extraCurrent_; m_source_.append('\uFFFE'); m_source_.append((char)ch); m_extraCurrent_ += 2; m_parsedToken_.m_charsLen_++; m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24) | m_parsedToken_.m_charsOffset_; m_utilToken_.m_rules_ = m_source_; sourcetoken = m_hashTable_.get(m_utilToken_); if(sourcetoken == null) { m_listHeader_[m_resultLength_] = new TokenListHeader(); m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F; if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) { m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1]; } else { m_listHeader_[m_resultLength_].m_baseContCE_ = 0; } m_listHeader_[m_resultLength_].m_nextCE_ = 0; m_listHeader_[m_resultLength_].m_nextContCE_ = 0; m_listHeader_[m_resultLength_].m_previousCE_ = 0; m_listHeader_[m_resultLength_].m_previousContCE_ = 0; m_listHeader_[m_resultLength_].m_indirect_ = false; sourcetoken = new Token(); initAReset(-1, sourcetoken); } } else { // first ce and second ce m_utilCEBuffer_ /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE( basece, basecontce, strength, m_utilCEBuffer_); // we got the previous CE. Now we need to see if the difference between // the two CEs is really of the requested strength. // if it's a bigger difference (we asked for secondary and got primary), we // need to modify the CE. if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) { // adjust the strength // now we are in the situation where our baseCE should actually be modified in // order to get the CE in the right position. if(strength == Collator.SECONDARY) { m_utilCEBuffer_[0] = basece - 0x0200; } else { // strength == UCOL_TERTIARY m_utilCEBuffer_[0] = basece - 0x02; } if(RuleBasedCollator.isContinuation(basecontce)) { if(strength == Collator.SECONDARY) { m_utilCEBuffer_[1] = basecontce - 0x0200; } else { // strength == UCOL_TERTIARY m_utilCEBuffer_[1] = basecontce - 0x02; } } } /* // the code below relies on getting a code point from the inverse table, in order to be // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: // 1. There are many code points that have the same CE // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. // Also, in case when there is no equivalent strength before an element, we have to actually // construct one. For example, &[before 2]a << x won't result in x << a, because the element // before a is a primary difference. ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos + 2]; if ((ch & INVERSE_SIZE_MASK_) != 0) { int offset = ch & INVERSE_OFFSET_MASK_; ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[ offset]; } m_source_.append((char)ch); m_extraCurrent_ ++; m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1; m_parsedToken_.m_charsLen_ = 1; // We got an UCA before. However, this might have been tailored. // example: // &\u30ca = \u306a // &[before 3]\u306a<<<\u306a|\u309d m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24) | m_parsedToken_.m_charsOffset_; m_utilToken_.m_rules_ = m_source_; sourcetoken = (Token)m_hashTable_.get(m_utilToken_); */ // here is how it should be. The situation such as &[before 1]a < x, should be // resolved exactly as if we wrote &a > x. // therefore, I don't really care if the UCA value before a has been changed. // However, I do care if the strength between my element and the previous element // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll // have to construct the base CE. // if we found a tailored thing, we have to use the UCA value and // construct a new reset token with constructed name //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) { // character to which we want to anchor is already tailored. // We need to construct a new token which will be the anchor point //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE'); //m_source_.append(ch); //m_extraCurrent_ ++; //m_parsedToken_.m_charsLen_ ++; // grab before m_parsedToken_.m_charsOffset_ -= 10; m_parsedToken_.m_charsLen_ += 10; m_listHeader_[m_resultLength_] = new TokenListHeader(); m_listHeader_[m_resultLength_].m_baseCE_ = m_utilCEBuffer_[0] & 0xFFFFFF3F; if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) { m_listHeader_[m_resultLength_].m_baseContCE_ = m_utilCEBuffer_[1]; } else { m_listHeader_[m_resultLength_].m_baseContCE_ = 0; } m_listHeader_[m_resultLength_].m_nextCE_ = 0; m_listHeader_[m_resultLength_].m_nextContCE_ = 0; m_listHeader_[m_resultLength_].m_previousCE_ = 0; m_listHeader_[m_resultLength_].m_previousContCE_ = 0; m_listHeader_[m_resultLength_].m_indirect_ = false; sourcetoken = new Token(); initAReset(-1, sourcetoken); //} } return sourcetoken; } /** * Processing Description. * 1. Build a m_listHeader_. Each list has a header, which contains two lists * (positive and negative), a reset token, a baseCE, nextCE, and * previousCE. The lists and reset may be null. * 2. As you process, you keep a LAST pointer that points to the last token * you handled. * @param expand string offset, -1 for null strings * @param targetToken token to update * @return expandnext offset * @throws ParseException thrown when rules syntax failed */ private int initAReset(int expand, Token targetToken) throws ParseException { if (m_resultLength_ == m_listHeader_.length - 1) { // Unfortunately, this won't work, as we store addresses of lhs in // token TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1]; System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1); m_listHeader_ = temp; } // do the reset thing targetToken.m_rules_ = m_source_; targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 | m_parsedToken_.m_charsOffset_; targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 | m_parsedToken_.m_extensionOffset_; // keep the flags around so that we know about before targetToken.m_flags_ = m_parsedToken_.m_flags_; if (m_parsedToken_.m_prefixOffset_ != 0) { throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1); } targetToken.m_prefix_ = 0; // TODO: this should also handle reverse targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_; targetToken.m_strength_ = TOKEN_RESET_; targetToken.m_next_ = null; targetToken.m_previous_ = null; targetToken.m_CELength_ = 0; targetToken.m_expCELength_ = 0; targetToken.m_listHeader_ = m_listHeader_[m_resultLength_]; m_listHeader_[m_resultLength_].m_first_ = null; m_listHeader_[m_resultLength_].m_last_ = null; m_listHeader_[m_resultLength_].m_first_ = null; m_listHeader_[m_resultLength_].m_last_ = null; m_listHeader_[m_resultLength_].m_reset_ = targetToken; /* 3 Consider each item: relation, source, and expansion: * e.g. ...< x / y ... * First convert all expansions into normal form. Examples: * If "xy" doesn't occur earlier in the list or in the UCA, convert * &xy * c * d * ... into &x * c/y * d * ... * Note: reset values can never have expansions, although they can * cause the very next item to have one. They may be contractions, if * they are found earlier in the list. */ int result = 0; if (expand > 0) { // check to see if there is an expansion if (m_parsedToken_.m_charsLen_ > 1) { targetToken.m_source_ = ((expand - m_parsedToken_.m_charsOffset_ ) << 24) | m_parsedToken_.m_charsOffset_; result = ((m_parsedToken_.m_charsLen_ + m_parsedToken_.m_charsOffset_ - expand) << 24) | expand; } } m_resultLength_ ++; m_hashTable_.put(targetToken, targetToken); return result; } /** * Checks if an character is special * @param ch character to test * @return true if the character is special */ private static final boolean isSpecialChar(char ch) { return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A) || (ch <= 0x0060 && ch >= 0x005B) || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B; } private UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException { while(source.charAt(start) != '[') { /* advance while we find the first '[' */ start++; } // now we need to get a balanced set of '[]'. The problem is that a set can have // many, and *end point to the first closing '[' int noOpenBraces = 1; int current = 1; // skip the opening brace while(start+current < source.length() && noOpenBraces != 0) { if(source.charAt(start+current) == '[') { noOpenBraces++; } else if(source.charAt(start+current) == ']') { // closing brace noOpenBraces--; } current++; } //int nextBrace = -1; if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) { throwParseException(m_rules_, start); } return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current); } /** in C, optionarg is passed by reference to function. * We use a private int to simulate this. */ private int m_optionarg_ = 0; private int readOption(String rules, int start, int optionend) { m_optionarg_ = 0; int i = 0; while (i < RULES_OPTIONS_.length) { String option = RULES_OPTIONS_[i].m_name_; int optionlength = option.length(); if (rules.length() > start + optionlength && option.equalsIgnoreCase(rules.substring(start, start + optionlength))) { if (optionend - start > optionlength) { m_optionarg_ = start + optionlength; // start of the options, skip space while (m_optionarg_ < optionend && PatternProps.isWhiteSpace(rules.charAt(m_optionarg_))) { // eat whitespace m_optionarg_ ++; } } break; } i ++; } if(i == RULES_OPTIONS_.length) { i = -1; } return i; } /** * Reads and set collation options * @return TOKEN_SUCCESS if option is set correct, 0 otherwise * @exception ParseException thrown when options in rules are wrong */ private byte readAndSetOption() throws ParseException { int start = m_current_ + 1; // skip opening '[' int i = readOption(m_rules_, start, m_optionEnd_); int optionarg = m_optionarg_; if (i < 0) { throwParseException(m_rules_, start); } if (i < 7) { if (optionarg != 0) { for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) { String subname = RULES_OPTIONS_[i].m_subOptions_[j]; int size = optionarg + subname.length(); if (m_rules_.length() > size && subname.equalsIgnoreCase(m_rules_.substring( optionarg, size))) { setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_, RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]); return TOKEN_SUCCESS_MASK_; } } } throwParseException(m_rules_, optionarg); } else if (i == 7) { // variable top return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_; } else if (i == 8) { // rearrange return TOKEN_SUCCESS_MASK_; } else if (i == 9) { // before if (optionarg != 0) { for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) { String subname = RULES_OPTIONS_[i].m_subOptions_[j]; int size = optionarg + subname.length(); if (m_rules_.length() > size && subname.equalsIgnoreCase( m_rules_.substring(optionarg, optionarg + subname.length()))) { return (byte)(TOKEN_SUCCESS_MASK_ | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j] + 1); } } } throwParseException(m_rules_, optionarg); } else if (i == 10) { // top, we are going to have an array with // structures of limit CEs index to this array will be // src->parsedToken.indirectIndex m_parsedToken_.m_indirectIndex_ = 0; return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_; } else if (i < 13) { // first, last for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) { String subname = RULES_OPTIONS_[i].m_subOptions_[j]; int size = optionarg + subname.length(); if (m_rules_.length() > size && subname.equalsIgnoreCase(m_rules_.substring(optionarg, size))) { m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1)); return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_; } } throwParseException(m_rules_, optionarg); } else if(i == 13 || i == 14) { // copy and remove are handled before normalization // we need to move end here int noOpenBraces = 1; m_current_++; // skip opening brace while(m_current_ < m_source_.length() && noOpenBraces != 0) { if(m_source_.charAt(m_current_) == '[') { noOpenBraces++; } else if(m_source_.charAt(m_current_) == ']') { // closing brace noOpenBraces--; } m_current_++; } m_optionEnd_ = m_current_-1; return TOKEN_SUCCESS_MASK_; } else if(i == 16) { m_current_ = m_optionarg_; // skip opening brace and name parseScriptReorder(); return TOKEN_SUCCESS_MASK_; } else { throwParseException(m_rules_, optionarg); } return TOKEN_SUCCESS_MASK_; // we will never reach here. } /** * Set collation option * @param optionset option set to set * @param attribute type to set * @param value attribute value */ private void setOptions(OptionSet optionset, int attribute, int value) { switch (attribute) { case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ : optionset.m_isHiragana4_ = (value == RuleBasedCollator.AttributeValue.ON_); break; case RuleBasedCollator.Attribute.FRENCH_COLLATION_ : optionset.m_isFrenchCollation_ = (value == RuleBasedCollator.AttributeValue.ON_); break; case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ : optionset.m_isAlternateHandlingShifted_ = (value == RuleBasedCollator.AttributeValue.SHIFTED_); break; case RuleBasedCollator.Attribute.CASE_FIRST_ : optionset.m_caseFirst_ = value; break; case RuleBasedCollator.Attribute.CASE_LEVEL_ : optionset.m_isCaseLevel_ = (value == RuleBasedCollator.AttributeValue.ON_); break; case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ : if (value == RuleBasedCollator.AttributeValue.ON_) { value = Collator.CANONICAL_DECOMPOSITION; } optionset.m_decomposition_ = value; break; case RuleBasedCollator.Attribute.STRENGTH_ : optionset.m_strength_ = value; break; default : break; } } UnicodeSet getTailoredSet() throws ParseException { boolean startOfRules = true; UnicodeSet tailored = new UnicodeSet(); String pattern; CanonicalIterator it = new CanonicalIterator(""); m_parsedToken_.m_strength_ = TOKEN_UNSET_; int sourcelimit = m_source_.length(); //int expandNext = 0; while (m_current_ < sourcelimit) { m_parsedToken_.m_prefixOffset_ = 0; if (parseNextToken(startOfRules) < 0) { // we have reached the end continue; } startOfRules = false; // The idea is to tokenize the rule set. For each non-reset token, // we add all the canonicaly equivalent FCD sequences if(m_parsedToken_.m_strength_ != TOKEN_RESET_) { it.setSource(m_source_.substring( m_parsedToken_.m_charsOffset_, m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_)); pattern = it.next(); while(pattern != null) { if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) { tailored.add(pattern); } pattern = it.next(); } } } return tailored; } final private String preprocessRules(String rules) throws ParseException { int optionNumber = -1; int setStart = 0; int i = 0; while(i < rules.length()) { if(rules.charAt(i) == 0x005B) { // [ optionNumber = readOption(rules, i+1, rules.length()); setStart = m_optionarg_; if(optionNumber == 13) { /* copy - parts of UCA to tailoring */ UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart); if(m_copySet_ == null) { m_copySet_ = newSet; } else { m_copySet_.addAll(newSet); } } else if(optionNumber == 14) { UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart); if(m_removeSet_ == null) { m_removeSet_ = newSet; } else { m_removeSet_.addAll(newSet); } } else if(optionNumber == 19) { int optionEndOffset = rules.indexOf(']', i) + 1; ULocale locale = ULocale.forLanguageTag(rules.substring(setStart, optionEndOffset-1)); UResourceBundle bundle = UResourceBundle.getBundleInstance( ICUResourceBundle.ICU_BASE_NAME + "/coll", locale.getBaseName()); String type = locale.getKeywordValue("collation"); if(type == null){ type = "standard"; } String importRules = bundle.get("collations") .get(type) .get("Sequence") .getString(); rules = rules.substring(0, i) + importRules + rules.substring(optionEndOffset); } } i++; } return rules; } /* This is the data that is used for non-script reordering codes. These _must_ be kept * in order that they are to be applied as defaults and in synch with the Collator.ReorderCodes statics. */ static final String ReorderingTokensArray[] = { "SPACE", "PUNCT", "SYMBOL", "CURRENCY", "DIGIT", }; int findReorderingEntry(String name) { for (int tokenIndex = 0; tokenIndex < ReorderingTokensArray.length; tokenIndex++) { if (name.equalsIgnoreCase(ReorderingTokensArray[tokenIndex])) { return tokenIndex + ReorderCodes.FIRST; } } return UScript.INVALID_CODE; } private void parseScriptReorder() throws ParseException { ArrayList tempOrder = new ArrayList(); int end = m_rules_.indexOf(']', m_current_); if (end == -1) { return; } String tokenString = m_rules_.substring(m_current_, end); String[] tokens = tokenString.split("\\s+", 0); String token; for (int tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) { token = tokens[tokenIndex]; int reorderCode = findReorderingEntry(token); if (reorderCode == UScript.INVALID_CODE) { reorderCode = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, token); if (reorderCode < 0) { throw new ParseException(m_rules_, tokenIndex); } } tempOrder.add(reorderCode); } m_options_.m_scriptOrder_ = new int[tempOrder.size()]; for(int i = 0; i < tempOrder.size(); i++) { m_options_.m_scriptOrder_[i] = tempOrder.get(i); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy