![JAR search and dependency download from the Maven repository](/logo.png)
com.ibm.icu.text.CollationRuleParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator.ReorderCodes;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;
/**
* Class for parsing collation rules, produces a list of tokens that will be
* turned into collation elements
* @author Syn Wee Quek
* @since release 2.2, June 7 2002
*/
final class CollationRuleParser
{
// public data members ---------------------------------------------------
// package private constructors ------------------------------------------
/**
* RuleBasedCollator constructor that takes the rules.
* Please see RuleBasedCollator class description for more details on the
* collation rule syntax.
* @see java.util.Locale
* @param rules the collation rules to build the collation table from.
* @exception ParseException thrown when argument rules have an invalid
* syntax.
*/
CollationRuleParser(String rules) throws ParseException
{
// Prepares m_copySet_ and m_removeSet_.
rules = preprocessRules(rules);
// Save the rules as a long string. The StringBuilder object is
// used to store the result of token parsing as well.
m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
m_rules_ = m_source_.toString();
// Index of the next unparsed character.
m_current_ = 0;
// Index of the next unwritten character in the parsed result.
m_extraCurrent_ = m_source_.length();
m_variableTop_ = null;
m_parsedToken_ = new ParsedToken();
m_hashTable_ = new HashMap();
m_options_ = new OptionSet(RuleBasedCollator.UCA_);
m_listHeader_ = new TokenListHeader[512];
m_resultLength_ = 0;
// call assembleTokenList() manually, so that we can
// init a parser and manually parse tokens
//assembleTokenList();
}
// package private inner classes -----------------------------------------
/**
* Collation options set
*/
static class OptionSet
{
// package private constructor ---------------------------------------
/**
* Initializes the option set with the argument collators
* @param collator option to use
*/
OptionSet(RuleBasedCollator collator)
{
m_variableTopValue_ = collator.m_variableTopValue_;
m_isFrenchCollation_ = collator.isFrenchCollation();
m_isAlternateHandlingShifted_
= collator.isAlternateHandlingShifted();
m_caseFirst_ = collator.m_caseFirst_;
m_isCaseLevel_ = collator.isCaseLevel();
m_decomposition_ = collator.getDecomposition();
m_strength_ = collator.getStrength();
m_isHiragana4_ = collator.m_isHiragana4_;
if(collator.m_reorderCodes_ != null){
m_scriptOrder_ = new int[collator.m_reorderCodes_.length];
for(int i = 0; i < m_scriptOrder_.length; i++){
m_scriptOrder_[i] = collator.m_reorderCodes_[i];
}
}
}
// package private data members --------------------------------------
int m_variableTopValue_;
boolean m_isFrenchCollation_;
/**
* Attribute for handling variable elements
*/
boolean m_isAlternateHandlingShifted_;
/**
* who goes first, lower case or uppercase
*/
int m_caseFirst_;
/**
* do we have an extra case level
*/
boolean m_isCaseLevel_;
/**
* attribute for normalization
*/
int m_decomposition_;
/**
* attribute for strength
*/
int m_strength_;
/**
* attribute for special Hiragana
*/
boolean m_isHiragana4_;
/**
* the ordering of the scripts
*/
int[] m_scriptOrder_;
}
/**
* List of tokens used by the collation rules
*/
static class TokenListHeader
{
Token m_first_;
Token m_last_;
Token m_reset_;
boolean m_indirect_;
int m_baseCE_;
int m_baseContCE_;
int m_nextCE_;
int m_nextContCE_;
int m_previousCE_;
int m_previousContCE_;
int m_pos_[] = new int[Collator.IDENTICAL + 1];
int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
}
/**
* Token wrapper for collation rules
*/
static class Token
{
// package private data members ---------------------------------------
int m_CE_[];
int m_CELength_;
int m_expCE_[];
int m_expCELength_;
int m_source_;
int m_expansion_;
int m_prefix_;
int m_strength_;
int m_toInsert_;
int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
TokenListHeader m_listHeader_;
Token m_previous_;
Token m_next_;
StringBuilder m_rules_;
char m_flags_;
// package private constructors ---------------------------------------
Token()
{
m_CE_ = new int[128];
m_expCE_ = new int[128];
// TODO: this should also handle reverse
m_polarity_ = TOKEN_POLARITY_POSITIVE_;
m_next_ = null;
m_previous_ = null;
m_CELength_ = 0;
m_expCELength_ = 0;
}
// package private methods --------------------------------------------
/**
* Hashcode calculation for token
* @return the hashcode
*/
public int hashCode()
{
int result = 0;
int len = (m_source_ & 0xFF000000) >>> 24;
int inc = ((len - 32) / 32) + 1;
int start = m_source_ & 0x00FFFFFF;
int limit = start + len;
while (start < limit) {
result = (result * 37) + m_rules_.charAt(start);
start += inc;
}
return result;
}
/**
* Equals calculation
* @param target object to compare
* @return true if target is the same as this object
*/
public boolean equals(Object target)
{
if (target == this) {
return true;
}
if (target instanceof Token) {
Token t = (Token)target;
int sstart = m_source_ & 0x00FFFFFF;
int tstart = t.m_source_ & 0x00FFFFFF;
int slimit = (m_source_ & 0xFF000000) >> 24;
int tlimit = (m_source_ & 0xFF000000) >> 24;
int end = sstart + slimit - 1;
if (m_source_ == 0 || t.m_source_ == 0) {
return false;
}
if (slimit != tlimit) {
return false;
}
if (m_source_ == t.m_source_) {
return true;
}
while (sstart < end
&& m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))
{
++ sstart;
++ tstart;
}
if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
return true;
}
}
return false;
}
}
// package private data member -------------------------------------------
/**
* Indicator that the token is resetted yet, ie & in the rules
*/
static final int TOKEN_RESET_ = 0xDEADBEEF;
/**
* Size of the number of tokens
*/
int m_resultLength_;
/**
* List of parsed tokens
*/
TokenListHeader m_listHeader_[];
/**
* Variable top token
*/
Token m_variableTop_;
/**
* Collation options
*/
OptionSet m_options_;
/**
* Normalized collation rules with some extra characters
*/
StringBuilder m_source_;
/**
* Hash table to keep all tokens
*/
Map m_hashTable_;
// package private method ------------------------------------------------
void setDefaultOptionsInCollator(RuleBasedCollator collator)
{
collator.m_defaultStrength_ = m_options_.m_strength_;
collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
collator.m_defaultIsAlternateHandlingShifted_
= m_options_.m_isAlternateHandlingShifted_;
collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
if(m_options_.m_scriptOrder_ != null) {
collator.m_defaultReorderCodes_ = m_options_.m_scriptOrder_.clone();
} else {
collator.m_defaultReorderCodes_ = null;
}
}
// private inner classes -------------------------------------------------
/**
* This is a token that has been parsed but not yet processed. Used to
* reduce the number of arguments in the parser
*/
private static class ParsedToken
{
// private constructor ----------------------------------------------
/**
* Empty constructor
*/
ParsedToken()
{
m_charsLen_ = 0;
m_charsOffset_ = 0;
m_extensionLen_ = 0;
m_extensionOffset_ = 0;
m_prefixLen_ = 0;
m_prefixOffset_ = 0;
m_flags_ = 0;
m_strength_ = TOKEN_UNSET_;
}
// private data members ---------------------------------------------
int m_strength_;
int m_charsOffset_;
int m_charsLen_;
int m_extensionOffset_;
int m_extensionLen_;
int m_prefixOffset_;
int m_prefixLen_;
char m_flags_;
char m_indirectIndex_;
}
/**
* Boundary wrappers
*/
private static class IndirectBoundaries
{
// package private constructor ---------------------------------------
IndirectBoundaries(int startce[], int limitce[])
{
// Set values for the top - TODO: once we have values for all the
// indirects, we are going to initalize here.
m_startCE_ = startce[0];
m_startContCE_ = startce[1];
if (limitce != null) {
m_limitCE_ = limitce[0];
m_limitContCE_ = limitce[1];
}
else {
m_limitCE_ = 0;
m_limitContCE_ = 0;
}
}
// package private data members --------------------------------------
int m_startCE_;
int m_startContCE_;
int m_limitCE_;
int m_limitContCE_;
}
/**
* Collation option rule tag
*/
private static class TokenOption
{
// package private constructor ---------------------------------------
TokenOption(String name, int attribute, String suboptions[],
int suboptionattributevalue[])
{
m_name_ = name;
m_attribute_ = attribute;
m_subOptions_ = suboptions;
m_subOptionAttributeValues_ = suboptionattributevalue;
}
// package private data member ---------------------------------------
private String m_name_;
private int m_attribute_;
private String m_subOptions_[];
private int m_subOptionAttributeValues_[];
}
// private variables -----------------------------------------------------
/**
* Current parsed token
*/
private ParsedToken m_parsedToken_;
/**
* Collation rule
*/
private String m_rules_;
private int m_current_;
/**
* End of the option while reading.
* Need it for UnicodeSet reading support.
*/
private int m_optionEnd_;
/*
* Current offset in m_source
*/
//private int m_sourceLimit_;
/**
* Offset to m_source_ ofr the extra expansion characters
*/
private int m_extraCurrent_;
/**
* UnicodeSet that contains code points to be copied from the UCA
*/
UnicodeSet m_copySet_;
/**
* UnicodeSet that contains code points for which we want to remove
* UCA contractions. It implies copying of these code points from
* the UCA.
*/
UnicodeSet m_removeSet_;
/*
* This is space for the extra strings that need to be unquoted during the
* parsing of the rules
*/
//private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
/**
* Indicator that the token is not set yet
*/
private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
/*
* Indicator that the rule is in the > polarity, ie everything on the
* right of the rule is less than
*/
//private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
/**
* Indicator that the rule is in the < polarity, ie everything on the
* right of the rule is greater than
*/
private static final int TOKEN_POLARITY_POSITIVE_ = 1;
/**
* Flag mask to determine if top is set
*/
private static final int TOKEN_TOP_MASK_ = 0x04;
/**
* Flag mask to determine if variable top is set
*/
private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
/**
* Flag mask to determine if a before attribute is set
*/
private static final int TOKEN_BEFORE_ = 0x03;
/**
* For use in parsing token options
*/
private static final int TOKEN_SUCCESS_MASK_ = 0x10;
/**
* These values are used for finding CE values for indirect positioning.
* Indirect positioning is a mechanism for allowing resets on symbolic
* values. It only works for resets and you cannot tailor indirect names.
* An indirect name can define either an anchor point or a range. An anchor
* point behaves in exactly the same way as a code point in reset would,
* except that it cannot be tailored. A range (we currently only know for
* the [top] range will explicitly set the upper bound for generated CEs,
* thus allowing for better control over how many CEs can be squeezed
* between in the range without performance penalty. In that respect, we use
* [top] for tailoring of locales that use CJK characters. Other indirect
* values are currently a pure convenience, they can be used to assure that
* the CEs will be always positioned in the same place relative to a point
* with known properties (e.g. first primary ignorable).
*/
private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
// /**
// * Inverse UCA constants
// */
// private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
// private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
// private static final int INVERSE_SHIFT_VALUE_ = 20;
/**
* Collation option tags
* [last variable] last variable value
* [last primary ignorable] largest CE for primary ignorable
* [last secondary ignorable] largest CE for secondary ignorable
* [last tertiary ignorable] largest CE for tertiary ignorable
* [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
*/
private static final TokenOption RULES_OPTIONS_[];
static
{
INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
// UCOL_RESET_TOP_VALUE
INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
// UCOL_FIRST_PRIMARY_IGNORABLE
INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
null);
// UCOL_LAST_PRIMARY_IGNORABLE
INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
null);
// UCOL_FIRST_SECONDARY_IGNORABLE
INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
null);
// UCOL_LAST_SECONDARY_IGNORABLE
INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
null);
// UCOL_FIRST_TERTIARY_IGNORABLE
INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
null);
// UCOL_LAST_TERTIARY_IGNORABLE
INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
null);
// UCOL_FIRST_VARIABLE;
INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,
null);
// UCOL_LAST_VARIABLE
INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,
null);
// UCOL_FIRST_NON_VARIABLE
INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
null);
// UCOL_LAST_NON_VARIABLE
INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
// UCOL_FIRST_IMPLICIT
INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,
null);
// UCOL_LAST_IMPLICIT
INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
// UCOL_FIRST_TRAILING
INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,
null);
// UCOL_LAST_TRAILING
INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,
null);
INDIRECT_BOUNDARIES_[14].m_limitCE_
= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
RULES_OPTIONS_ = new TokenOption[20];
String option[] = {"non-ignorable", "shifted"};
int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
RuleBasedCollator.AttributeValue.SHIFTED_};
RULES_OPTIONS_[0] = new TokenOption("alternate",
RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
option, value);
option = new String[1];
option[0] = "2";
value = new int[1];
value[0] = RuleBasedCollator.AttributeValue.ON_;
RULES_OPTIONS_[1] = new TokenOption("backwards",
RuleBasedCollator.Attribute.FRENCH_COLLATION_,
option, value);
String offonoption[] = new String[2];
offonoption[0] = "off";
offonoption[1] = "on";
int offonvalue[] = new int[2];
offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
RULES_OPTIONS_[2] = new TokenOption("caseLevel",
RuleBasedCollator.Attribute.CASE_LEVEL_,
offonoption, offonvalue);
option = new String[3];
option[0] = "lower";
option[1] = "upper";
option[2] = "off";
value = new int[3];
value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
value[2] = RuleBasedCollator.AttributeValue.OFF_;
RULES_OPTIONS_[3] = new TokenOption("caseFirst",
RuleBasedCollator.Attribute.CASE_FIRST_,
option, value);
RULES_OPTIONS_[4] = new TokenOption("normalization",
RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
offonoption, offonvalue);
RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
offonoption, offonvalue);
option = new String[5];
option[0] = "1";
option[1] = "2";
option[2] = "3";
option[3] = "4";
option[4] = "I";
value = new int[5];
value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
RULES_OPTIONS_[6] = new TokenOption("strength",
RuleBasedCollator.Attribute.STRENGTH_,
option, value);
RULES_OPTIONS_[7] = new TokenOption("variable top",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[8] = new TokenOption("rearrange",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
option = new String[3];
option[0] = "1";
option[1] = "2";
option[2] = "3";
value = new int[3];
value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
RULES_OPTIONS_[9] = new TokenOption("before",
RuleBasedCollator.Attribute.LIMIT_,
option, value);
RULES_OPTIONS_[10] = new TokenOption("top",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
String firstlastoption[] = new String[7];
firstlastoption[0] = "primary";
firstlastoption[1] = "secondary";
firstlastoption[2] = "tertiary";
firstlastoption[3] = "variable";
firstlastoption[4] = "regular";
firstlastoption[5] = "implicit";
firstlastoption[6] = "trailing";
int firstlastvalue[] = new int[7];
Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);
RULES_OPTIONS_[11] = new TokenOption("first",
RuleBasedCollator.Attribute.LIMIT_,
firstlastoption, firstlastvalue);
RULES_OPTIONS_[12] = new TokenOption("last",
RuleBasedCollator.Attribute.LIMIT_,
firstlastoption, firstlastvalue);
RULES_OPTIONS_[13] = new TokenOption("optimize",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[15] = new TokenOption("undefined",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[16] = new TokenOption("reorder",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[17] = new TokenOption("charsetname",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[18] = new TokenOption("charset",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
RULES_OPTIONS_[19] = new TokenOption("import",
RuleBasedCollator.Attribute.LIMIT_,
null, null);
}
/**
* Utility data members
*/
private Token m_utilToken_ = new Token();
private CollationElementIterator m_UCAColEIter_
= RuleBasedCollator.UCA_.getCollationElementIterator("");
private int m_utilCEBuffer_[] = new int[2];
private boolean m_isStarred_;
private int m_currentStarredCharIndex_;
private int m_lastStarredCharIndex_;
private int m_currentRangeCp_;
private int m_lastRangeCp_;
private boolean m_inRange_;
private int m_previousCp_;
private boolean m_savedIsStarred_;
// private methods -------------------------------------------------------
/**
* Assembles the token list
* @exception ParseException thrown when rules syntax fails
*/
int assembleTokenList() throws ParseException
{
Token lastToken = null;
m_parsedToken_.m_strength_ = TOKEN_UNSET_;
int sourcelimit = m_source_.length();
int expandNext = 0;
m_isStarred_ = false;
while (m_current_ < sourcelimit || m_isStarred_) {
m_parsedToken_.m_prefixOffset_ = 0;
if (parseNextToken(lastToken == null) < 0) {
// we have reached the end
continue;
}
char specs = m_parsedToken_.m_flags_;
boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
int lastStrength = TOKEN_UNSET_;
if (lastToken != null) {
lastStrength = lastToken.m_strength_;
}
m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
| m_parsedToken_.m_charsOffset_;
m_utilToken_.m_rules_ = m_source_;
// 4 Lookup each source in the CharsToToken map, and find a
// sourcetoken
Token sourceToken = m_hashTable_.get(m_utilToken_);
if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
if (lastToken == null) {
// this means that rules haven't started properly
throwParseException(m_source_.toString(), 0);
}
// 6 Otherwise (when relation != reset)
if (sourceToken == null) {
// If sourceToken is null, create new one
sourceToken = new Token();
sourceToken.m_rules_ = m_source_;
sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
| m_parsedToken_.m_charsOffset_;
sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
| m_parsedToken_.m_prefixOffset_;
// TODO: this should also handle reverse
sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
sourceToken.m_next_ = null;
sourceToken.m_previous_ = null;
sourceToken.m_CELength_ = 0;
sourceToken.m_expCELength_ = 0;
m_hashTable_.put(sourceToken, sourceToken);
}
else {
// we could have fished out a reset here
if (sourceToken.m_strength_ != TOKEN_RESET_
&& lastToken != sourceToken) {
// otherwise remove sourceToken from where it was.
// Take care of the next node
if (sourceToken.m_next_ != null) {
if (sourceToken.m_next_.m_strength_
> sourceToken.m_strength_) {
sourceToken.m_next_.m_strength_
= sourceToken.m_strength_;
}
sourceToken.m_next_.m_previous_
= sourceToken.m_previous_;
}
else {
// sourcetoken is the last token.
// Redefine the tail token.
sourceToken.m_listHeader_.m_last_
= sourceToken.m_previous_;
}
// Take care of the previous node.
if (sourceToken.m_previous_ != null) {
sourceToken.m_previous_.m_next_
= sourceToken.m_next_;
}
else {
// sourcetoken is the first token.
// Redefine the head node.
sourceToken.m_listHeader_.m_first_
= sourceToken.m_next_;
}
sourceToken.m_next_ = null;
sourceToken.m_previous_ = null;
}
}
sourceToken.m_strength_ = m_parsedToken_.m_strength_;
sourceToken.m_listHeader_ = lastToken.m_listHeader_;
// 1. Find the strongest strength in each list, and set
// strongestP and strongestN accordingly in the headers.
if (lastStrength == TOKEN_RESET_
|| sourceToken.m_listHeader_.m_first_ == null) {
// If LAST is a reset insert sourceToken in the list.
if (sourceToken.m_listHeader_.m_first_ == null) {
sourceToken.m_listHeader_.m_first_ = sourceToken;
sourceToken.m_listHeader_.m_last_ = sourceToken;
}
else { // we need to find a place for us
// and we'll get in front of the same strength
if (sourceToken.m_listHeader_.m_first_.m_strength_
<= sourceToken.m_strength_) {
sourceToken.m_next_
= sourceToken.m_listHeader_.m_first_;
sourceToken.m_next_.m_previous_ = sourceToken;
sourceToken.m_listHeader_.m_first_ = sourceToken;
sourceToken.m_previous_ = null;
}
else {
lastToken = sourceToken.m_listHeader_.m_first_;
while (lastToken.m_next_ != null
&& lastToken.m_next_.m_strength_
> sourceToken.m_strength_) {
lastToken = lastToken.m_next_;
}
if (lastToken.m_next_ != null) {
lastToken.m_next_.m_previous_ = sourceToken;
}
else {
sourceToken.m_listHeader_.m_last_
= sourceToken;
}
sourceToken.m_previous_ = lastToken;
sourceToken.m_next_ = lastToken.m_next_;
lastToken.m_next_ = sourceToken;
}
}
}
else {
// Otherwise (when LAST is not a reset)
// if polarity (LAST) == polarity(relation), insert
// sourceToken after LAST, otherwise insert before.
// when inserting after or before, search to the next
// position with the same strength in that direction.
// (This is called postpone insertion).
if (sourceToken != lastToken) {
if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
while (lastToken.m_next_ != null
&& lastToken.m_next_.m_strength_
> sourceToken.m_strength_) {
lastToken = lastToken.m_next_;
}
sourceToken.m_previous_ = lastToken;
if (lastToken.m_next_ != null) {
lastToken.m_next_.m_previous_ = sourceToken;
}
else {
sourceToken.m_listHeader_.m_last_ = sourceToken;
}
sourceToken.m_next_ = lastToken.m_next_;
lastToken.m_next_ = sourceToken;
}
else {
while (lastToken.m_previous_ != null
&& lastToken.m_previous_.m_strength_
> sourceToken.m_strength_) {
lastToken = lastToken.m_previous_;
}
sourceToken.m_next_ = lastToken;
if (lastToken.m_previous_ != null) {
lastToken.m_previous_.m_next_ = sourceToken;
}
else {
sourceToken.m_listHeader_.m_first_
= sourceToken;
}
sourceToken.m_previous_ = lastToken.m_previous_;
lastToken.m_previous_ = sourceToken;
}
}
else { // repeated one thing twice in rules, stay with the
// stronger strength
if (lastStrength < sourceToken.m_strength_) {
sourceToken.m_strength_ = lastStrength;
}
}
}
// if the token was a variable top, we're gonna put it in
if (variableTop == true && m_variableTop_ == null) {
variableTop = false;
m_variableTop_ = sourceToken;
}
// Treat the expansions.
// There are two types of expansions: explicit (x / y) and
// reset based propagating expansions
// (&abc * d * e <=> &ab * d / c * e / c)
// if both of them are in effect for a token, they are combined.
sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
| m_parsedToken_.m_extensionOffset_;
if (expandNext != 0) {
if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
// primary strength kills off the implicit expansion
expandNext = 0;
}
else if (sourceToken.m_expansion_ == 0) {
// if there is no expansion, implicit is just added to
// the token
sourceToken.m_expansion_ = expandNext;
}
else {
// there is both explicit and implicit expansion.
// We need to make a combination
int start = expandNext & 0xFFFFFF;
int size = expandNext >>> 24;
if (size > 0) {
m_source_.append(m_source_.substring(start,
start + size));
}
start = m_parsedToken_.m_extensionOffset_;
m_source_.append(m_source_.substring(start,
start + m_parsedToken_.m_extensionLen_));
sourceToken.m_expansion_ = (size
+ m_parsedToken_.m_extensionLen_) << 24
| m_extraCurrent_;
m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
}
}
// if the previous token was a reset before, the strength of this
// token must match the strength of before. Otherwise we have an
// undefined situation.
// In other words, we currently have a cludge which we use to
// represent &a >> x. This is written as &[before 2]a << x.
if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
if(beforeStrength != sourceToken.m_strength_) {
throwParseException(m_source_.toString(), m_current_);
}
}
}
else {
if (lastToken != null && lastStrength == TOKEN_RESET_) {
// if the previous token was also a reset, this means that
// we have two consecutive resets and we want to remove the
// previous one if empty
if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
m_resultLength_ --;
}
}
if (sourceToken == null) {
// this is a reset, but it might still be somewhere in the
// tailoring, in shorter form
int searchCharsLen = m_parsedToken_.m_charsLen_;
while (searchCharsLen > 1 && sourceToken == null) {
searchCharsLen --;
// key = searchCharsLen << 24 | charsOffset;
m_utilToken_.m_source_ = searchCharsLen << 24
| m_parsedToken_.m_charsOffset_;
m_utilToken_.m_rules_ = m_source_;
sourceToken = m_hashTable_.get(m_utilToken_);
}
if (sourceToken != null) {
expandNext = (m_parsedToken_.m_charsLen_
- searchCharsLen) << 24
| (m_parsedToken_.m_charsOffset_
+ searchCharsLen);
}
}
if ((specs & TOKEN_BEFORE_) != 0) {
if (top == false) {
// we're doing before & there is no indirection
int strength = (specs & TOKEN_BEFORE_) - 1;
if (sourceToken != null
&& sourceToken.m_strength_ != TOKEN_RESET_) {
// this is a before that is already ordered in the UCA
// - so we need to get the previous with good strength
while (sourceToken.m_strength_ > strength
&& sourceToken.m_previous_ != null) {
sourceToken = sourceToken.m_previous_;
}
// here, either we hit the strength or NULL
if (sourceToken.m_strength_ == strength) {
if (sourceToken.m_previous_ != null) {
sourceToken = sourceToken.m_previous_;
}
else { // start of list
sourceToken
= sourceToken.m_listHeader_.m_reset_;
}
}
else { // we hit NULL, we should be doing the else part
sourceToken
= sourceToken.m_listHeader_.m_reset_;
sourceToken = getVirginBefore(sourceToken,
strength);
}
}
else {
sourceToken
= getVirginBefore(sourceToken, strength);
}
}
else {
// this is both before and indirection
top = false;
m_listHeader_[m_resultLength_] = new TokenListHeader();
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
m_listHeader_[m_resultLength_].m_indirect_ = true;
// we need to do slightly more work. we need to get the
// baseCE using the inverse UCA & getPrevious. The next
// bound is not set, and will be decided in ucol_bld
int strength = (specs & TOKEN_BEFORE_) - 1;
int baseCE = INDIRECT_BOUNDARIES_[
m_parsedToken_.m_indirectIndex_].m_startCE_;
int baseContCE = INDIRECT_BOUNDARIES_[
m_parsedToken_.m_indirectIndex_].m_startContCE_;
int ce[] = new int[2];
if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
&& (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
} else {
CollationParsedRuleBuilder.InverseUCA invuca
= CollationParsedRuleBuilder.INVERSE_UCA_;
invuca.getInversePrevCE(baseCE, baseContCE, strength,
ce);
}
m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
sourceToken = new Token();
expandNext = initAReset(0, sourceToken);
}
}
// 5 If the relation is a reset:
// If sourceToken is null
// Create new list, create new sourceToken, make the baseCE
// from source, put the sourceToken in ListHeader of the new
// list
if (sourceToken == null) {
if (m_listHeader_[m_resultLength_] == null) {
m_listHeader_[m_resultLength_] = new TokenListHeader();
}
// 3 Consider each item: relation, source, and expansion:
// e.g. ...< x / y ...
// First convert all expansions into normal form.
// Examples:
// If "xy" doesn't occur earlier in the list or in the UCA,
// convert &xy * c * d * ... into &x * c/y * d * ...
// Note: reset values can never have expansions, although
// they can cause the very next item to have one. They may
// be contractions, if they are found earlier in the list.
if (top == false) {
CollationElementIterator coleiter
= RuleBasedCollator.UCA_.getCollationElementIterator(
m_source_.substring(m_parsedToken_.m_charsOffset_,
m_parsedToken_.m_charsOffset_
+ m_parsedToken_.m_charsLen_));
int CE = coleiter.next();
// offset to the character in the full rule string
int expand = coleiter.getOffset()
+ m_parsedToken_.m_charsOffset_;
int SecondCE = coleiter.next();
m_listHeader_[m_resultLength_].m_baseCE_
= CE & 0xFFFFFF3F;
if (RuleBasedCollator.isContinuation(SecondCE)) {
m_listHeader_[m_resultLength_].m_baseContCE_
= SecondCE;
}
else {
m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
}
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
m_listHeader_[m_resultLength_].m_indirect_ = false;
sourceToken = new Token();
expandNext = initAReset(expand, sourceToken);
}
else { // top == TRUE
top = false;
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
m_listHeader_[m_resultLength_].m_indirect_ = true;
IndirectBoundaries ib = INDIRECT_BOUNDARIES_[
m_parsedToken_.m_indirectIndex_];
m_listHeader_[m_resultLength_].m_baseCE_
= ib.m_startCE_;
m_listHeader_[m_resultLength_].m_baseContCE_
= ib.m_startContCE_;
m_listHeader_[m_resultLength_].m_nextCE_
= ib.m_limitCE_;
m_listHeader_[m_resultLength_].m_nextContCE_
= ib.m_limitContCE_;
sourceToken = new Token();
expandNext = initAReset(0, sourceToken);
}
}
else { // reset to something already in rules
top = false;
}
}
// 7 After all this, set LAST to point to sourceToken, and goto
// step 3.
lastToken = sourceToken;
}
if (m_resultLength_ > 0
&& m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
m_resultLength_ --;
}
return m_resultLength_;
}
/**
* Formats and throws a ParseException
* @param rules collation rule that failed
* @param offset failed offset in rules
* @throws ParseException with failure information
*/
private static final void throwParseException(String rules, int offset)
throws ParseException
{
// for pre-context
String precontext = rules.substring(0, offset);
String postcontext = rules.substring(offset, rules.length());
StringBuilder error = new StringBuilder(
"Parse error occurred in rule at offset ");
error.append(offset);
error.append("\n after the prefix \"");
error.append(precontext);
error.append("\" before the suffix \"");
error.append(postcontext);
throw new ParseException(error.toString(), offset);
}
private final boolean doSetTop() {
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
m_source_.append((char)0xFFFE);
IndirectBoundaries ib =
INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
m_source_.append((char)(ib.m_startCE_ >> 16));
m_source_.append((char)(ib.m_startCE_ & 0xFFFF));
m_extraCurrent_ += 3;
if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_
].m_startContCE_ == 0) {
m_parsedToken_.m_charsLen_ = 3;
}
else {
m_source_.append((char)(INDIRECT_BOUNDARIES_[
m_parsedToken_.m_indirectIndex_
].m_startContCE_ >> 16));
m_source_.append((char)(INDIRECT_BOUNDARIES_[
m_parsedToken_.m_indirectIndex_
].m_startContCE_ & 0xFFFF));
m_extraCurrent_ += 2;
m_parsedToken_.m_charsLen_ = 5;
}
return true;
}
private static boolean isCharNewLine(char c) {
switch (c) {
case 0x000A: /* LF */
case 0x000D: /* CR */
case 0x000C: /* FF */
case 0x0085: /* NEL */
case 0x2028: /* LS */
case 0x2029: /* PS */
return true;
default:
return false;
}
}
/**
* Parses the next token.
*
* It updates/accesses the following member variables:
* m_current_: Index to the next unparsed character (not code point)
* in the character array (a StringBuilder object) m_source_.
* m_parsedToken_: The parsed token. The following of the token are updated.
* .m_strength: The strength of the token.
* .m_charsOffset, m_charsLen_: Index to the first character (after operators),
* and number of characters in the token.
* This may be in the main string, or in the appended string.
* .m_extensionOffset_, .m_extensionLen_:
* .m_flags:
* .m_prefixOffset, .m_prefixLen: Used when "|" is used to specify "context before".
* .m_indirectIndex:
* @param startofrules
* flag indicating if we are at the start of rules
* @return the offset of the next unparsed char
* @exception ParseException
* thrown when rule parsing fails
*/
private int parseNextToken(boolean startofrules) throws ParseException
{
if (m_inRange_) {
// We are not done processing a range. Continue it.
return processNextCodePointInRange();
} else if (m_isStarred_) {
// We are not done processing a starred token. Continue it.
return processNextTokenInTheStarredList();
}
// Get the next token.
int nextOffset = parseNextTokenInternal(startofrules);
// If the next token is starred and/or in range, we need to handle it here.
if (m_inRange_) {
// A new range has started.
// Check whether it is a chain of ranges with more than one hyphen.
if (m_lastRangeCp_ > 0 && m_lastRangeCp_ == m_previousCp_) {
throw new ParseException("Chained range syntax", m_current_);
}
// The current token is the first character of the second code point of the range.
// Process just that, and then proceed with the star.
m_lastRangeCp_ = m_source_.codePointAt(this.m_parsedToken_.m_charsOffset_);
if (m_lastRangeCp_ <= m_previousCp_) {
throw new ParseException("Invalid range", m_current_);
}
// Set current range code point to process the range loop
m_currentRangeCp_ = m_previousCp_ + 1;
// Set current starred char index to continue processing the starred
// expression after the range is done.
m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_
+ Character.charCount(m_lastRangeCp_);
m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;
return processNextCodePointInRange();
} else if (m_isStarred_) {
// We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
// [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
// separated into several tokens and returned.
m_currentStarredCharIndex_ = m_parsedToken_.m_charsOffset_;
m_lastStarredCharIndex_ = m_parsedToken_.m_charsOffset_ + m_parsedToken_.m_charsLen_ - 1;
return processNextTokenInTheStarredList();
}
return nextOffset;
}
private int processNextCodePointInRange() throws ParseException {
int nChars = Character.charCount(m_currentRangeCp_);
m_source_.appendCodePoint(m_currentRangeCp_);
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
m_parsedToken_.m_charsLen_ = nChars;
m_extraCurrent_ += nChars;
++m_currentRangeCp_;
if (m_currentRangeCp_ > m_lastRangeCp_) {
// All the code points in the range are processed.
// Turn the range flag off.
m_inRange_ = false;
// If there is a starred portion remaining in the current
// parsed token, resume the starred operation.
if (m_currentStarredCharIndex_ <= m_lastStarredCharIndex_) {
m_isStarred_ = true;
} else {
m_isStarred_ = false;
}
} else {
m_previousCp_ = m_currentRangeCp_;
}
return m_current_;
}
/**
* Extracts the next token from the starred token from
* m_currentStarredCharIndex_ and returns it.
* @return the offset of the next unparsed char
* @throws ParseException
*/
private int processNextTokenInTheStarredList() throws ParseException {
// Extract the characters corresponding to the next code point.
int cp = m_source_.codePointAt(m_currentStarredCharIndex_);
int nChars = Character.charCount(cp);
m_parsedToken_.m_charsLen_ = nChars;
m_parsedToken_.m_charsOffset_ = m_currentStarredCharIndex_;
m_currentStarredCharIndex_ += nChars;
// When we are done parsing the starred string, turn the flag off so that
// the normal processing is restored.
if (m_currentStarredCharIndex_ > m_lastStarredCharIndex_) {
m_isStarred_ = false;
}
m_previousCp_ = cp;
return m_current_;
}
private int resetToTop(boolean top, boolean variableTop,
int extensionOffset, int newExtensionLen,
byte byteBefore) throws ParseException {
m_parsedToken_.m_indirectIndex_ = 5;
top = doSetTop();
return doEndParseNextToken(TOKEN_RESET_,
top,
extensionOffset,
newExtensionLen,
variableTop, byteBefore);
}
/**
* Gets the next token and sets the necessary internal variables.
* This function parses a starred string as a single token, which will be separated
* in the calling function.
* @param startofrules Boolean value indicating whether this is the first rule
* @return the offset of the next unparsed char
* @throws ParseException
*/
@SuppressWarnings("fallthrough")
private int parseNextTokenInternal(boolean startofrules) throws ParseException {
boolean variabletop = false;
boolean top = false;
boolean inchars = true;
boolean inquote = false;
boolean wasinquote = false;
byte before = 0;
boolean isescaped = false;
int /*newcharslen = 0,*/ newextensionlen = 0;
int /*charsoffset = 0,*/ extensionoffset = 0;
int newstrength = TOKEN_UNSET_;
initializeParsedToken();
int limit = m_rules_.length();
while (m_current_ < limit) {
char ch = m_source_.charAt(m_current_);
if (inquote) {
if (ch == 0x0027) { // '\''
inquote = false;
}
else {
if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
if (m_parsedToken_.m_charsLen_ == 0) {
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
}
m_parsedToken_.m_charsLen_ ++;
}
else {
if (newextensionlen == 0) {
extensionoffset = m_extraCurrent_;
}
newextensionlen ++;
}
}
}
else if (isescaped) {
isescaped = false;
if (newstrength == TOKEN_UNSET_) {
throwParseException(m_rules_, m_current_);
}
if (ch != 0 && m_current_ != limit) {
if (inchars) {
if (m_parsedToken_.m_charsLen_ == 0) {
m_parsedToken_.m_charsOffset_ = m_current_;
}
m_parsedToken_.m_charsLen_ ++;
}
else {
if (newextensionlen == 0) {
extensionoffset = m_current_;
}
newextensionlen ++;
}
}
}
else {
if (!PatternProps.isWhiteSpace(ch)) {
// Sets the strength for this entry
switch (ch) {
case 0x003D : // '='
if (newstrength != TOKEN_UNSET_) {
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
newstrength = Collator.IDENTICAL;
if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
m_current_++;
m_isStarred_ = true;
}
break;
case 0x002C : // ','
if (newstrength != TOKEN_UNSET_) {
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
newstrength = Collator.TERTIARY;
break;
case 0x003B : // ';'
if (newstrength != TOKEN_UNSET_) {
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
//if we start with strength, we'll reset to top
if(startofrules == true) {
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
newstrength = Collator.SECONDARY;
break;
case 0x003C : // '<'
if (newstrength != TOKEN_UNSET_) {
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
// if we start with strength, we'll reset to top
if (startofrules == true) {
return resetToTop(top, variabletop, extensionoffset,
newextensionlen, before);
}
// before this, do a scan to verify whether this is
// another strength
if (m_source_.charAt(m_current_ + 1) == 0x003C) {
m_current_ ++;
if (m_source_.charAt(m_current_ + 1) == 0x003C) {
m_current_ ++; // three in a row!
newstrength = Collator.TERTIARY;
}
else { // two in a row
newstrength = Collator.SECONDARY;
}
}
else { // just one
newstrength = Collator.PRIMARY;
}
if (m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
m_current_++;
m_isStarred_ = true;
}
break;
case 0x0026 : // '&'
if (newstrength != TOKEN_UNSET_) {
return doEndParseNextToken(newstrength,
top,
extensionoffset,
newextensionlen,
variabletop, before);
}
newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
break;
case 0x005b : // '['
// options - read an option, analyze it
m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
if (m_optionEnd_ != -1) { // ']'
byte result = readAndSetOption();
m_current_ = m_optionEnd_;
if ((result & TOKEN_TOP_MASK_) != 0) {
if (newstrength == TOKEN_RESET_) {
doSetTop();
if (before != 0) {
// This is a combination of before and
// indirection like
// '&[before 2][first regular]>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
&& (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);
int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
m_source_.append('\uFFFE');
m_source_.append((char)ch);
m_extraCurrent_ += 2;
m_parsedToken_.m_charsLen_++;
m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
| m_parsedToken_.m_charsOffset_;
m_utilToken_.m_rules_ = m_source_;
sourcetoken = m_hashTable_.get(m_utilToken_);
if(sourcetoken == null) {
m_listHeader_[m_resultLength_] = new TokenListHeader();
m_listHeader_[m_resultLength_].m_baseCE_
= m_utilCEBuffer_[0] & 0xFFFFFF3F;
if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
m_listHeader_[m_resultLength_].m_baseContCE_
= m_utilCEBuffer_[1];
}
else {
m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
}
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
m_listHeader_[m_resultLength_].m_indirect_ = false;
sourcetoken = new Token();
initAReset(-1, sourcetoken);
}
} else {
// first ce and second ce m_utilCEBuffer_
/*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(
basece, basecontce,
strength, m_utilCEBuffer_);
// we got the previous CE. Now we need to see if the difference between
// the two CEs is really of the requested strength.
// if it's a bigger difference (we asked for secondary and got primary), we
// need to modify the CE.
if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
// adjust the strength
// now we are in the situation where our baseCE should actually be modified in
// order to get the CE in the right position.
if(strength == Collator.SECONDARY) {
m_utilCEBuffer_[0] = basece - 0x0200;
} else { // strength == UCOL_TERTIARY
m_utilCEBuffer_[0] = basece - 0x02;
}
if(RuleBasedCollator.isContinuation(basecontce)) {
if(strength == Collator.SECONDARY) {
m_utilCEBuffer_[1] = basecontce - 0x0200;
} else { // strength == UCOL_TERTIARY
m_utilCEBuffer_[1] = basecontce - 0x02;
}
}
}
/*
// the code below relies on getting a code point from the inverse table, in order to be
// able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
// 1. There are many code points that have the same CE
// 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
// Also, in case when there is no equivalent strength before an element, we have to actually
// construct one. For example, &[before 2]a << x won't result in x << a, because the element
// before a is a primary difference.
ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
+ 2];
if ((ch & INVERSE_SIZE_MASK_) != 0) {
int offset = ch & INVERSE_OFFSET_MASK_;
ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
offset];
}
m_source_.append((char)ch);
m_extraCurrent_ ++;
m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
m_parsedToken_.m_charsLen_ = 1;
// We got an UCA before. However, this might have been tailored.
// example:
// &\u30ca = \u306a
// &[before 3]\u306a<<<\u306a|\u309d
m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
| m_parsedToken_.m_charsOffset_;
m_utilToken_.m_rules_ = m_source_;
sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
*/
// here is how it should be. The situation such as &[before 1]a < x, should be
// resolved exactly as if we wrote &a > x.
// therefore, I don't really care if the UCA value before a has been changed.
// However, I do care if the strength between my element and the previous element
// is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
// have to construct the base CE.
// if we found a tailored thing, we have to use the UCA value and
// construct a new reset token with constructed name
//if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
// character to which we want to anchor is already tailored.
// We need to construct a new token which will be the anchor point
//m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
//m_source_.append(ch);
//m_extraCurrent_ ++;
//m_parsedToken_.m_charsLen_ ++;
// grab before
m_parsedToken_.m_charsOffset_ -= 10;
m_parsedToken_.m_charsLen_ += 10;
m_listHeader_[m_resultLength_] = new TokenListHeader();
m_listHeader_[m_resultLength_].m_baseCE_
= m_utilCEBuffer_[0] & 0xFFFFFF3F;
if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
m_listHeader_[m_resultLength_].m_baseContCE_
= m_utilCEBuffer_[1];
}
else {
m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
}
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
m_listHeader_[m_resultLength_].m_indirect_ = false;
sourcetoken = new Token();
initAReset(-1, sourcetoken);
//}
}
return sourcetoken;
}
/**
* Processing Description.
* 1. Build a m_listHeader_. Each list has a header, which contains two lists
* (positive and negative), a reset token, a baseCE, nextCE, and
* previousCE. The lists and reset may be null.
* 2. As you process, you keep a LAST pointer that points to the last token
* you handled.
* @param expand string offset, -1 for null strings
* @param targetToken token to update
* @return expandnext offset
* @throws ParseException thrown when rules syntax failed
*/
private int initAReset(int expand, Token targetToken) throws ParseException
{
if (m_resultLength_ == m_listHeader_.length - 1) {
// Unfortunately, this won't work, as we store addresses of lhs in
// token
TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
m_listHeader_ = temp;
}
// do the reset thing
targetToken.m_rules_ = m_source_;
targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
| m_parsedToken_.m_charsOffset_;
targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
| m_parsedToken_.m_extensionOffset_;
// keep the flags around so that we know about before
targetToken.m_flags_ = m_parsedToken_.m_flags_;
if (m_parsedToken_.m_prefixOffset_ != 0) {
throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
}
targetToken.m_prefix_ = 0;
// TODO: this should also handle reverse
targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
targetToken.m_strength_ = TOKEN_RESET_;
targetToken.m_next_ = null;
targetToken.m_previous_ = null;
targetToken.m_CELength_ = 0;
targetToken.m_expCELength_ = 0;
targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
m_listHeader_[m_resultLength_].m_first_ = null;
m_listHeader_[m_resultLength_].m_last_ = null;
m_listHeader_[m_resultLength_].m_first_ = null;
m_listHeader_[m_resultLength_].m_last_ = null;
m_listHeader_[m_resultLength_].m_reset_ = targetToken;
/* 3 Consider each item: relation, source, and expansion:
* e.g. ...< x / y ...
* First convert all expansions into normal form. Examples:
* If "xy" doesn't occur earlier in the list or in the UCA, convert
* &xy * c * d * ... into &x * c/y * d * ...
* Note: reset values can never have expansions, although they can
* cause the very next item to have one. They may be contractions, if
* they are found earlier in the list.
*/
int result = 0;
if (expand > 0) {
// check to see if there is an expansion
if (m_parsedToken_.m_charsLen_ > 1) {
targetToken.m_source_ = ((expand
- m_parsedToken_.m_charsOffset_ )
<< 24)
| m_parsedToken_.m_charsOffset_;
result = ((m_parsedToken_.m_charsLen_
+ m_parsedToken_.m_charsOffset_ - expand) << 24)
| expand;
}
}
m_resultLength_ ++;
m_hashTable_.put(targetToken, targetToken);
return result;
}
/**
* Checks if an character is special
* @param ch character to test
* @return true if the character is special
*/
private static final boolean isSpecialChar(char ch)
{
return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)
|| (ch <= 0x0060 && ch >= 0x005B)
|| (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
}
private
UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException
{
while(source.charAt(start) != '[') { /* advance while we find the first '[' */
start++;
}
// now we need to get a balanced set of '[]'. The problem is that a set can have
// many, and *end point to the first closing '['
int noOpenBraces = 1;
int current = 1; // skip the opening brace
while(start+current < source.length() && noOpenBraces != 0) {
if(source.charAt(start+current) == '[') {
noOpenBraces++;
} else if(source.charAt(start+current) == ']') { // closing brace
noOpenBraces--;
}
current++;
}
//int nextBrace = -1;
if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {
throwParseException(m_rules_, start);
}
return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
}
/** in C, optionarg is passed by reference to function.
* We use a private int to simulate this.
*/
private int m_optionarg_ = 0;
private int readOption(String rules, int start, int optionend)
{
m_optionarg_ = 0;
int i = 0;
while (i < RULES_OPTIONS_.length) {
String option = RULES_OPTIONS_[i].m_name_;
int optionlength = option.length();
if (rules.length() > start + optionlength
&& option.equalsIgnoreCase(rules.substring(start,
start + optionlength))) {
if (optionend - start > optionlength) {
m_optionarg_ = start + optionlength;
// start of the options, skip space
while (m_optionarg_ < optionend && PatternProps.isWhiteSpace(rules.charAt(m_optionarg_)))
{ // eat whitespace
m_optionarg_ ++;
}
}
break;
}
i ++;
}
if(i == RULES_OPTIONS_.length) {
i = -1;
}
return i;
}
/**
* Reads and set collation options
* @return TOKEN_SUCCESS if option is set correct, 0 otherwise
* @exception ParseException thrown when options in rules are wrong
*/
private byte readAndSetOption() throws ParseException
{
int start = m_current_ + 1; // skip opening '['
int i = readOption(m_rules_, start, m_optionEnd_);
int optionarg = m_optionarg_;
if (i < 0) {
throwParseException(m_rules_, start);
}
if (i < 7) {
if (optionarg != 0) {
for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
j ++) {
String subname = RULES_OPTIONS_[i].m_subOptions_[j];
int size = optionarg + subname.length();
if (m_rules_.length() > size
&& subname.equalsIgnoreCase(m_rules_.substring(
optionarg, size))) {
setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,
RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
return TOKEN_SUCCESS_MASK_;
}
}
}
throwParseException(m_rules_, optionarg);
}
else if (i == 7) { // variable top
return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
}
else if (i == 8) { // rearrange
return TOKEN_SUCCESS_MASK_;
}
else if (i == 9) { // before
if (optionarg != 0) {
for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
j ++) {
String subname = RULES_OPTIONS_[i].m_subOptions_[j];
int size = optionarg + subname.length();
if (m_rules_.length() > size
&& subname.equalsIgnoreCase(
m_rules_.substring(optionarg,
optionarg + subname.length()))) {
return (byte)(TOKEN_SUCCESS_MASK_
| RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]
+ 1);
}
}
}
throwParseException(m_rules_, optionarg);
}
else if (i == 10) { // top, we are going to have an array with
// structures of limit CEs index to this array will be
// src->parsedToken.indirectIndex
m_parsedToken_.m_indirectIndex_ = 0;
return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
}
else if (i < 13) { // first, last
for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {
String subname = RULES_OPTIONS_[i].m_subOptions_[j];
int size = optionarg + subname.length();
if (m_rules_.length() > size
&& subname.equalsIgnoreCase(m_rules_.substring(optionarg,
size))) {
m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));
return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
}
}
throwParseException(m_rules_, optionarg);
}
else if(i == 13 || i == 14) { // copy and remove are handled before normalization
// we need to move end here
int noOpenBraces = 1;
m_current_++; // skip opening brace
while(m_current_ < m_source_.length() && noOpenBraces != 0) {
if(m_source_.charAt(m_current_) == '[') {
noOpenBraces++;
} else if(m_source_.charAt(m_current_) == ']') { // closing brace
noOpenBraces--;
}
m_current_++;
}
m_optionEnd_ = m_current_-1;
return TOKEN_SUCCESS_MASK_;
}
else if(i == 16) {
m_current_ = m_optionarg_; // skip opening brace and name
parseScriptReorder();
return TOKEN_SUCCESS_MASK_;
}
else {
throwParseException(m_rules_, optionarg);
}
return TOKEN_SUCCESS_MASK_; // we will never reach here.
}
/**
* Set collation option
* @param optionset option set to set
* @param attribute type to set
* @param value attribute value
*/
private void setOptions(OptionSet optionset, int attribute, int value)
{
switch (attribute) {
case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :
optionset.m_isHiragana4_
= (value == RuleBasedCollator.AttributeValue.ON_);
break;
case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :
optionset.m_isFrenchCollation_
= (value == RuleBasedCollator.AttributeValue.ON_);
break;
case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :
optionset.m_isAlternateHandlingShifted_
= (value
== RuleBasedCollator.AttributeValue.SHIFTED_);
break;
case RuleBasedCollator.Attribute.CASE_FIRST_ :
optionset.m_caseFirst_ = value;
break;
case RuleBasedCollator.Attribute.CASE_LEVEL_ :
optionset.m_isCaseLevel_
= (value == RuleBasedCollator.AttributeValue.ON_);
break;
case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :
if (value == RuleBasedCollator.AttributeValue.ON_) {
value = Collator.CANONICAL_DECOMPOSITION;
}
optionset.m_decomposition_ = value;
break;
case RuleBasedCollator.Attribute.STRENGTH_ :
optionset.m_strength_ = value;
break;
default :
break;
}
}
UnicodeSet getTailoredSet() throws ParseException
{
boolean startOfRules = true;
UnicodeSet tailored = new UnicodeSet();
String pattern;
CanonicalIterator it = new CanonicalIterator("");
m_parsedToken_.m_strength_ = TOKEN_UNSET_;
int sourcelimit = m_source_.length();
//int expandNext = 0;
while (m_current_ < sourcelimit) {
m_parsedToken_.m_prefixOffset_ = 0;
if (parseNextToken(startOfRules) < 0) {
// we have reached the end
continue;
}
startOfRules = false;
// The idea is to tokenize the rule set. For each non-reset token,
// we add all the canonicaly equivalent FCD sequences
if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {
it.setSource(m_source_.substring(
m_parsedToken_.m_charsOffset_,
m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));
pattern = it.next();
while(pattern != null) {
if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {
tailored.add(pattern);
}
pattern = it.next();
}
}
}
return tailored;
}
final private String preprocessRules(String rules) throws ParseException {
int optionNumber = -1;
int setStart = 0;
int i = 0;
while(i < rules.length()) {
if(rules.charAt(i) == 0x005B) { // [
optionNumber = readOption(rules, i+1, rules.length());
setStart = m_optionarg_;
if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
if(m_copySet_ == null) {
m_copySet_ = newSet;
} else {
m_copySet_.addAll(newSet);
}
} else if(optionNumber == 14) {
UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
if(m_removeSet_ == null) {
m_removeSet_ = newSet;
} else {
m_removeSet_.addAll(newSet);
}
} else if(optionNumber == 19) {
int optionEndOffset = rules.indexOf(']', i) + 1;
ULocale locale = ULocale.forLanguageTag(rules.substring(setStart, optionEndOffset-1));
UResourceBundle bundle = UResourceBundle.getBundleInstance(
ICUResourceBundle.ICU_BASE_NAME + "/coll", locale.getBaseName());
String type = locale.getKeywordValue("collation");
if(type == null){
type = "standard";
}
String importRules = bundle.get("collations")
.get(type)
.get("Sequence")
.getString();
rules = rules.substring(0, i) + importRules + rules.substring(optionEndOffset);
}
}
i++;
}
return rules;
}
/* This is the data that is used for non-script reordering codes. These _must_ be kept
* in order that they are to be applied as defaults and in synch with the Collator.ReorderCodes statics.
*/
static final String ReorderingTokensArray[] = {
"SPACE",
"PUNCT",
"SYMBOL",
"CURRENCY",
"DIGIT",
};
int findReorderingEntry(String name) {
for (int tokenIndex = 0; tokenIndex < ReorderingTokensArray.length; tokenIndex++) {
if (name.equalsIgnoreCase(ReorderingTokensArray[tokenIndex])) {
return tokenIndex + ReorderCodes.FIRST;
}
}
return UScript.INVALID_CODE;
}
private void parseScriptReorder() throws ParseException {
ArrayList tempOrder = new ArrayList();
int end = m_rules_.indexOf(']', m_current_);
if (end == -1) {
return;
}
String tokenString = m_rules_.substring(m_current_, end);
String[] tokens = tokenString.split("\\s+", 0);
String token;
for (int tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
token = tokens[tokenIndex];
int reorderCode = findReorderingEntry(token);
if (reorderCode == UScript.INVALID_CODE) {
reorderCode = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, token);
if (reorderCode < 0) {
throw new ParseException(m_rules_, tokenIndex);
}
}
tempOrder.add(reorderCode);
}
m_options_.m_scriptOrder_ = new int[tempOrder.size()];
for(int i = 0; i < tempOrder.size(); i++) {
m_options_.m_scriptOrder_[i] = tempOrder.get(i);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy