com.ibm.icu.text.CollationParsedRuleBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.ibm.icu.impl.IntTrieBuilder;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.TrieBuilder;
import com.ibm.icu.impl.TrieIterator;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.VersionInfo;
/**
* Class for building a collator from a list of collation rules. This class is
* uses CollationRuleParser
*
* @author Syn Wee Quek
* @since release 2.2, June 11 2002
*/
final class CollationParsedRuleBuilder {
// package private constructors ------------------------------------------
/**
* Constructor
*
* @param rules
* collation rules
* @exception ParseException
* thrown when argument rules have an invalid syntax
*/
CollationParsedRuleBuilder(String rules) throws ParseException {
m_parser_ = new CollationRuleParser(rules);
m_parser_.assembleTokenList();
m_utilColEIter_ = RuleBasedCollator.UCA_
.getCollationElementIterator("");
}
// package private inner classes -----------------------------------------
/**
* Inverse UCA wrapper
*/
static class InverseUCA {
// package private constructor ---------------------------------------
InverseUCA() {
}
// package private data member ---------------------------------------
/**
* Array list of characters
*/
int m_table_[];
/**
* Array list of continuation characters
*/
char m_continuations_[];
/**
* UCA version of inverse UCA table
*/
VersionInfo m_UCA_version_;
// package private method --------------------------------------------
/**
* Returns the previous inverse ces of the argument ces
*
* @param ce
* ce to test
* @param contce
* continuation ce to test
* @param strength
* collation strength
* @param prevresult
* an array to store the return results previous inverse ce
* and previous inverse continuation ce
* @return result of the inverse ce
*/
final int getInversePrevCE(int ce, int contce, int strength,
int prevresult[]) {
int result = findInverseCE(ce, contce);
if (result < 0) {
prevresult[0] = CollationElementIterator.NULLORDER;
return -1;
}
ce &= STRENGTH_MASK_[strength];
contce &= STRENGTH_MASK_[strength];
prevresult[0] = ce;
prevresult[1] = contce;
while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
&& (prevresult[1] & STRENGTH_MASK_[strength]) == contce
&& result > 0) {
// this condition should prevent falling off the edge of the
// world
// here, we end up in a singularity - zero
prevresult[0] = m_table_[3 * (--result)];
prevresult[1] = m_table_[3 * result + 1];
}
return result;
}
final int getCEStrengthDifference(int CE, int contCE, int prevCE,
int prevContCE) {
int strength = Collator.TERTIARY;
while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
&& (strength != 0)) {
strength--;
}
return strength;
}
private int compareCEs(int source0, int source1, int target0,
int target1) {
int s1 = source0, s2, t1 = target0, t2;
if (RuleBasedCollator.isContinuation(source1)) {
s2 = source1;
} else {
s2 = 0;
}
if (RuleBasedCollator.isContinuation(target1)) {
t2 = target1;
} else {
t2 = 0;
}
int s = 0, t = 0;
if (s1 == t1 && s2 == t2) {
return 0;
}
s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >>> 16);
t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >>> 16);
if (s == t) {
s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;
t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;
if (s == t) {
s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);
t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);
return Utility.compareUnsigned(s, t);
} else {
return Utility.compareUnsigned(s, t);
}
} else {
return Utility.compareUnsigned(s, t);
}
}
/**
* Finding the inverse CE of the argument CEs
*
* @param ce
* CE to be tested
* @param contce
* continuation CE
* @return inverse CE
*/
int findInverseCE(int ce, int contce) {
int bottom = 0;
int top = m_table_.length / 3;
int result = 0;
while (bottom < top - 1) {
result = (top + bottom) >> 1;
int first = m_table_[3 * result];
int second = m_table_[3 * result + 1];
int comparison = compareCEs(first, second, ce, contce);
if (comparison > 0) {
top = result;
} else if (comparison < 0) {
bottom = result;
} else {
break;
}
}
return result;
}
/**
* Getting gap offsets in the inverse UCA
*
* @param listheader
* parsed token lists
* @exception Exception
* thrown when error occurs while finding the collation
* gaps
*/
void getInverseGapPositions(
CollationRuleParser.TokenListHeader listheader)
throws Exception {
// reset all the gaps
CollationRuleParser.Token token = listheader.m_first_;
int tokenstrength = token.m_strength_;
for (int i = 0; i < 3; i++) {
listheader.m_gapsHi_[3 * i] = 0;
listheader.m_gapsHi_[3 * i + 1] = 0;
listheader.m_gapsHi_[3 * i + 2] = 0;
listheader.m_gapsLo_[3 * i] = 0;
listheader.m_gapsLo_[3 * i + 1] = 0;
listheader.m_gapsLo_[3 * i + 2] = 0;
listheader.m_numStr_[i] = 0;
listheader.m_fStrToken_[i] = null;
listheader.m_lStrToken_[i] = null;
listheader.m_pos_[i] = -1;
}
if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
&& (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {
// implicits -
listheader.m_pos_[0] = 0;
int t1 = listheader.m_baseCE_;
int t2 = listheader.m_baseContCE_;
listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_
| (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
primaryCE = RuleBasedCollator.impCEGen_
.getImplicitFromRaw(RuleBasedCollator.impCEGen_
.getRawFromImplicit(primaryCE) + 1);
t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
// if (listheader.m_baseCE_ < 0xEF000000) {
// // first implicits have three byte primaries, with a gap of
// // one so we esentially need to add 2 to the top byte in
// // listheader.m_baseContCE_
// t2 += 0x02000000;
// }
// else {
// // second implicits have four byte primaries, with a gap of
// // IMPLICIT_LAST2_MULTIPLIER_
// // Now, this guy is not really accessible here, so until we
// // find a better way to pass it around, assume that the gap
// is 1
// t2 += 0x00020000;
// }
listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
} else if (listheader.m_indirect_ == true
&& listheader.m_nextCE_ != 0) {
listheader.m_pos_[0] = 0;
int t1 = listheader.m_baseCE_;
int t2 = listheader.m_baseContCE_;
listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
t1 = listheader.m_nextCE_;
t2 = listheader.m_nextContCE_;
listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
} else {
while (true) {
if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
listheader.m_pos_[tokenstrength] = getInverseNext(
listheader, tokenstrength);
if (listheader.m_pos_[tokenstrength] >= 0) {
listheader.m_fStrToken_[tokenstrength] = token;
} else {
// The CE must be implicit, since it's not in the
// table
// Error
throw new Exception("Internal program error");
}
}
while (token != null && token.m_strength_ >= tokenstrength) {
if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
listheader.m_lStrToken_[tokenstrength] = token;
}
token = token.m_next_;
}
if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
// check if previous interval is the same and merge the
// intervals if it is so
if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {
listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];
listheader.m_fStrToken_[tokenstrength + 1] = null;
listheader.m_lStrToken_[tokenstrength + 1] = null;
listheader.m_pos_[tokenstrength + 1] = -1;
}
}
if (token != null) {
tokenstrength = token.m_strength_;
} else {
break;
}
}
for (int st = 0; st < 3; st++) {
int pos = listheader.m_pos_[st];
if (pos >= 0) {
int t1 = m_table_[3 * pos];
int t2 = m_table_[3 * pos + 1];
listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
Collator.PRIMARY);
listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2,
Collator.SECONDARY);
listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
| (t2 & 0x3f) << 16;
// pos --;
// t1 = m_table_[3 * pos];
// t2 = m_table_[3 * pos + 1];
t1 = listheader.m_baseCE_;
t2 = listheader.m_baseContCE_;
listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
Collator.PRIMARY);
listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2,
Collator.SECONDARY);
listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
| (t2 & 0x3f) << 16;
}
}
}
}
/**
* Gets the next CE in the inverse table
*
* @param listheader
* token list header
* @param strength
* collation strength
* @return next ce
*/
private final int getInverseNext(
CollationRuleParser.TokenListHeader listheader, int strength) {
int ce = listheader.m_baseCE_;
int secondce = listheader.m_baseContCE_;
int result = findInverseCE(ce, secondce);
if (result < 0) {
return -1;
}
ce &= STRENGTH_MASK_[strength];
secondce &= STRENGTH_MASK_[strength];
int nextce = ce;
int nextcontce = secondce;
while ((nextce & STRENGTH_MASK_[strength]) == ce
&& (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
nextce = m_table_[3 * (++result)];
nextcontce = m_table_[3 * result + 1];
}
listheader.m_nextCE_ = nextce;
listheader.m_nextContCE_ = nextcontce;
return result;
}
}
// package private data members ------------------------------------------
/**
* Inverse UCA, instantiate only when required
*/
static final InverseUCA INVERSE_UCA_;
/**
* UCA and Inverse UCA version do not match
*/
private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";
/**
* UCA and Inverse UCA version do not match
*/
private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";
/**
* Initializing the inverse UCA
*/
static {
InverseUCA temp = null;
try {
temp = CollatorReader.getInverseUCA();
} catch (IOException e) {
}
/*
* try { String invdat = "/com/ibm/icu/impl/data/invuca.icu";
* InputStream i =
* CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
* BufferedInputStream b = new BufferedInputStream(i, 110000);
* INVERSE_UCA_ = CollatorReader.readInverseUCA(b); b.close();
* i.close(); } catch (Exception e) { e.printStackTrace(); throw new
* RuntimeException(e.getMessage()); }
*/
if (temp != null && RuleBasedCollator.UCA_ != null) {
if (!temp.m_UCA_version_
.equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
}
} else {
throw new RuntimeException(UCA_NOT_INSTANTIATED_);
}
INVERSE_UCA_ = temp;
}
// package private methods -----------------------------------------------
/**
* Parse and sets the collation rules in the argument collator
*
* @param collator
* to set
* @exception Exception
* thrown when internal program error occurs
*/
void setRules(RuleBasedCollator collator) throws Exception {
if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) {
// we have a set of rules, let's make something of it
assembleTailoringTable(collator);
} else { // no rules, but no error either must be only options
// We will init the collator from UCA
collator.setWithUCATables();
}
// And set only the options
m_parser_.setDefaultOptionsInCollator(collator);
}
private void copyRangeFromUCA(BuildTable t, int start, int end) {
int u = 0;
for (u = start; u <= end; u++) {
// if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
int CE = t.m_mapping_.getValue(u);
if (CE == CE_NOT_FOUND_
// this test is for contractions that are missing the starting
// element. Looks like latin-1 should be done before
// assembling the table, even if it results in more false
// closure elements
|| (isContractionTableElement(CE) && getCE(
t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
// m_utilElement_.m_uchars_ = str.toString();
m_utilElement_.m_uchars_ = UCharacter.toString(u);
m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
m_utilElement_.m_prefix_ = 0;
m_utilElement_.m_CELength_ = 0;
m_utilElement_.m_prefixChars_ = null;
m_utilColEIter_.setText(m_utilElement_.m_uchars_);
while (CE != CollationElementIterator.NULLORDER) {
CE = m_utilColEIter_.next();
if (CE != CollationElementIterator.NULLORDER) {
m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
}
}
addAnElement(t, m_utilElement_);
}
}
}
/**
* 2. Eliminate the negative lists by doing the following for each non-null
* negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's
* baseCE, create new ListHeader X o reverse the list, add to the end of X's
* positive list. Reset the strength of the first item you add, based on the
* stronger strength levels of the two lists.
*
* 3. For each ListHeader with a non-null positive list: o Find all
* character strings with CEs between the baseCE and the next/previous CE,
* at the strength of the first token. Add these to the tailoring. ? That
* is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has &
* x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
*
* It is possible that this part should be done even while constructing list
* The problem is that it is unknown what is going to be the strongest
* weight. So we might as well do it here o Allocate CEs for each token in
* the list, based on the total number N of the largest level difference,
* and the gap G between baseCE and nextCE at that level. The relation *
* between the last item and nextCE is the same as the strongest strength. o
* Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3
* primary items: a, d, e. Fit them into the primary gap. Then fit b and c
* into the secondary gap between a and d, then fit q into the tertiary gap
* between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There
* are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q
* into the tertiary gap between b and c. o When incrementing primary
* values, we will not cross high byte boundaries except where there is only
* a single-byte primary. That is to ensure that the script reordering will
* continue to work.
*
* @param collator
* the rule based collator to update
* @exception Exception
* thrown when internal program error occurs
*/
void assembleTailoringTable(RuleBasedCollator collator) throws Exception {
for (int i = 0; i < m_parser_.m_resultLength_; i++) {
// now we need to generate the CEs
// We stuff the initial value in the buffers, and increase the
// appropriate buffer according to strength
if (m_parser_.m_listHeader_[i].m_first_ != null) {
// if there are any elements
// due to the way parser works, subsequent tailorings
// may remove all the elements from a sequence, therefore
// leaving an empty tailoring sequence.
initBuffers(m_parser_.m_listHeader_[i]);
}
}
if (m_parser_.m_variableTop_ != null) {
// stuff the variable top value
m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
// remove it from the list
if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first
// in
// list
m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;
}
if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {
// first in list
m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;
}
if (m_parser_.m_variableTop_.m_next_ != null) {
m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;
}
if (m_parser_.m_variableTop_.m_previous_ != null) {
m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;
}
}
BuildTable t = new BuildTable(m_parser_);
// After this, we have assigned CE values to all regular CEs now we
// will go through list once more and resolve expansions, make
// UCAElements structs and add them to table
for (int i = 0; i < m_parser_.m_resultLength_; i++) {
// now we need to generate the CEs
// We stuff the initial value in the buffers, and increase the
// appropriate buffer according to strength */
createElements(t, m_parser_.m_listHeader_[i]);
}
m_utilElement_.clear();
// add latin-1 stuff
copyRangeFromUCA(t, 0, 0xFF);
// add stuff for copying
if (m_parser_.m_copySet_ != null) {
int i = 0;
for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i),
m_parser_.m_copySet_.getRangeEnd(i));
}
}
// copy contractions from the UCA - this is felt mostly for cyrillic
char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
int maxUCAContractionLength = RuleBasedCollator.MAX_UCA_CONTRACTION_LENGTH;
int offset = 0;
while (conts[offset] != 0) {
// A continuation is NUL-terminated and NUL-padded
// except if it has the maximum length.
int contractionLength = maxUCAContractionLength;
while (contractionLength > 0 && conts[offset + contractionLength - 1] == 0) {
--contractionLength;
}
int first = Character.codePointAt(conts, offset);
int firstLength = Character.charCount(first);
int tailoredCE = t.m_mapping_.getValue(first);
Elements prefixElm = null;
if (tailoredCE != CE_NOT_FOUND_) {
boolean needToAdd = true;
if (isContractionTableElement(tailoredCE)) {
if (isTailored(t.m_contractions_, tailoredCE, conts,
offset + firstLength) == true) {
needToAdd = false;
}
}
if (!needToAdd && isPrefix(tailoredCE)
&& conts[offset + 1] == 0) {
// pre-context character in UCA
// The format for pre-context character is
// conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP
Elements elm = new Elements();
elm.m_CELength_ = 0;
elm.m_uchars_ = Character.toString(conts[offset]);
elm.m_cPoints_ = m_utilElement_.m_uchars_;
elm.m_prefixChars_ = Character.toString(conts[offset + 2]);
elm.m_prefix_ = 0; // TODO(claireho) : confirm!
prefixElm = t.m_prefixLookup_.get(elm);
if ((prefixElm == null)
|| (prefixElm.m_prefixChars_.charAt(0) != conts[offset + 2])) {
needToAdd = true;
}
}
if (m_parser_.m_removeSet_ != null
&& m_parser_.m_removeSet_.contains(first)) {
needToAdd = false;
}
if (needToAdd == true) {
// we need to add if this contraction is not tailored.
if (conts[offset + 1] != 0) { // not precontext
m_utilElement_.m_prefix_ = 0;
m_utilElement_.m_prefixChars_ = null;
m_utilElement_.m_uchars_ = new String(conts, offset, contractionLength);
m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
m_utilElement_.m_CELength_ = 0;
m_utilColEIter_.setText(m_utilElement_.m_uchars_);
} else { // add a pre-context element
int preKeyLen = 0;
m_utilElement_.m_uchars_ = Character.toString(conts[offset]);
m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
m_utilElement_.m_CELength_ = 0;
m_utilElement_.m_prefixChars_ = Character.toString(conts[offset + 2]);
if (prefixElm == null) {
m_utilElement_.m_prefix_ = 0;
} else { // TODO (claireho): confirm!
m_utilElement_.m_prefix_ = m_utilElement_.m_prefix_;
// m_utilElement_.m_prefix_= prefixElm.m_prefix_;
}
m_utilColEIter_.setText(m_utilElement_.m_prefixChars_);
while (m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
// count number of keys for pre-context char.
preKeyLen++;
}
m_utilColEIter_.setText(m_utilElement_.m_prefixChars_ + m_utilElement_.m_uchars_);
// Skip the keys for prefix character, then copy the
// rest to el.
while ((preKeyLen-- > 0)
&& m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
continue;
}
}
while (true) {
int CE = m_utilColEIter_.next();
if (CE != CollationElementIterator.NULLORDER) {
m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
} else {
break;
}
}
addAnElement(t, m_utilElement_);
}
} else if (m_parser_.m_removeSet_ != null
&& m_parser_.m_removeSet_.contains(first)) {
copyRangeFromUCA(t, first, first);
}
offset += maxUCAContractionLength;
}
// Add completely ignorable elements
processUCACompleteIgnorables(t);
// canonical closure
canonicalClosure(t);
// still need to produce compatibility closure
assembleTable(t, collator);
}
// private inner classes -------------------------------------------------
@SuppressWarnings("unused")
private static class CEGenerator {
// package private data members --------------------------------------
WeightRange m_ranges_[];
int m_rangesLength_;
int m_byteSize_;
int m_start_;
int m_limit_;
int m_maxCount_;
int m_count_;
int m_current_;
int m_fLow_; // forbidden Low
int m_fHigh_; // forbidden High
// package private constructor ---------------------------------------
CEGenerator() {
m_ranges_ = new WeightRange[7];
for (int i = 6; i >= 0; i--) {
m_ranges_[i] = new WeightRange();
}
}
}
private static class WeightRange implements Comparable {
// public methods ----------------------------------------------------
/**
* Compares this object with target
*
* @param target object to compare with
* @return 0 if equals, 1 if this is > target, -1 otherwise
*/
public int compareTo(WeightRange target) {
return Utility.compareUnsigned(m_start_, target.m_start_);
}
/**
* Initialize
*/
public void clear() {
m_start_ = 0;
m_end_ = 0;
m_length_ = 0;
m_count_ = 0;
m_length2_ = 0;
m_count2_ = 0;
}
// package private data members --------------------------------------
int m_start_;
int m_end_;
int m_length_;
int m_count_;
int m_length2_;
int m_count2_;
// package private constructor ---------------------------------------
WeightRange() {
clear();
}
/**
* Copy constructor. Cloneable is troublesome, needs to check for
* exception
*
* @param source
* to clone
*/
WeightRange(WeightRange source) {
m_start_ = source.m_start_;
m_end_ = source.m_end_;
m_length_ = source.m_length_;
m_count_ = source.m_count_;
m_length2_ = source.m_length2_;
m_count2_ = source.m_count2_;
}
}
private static class MaxJamoExpansionTable {
// package private data members --------------------------------------
List m_endExpansionCE_;
// vector of booleans
List m_isV_;
byte m_maxLSize_;
byte m_maxVSize_;
byte m_maxTSize_;
// package private constructor ---------------------------------------
MaxJamoExpansionTable() {
m_endExpansionCE_ = new ArrayList();
m_isV_ = new ArrayList();
m_endExpansionCE_.add(Integer.valueOf(0));
m_isV_.add(Boolean.FALSE);
m_maxLSize_ = 1;
m_maxVSize_ = 1;
m_maxTSize_ = 1;
}
MaxJamoExpansionTable(MaxJamoExpansionTable table) {
m_endExpansionCE_ = new ArrayList(table.m_endExpansionCE_);
m_isV_ = new ArrayList(table.m_isV_);
m_maxLSize_ = table.m_maxLSize_;
m_maxVSize_ = table.m_maxVSize_;
m_maxTSize_ = table.m_maxTSize_;
}
}
private static class MaxExpansionTable {
// package private constructor --------------------------------------
MaxExpansionTable() {
m_endExpansionCE_ = new ArrayList();
m_expansionCESize_ = new ArrayList();
m_endExpansionCE_.add(Integer.valueOf(0));
m_expansionCESize_.add(Byte.valueOf((byte) 0));
}
MaxExpansionTable(MaxExpansionTable table) {
m_endExpansionCE_ = new ArrayList(table.m_endExpansionCE_);
m_expansionCESize_ = new ArrayList(table.m_expansionCESize_);
}
// package private data member --------------------------------------
List m_endExpansionCE_;
List m_expansionCESize_;
}
private static class BasicContractionTable {
// package private constructors -------------------------------------
BasicContractionTable() {
m_CEs_ = new ArrayList();
m_codePoints_ = new StringBuilder();
}
// package private data members -------------------------------------
StringBuilder m_codePoints_;
List m_CEs_;
}
private static class ContractionTable {
// package private constructor --------------------------------------
/**
* Builds a contraction table
*
* @param mapping
*/
ContractionTable(IntTrieBuilder mapping) {
m_mapping_ = mapping;
m_elements_ = new ArrayList();
m_CEs_ = new ArrayList();
m_codePoints_ = new StringBuilder();
m_offsets_ = new ArrayList();
m_currentTag_ = CE_NOT_FOUND_TAG_;
}
/**
* Copies a contraction table. Not all data will be copied into their
* own object.
*
* @param table
*/
ContractionTable(ContractionTable table) {
m_mapping_ = table.m_mapping_;
m_elements_ = new ArrayList(table.m_elements_);
m_codePoints_ = new StringBuilder(table.m_codePoints_);
m_CEs_ = new ArrayList(table.m_CEs_);
m_offsets_ = new ArrayList(table.m_offsets_);
m_currentTag_ = table.m_currentTag_;
}
// package private data members ------------------------------------
/**
* Vector of BasicContractionTable
*/
List m_elements_;
IntTrieBuilder m_mapping_;
StringBuilder m_codePoints_;
List m_CEs_;
List m_offsets_;
int m_currentTag_;
}
/**
* Private class for combining mark table. The table is indexed by the class
* value(0-255).
*/
@SuppressWarnings("unused")
private static class CombinClassTable {
/**
* accumulated numbers of combining marks.
*/
int[] index = new int[256];
/**
* code point array for combining marks.
*/
char[] cPoints;
/**
* size of cPoints.
*/
int size;
// constructor
CombinClassTable() {
cPoints = null;
size = 0;
pos = 0;
curClass = 1;
}
/**
* Copy the combining mark table from ccc and index in compact way.
*
* @param cps
* : code point array
* @param size
* : size of ccc
* @param index
* : index of combining classes(0-255)
*/
void generate(char[] cps, int numOfCM, int[] ccIndex) {
int count = 0;
cPoints = new char[numOfCM];
for (int i = 0; i < 256; i++) {
for (int j = 0; j < ccIndex[i]; j++) {
cPoints[count++] = cps[(i << 8) + j];
}
index[i] = count;
}
size = count;
}
/**
* Get first CM(combining mark) with the combining class value cClass.
*
* @param cClass
* : combining class value.
* @return combining mark codepoint or 0 if no combining make with class
* value cClass
*/
char GetFirstCM(int cClass) {
curClass = cClass;
if (cPoints == null || cClass == 0
|| index[cClass] == index[cClass - 1]) {
return 0;
}
pos = 1;
return cPoints[index[cClass - 1]];
}
/**
* Get next CM(combining mark) with the combining class value cClass.
* Return combining mark codepoint or 0 if no next CM.
*/
char GetNextCM() {
if (cPoints == null
|| index[curClass] == (index[curClass - 1] + pos)) {
return 0;
}
return cPoints[index[curClass - 1] + (pos++)];
}
// private data members
int pos;
int curClass;
}
private static final class BuildTable implements TrieBuilder.DataManipulate {
// package private methods ------------------------------------------
/**
* For construction of the Trie tables. Has to be labeled public
*
* @param cp The value of the code point.
* @param offset The value of the offset.
* @return data offset or 0
*/
public int getFoldedValue(int cp, int offset) {
int limit = cp + 0x400;
while (cp < limit) {
int value = m_mapping_.getValue(cp);
boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
int tag = getCETag(value);
if (inBlockZero == true) {
cp += TrieBuilder.DATA_BLOCK_LENGTH;
} else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {
// These are values that are starting in either UCA
// (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).
// Presence of these tags means that there is nothing in
// this position and that it should be skipped.
return RuleBasedCollator.CE_SPECIAL_FLAG_
| (CE_SURROGATE_TAG_ << 24) | offset;
} else {
++cp;
}
}
return 0;
}
// package private constructor --------------------------------------
/**
* Returns a table
*/
BuildTable(CollationRuleParser parser) {
m_collator_ = new RuleBasedCollator();
m_collator_.setWithUCAData();
MaxExpansionTable maxet = new MaxExpansionTable();
MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
m_options_ = parser.m_options_;
m_expansions_ = new ArrayList();
// Do your own mallocs for the structure, array and have linear
// Latin 1
int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
| (CE_NOT_FOUND_TAG_ << 24);
// temporary fix for jb3822, 0x100000 -> 30000
m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue,
trieinitialvalue, true);
m_prefixLookup_ = new HashMap();
// uhash_open(prefixLookupHash, prefixLookupComp);
m_contractions_ = new ContractionTable(m_mapping_);
// copy UCA's maxexpansion and merge as we go along
m_maxExpansions_ = maxet;
// adding an extra initial value for easier manipulation
for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {
maxet.m_endExpansionCE_.add(Integer.valueOf(
RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
maxet.m_expansionCESize_.add(Byte.valueOf(
RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
}
m_maxJamoExpansions_ = maxjet;
m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
Arrays.fill(m_unsafeCP_, (byte) 0);
Arrays.fill(m_contrEndCP_, (byte) 0);
}
/**
* Duplicating a BuildTable. Not all data will be duplicated into their
* own object.
*
* @param table
* to clone
*/
BuildTable(BuildTable table) {
m_collator_ = table.m_collator_;
m_mapping_ = new IntTrieBuilder(table.m_mapping_);
m_expansions_ = new ArrayList(table.m_expansions_);
m_contractions_ = new ContractionTable(table.m_contractions_);
m_contractions_.m_mapping_ = m_mapping_;
m_options_ = table.m_options_;
m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_);
m_maxJamoExpansions_ = new MaxJamoExpansionTable(
table.m_maxJamoExpansions_);
m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
m_unsafeCP_.length);
m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
m_contrEndCP_.length);
}
// package private data members -------------------------------------
RuleBasedCollator m_collator_;
IntTrieBuilder m_mapping_;
List m_expansions_;
ContractionTable m_contractions_;
// UCATableHeader image;
CollationRuleParser.OptionSet m_options_;
MaxExpansionTable m_maxExpansions_;
MaxJamoExpansionTable m_maxJamoExpansions_;
byte m_unsafeCP_[];
byte m_contrEndCP_[];
Map m_prefixLookup_;
CombinClassTable cmLookup = null;
}
private static class Elements {
// package private data members -------------------------------------
String m_prefixChars_;
int m_prefix_;
String m_uchars_;
/**
* Working string
*/
String m_cPoints_;
/**
* Offset to the working string
*/
int m_cPointsOffset_;
/**
* These are collation elements - there could be more than one - in case
* of expansion
*/
int m_CEs_[];
int m_CELength_;
/**
* This is the value element maps in original table
*/
int m_mapCE_;
int m_sizePrim_[];
int m_sizeSec_[];
int m_sizeTer_[];
boolean m_variableTop_;
boolean m_caseBit_;
// package private constructors -------------------------------------
/**
* Package private constructor
*/
Elements() {
m_sizePrim_ = new int[128];
m_sizeSec_ = new int[128];
m_sizeTer_ = new int[128];
m_CEs_ = new int[256];
m_CELength_ = 0;
}
/**
* Package private constructor
*/
Elements(Elements element) {
m_prefixChars_ = element.m_prefixChars_;
m_prefix_ = element.m_prefix_;
m_uchars_ = element.m_uchars_;
m_cPoints_ = element.m_cPoints_;
m_cPointsOffset_ = element.m_cPointsOffset_;
m_CEs_ = element.m_CEs_;
m_CELength_ = element.m_CELength_;
m_mapCE_ = element.m_mapCE_;
m_sizePrim_ = element.m_sizePrim_;
m_sizeSec_ = element.m_sizeSec_;
m_sizeTer_ = element.m_sizeTer_;
m_variableTop_ = element.m_variableTop_;
m_caseBit_ = element.m_caseBit_;
}
// package private methods -------------------------------------------
/**
* Initializing the elements
*/
public void clear() {
m_prefixChars_ = null;
m_prefix_ = 0;
m_uchars_ = null;
m_cPoints_ = null;
m_cPointsOffset_ = 0;
m_CELength_ = 0;
m_mapCE_ = 0;
Arrays.fill(m_sizePrim_, 0);
Arrays.fill(m_sizeSec_, 0);
Arrays.fill(m_sizeTer_, 0);
m_variableTop_ = false;
m_caseBit_ = false;
}
/**
* Hashcode calculation for token
*
* @return the hashcode
*/
public int hashCode() {
String str = m_cPoints_.substring(m_cPointsOffset_);
return str.hashCode();
}
/**
* Equals calculation
*
* @param target Object to compare
* @return true if target is the same as this object
*/
public boolean equals(Object target) {
if (target == this) {
return true;
}
if (target instanceof Elements) {
Elements t = (Elements) target;
int size = m_cPoints_.length() - m_cPointsOffset_;
if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
return t.m_cPoints_.regionMatches(t.m_cPointsOffset_,
m_cPoints_, m_cPointsOffset_, size);
}
}
return false;
}
}
// private data member ---------------------------------------------------
/**
* Maximum strength used in CE building
*/
private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
/**
* Maximum collation strength
*/
private static final int CE_STRENGTH_LIMIT_ = 16;
/**
* Strength mask array, used in inverse UCA
*/
private static final int STRENGTH_MASK_[] = { 0xFFFF0000, 0xFFFFFF00,
0xFFFFFFFF };
/**
* CE tag for not found
*/
private static final int CE_NOT_FOUND_ = 0xF0000000;
/**
* CE tag for not found
*/
private static final int CE_NOT_FOUND_TAG_ = 0;
/**
* This code point results in an expansion
*/
private static final int CE_EXPANSION_TAG_ = 1;
/**
* Start of a contraction
*/
private static final int CE_CONTRACTION_TAG_ = 2;
/*
* Thai character - do the reordering
*/
// private static final int CE_THAI_TAG_ = 3;
/*
* Charset processing, not yet implemented
*/
// private static final int CE_CHARSET_TAG_ = 4;
/**
* Lead surrogate that is tailored and doesn't start a contraction
*/
private static final int CE_SURROGATE_TAG_ = 5;
/*
* AC00-D7AF
*/
// private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
/*
* D800-DBFF
*/
// private static final int CE_LEAD_SURROGATE_TAG_ = 7;
/*
* DC00-DFFF
*/
// private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
/*
* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/
// private static final int CE_CJK_IMPLICIT_TAG_ = 9;
private static final int CE_IMPLICIT_TAG_ = 10;
private static final int CE_SPEC_PROC_TAG_ = 11;
/**
* This is a three byte primary with starting secondaries and tertiaries. It
* fits in a single 32 bit CE and is used instead of expansion to save space
* without affecting the performance (hopefully)
*/
private static final int CE_LONG_PRIMARY_TAG_ = 12;
/**
* Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each
* latin 1 char + some power of two for hashing the rest of the chars. Size
* in bytes
*/
private static final int UNSAFECP_TABLE_SIZE_ = 1056;
/**
* Mask value down to "some power of two" -1. Number of bits, not num of
* bytes.
*/
private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
/**
* Case values
*/
private static final int UPPER_CASE_ = 0x80;
private static final int MIXED_CASE_ = 0x40;
private static final int LOWER_CASE_ = 0x00;
/*
* Initial table size
*/
// private static final int INIT_TABLE_SIZE_ = 1028;
/*
* Header size, copied from ICU4C, to be changed when that value changes
*/
// private static final int HEADER_SIZE_ = 0xC4;
/**
* Contraction table new element indicator
*/
private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
/**
* Parser for the rules
*/
private CollationRuleParser m_parser_;
/**
* Utility UCA collation element iterator
*/
private CollationElementIterator m_utilColEIter_;
/**
* Utility data members
*/
private CEGenerator m_utilGens_[] = { new CEGenerator(), new CEGenerator(),
new CEGenerator() };
private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
private Elements m_utilElement_ = new Elements();
private Elements m_utilElement2_ = new Elements();
private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();
private int m_utilCountBuffer_[] = new int[6];
private long m_utilLongBuffer_[] = new long[5];
private WeightRange m_utilLowerWeightRange_[] = { new WeightRange(),
new WeightRange(), new WeightRange(), new WeightRange(),
new WeightRange() };
private WeightRange m_utilUpperWeightRange_[] = { new WeightRange(),
new WeightRange(), new WeightRange(), new WeightRange(),
new WeightRange() };
private WeightRange m_utilWeightRange_ = new WeightRange();
private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
private StringBuilder m_utilStringBuffer_ = new StringBuilder("");
// Flag indicating a combining marks table is required or not.
private static boolean buildCMTabFlag = false;
// private methods -------------------------------------------------------
/**
* @param listheader
* parsed rule tokens
* @exception Exception
* thrown when internal error occurs
*/
private void initBuffers(CollationRuleParser.TokenListHeader listheader)
throws Exception {
CollationRuleParser.Token token = listheader.m_last_;
Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
token.m_toInsert_ = 1;
m_utilIntBuffer_[token.m_strength_] = 1;
while (token.m_previous_ != null) {
if (token.m_previous_.m_strength_ < token.m_strength_) {
// going up
m_utilIntBuffer_[token.m_strength_] = 0;
m_utilIntBuffer_[token.m_previous_.m_strength_]++;
} else if (token.m_previous_.m_strength_ > token.m_strength_) {
// going down
m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
} else {
m_utilIntBuffer_[token.m_strength_]++;
}
token = token.m_previous_;
token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
}
token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
INVERSE_UCA_.getInverseGapPositions(listheader);
token = listheader.m_first_;
int fstrength = Collator.IDENTICAL;
int initstrength = Collator.IDENTICAL;
m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_,
listheader.m_baseContCE_, Collator.PRIMARY);
m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_,
listheader.m_baseContCE_, Collator.SECONDARY);
m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_,
listheader.m_baseContCE_, Collator.TERTIARY);
while (token != null) {
fstrength = token.m_strength_;
if (fstrength < initstrength) {
initstrength = fstrength;
if (listheader.m_pos_[fstrength] == -1) {
while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) {
fstrength--;
}
if (listheader.m_pos_[fstrength] == -1) {
throw new Exception("Internal program error");
}
}
if (initstrength == Collator.TERTIARY) {
// starting with tertiary
m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];
m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
m_utilGens_[Collator.TERTIARY],
listheader.m_gapsLo_, listheader.m_gapsHi_, token,
fstrength);
} else if (initstrength == Collator.SECONDARY) {
// secondaries
m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(
m_utilGens_[Collator.SECONDARY],
listheader.m_gapsLo_, listheader.m_gapsHi_, token,
fstrength);
m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
m_utilGens_[Collator.TERTIARY], token,
Collator.TERTIARY);
} else {
// primaries
m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(
m_utilGens_[Collator.PRIMARY],
listheader.m_gapsLo_, listheader.m_gapsHi_, token,
fstrength);
m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
m_utilGens_[Collator.SECONDARY], token,
Collator.SECONDARY);
m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
m_utilGens_[Collator.TERTIARY], token,
Collator.TERTIARY);
}
} else {
if (token.m_strength_ == Collator.TERTIARY) {
m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
} else if (token.m_strength_ == Collator.SECONDARY) {
m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
m_utilGens_[Collator.TERTIARY], token,
Collator.TERTIARY);
} else if (token.m_strength_ == Collator.PRIMARY) {
m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);
m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
m_utilGens_[Collator.SECONDARY], token,
Collator.SECONDARY);
m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
m_utilGens_[Collator.TERTIARY], token,
Collator.TERTIARY);
}
}
doCE(m_utilCEBuffer_, token);
token = token.m_next_;
}
}
/**
* Get the next generated ce
*
* @param g
* ce generator
* @return next generated ce
*/
private int getNextGenerated(CEGenerator g) {
g.m_current_ = nextWeight(g);
return g.m_current_;
}
/**
* @param g
* CEGenerator
* @param token
* rule token
* @param strength
* @return ce generator
* @exception Exception
* thrown when internal error occurs
*/
private int getSimpleCEGenerator(CEGenerator g,
CollationRuleParser.Token token, int strength) throws Exception {
int high, low, count = 1;
int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
if (strength == Collator.SECONDARY) {
low = RuleBasedCollator.COMMON_TOP_2_ << 24;
high = 0xFFFFFFFF;
count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
} else {
low = RuleBasedCollator.BYTE_COMMON_ << 24; // 0x05000000;
high = 0x40000000;
count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
}
if (token.m_next_ != null && token.m_next_.m_strength_ == strength) {
count = token.m_next_.m_toInsert_;
}
g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
g.m_ranges_);
g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
if (g.m_rangesLength_ == 0) {
throw new Exception("Internal program error");
}
return g.m_current_;
}
/**
* Combines 2 ce into one with respect to the argument strength
*
* @param ce1
* first ce
* @param ce2
* second ce
* @param strength
* strength to use
* @return combined ce
*/
private static int mergeCE(int ce1, int ce2, int strength) {
int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
if (strength == Collator.SECONDARY) {
mask = RuleBasedCollator.CE_SECONDARY_MASK_;
} else if (strength == Collator.PRIMARY) {
mask = RuleBasedCollator.CE_PRIMARY_MASK_;
}
ce1 &= mask;
ce2 &= mask;
switch (strength) {
case Collator.PRIMARY:
return ce1 | ce2 >>> 16;
case Collator.SECONDARY:
return ce1 << 16 | ce2 << 8;
default:
return ce1 << 24 | ce2 << 16;
}
}
/**
* @param g
* CEGenerator
* @param lows
* low gap array
* @param highs
* high gap array
* @param token
* rule token
* @param fstrength
* @exception Exception
* thrown when internal error occurs
*/
private int getCEGenerator(CEGenerator g, int lows[], int highs[],
CollationRuleParser.Token token, int fstrength) throws Exception {
int strength = token.m_strength_;
int low = lows[fstrength * 3 + strength];
int high = highs[fstrength * 3 + strength];
int maxbyte = 0;
if (strength == Collator.TERTIARY) {
maxbyte = 0x3F;
} else if (strength == Collator.PRIMARY) {
maxbyte = 0xFE;
} else {
maxbyte = 0xFF;
}
int count = token.m_toInsert_;
if (Utility.compareUnsigned(low, high) >= 0
&& strength > Collator.PRIMARY) {
int s = strength;
while (true) {
s--;
if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
if (strength == Collator.SECONDARY) {
if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) {
// Override if low range is less than
// UCOL_COMMON_TOP2.
low = RuleBasedCollator.COMMON_TOP_2_ << 24;
}
high = 0xFFFFFFFF;
} else {
if (low < RuleBasedCollator.COMMON_BOTTOM_3 << 24) {
// Override if low range is less than
// UCOL_COMMON_BOT3.
low = RuleBasedCollator.COMMON_BOTTOM_3 << 24;
}
high = 0x40000000;
}
break;
}
if (s < 0) {
throw new Exception("Internal program error");
}
}
}
if(0 <= low && low < 0x02000000) { // unsigned comparison < 0x02000000
// We must not use CE weight byte 02, so we set it as the minimum lower bound.
// See http://site.icu-project.org/design/collation/bytes
low = 0x02000000;
}
if (strength == Collator.SECONDARY) { // similar as simple
if (Utility.compareUnsigned(low,
RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
&& Utility.compareUnsigned(low,
RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
low = RuleBasedCollator.COMMON_TOP_2_ << 24;
}
if (Utility.compareUnsigned(high,
RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
&& Utility.compareUnsigned(high,
RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
high = RuleBasedCollator.COMMON_TOP_2_ << 24;
}
if (Utility.compareUnsigned(low,
RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
g.m_rangesLength_ = allocateWeights(
RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24, high,
count, maxbyte, g.m_ranges_);
g.m_current_ = nextWeight(g);
// g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
return g.m_current_;
}
}
g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
g.m_ranges_);
if (g.m_rangesLength_ == 0) {
throw new Exception("Internal program error");
}
g.m_current_ = nextWeight(g);
return g.m_current_;
}
/**
* @param ceparts
* list of collation elements parts
* @param token
* rule token
* @exception Exception
* thrown when forming case bits for expansions fails
*/
private void doCE(int ceparts[], CollationRuleParser.Token token)
throws Exception {
// this one makes the table and stuff
// int noofbytes[] = new int[3];
for (int i = 0; i < 3; i++) {
// noofbytes[i] = countBytes(ceparts[i]);
m_utilIntBuffer_[i] = countBytes(ceparts[i]);
}
// Here we have to pack CEs from parts
int cei = 0;
int value = 0;
while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1]
|| cei < m_utilIntBuffer_[2]) {
if (cei > 0) {
value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
} else {
value = 0;
}
if ((cei << 1) < m_utilIntBuffer_[0]) {
value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;
}
if (cei < m_utilIntBuffer_[1]) {
value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
}
if (cei < m_utilIntBuffer_[2]) {
value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);
}
token.m_CE_[cei] = value;
cei++;
}
if (cei == 0) { // totally ignorable
token.m_CELength_ = 1;
token.m_CE_[0] = 0;
} else { // there is at least something
token.m_CELength_ = cei;
}
// Case bits handling for expansion
if (token.m_CE_[0] != 0) { // case bits should be set only for
// non-ignorables
token.m_CE_[0] &= 0xFFFFFF3F; // Clean the case bits field
int cSize = (token.m_source_ & 0xFF000000) >>> 24;
int startoftokenrule = token.m_source_ & 0x00FFFFFF;
if (cSize > 1) {
// Do it manually
String tokenstr = token.m_rules_.substring(startoftokenrule,
startoftokenrule + cSize);
token.m_CE_[0] |= getCaseBits(tokenstr);
} else {
// Copy it from the UCA
int caseCE = getFirstCE(token.m_rules_.charAt(startoftokenrule));
token.m_CE_[0] |= (caseCE & 0xC0);
}
}
}
/**
* Count the number of non-zero bytes used in the ce
*
* @param ce
* @return number of non-zero bytes used in ce
*/
private static final int countBytes(int ce) {
int mask = 0xFFFFFFFF;
int result = 0;
while (mask != 0) {
if ((ce & mask) != 0) {
result++;
}
mask >>>= 8;
}
return result;
}
/**
* We are ready to create collation elements
*
* @param t
* build table to insert
* @param lh
* rule token list header
*/
private void createElements(BuildTable t,
CollationRuleParser.TokenListHeader lh) {
CollationRuleParser.Token tok = lh.m_first_;
m_utilElement_.clear();
while (tok != null) {
// first, check if there are any expansions
// if there are expansions, we need to do a little bit more
// processing since parts of expansion can be tailored, while
// others are not
if (tok.m_expansion_ != 0) {
int len = tok.m_expansion_ >>> 24;
int currentSequenceLen = len;
int expOffset = tok.m_expansion_ & 0x00FFFFFF;
m_utilToken_.m_source_ = currentSequenceLen | expOffset;
m_utilToken_.m_rules_ = m_parser_.m_source_;
while (len > 0) {
currentSequenceLen = len;
while (currentSequenceLen > 0) {
m_utilToken_.m_source_ = (currentSequenceLen << 24)
| expOffset;
CollationRuleParser.Token expt = m_parser_.m_hashTable_.get(m_utilToken_);
if (expt != null
&& expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {
// expansion is tailored
int noOfCEsToCopy = expt.m_CELength_;
for (int j = 0; j < noOfCEsToCopy; j++) {
tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];
}
tok.m_expCELength_ += noOfCEsToCopy;
// never try to add codepoints and CEs.
// For some odd reason, it won't work.
expOffset += currentSequenceLen; // noOfCEsToCopy;
len -= currentSequenceLen; // noOfCEsToCopy;
break;
} else {
currentSequenceLen--;
}
}
if (currentSequenceLen == 0) {
// couldn't find any tailored subsequence, will have to
// get one from UCA. first, get the UChars from the
// rules then pick CEs out until there is no more and
// stuff them into expansion
m_utilColEIter_.setText(m_parser_.m_source_.substring(
expOffset, expOffset + 1));
while (true) {
int order = m_utilColEIter_.next();
if (order == CollationElementIterator.NULLORDER) {
break;
}
tok.m_expCE_[tok.m_expCELength_++] = order;
}
expOffset++;
len--;
}
}
} else {
tok.m_expCELength_ = 0;
}
// set the ucaelement with obtained values
m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_;
// copy CEs
System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
tok.m_CELength_);
System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
tok.m_CELength_, tok.m_expCELength_);
// copy UChars
// We kept prefix and source kind of together, as it is a kind of a
// contraction.
// However, now we have to slice the prefix off the main thing -
m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
if (tok.m_prefix_ != 0) {
// we will just copy the prefix here, and adjust accordingly in
// the addPrefix function in ucol_elm. The reason is that we
// need to add both composed AND decomposed elements to the
// unsafe table.
int size = tok.m_prefix_ >> 24;
int offset = tok.m_prefix_ & 0x00FFFFFF;
m_utilElement_.m_prefixChars_ = m_parser_.m_source_.substring(
offset, offset + size);
size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);
m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
offset, offset + size);
} else {
m_utilElement_.m_prefixChars_ = null;
int offset = tok.m_source_ & 0x00FFFFFF;
int size = tok.m_source_ >>> 24;
m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
offset, offset + size);
}
m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
boolean containCombinMarks = false;
for (int i = 0; i < m_utilElement_.m_cPoints_.length()
- m_utilElement_.m_cPointsOffset_; i++) {
if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
t.m_collator_.m_isJamoSpecial_ = true;
break;
}
if (!buildCMTabFlag) {
// check combining class
int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters
if ((fcd & 0xff) == 0) {
// reset flag when current char is not combining mark.
containCombinMarks = false;
} else {
containCombinMarks = true;
}
}
}
if (!buildCMTabFlag && containCombinMarks) {
buildCMTabFlag = true;
}
/***
* // Case bits handling m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; //
* Clean the case bits field if (m_utilElement_.m_cPoints_.length()
* - m_utilElement_.m_cPointsOffset_ > 1) { // Do it manually
* m_utilElement_.m_CEs_[0] |=
* getCaseBits(m_utilElement_.m_cPoints_); } else { // Copy it from
* the UCA int caseCE =
* getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
* m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0); }
***/
// and then, add it
addAnElement(t, m_utilElement_);
tok = tok.m_next_;
}
}
/**
* Testing if the string argument has case
*
* @param src
* string
* @return the case for this char array
* @exception Exception
* thrown when internal program error occurs
*/
private final int getCaseBits(String src) throws Exception {
int uCount = 0;
int lCount = 0;
src = Normalizer.decompose(src, true);
m_utilColEIter_.setText(src);
for (int i = 0; i < src.length(); i++) {
m_utilColEIter_.setText(src.substring(i, i + 1));
int order = m_utilColEIter_.next();
if (RuleBasedCollator.isContinuation(order)) {
throw new Exception("Internal program error");
}
if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {
uCount++;
} else {
char ch = src.charAt(i);
if (UCharacter.isLowerCase(ch)) {
lCount++;
} else {
if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
lCount++;
}
}
}
}
if (uCount != 0 && lCount != 0) {
return MIXED_CASE_;
} else if (uCount != 0) {
return UPPER_CASE_;
} else {
return LOWER_CASE_;
}
}
/**
* Converts a char to the uppercase Kana
*
* @param ch
* character to convert
* @return the converted Kana character
*/
private static final char toLargeKana(char ch) {
if (0x3042 < ch && ch < 0x30ef) { // Kana range
switch (ch - 0x3000) {
case 0x41:
case 0x43:
case 0x45:
case 0x47:
case 0x49:
case 0x63:
case 0x83:
case 0x85:
case 0x8E:
case 0xA1:
case 0xA3:
case 0xA5:
case 0xA7:
case 0xA9:
case 0xC3:
case 0xE3:
case 0xE5:
case 0xEE:
ch++;
break;
case 0xF5:
ch = 0x30AB;
break;
case 0xF6:
ch = 0x30B1;
break;
}
}
return ch;
}
/**
* Converts a char to the lowercase Kana
*
* @param ch
* character to convert
* @return the converted Kana character
*/
private static final char toSmallKana(char ch) {
if (0x3042 < ch && ch < 0x30ef) { // Kana range
switch (ch - 0x3000) {
case 0x42:
case 0x44:
case 0x46:
case 0x48:
case 0x4A:
case 0x64:
case 0x84:
case 0x86:
case 0x8F:
case 0xA2:
case 0xA4:
case 0xA6:
case 0xA8:
case 0xAA:
case 0xC4:
case 0xE4:
case 0xE6:
case 0xEF:
ch--;
break;
case 0xAB:
ch = 0x30F5;
break;
case 0xB1:
ch = 0x30F6;
break;
}
}
return ch;
}
/**
* This should be connected to special Jamo handling.
*/
private int getFirstCE(char ch) {
m_utilColEIter_.setText(UCharacter.toString(ch));
return m_utilColEIter_.next();
}
/**
* This adds a read element, while testing for existence
*
* @param t
* build table
* @param element
* @return ce
*/
private int addAnElement(BuildTable t, Elements element) {
List expansions = t.m_expansions_;
element.m_mapCE_ = 0;
if (element.m_CELength_ == 1) {
element.m_mapCE_ = element.m_CEs_[0];
} else {
// unfortunately, it looks like we have to look for a long primary
// here since in canonical closure we are going to hit some long
// primaries from the first phase, and they will come back as
// continuations/expansions destroying the effect of the previous
// opitimization. A long primary is a three byte primary with
// starting secondaries and tertiaries. It can appear in long runs
// of only primary differences (like east Asian tailorings) also,
// it should not be an expansion, as expansions would break with
// this
if (element.m_CELength_ == 2 // a two CE expansion
&& RuleBasedCollator.isContinuation(element.m_CEs_[1])
&& (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
// has
// only
// primaries
// in
// continuation
&& (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
// a common secondary
&& ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and
// a
// common
// tertiary
) {
element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
// a long primary special
| (CE_LONG_PRIMARY_TAG_ << 24)
// first and second byte of primary
| ((element.m_CEs_[0] >> 8) & 0xFFFF00)
// third byte of primary
| ((element.m_CEs_[1] >> 24) & 0xFF);
} else {
// omitting expansion offset in builder
// (HEADER_SIZE_ >> 2)
int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
| (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
| (addExpansion(expansions, element.m_CEs_[0]) << 4)
& 0xFFFFF0;
for (int i = 1; i < element.m_CELength_; i++) {
addExpansion(expansions, element.m_CEs_[i]);
}
if (element.m_CELength_ <= 0xF) {
expansion |= element.m_CELength_;
} else {
addExpansion(expansions, 0);
}
element.m_mapCE_ = expansion;
setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
(byte) element.m_CELength_, t.m_maxExpansions_);
if (isJamo(element.m_cPoints_.charAt(0))) {
t.m_collator_.m_isJamoSpecial_ = true;
setMaxJamoExpansion(element.m_cPoints_.charAt(0),
element.m_CEs_[element.m_CELength_ - 1],
(byte) element.m_CELength_, t.m_maxJamoExpansions_);
}
}
}
// We treat digits differently - they are "uber special" and should be
// processed differently if numeric collation is on.
int uniChar = 0;
if ((element.m_uchars_.length() == 2)
&& UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
uniChar = UCharacterProperty.getRawSupplementary(element.m_uchars_
.charAt(0), element.m_uchars_.charAt(1));
} else if (element.m_uchars_.length() == 1) {
uniChar = element.m_uchars_.charAt(0);
}
// Here, we either have one normal CE OR mapCE is set. Therefore, we
// stuff only one element to the expansion buffer. When we encounter a
// digit and we don't do numeric collation, we will just pick the CE
// we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
// && ucol_prv_getSpecialPrevCE). If we picked a special, further
// processing will occur. If it's a simple CE, we'll return due
// to how the loop is constructed.
if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
// prepare the element
int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
| (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
| 1;
if (element.m_mapCE_ != 0) {
// if there is an expansion, we'll pick it here
expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
} else {
expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
}
element.m_mapCE_ = expansion;
}
// here we want to add the prefix structure.
// I will try to process it as a reverse contraction, if possible.
// prefix buffer is already reversed.
if (element.m_prefixChars_ != null
&& element.m_prefixChars_.length() - element.m_prefix_ > 0) {
// We keep the seen prefix starter elements in a hashtable we need
// it to be able to distinguish between the simple codepoints and
// prefix starters. Also, we need to use it for canonical closure.
m_utilElement2_.m_caseBit_ = element.m_caseBit_;
m_utilElement2_.m_CELength_ = element.m_CELength_;
m_utilElement2_.m_CEs_ = element.m_CEs_;
m_utilElement2_.m_mapCE_ = element.m_mapCE_;
// m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
m_utilElement2_.m_variableTop_ = element.m_variableTop_;
m_utilElement2_.m_prefix_ = element.m_prefix_;
m_utilElement2_.m_prefixChars_ = Normalizer.compose(
element.m_prefixChars_, false);
m_utilElement2_.m_uchars_ = element.m_uchars_;
m_utilElement2_.m_cPoints_ = element.m_cPoints_;
m_utilElement2_.m_cPointsOffset_ = 0;
if (t.m_prefixLookup_ != null) {
Elements uCE = t.m_prefixLookup_.get(element);
if (uCE != null) {
// there is already a set of code points here
element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element);
} else { // no code points, so this spot is clean
element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element);
uCE = new Elements(element);
uCE.m_cPoints_ = uCE.m_uchars_;
t.m_prefixLookup_.put(uCE, uCE);
}
if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_
.length()
- element.m_prefix_
|| !m_utilElement2_.m_prefixChars_.regionMatches(0,
element.m_prefixChars_, element.m_prefix_,
m_utilElement2_.m_prefixChars_.length())) {
// do it!
m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_,
m_utilElement2_);
}
}
}
// We need to use the canonical iterator here
// the way we do it is to generate the canonically equivalent strings
// for the contraction and then add the sequences that pass FCD check
if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
&& !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2
&& UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0)) && UTF16
.isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
// this is a contraction, we should check whether a composed form
// should also be included
m_utilCanIter_.setSource(element.m_cPoints_);
String source = m_utilCanIter_.next();
while (source != null && source.length() > 0) {
if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {
element.m_uchars_ = source;
element.m_cPoints_ = element.m_uchars_;
finalizeAddition(t, element);
}
source = m_utilCanIter_.next();
}
return element.m_mapCE_;
} else {
return finalizeAddition(t, element);
}
}
/**
* Adds an expansion ce to the expansion vector
*
* @param expansions
* vector to add to
* @param value
* of the expansion
* @return the current position of the new element
*/
private static final int addExpansion(List expansions, int value) {
expansions.add(Integer.valueOf(value));
return expansions.size() - 1;
}
/**
* Looks for the maximum length of all expansion sequences ending with the
* same collation element. The size required for maxexpansion and maxsize is
* returned if the arrays are too small.
*
* @param endexpansion
* the last expansion collation element to be added
* @param expansionsize
* size of the expansion
* @param maxexpansion
* data structure to store the maximum expansion data.
* @returns size of the maxexpansion and maxsize used.
*/
private static int setMaxExpansion(int endexpansion, byte expansionsize,
MaxExpansionTable maxexpansion) {
int start = 0;
int limit = maxexpansion.m_endExpansionCE_.size();
long unsigned = (long) endexpansion;
unsigned &= 0xFFFFFFFFl;
// using binary search to determine if last expansion element is
// already in the array
int result = -1;
if (limit > 0) {
while (start < limit - 1) {
int mid = (start + limit) >> 1;
long unsignedce = (maxexpansion.m_endExpansionCE_
.get(mid)).intValue();
unsignedce &= 0xFFFFFFFFl;
if (unsigned < unsignedce) {
limit = mid;
} else {
start = mid;
}
}
if ((maxexpansion.m_endExpansionCE_.get(start)).intValue() == endexpansion) {
result = start;
}
}
if (result > -1) {
// found the ce in expansion, we'll just modify the size if it
// is smaller
Object currentsize = maxexpansion.m_expansionCESize_.get(result);
if (((Byte) currentsize).byteValue() < expansionsize) {
maxexpansion.m_expansionCESize_.set(result, Byte.valueOf(
expansionsize));
}
} else {
// we'll need to squeeze the value into the array. initial
// implementation. shifting the subarray down by 1
maxexpansion.m_endExpansionCE_.add(start + 1, Integer.valueOf(endexpansion));
maxexpansion.m_expansionCESize_.add(start + 1, Byte.valueOf(expansionsize));
}
return maxexpansion.m_endExpansionCE_.size();
}
/**
* Sets the maximum length of all jamo expansion sequences ending with the
* same collation element. The size required for maxexpansion and maxsize is
* returned if the arrays are too small.
*
* @param ch
* the jamo codepoint
* @param endexpansion
* the last expansion collation element to be added
* @param expansionsize
* size of the expansion
* @param maxexpansion
* data structure to store the maximum expansion data.
* @returns size of the maxexpansion and maxsize used.
*/
private static int setMaxJamoExpansion(char ch, int endexpansion,
byte expansionsize, MaxJamoExpansionTable maxexpansion) {
boolean isV = true;
if (ch >= 0x1100 && ch <= 0x1112) {
// determines L for Jamo, doesn't need to store this since it is
// never at the end of a expansion
if (maxexpansion.m_maxLSize_ < expansionsize) {
maxexpansion.m_maxLSize_ = expansionsize;
}
return maxexpansion.m_endExpansionCE_.size();
}
if (ch >= 0x1161 && ch <= 0x1175) {
// determines V for Jamo
if (maxexpansion.m_maxVSize_ < expansionsize) {
maxexpansion.m_maxVSize_ = expansionsize;
}
}
if (ch >= 0x11A8 && ch <= 0x11C2) {
isV = false;
// determines T for Jamo
if (maxexpansion.m_maxTSize_ < expansionsize) {
maxexpansion.m_maxTSize_ = expansionsize;
}
}
int pos = maxexpansion.m_endExpansionCE_.size();
while (pos > 0) {
pos--;
if ((maxexpansion.m_endExpansionCE_.get(pos)).intValue() == endexpansion) {
return maxexpansion.m_endExpansionCE_.size();
}
}
maxexpansion.m_endExpansionCE_.add(Integer.valueOf(endexpansion));
maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE);
return maxexpansion.m_endExpansionCE_.size();
}
/**
* Adds a prefix to the table
*
* @param t
* build table to update
* @param CE
* collation element to add
* @param element
* rule element to add
* @return modified ce
*/
private int addPrefix(BuildTable t, int CE, Elements element) {
// currently the longest prefix we're supporting in Japanese is two
// characters long. Although this table could quite easily mimic
// complete contraction stuff there is no good reason to make a general
// solution, as it would require some error prone messing.
ContractionTable contractions = t.m_contractions_;
String oldCP = element.m_cPoints_;
int oldCPOffset = element.m_cPointsOffset_;
contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
// here, we will normalize & add prefix to the table.
int size = element.m_prefixChars_.length() - element.m_prefix_;
for (int j = 1; j < size; j++) {
// First add NFD prefix chars to unsafe CP hash table
// Unless it is a trail surrogate, which is handled algoritmically
// and shouldn't take up space in the table.
char ch = element.m_prefixChars_.charAt(j + element.m_prefix_);
if (!UTF16.isTrailSurrogate(ch)) {
unsafeCPSet(t.m_unsafeCP_, ch);
}
}
// StringBuffer reversed = new StringBuffer();
m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
for (int j = 0; j < size; j++) {
// prefixes are going to be looked up backwards
// therefore, we will promptly reverse the prefix buffer...
int offset = element.m_prefixChars_.length() - j - 1;
m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset));
}
element.m_prefixChars_ = m_utilStringBuffer_.toString();
element.m_prefix_ = 0;
// the first codepoint is also unsafe, as it forms a 'contraction' with
// the prefix
if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
}
element.m_cPoints_ = element.m_prefixChars_;
element.m_cPointsOffset_ = element.m_prefix_;
// Add the last char of the contraction to the contraction-end hash
// table. unless it is a trail surrogate, which is handled
// algorithmically and shouldn't be in the table
if (!UTF16.isTrailSurrogate(element.m_cPoints_
.charAt(element.m_cPoints_.length() - 1))) {
ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
.charAt(element.m_cPoints_.length() - 1));
}
// First we need to check if contractions starts with a surrogate
// int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
// If there are any Jamos in the contraction, we should turn on special
// processing for Jamos
if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
t.m_collator_.m_isJamoSpecial_ = true;
}
// then we need to deal with it
// we could aready have something in table - or we might not
if (!isPrefix(CE)) {
// if it wasn't contraction, we wouldn't end up here
int firstContractionOffset = addContraction(contractions,
CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
addContraction(contractions, firstContractionOffset,
element.m_prefixChars_.charAt(element.m_prefix_), newCE);
addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
CE);
CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset);
} else {
// we are adding to existing contraction
// there were already some elements in the table, so we need to add
// a new contraction
// Two things can happen here: either the codepoint is already in
// the table, or it is not
char ch = element.m_prefixChars_.charAt(element.m_prefix_);
int position = findCP(contractions, CE, ch);
if (position > 0) {
// if it is we just continue down the chain
int eCE = getCE(contractions, CE, position);
int newCE = processContraction(contractions, element, eCE);
setContraction(contractions, CE, position, ch, newCE);
} else {
// if it isn't, we will have to create a new sequence
processContraction(contractions, element, CE_NOT_FOUND_);
insertContraction(contractions, CE, ch, element.m_mapCE_);
}
}
element.m_cPoints_ = oldCP;
element.m_cPointsOffset_ = oldCPOffset;
return CE;
}
/**
* Checks if the argument ce is a contraction
*
* @param CE
* collation element
* @return true if argument ce is a contraction
*/
private static final boolean isContraction(int CE) {
return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
}
/**
* Checks if the argument ce has a prefix
*
* @param CE
* collation element
* @return true if argument ce has a prefix
*/
private static final boolean isPrefix(int CE) {
return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
}
/**
* Checks if the argument ce is special
*
* @param CE
* collation element
* @return true if argument ce is special
*/
private static final boolean isSpecial(int CE) {
return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
}
/**
* Checks if the argument ce has a prefix
*
* @param CE
* collation element
* @return true if argument ce has a prefix
*/
private static final int getCETag(int CE) {
return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;
}
/**
* Gets the ce at position in contraction table
*
* @param table
* contraction table
* @param position
* offset to the contraction table
* @return ce
*/
private static final int getCE(ContractionTable table, int element,
int position) {
element &= 0xFFFFFF;
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
return CE_NOT_FOUND_;
}
if (position > tbl.m_CEs_.size() || position == -1) {
return CE_NOT_FOUND_;
} else {
return tbl.m_CEs_.get(position).intValue();
}
}
/**
* Sets the unsafe character
*
* @param table
* unsafe table
* @param c
* character to be added
*/
private static final void unsafeCPSet(byte table[], char c) {
int hash = c;
if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
if (hash >= 0xd800 && hash <= 0xf8ff) {
// Part of a surrogate, or in private use area.
// These don't go in the table
return;
}
hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
}
table[hash >> 3] |= (1 << (hash & 7));
}
/**
* Sets the contraction end character
*
* @param table
* contraction end table
* @param c
* character to be added
*/
private static final void ContrEndCPSet(byte table[], char c) {
int hash = c;
if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
}
table[hash >> 3] |= (1 << (hash & 7));
}
/**
* Adds more contractions in table. If element is non existant, it creates
* on. Returns element handle
*
* @param table
* contraction table
* @param element
* offset to the contraction table
* @param codePoint
* codepoint to add
* @param value
* @return collation element
*/
private static int addContraction(ContractionTable table, int element,
char codePoint, int value) {
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
tbl = addAContractionElement(table);
element = table.m_elements_.size() - 1;
}
tbl.m_CEs_.add(Integer.valueOf(value));
tbl.m_codePoints_.append(codePoint);
return constructSpecialCE(table.m_currentTag_, element);
}
/**
* Adds a contraction element to the table
*
* @param table
* contraction table to update
* @return contraction
*/
private static BasicContractionTable addAContractionElement(
ContractionTable table) {
BasicContractionTable result = new BasicContractionTable();
table.m_elements_.add(result);
return result;
}
/**
* Constructs a special ce
*
* @param tag
* special tag
* @param CE
* collation element
* @return a contraction ce
*/
private static final int constructSpecialCE(int tag, int CE) {
return RuleBasedCollator.CE_SPECIAL_FLAG_
| (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF);
}
/**
* Sets and inserts the element that has a contraction
*
* @param contractions
* contraction table
* @param element
* contracting element
* @param existingCE
* @return contraction ce
*/
private static int processContraction(ContractionTable contractions,
Elements element, int existingCE) {
int firstContractionOffset = 0;
// end of recursion
if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
if (isContractionTableElement(existingCE)
&& getCETag(existingCE) == contractions.m_currentTag_) {
changeContraction(contractions, existingCE, (char) 0,
element.m_mapCE_);
changeContraction(contractions, existingCE, (char) 0xFFFF,
element.m_mapCE_);
return existingCE;
} else {
// can't do just that. existingCe might be a contraction,
// meaning that we need to do another step
return element.m_mapCE_;
}
}
// this recursion currently feeds on the only element we have...
// We will have to copy it in order to accomodate for both backward
// and forward cycles
// we encountered either an empty space or a non-contraction element
// this means we are constructing a new contraction sequence
element.m_cPointsOffset_++;
if (!isContractionTableElement(existingCE)) {
// if it wasn't contraction, we wouldn't end up here
firstContractionOffset = addContraction(contractions,
CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, existingCE);
int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
addContraction(contractions, firstContractionOffset,
element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE);
addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
existingCE);
existingCE = constructSpecialCE(contractions.m_currentTag_,
firstContractionOffset);
} else {
// we are adding to existing contraction
// there were already some elements in the table, so we need to add
// a new contraction
// Two things can happen here: either the codepoint is already in
// the table, or it is not
int position = findCP(contractions, existingCE, element.m_cPoints_
.charAt(element.m_cPointsOffset_));
if (position > 0) {
// if it is we just continue down the chain
int eCE = getCE(contractions, existingCE, position);
int newCE = processContraction(contractions, element, eCE);
setContraction(contractions, existingCE, position,
element.m_cPoints_.charAt(element.m_cPointsOffset_),
newCE);
} else {
// if it isn't, we will have to create a new sequence
int newCE = processContraction(contractions, element,
CE_NOT_FOUND_);
insertContraction(contractions, existingCE, element.m_cPoints_
.charAt(element.m_cPointsOffset_), newCE);
}
}
element.m_cPointsOffset_--;
return existingCE;
}
/**
* Checks if CE belongs to the contraction table
*
* @param CE
* collation element to test
* @return true if CE belongs to the contraction table
*/
private static final boolean isContractionTableElement(int CE) {
return isSpecial(CE)
&& (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);
}
/**
* Gets the codepoint
*
* @param table
* contraction table
* @param element
* offset to the contraction element in the table
* @param codePoint
* code point to look for
* @return the offset to the code point
*/
private static int findCP(ContractionTable table, int element,
char codePoint) {
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
return -1;
}
int position = 0;
while (codePoint > tbl.m_codePoints_.charAt(position)) {
position++;
if (position > tbl.m_codePoints_.length()) {
return -1;
}
}
if (codePoint == tbl.m_codePoints_.charAt(position)) {
return position;
} else {
return -1;
}
}
/**
* Gets the contraction element out of the contraction table
*
* @param table
* contraction table
* @param offset
* to the element in the contraction table
* @return basic contraction element at offset in the contraction table
*/
private static final BasicContractionTable getBasicContractionTable(
ContractionTable table, int offset) {
offset &= 0xFFFFFF;
if (offset == 0xFFFFFF) {
return null;
}
return table.m_elements_.get(offset);
}
/**
* Changes the contraction element
*
* @param table
* contraction table
* @param element
* offset to the element in the contraction table
* @param codePoint
* codepoint
* @param newCE
* new collation element
* @return basic contraction element at offset in the contraction table
*/
private static final int changeContraction(ContractionTable table,
int element, char codePoint, int newCE) {
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
return 0;
}
int position = 0;
while (codePoint > tbl.m_codePoints_.charAt(position)) {
position++;
if (position > tbl.m_codePoints_.length()) {
return CE_NOT_FOUND_;
}
}
if (codePoint == tbl.m_codePoints_.charAt(position)) {
tbl.m_CEs_.set(position, Integer.valueOf(newCE));
return element & 0xFFFFFF;
} else {
return CE_NOT_FOUND_;
}
}
/**
* Sets a part of contraction sequence in table. If element is non existant,
* it creates on. Returns element handle.
*
* @param table
* contraction table
* @param element
* offset to the contraction table
* @param offset
* @param codePoint
* contraction character
* @param value
* ce value
* @return new contraction ce
*/
private static final int setContraction(ContractionTable table,
int element, int offset, char codePoint, int value) {
element &= 0xFFFFFF;
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
tbl = addAContractionElement(table);
element = table.m_elements_.size() - 1;
}
tbl.m_CEs_.set(offset, Integer.valueOf(value));
tbl.m_codePoints_.setCharAt(offset, codePoint);
return constructSpecialCE(table.m_currentTag_, element);
}
/**
* Inserts a part of contraction sequence in table. Sequences behind the
* offset are moved back. If element is non existent, it creates on.
*
* @param table
* contraction
* @param element
* offset to the table contraction
* @param codePoint
* code point
* @param value
* collation element value
* @return contraction collation element
*/
private static final int insertContraction(ContractionTable table,
int element, char codePoint, int value) {
element &= 0xFFFFFF;
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
tbl = addAContractionElement(table);
element = table.m_elements_.size() - 1;
}
int offset = 0;
while (tbl.m_codePoints_.charAt(offset) < codePoint
&& offset < tbl.m_codePoints_.length()) {
offset++;
}
tbl.m_CEs_.add(offset, Integer.valueOf(value));
tbl.m_codePoints_.insert(offset, codePoint);
return constructSpecialCE(table.m_currentTag_, element);
}
/**
* Finalize addition
*
* @param t
* build table
* @param element
* to add
*/
private final static int finalizeAddition(BuildTable t, Elements element) {
int CE = CE_NOT_FOUND_;
// This should add a completely ignorable element to the
// unsafe table, so that backward iteration will skip
// over it when treating contractions.
if (element.m_mapCE_ == 0) {
for (int i = 0; i < element.m_cPoints_.length(); i++) {
char ch = element.m_cPoints_.charAt(i);
if (!UTF16.isTrailSurrogate(ch)) {
unsafeCPSet(t.m_unsafeCP_, ch);
}
}
}
if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
// we're adding a contraction
int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
CE = t.m_mapping_.getValue(cp);
CE = addContraction(t, CE, element);
} else {
// easy case
CE = t.m_mapping_.getValue(element.m_cPoints_
.charAt(element.m_cPointsOffset_));
if (CE != CE_NOT_FOUND_) {
if (isContractionTableElement(CE)) {
// adding a non contraction element (thai, expansion,
// single) to already existing contraction
if (!isPrefix(element.m_mapCE_)) {
// we cannot reenter prefix elements - as we are going
// to create a dead loop
// Only expansions and regular CEs can go here...
// Contractions will never happen in this place
setContraction(t.m_contractions_, CE, 0, (char) 0,
element.m_mapCE_);
// This loop has to change the CE at the end of
// contraction REDO!
changeLastCE(t.m_contractions_, CE, element.m_mapCE_);
}
} else {
t.m_mapping_
.setValue(element.m_cPoints_
.charAt(element.m_cPointsOffset_),
element.m_mapCE_);
if (element.m_prefixChars_ != null
&& element.m_prefixChars_.length() > 0
&& getCETag(CE) != CE_IMPLICIT_TAG_) {
// Add CE for standalone precontext char.
Elements origElem = new Elements();
origElem.m_prefixChars_ = null;
origElem.m_uchars_ = element.m_cPoints_;
origElem.m_cPoints_ = origElem.m_uchars_;
origElem.m_CEs_[0] = CE;
origElem.m_mapCE_ = CE;
origElem.m_CELength_ = 1;
finalizeAddition(t, origElem);
}
}
} else {
t.m_mapping_.setValue(element.m_cPoints_
.charAt(element.m_cPointsOffset_), element.m_mapCE_);
}
}
return CE;
}
/**
* Note regarding surrogate handling: We are interested only in the single
* or leading surrogates in a contraction. If a surrogate is somewhere else
* in the contraction, it is going to be handled as a pair of code units, as
* it doesn't affect the performance AND handling surrogates specially would
* complicate code way too much.
*/
private static int addContraction(BuildTable t, int CE, Elements element) {
ContractionTable contractions = t.m_contractions_;
contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
// First we need to check if contractions starts with a surrogate
int cp = UTF16.charAt(element.m_cPoints_, 0);
int cpsize = 1;
if (UCharacter.isSupplementary(cp)) {
cpsize = 2;
}
if (cpsize < element.m_cPoints_.length()) {
// This is a real contraction, if there are other characters after
// the first
int size = element.m_cPoints_.length() - element.m_cPointsOffset_;
for (int j = 1; j < size; j++) {
// First add contraction chars to unsafe CP hash table
// Unless it is a trail surrogate, which is handled
// algoritmically and shouldn't take up space in the table.
if (!UTF16.isTrailSurrogate(element.m_cPoints_
.charAt(element.m_cPointsOffset_ + j))) {
unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_
.charAt(element.m_cPointsOffset_ + j));
}
}
// Add the last char of the contraction to the contraction-end
// hash table. unless it is a trail surrogate, which is handled
// algorithmically and shouldn't be in the table
if (!UTF16.isTrailSurrogate(element.m_cPoints_
.charAt(element.m_cPoints_.length() - 1))) {
ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
.charAt(element.m_cPoints_.length() - 1));
}
// If there are any Jamos in the contraction, we should turn on
// special processing for Jamos
if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) {
t.m_collator_.m_isJamoSpecial_ = true;
}
// then we need to deal with it
// we could aready have something in table - or we might not
element.m_cPointsOffset_ += cpsize;
if (!isContraction(CE)) {
// if it wasn't contraction, we wouldn't end up here
int firstContractionOffset = addContraction(contractions,
CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
int newCE = processContraction(contractions, element,
CE_NOT_FOUND_);
addContraction(contractions, firstContractionOffset,
element.m_cPoints_.charAt(element.m_cPointsOffset_),
newCE);
addContraction(contractions, firstContractionOffset,
(char) 0xFFFF, CE);
CE = constructSpecialCE(CE_CONTRACTION_TAG_,
firstContractionOffset);
} else {
// we are adding to existing contraction
// there were already some elements in the table, so we need to
// add a new contraction
// Two things can happen here: either the codepoint is already
// in the table, or it is not
int position = findCP(contractions, CE, element.m_cPoints_
.charAt(element.m_cPointsOffset_));
if (position > 0) {
// if it is we just continue down the chain
int eCE = getCE(contractions, CE, position);
int newCE = processContraction(contractions, element, eCE);
setContraction(
contractions,
CE,
position,
element.m_cPoints_.charAt(element.m_cPointsOffset_),
newCE);
} else {
// if it isn't, we will have to create a new sequence
int newCE = processContraction(contractions, element,
CE_NOT_FOUND_);
insertContraction(contractions, CE, element.m_cPoints_
.charAt(element.m_cPointsOffset_), newCE);
}
}
element.m_cPointsOffset_ -= cpsize;
t.m_mapping_.setValue(cp, CE);
} else if (!isContraction(CE)) {
// this is just a surrogate, and there is no contraction
t.m_mapping_.setValue(cp, element.m_mapCE_);
} else {
// fill out the first stage of the contraction with the surrogate
// CE
changeContraction(contractions, CE, (char) 0, element.m_mapCE_);
changeContraction(contractions, CE, (char) 0xFFFF, element.m_mapCE_);
}
return CE;
}
/**
* this is for adding non contractions
*
* @param table
* contraction table
* @param element
* offset to the contraction table
* @param value
* collation element value
* @return new collation element
*/
private static final int changeLastCE(ContractionTable table, int element,
int value) {
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
return 0;
}
tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, Integer.valueOf(value));
return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF);
}
/**
* Given a set of ranges calculated by allocWeights(), iterate through the
* weights. Sets the next weight in cegenerator.m_current_.
*
* @param cegenerator
* object that contains ranges weight range array and its
* rangeCount
* @return the next weight
*/
private static int nextWeight(CEGenerator cegenerator) {
if (cegenerator.m_rangesLength_ > 0) {
// get maxByte from the .count field
int maxByte = cegenerator.m_ranges_[0].m_count_;
// get the next weight
int weight = cegenerator.m_ranges_[0].m_start_;
if (weight == cegenerator.m_ranges_[0].m_end_) {
// this range is finished, remove it and move the following
// ones up
cegenerator.m_rangesLength_--;
if (cegenerator.m_rangesLength_ > 0) {
System.arraycopy(cegenerator.m_ranges_, 1,
cegenerator.m_ranges_, 0,
cegenerator.m_rangesLength_);
cegenerator.m_ranges_[0].m_count_ = maxByte;
// keep maxByte in ranges[0]
}
} else {
// increment the weight for the next value
cegenerator.m_ranges_[0].m_start_ = incWeight(weight,
cegenerator.m_ranges_[0].m_length2_, maxByte);
}
return weight;
}
return -1;
}
/**
* Increment the collation weight
*
* @param weight
* to increment
* @param length
* @param maxByte
* @return new incremented weight
*/
private static final int incWeight(int weight, int length, int maxByte) {
while (true) {
int b = getWeightByte(weight, length);
if (b < maxByte) {
return setWeightByte(weight, length, b + 1);
} else {
// roll over, set this byte to BYTE_FIRST_TAILORED_ and
// increment the previous one
weight = setWeightByte(weight, length,
RuleBasedCollator.BYTE_FIRST_TAILORED_);
--length;
}
}
}
/**
* Gets the weight byte
*
* @param weight
* @param index
* @return byte
*/
private static final int getWeightByte(int weight, int index) {
return (weight >> ((4 - index) << 3)) & 0xff;
}
/**
* Set the weight byte in table
*
* @param weight
* @param index
* @param b
* byte
*/
private static final int setWeightByte(int weight, int index, int b) {
index <<= 3;
// 0xffffffff except a 00 "hole" for the index-th byte
int mask;
if (index < 32) {
mask = 0xffffffff >>> index;
} else {
// Do not use int>>>32 because that does not shift at all
// while we need it to become 0.
//
// Java Language Specification (Third Edition) 15.19 Shift Operators:
// "If the promoted type of the left-hand operand is int,
// only the five lowest-order bits of the right-hand operand
// are used as the shift distance.
// It is as if the right-hand operand were subjected to
// a bitwise logical AND operator & (§15.22.1) with the mask value 0x1f.
// The shift distance actually used is therefore
// always in the range 0 to 31, inclusive."
mask = 0;
}
index = 32 - index;
mask |= 0xffffff00 << index;
return (weight & mask) | (b << index);
}
/**
* Call getWeightRanges and then determine heuristically which ranges to use
* for a given number of weights between (excluding) two limits
*
* @param lowerLimit
* @param upperLimit
* @param n
* @param maxByte
* @param ranges
* @return
*/
private int allocateWeights(int lowerLimit, int upperLimit, int n,
int maxByte, WeightRange ranges[]) {
// number of usable byte values 3..maxByte
int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
// [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
// countBytes to the power of index, m_utilLongBuffer_ for unsignedness
// gcc requires explicit initialization
m_utilLongBuffer_[0] = 1;
m_utilLongBuffer_[1] = countBytes;
m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte,
countBytes, ranges);
if (rangeCount <= 0) {
return 0;
}
// what is the maximum number of weights with these ranges?
long maxCount = 0;
for (int i = 0; i < rangeCount; ++i) {
maxCount += (long) ranges[i].m_count_
* m_utilLongBuffer_[4 - ranges[i].m_length_];
}
if (maxCount < n) {
return 0;
}
// set the length2 and count2 fields
for (int i = 0; i < rangeCount; ++i) {
ranges[i].m_length2_ = ranges[i].m_length_;
ranges[i].m_count2_ = ranges[i].m_count_;
}
// try until we find suitably large ranges
while (true) {
// get the smallest number of bytes in a range
int minLength = ranges[0].m_length2_;
// sum up the number of elements that fit into ranges of each byte
// length
Arrays.fill(m_utilCountBuffer_, 0);
for (int i = 0; i < rangeCount; ++i) {
m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
}
// now try to allocate n elements in the available short ranges
if (n <= m_utilCountBuffer_[minLength]
+ m_utilCountBuffer_[minLength + 1]) {
// trivial cases, use the first few ranges
maxCount = 0;
rangeCount = 0;
do {
maxCount += ranges[rangeCount].m_count2_;
++rangeCount;
} while (n > maxCount);
break;
} else if (n <= ranges[0].m_count2_ * countBytes) {
// easy case, just make this one range large enough by
// lengthening it once more, possibly split it
rangeCount = 1;
// calculate how to split the range between maxLength-1
// (count1) and maxLength (count2)
long power_1 = m_utilLongBuffer_[minLength
- ranges[0].m_length_];
long power = power_1 * countBytes;
int count2 = (int) ((n + power - 1) / power);
int count1 = ranges[0].m_count_ - count2;
// split the range
if (count1 < 1) {
// lengthen the entire range to maxLength
lengthenRange(ranges, 0, maxByte, countBytes);
} else {
// really split the range
// create a new range with the end and initial and current
// length of the old one
rangeCount = 2;
ranges[1].m_end_ = ranges[0].m_end_;
ranges[1].m_length_ = ranges[0].m_length_;
ranges[1].m_length2_ = minLength;
// set the end of the first range according to count1
int i = ranges[0].m_length_;
int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1;
// ranges[0].count and count1 may be >countBytes from
// merging adjacent ranges; b > maxByte is possible
if (b <= maxByte) {
ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i,
b);
} else {
ranges[0].m_end_ = setWeightByte(incWeight(
ranges[0].m_start_, i - 1, maxByte), i, b
- countBytes);
}
// set the bytes in the end weight at length + 1..length2
// to maxByte
b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8)
| maxByte; // this used to be 0xffffffff
ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i)
| (b >>> (i << 3)) & (b << ((4 - minLength) << 3));
// set the start of the second range to immediately follow
// the end of the first one
ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength,
maxByte);
// set the count values (informational)
ranges[0].m_count_ = count1;
ranges[1].m_count_ = count2;
ranges[0].m_count2_ = (int) (count1 * power_1);
// will be *countBytes when lengthened
ranges[1].m_count2_ = (int) (count2 * power_1);
// lengthen the second range to maxLength
lengthenRange(ranges, 1, maxByte, countBytes);
}
break;
}
// no good match, lengthen all minLength ranges and iterate
for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {
lengthenRange(ranges, i, maxByte, countBytes);
}
}
if (rangeCount > 1) {
// sort the ranges by weight values
Arrays.sort(ranges, 0, rangeCount);
}
// set maxByte in ranges[0] for ucol_nextWeight()
ranges[0].m_count_ = maxByte;
return rangeCount;
}
/**
* Updates the range length
*
* @param range
* weight range array
* @param offset
* to weight range array
* @param maxByte
* @param countBytes
* @return new length
*/
private static final int lengthenRange(WeightRange range[], int offset,
int maxByte, int countBytes) {
int length = range[offset].m_length2_ + 1;
range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length,
RuleBasedCollator.BYTE_FIRST_TAILORED_);
range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length,
maxByte);
range[offset].m_count2_ *= countBytes;
range[offset].m_length2_ = length;
return length;
}
/**
* Gets the weight
*
* @param weight
* @param length
* @param trail
* @return new weight
*/
private static final int setWeightTrail(int weight, int length, int trail) {
length = (4 - length) << 3;
return (weight & (0xffffff00 << length)) | (trail << length);
}
/**
* take two CE weights and calculate the possible ranges of weights between
* the two limits, excluding them for weights with up to 4 bytes there are
* up to 2*4-1=7 ranges
*
* @param lowerLimit
* @param upperLimit
* @param maxByte
* @param countBytes
* @param ranges
* @return weight ranges
*/
private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte,
int countBytes, WeightRange ranges[]) {
// assume that both lowerLimit & upperLimit are not 0
// get the lengths of the limits
int lowerLength = lengthOfWeight(lowerLimit);
int upperLength = lengthOfWeight(upperLimit);
if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
return 0;
}
// check that neither is a prefix of the other
if (lowerLength < upperLength) {
if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
return 0;
}
}
// if the upper limit is a prefix of the lower limit then the earlier
// test lowerLimit >= upperLimit has caught it
// reset local variables
// With the limit lengths of 1..4, there are up to 7 ranges for
// allocation:
// range minimum length
// lower[4] 4
// lower[3] 3
// lower[2] 2
// middle 1
// upper[2] 2
// upper[3] 3
// upper[4] 4
// We are now going to calculate up to 7 ranges.
// Some of them will typically overlap, so we will then have to merge
// and eliminate ranges.
// We have to clean cruft from previous invocations
// before doing anything. C++ already does that
for (int length = 0; length < 5; length++) {
m_utilLowerWeightRange_[length].clear();
m_utilUpperWeightRange_[length].clear();
}
m_utilWeightRange_.clear();
int weight = lowerLimit;
for (int length = lowerLength; length >= 2; --length) {
m_utilLowerWeightRange_[length].clear();
int trail = getWeightByte(weight, length);
if (trail < maxByte) {
m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(
weight, length);
m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(weight,
length, maxByte);
m_utilLowerWeightRange_[length].m_length_ = length;
m_utilLowerWeightRange_[length].m_count_ = maxByte - trail;
}
weight = truncateWeight(weight, length - 1);
}
m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
weight = upperLimit;
// [0] and [1] are not used - this simplifies indexing,
// m_utilUpperWeightRange_
for (int length = upperLength; length >= 2; length--) {
int trail = getWeightByte(weight, length);
if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(
weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_);
m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(weight,
length);
m_utilUpperWeightRange_[length].m_length_ = length;
m_utilUpperWeightRange_[length].m_count_ = trail
- RuleBasedCollator.BYTE_FIRST_TAILORED_;
}
weight = truncateWeight(weight, length - 1);
}
m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
// set the middle range
m_utilWeightRange_.m_length_ = 1;
if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,
m_utilWeightRange_.m_start_) >= 0) {
// if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;
} else {
// eliminate overlaps
// remove the middle range
m_utilWeightRange_.m_count_ = 0;
// reduce or remove the lower ranges that go beyond upperLimit
for (int length = 4; length >= 2; --length) {
if (m_utilLowerWeightRange_[length].m_count_ > 0
&& m_utilUpperWeightRange_[length].m_count_ > 0) {
int start = m_utilUpperWeightRange_[length].m_start_;
int end = m_utilLowerWeightRange_[length].m_end_;
if (end >= start
|| incWeight(end, length, maxByte) == start) {
// lower and upper ranges collide or are directly
// adjacent: merge these two and remove all shorter
// ranges
start = m_utilLowerWeightRange_[length].m_start_;
end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;
// merging directly adjacent ranges needs to subtract
// the 0/1 gaps in between;
// it may result in a range with count>countBytes
m_utilLowerWeightRange_[length].m_count_ = getWeightByte(
end, length)
- getWeightByte(start, length)
+ 1
+ countBytes
* (getWeightByte(end, length - 1) - getWeightByte(
start, length - 1));
m_utilUpperWeightRange_[length].m_count_ = 0;
while (--length >= 2) {
m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;
}
break;
}
}
}
}
// copy the ranges, shortest first, into the result array
int rangeCount = 0;
if (m_utilWeightRange_.m_count_ > 0) {
ranges[0] = new WeightRange(m_utilWeightRange_);
rangeCount = 1;
}
for (int length = 2; length <= 4; ++length) {
// copy upper first so that later the middle range is more likely
// the first one to use
if (m_utilUpperWeightRange_[length].m_count_ > 0) {
ranges[rangeCount] = new WeightRange(
m_utilUpperWeightRange_[length]);
++rangeCount;
}
if (m_utilLowerWeightRange_[length].m_count_ > 0) {
ranges[rangeCount] = new WeightRange(
m_utilLowerWeightRange_[length]);
++rangeCount;
}
}
return rangeCount;
}
/**
* Truncates the weight with length
*
* @param weight
* @param length
* @return truncated weight
*/
private static final int truncateWeight(int weight, int length) {
return weight & (0xffffffff << ((4 - length) << 3));
}
/**
* Length of the weight
*
* @param weight
* @return length of the weight
*/
private static final int lengthOfWeight(int weight) {
if ((weight & 0xffffff) == 0) {
return 1;
} else if ((weight & 0xffff) == 0) {
return 2;
} else if ((weight & 0xff) == 0) {
return 3;
}
return 4;
}
/**
* Increment the weight trail
*
* @param weight
* @param length
* @return new weight
*/
private static final int incWeightTrail(int weight, int length) {
return weight + (1 << ((4 - length) << 3));
}
/**
* Decrement the weight trail
*
* @param weight
* @param length
* @return new weight
*/
private static int decWeightTrail(int weight, int length) {
return weight - (1 << ((4 - length) << 3));
}
/**
* Gets the codepoint
*
* @param tbl
* contraction table
* @param codePoint
* code point to look for
* @return the offset to the code point
*/
private static int findCP(BasicContractionTable tbl, char codePoint) {
int position = 0;
while (codePoint > tbl.m_codePoints_.charAt(position)) {
position++;
if (position > tbl.m_codePoints_.length()) {
return -1;
}
}
if (codePoint == tbl.m_codePoints_.charAt(position)) {
return position;
} else {
return -1;
}
}
/**
* Finds a contraction ce
*
* @param table
* @param element
* @param ch
* @return ce
*/
private static int findCE(ContractionTable table, int element, char ch) {
if (table == null) {
return CE_NOT_FOUND_;
}
BasicContractionTable tbl = getBasicContractionTable(table, element);
if (tbl == null) {
return CE_NOT_FOUND_;
}
int position = findCP(tbl, ch);
if (position > tbl.m_CEs_.size() || position < 0) {
return CE_NOT_FOUND_;
}
return tbl.m_CEs_.get(position).intValue();
}
/**
* Checks if the string is tailored in the contraction
*
* @param table
* contraction table
* @param element
* @param array
* character array to check
* @param offset
* array offset
* @return true if it is tailored
*/
private static boolean isTailored(ContractionTable table, int element,
char array[], int offset) {
while (array[offset] != 0) {
element = findCE(table, element, array[offset]);
if (element == CE_NOT_FOUND_) {
return false;
}
if (!isContractionTableElement(element)) {
return true;
}
offset++;
}
if (getCE(table, element, 0) != CE_NOT_FOUND_) {
return true;
} else {
return false;
}
}
/**
* Assemble RuleBasedCollator
*
* @param t
* build table
* @param collator
* to update
*/
private void assembleTable(BuildTable t, RuleBasedCollator collator) {
IntTrieBuilder mapping = t.m_mapping_;
List expansions = t.m_expansions_;
ContractionTable contractions = t.m_contractions_;
MaxExpansionTable maxexpansion = t.m_maxExpansions_;
// contraction offset has to be in since we are building on the
// UCA contractions
// int beforeContractions = (HEADER_SIZE_
// + paddedsize(expansions.size() << 2)) >>> 1;
collator.m_contractionOffset_ = 0;
int contractionsSize = constructTable(contractions);
// the following operation depends on the trie data. Therefore, we have
// to do it before the trie is compacted
// sets jamo expansions
getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_,
collator.m_isJamoSpecial_);
// TODO: LATIN1 array is now in the utrie - it should be removed from
// the calculation
setAttributes(collator, t.m_options_);
// copy expansions
int size = expansions.size();
collator.m_expansion_ = new int[size];
for (int i = 0; i < size; i++) {
collator.m_expansion_[i] = expansions.get(i).intValue();
}
// contractions block
if (contractionsSize != 0) {
// copy contraction index
collator.m_contractionIndex_ = new char[contractionsSize];
contractions.m_codePoints_.getChars(0, contractionsSize,
collator.m_contractionIndex_, 0);
// copy contraction collation elements
collator.m_contractionCE_ = new int[contractionsSize];
for (int i = 0; i < contractionsSize; i++) {
collator.m_contractionCE_[i] = contractions.m_CEs_.get(i).intValue();
}
}
// copy mapping table
collator.m_trie_ = mapping.serialize(t,
RuleBasedCollator.DataManipulate.getInstance());
// copy max expansion table
// not copying the first element which is a dummy
// to be in synch with icu4c's builder, we continue to use the
// expansion offset
// omitting expansion offset in builder
collator.m_expansionOffset_ = 0;
size = maxexpansion.m_endExpansionCE_.size();
collator.m_expansionEndCE_ = new int[size - 1];
for (int i = 1; i < size; i++) {
collator.m_expansionEndCE_[i - 1] = maxexpansion.m_endExpansionCE_
.get(i).intValue();
}
collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
for (int i = 1; i < size; i++) {
collator.m_expansionEndCEMaxSize_[i - 1] = maxexpansion.m_expansionCESize_
.get(i).byteValue();
}
// Unsafe chars table. Finish it off, then copy it.
unsafeCPAddCCNZ(t);
// Or in unsafebits from UCA, making a combined table.
for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
}
collator.m_unsafe_ = t.m_unsafeCP_;
// Finish building Contraction Ending chars hash table and then copy it
// out.
// Or in unsafebits from UCA, making a combined table
for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
}
collator.m_contractionEnd_ = t.m_contrEndCP_;
}
/**
* Sets this collator to use the all options and tables in UCA.
*
* @param collator
* which attribute is to be set
* @param option
* to set with
*/
private static final void setAttributes(RuleBasedCollator collator,
CollationRuleParser.OptionSet option) {
collator.latinOneFailed_ = true;
collator.m_caseFirst_ = option.m_caseFirst_;
collator.setDecomposition(option.m_decomposition_);
collator
.setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);
collator.setCaseLevel(option.m_isCaseLevel_);
collator.setFrenchCollation(option.m_isFrenchCollation_);
collator.m_isHiragana4_ = option.m_isHiragana4_;
collator.setStrength(option.m_strength_);
collator.m_variableTopValue_ = option.m_variableTopValue_;
collator.m_reorderCodes_ = option.m_scriptOrder_;
collator.latinOneFailed_ = false;
}
/**
* Constructing the contraction table
*
* @param table
* contraction table
* @return
*/
private int constructTable(ContractionTable table) {
// See how much memory we need
int tsize = table.m_elements_.size();
if (tsize == 0) {
return 0;
}
table.m_offsets_.clear();
int position = 0;
for (int i = 0; i < tsize; i++) {
table.m_offsets_.add(Integer.valueOf(position));
position += table.m_elements_.get(i).m_CEs_
.size();
}
table.m_CEs_.clear();
table.m_codePoints_.delete(0, table.m_codePoints_.length());
// Now stuff the things in
StringBuilder cpPointer = table.m_codePoints_;
List CEPointer = table.m_CEs_;
for (int i = 0; i < tsize; i++) {
BasicContractionTable bct = table.m_elements_.get(i);
int size = bct.m_CEs_.size();
char ccMax = 0;
char ccMin = 255;
int offset = CEPointer.size();
CEPointer.add(bct.m_CEs_.get(0));
for (int j = 1; j < size; j++) {
char ch = bct.m_codePoints_.charAt(j);
char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);
if (cc > ccMax) {
ccMax = cc;
}
if (cc < ccMin) {
ccMin = cc;
}
cpPointer.append(ch);
CEPointer.add(bct.m_CEs_.get(j));
}
cpPointer.insert(offset,
(char) (((ccMin == ccMax) ? 1 : 0 << 8) | ccMax));
for (int j = 0; j < size; j++) {
if (isContractionTableElement(CEPointer.get(offset + j).intValue())) {
int ce = CEPointer.get(offset + j).intValue();
CEPointer.set(offset + j,
Integer.valueOf(constructSpecialCE(getCETag(ce),
table.m_offsets_.get(getContractionOffset(ce))
.intValue())));
}
}
}
for (int i = 0; i <= 0x10FFFF; i++) {
int CE = table.m_mapping_.getValue(i);
if (isContractionTableElement(CE)) {
CE = constructSpecialCE(getCETag(CE),
table.m_offsets_.get(getContractionOffset(CE)).intValue());
table.m_mapping_.setValue(i, CE);
}
}
return position;
}
/**
* Get contraction offset
*
* @param ce
* collation element
* @return contraction offset
*/
private static final int getContractionOffset(int ce) {
return ce & 0xFFFFFF;
}
/**
* Gets the maximum Jamo expansion
*
* @param mapping
* trie table
* @param maxexpansion
* maximum expansion table
* @param maxjamoexpansion
* maximum jamo expansion table
* @param jamospecial
* is jamo special?
*/
private static void getMaxExpansionJamo(IntTrieBuilder mapping,
MaxExpansionTable maxexpansion,
MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {
int VBASE = 0x1161;
int TBASE = 0x11A8;
int VCOUNT = 21;
int TCOUNT = 28;
int v = VBASE + VCOUNT - 1;
int t = TBASE + TCOUNT - 1;
while (v >= VBASE) {
int ce = mapping.getValue(v);
if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
setMaxExpansion(ce, (byte) 2, maxexpansion);
}
v--;
}
while (t >= TBASE) {
int ce = mapping.getValue(t);
if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
setMaxExpansion(ce, (byte) 3, maxexpansion);
}
t--;
}
// According to the docs, 99% of the time, the Jamo will not be special
if (jamospecial) {
// gets the max expansion in all unicode characters
int count = maxjamoexpansion.m_endExpansionCE_.size();
byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_
+ maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);
byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);
while (count > 0) {
count--;
if ((maxjamoexpansion.m_isV_.get(count))
.booleanValue() == true) {
setMaxExpansion(
(maxjamoexpansion.m_endExpansionCE_
.get(count)).intValue(), maxVSize,
maxexpansion);
} else {
setMaxExpansion(
(maxjamoexpansion.m_endExpansionCE_
.get(count)).intValue(), maxTSize,
maxexpansion);
}
}
}
}
/**
* To the UnsafeCP hash table, add all chars with combining class != 0
*
* @param t
* build table
*/
private final void unsafeCPAddCCNZ(BuildTable t) {
boolean buildCMTable = (buildCMTabFlag & (t.cmLookup == null));
char[] cm = null; // combining mark array
int[] index = new int[256];
int count = 0;
if (buildCMTable) {
cm = new char[0x10000];
}
for (char c = 0; c < 0xffff; c++) {
int fcd;
if (UTF16.isLeadSurrogate(c)) {
fcd = 0;
if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
int supp = Character.toCodePoint(c, (char)0xdc00);
int suppLimit = supp + 0x400;
while (supp < suppLimit) {
fcd |= m_nfcImpl_.getFCD16FromNormData(supp++);
}
}
} else {
fcd = m_nfcImpl_.getFCD16(c);
}
// TODO: review for handling supplementary characters
if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
(UTF16.isLeadSurrogate(c) && fcd != 0)) {
// c is a leading surrogate with some FCD data
unsafeCPSet(t.m_unsafeCP_, c);
if (buildCMTable) {
int cc = (fcd & 0xff);
int pos = (cc << 8) + index[cc];
cm[pos] = c;
index[cc]++;
count++;
}
}
}
if (t.m_prefixLookup_ != null) {
Enumeration els = Collections.enumeration(t.m_prefixLookup_.values());
while (els.hasMoreElements()) {
Elements e = els.nextElement();
// codepoints here are in the NFD form. We need to add the
// first code point of the NFC form to unsafe, because
// strcoll needs to backup over them.
// weiv: This is wrong! See the comment above.
// String decomp = Normalizer.decompose(e.m_cPoints_, true);
// unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
// it should be:
String comp = Normalizer.compose(e.m_cPoints_, false);
unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
}
}
if (buildCMTable) {
t.cmLookup = new CombinClassTable();
t.cmLookup.generate(cm, count, index);
}
}
/**
* Create closure
*
* @param t
* build table
* @param collator
* RuleBasedCollator
* @param colEl
* collation element iterator
* @param start
* @param limit
* @param type
* character type
* @return
*/
private boolean enumCategoryRangeClosureCategory(BuildTable t,
RuleBasedCollator collator, CollationElementIterator colEl,
int start, int limit, int type) {
if (type != UCharacterCategory.UNASSIGNED
&& type != UCharacterCategory.PRIVATE_USE) {
// if the range is assigned - we might ommit more categories later
for (int u32 = start; u32 < limit; u32++) {
String decomp = m_nfcImpl_.getDecomposition(u32);
if (decomp != null) {
String comp = UCharacter.toString(u32);
if (!collator.equals(comp, decomp)) {
m_utilElement_.m_cPoints_ = decomp;
m_utilElement_.m_prefix_ = 0;
Elements prefix = t.m_prefixLookup_.get(m_utilElement_);
if (prefix == null) {
m_utilElement_.m_cPoints_ = comp;
m_utilElement_.m_prefix_ = 0;
m_utilElement_.m_prefixChars_ = null;
colEl.setText(decomp);
int ce = colEl.next();
m_utilElement_.m_CELength_ = 0;
while (ce != CollationElementIterator.NULLORDER) {
m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;
ce = colEl.next();
}
} else {
m_utilElement_.m_cPoints_ = comp;
m_utilElement_.m_prefix_ = 0;
m_utilElement_.m_prefixChars_ = null;
m_utilElement_.m_CELength_ = 1;
m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
// This character uses a prefix. We have to add it
// to the unsafe table, as it decomposed form is
// already in. In Japanese, this happens for \u309e
// & \u30fe
// Since unsafeCPSet is static in ucol_elm, we are
// going to wrap it up in the unsafeCPAddCCNZ
// function
}
addAnElement(t, m_utilElement_);
}
}
}
}
return true;
}
/**
* Determine if a character is a Jamo
*
* @param ch
* character to test
* @return true if ch is a Jamo, false otherwise
*/
private static final boolean isJamo(char ch) {
return (ch >= 0x1100 && ch <= 0x1112) || (ch >= 0x1175 && ch <= 0x1161)
|| (ch >= 0x11A8 && ch <= 0x11C2);
}
/**
* Produces canonical closure
*/
private void canonicalClosure(BuildTable t) {
BuildTable temp = new BuildTable(t);
assembleTable(temp, temp.m_collator_);
// produce canonical closure
CollationElementIterator coleiter = temp.m_collator_
.getCollationElementIterator("");
RangeValueIterator typeiter = UCharacter.getTypeIterator();
RangeValueIterator.Element element = new RangeValueIterator.Element();
while (typeiter.next(element)) {
enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter,
element.start, element.limit, element.value);
}
t.cmLookup = temp.cmLookup;
temp.cmLookup = null;
for (int i = 0; i < m_parser_.m_resultLength_; i++) {
char baseChar, firstCM;
// now we need to generate the CEs
// We stuff the initial value in the buffers, and increase the
// appropriate buffer according to strength */
// createElements(t, m_parser_.m_listHeader_[i]);
CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_;
m_utilElement_.clear();
while (tok != null) {
m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
if (tok.m_prefix_ != 0) {
// we will just copy the prefix here, and adjust accordingly
// in
// the addPrefix function in ucol_elm. The reason is that we
// need to add both composed AND decomposed elements to the
// unsafe table.
int size = tok.m_prefix_ >> 24;
int offset = tok.m_prefix_ & 0x00FFFFFF;
m_utilElement_.m_prefixChars_ = m_parser_.m_source_
.substring(offset, offset + size);
size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
offset = (tok.m_source_ & 0x00FFFFFF)
+ (tok.m_prefix_ >> 24);
m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
offset, offset + size);
} else {
m_utilElement_.m_prefixChars_ = null;
int offset = tok.m_source_ & 0x00FFFFFF;
int size = tok.m_source_ >>> 24;
m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
offset, offset + size);
}
m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
baseChar = firstCM = 0; // reset
for (int j = 0; j < m_utilElement_.m_cPoints_.length()
- m_utilElement_.m_cPointsOffset_; j++) {
int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters
if ((fcd & 0xff) == 0) {
baseChar = m_utilElement_.m_cPoints_.charAt(j);
} else {
if ((baseChar != 0) && (firstCM == 0)) {
firstCM = m_utilElement_.m_cPoints_.charAt(j); // first
// combining
// mark
}
}
}
if ((baseChar != 0) && (firstCM != 0)) {
addTailCanonicalClosures(t, temp.m_collator_, coleiter,
baseChar, firstCM);
}
tok = tok.m_next_;
}
}
}
private void addTailCanonicalClosures(BuildTable t,
RuleBasedCollator m_collator, CollationElementIterator colEl,
char baseChar, char cMark) {
if (t.cmLookup == null) {
return;
}
CombinClassTable cmLookup = t.cmLookup;
int[] index = cmLookup.index;
int cClass = m_nfcImpl_.getFCD16(cMark) & 0xff; // TODO: review for handling supplementary characters
int maxIndex = 0;
char[] precompCh = new char[256];
int[] precompClass = new int[256];
int precompLen = 0;
Elements element = new Elements();
if (cClass > 0) {
maxIndex = index[cClass - 1];
}
for (int i = 0; i < maxIndex; i++) {
StringBuilder decompBuf = new StringBuilder();
decompBuf.append(baseChar).append(cmLookup.cPoints[i]);
String comp = Normalizer.compose(decompBuf.toString(), false);
if (comp.length() == 1) {
precompCh[precompLen] = comp.charAt(0);
precompClass[precompLen] = m_nfcImpl_.getFCD16(cmLookup.cPoints[i]) & 0xff; // TODO: review for handling supplementary characters
precompLen++;
StringBuilder decomp = new StringBuilder();
for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) {
if (m_utilElement_.m_cPoints_.charAt(j) == cMark) {
decomp.append(cmLookup.cPoints[i]);
} else {
decomp.append(m_utilElement_.m_cPoints_.charAt(j));
}
}
comp = Normalizer.compose(decomp.toString(), false);
StringBuilder buf = new StringBuilder(comp);
buf.append(cMark);
decomp.append(cMark);
comp = buf.toString();
element.m_cPoints_ = decomp.toString();
element.m_CELength_ = 0;
element.m_prefix_ = 0;
Elements prefix = t.m_prefixLookup_.get(element);
element.m_cPoints_ = comp;
element.m_uchars_ = comp;
if (prefix == null) {
element.m_prefix_ = 0;
element.m_prefixChars_ = null;
colEl.setText(decomp.toString());
int ce = colEl.next();
element.m_CELength_ = 0;
while (ce != CollationElementIterator.NULLORDER) {
element.m_CEs_[element.m_CELength_++] = ce;
ce = colEl.next();
}
} else {
element.m_cPoints_ = comp;
element.m_prefix_ = 0;
element.m_prefixChars_ = null;
element.m_CELength_ = 1;
element.m_CEs_[0] = prefix.m_mapCE_;
}
setMapCE(t, element);
finalizeAddition(t, element);
if (comp.length() > 2) {
// This is a fix for tailoring contractions with accented
// character at the end of contraction string.
addFCD4AccentedContractions(t, colEl, comp, element);
}
if (precompLen > 1) {
precompLen = addMultiCMontractions(t, colEl, element,
precompCh, precompClass, precompLen, cMark, i,
decomp.toString());
}
}
}
}
private void setMapCE(BuildTable t, Elements element) {
List expansions = t.m_expansions_;
element.m_mapCE_ = 0;
if (element.m_CELength_ == 2 // a two CE expansion
&& RuleBasedCollator.isContinuation(element.m_CEs_[1])
&& (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
// has
// only
// primaries
// in
// continuation
&& (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
// a common secondary
&& ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and
// a
// common
// tertiary
element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
// a long primary special
| (CE_LONG_PRIMARY_TAG_ << 24)
// first and second byte of primary
| ((element.m_CEs_[0] >> 8) & 0xFFFF00)
// third byte of primary
| ((element.m_CEs_[1] >> 24) & 0xFF);
} else {
// omitting expansion offset in builder
// (HEADER_SIZE_ >> 2)
int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
| (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
| (addExpansion(expansions, element.m_CEs_[0]) << 4)
& 0xFFFFF0;
for (int i = 1; i < element.m_CELength_; i++) {
addExpansion(expansions, element.m_CEs_[i]);
}
if (element.m_CELength_ <= 0xF) {
expansion |= element.m_CELength_;
} else {
addExpansion(expansions, 0);
}
element.m_mapCE_ = expansion;
setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
(byte) element.m_CELength_, t.m_maxExpansions_);
}
}
private int addMultiCMontractions(BuildTable t,
CollationElementIterator colEl, Elements element, char[] precompCh,
int[] precompClass, int maxComp, char cMark, int cmPos,
String decomp) {
CombinClassTable cmLookup = t.cmLookup;
char[] combiningMarks = { cMark };
int cMarkClass = UCharacter.getCombiningClass(cMark) & 0xFF;
String comMark = new String(combiningMarks);
int noOfPrecomposedChs = maxComp;
for (int j = 0; j < maxComp; j++) {
int count = 0;
StringBuilder temp;
do {
String newDecomp, comp;
if (count == 0) { // Decompose the saved precomposed char.
newDecomp = Normalizer.decompose(
new String(precompCh, j, 1), false);
temp = new StringBuilder(newDecomp);
temp.append(cmLookup.cPoints[cmPos]);
newDecomp = temp.toString();
} else {
temp = new StringBuilder(decomp);
temp.append(precompCh[j]);
newDecomp = temp.toString();
}
comp = Normalizer.compose(newDecomp, false);
if (comp.length() == 1) {
temp.append(cMark);
element.m_cPoints_ = temp.toString();
element.m_CELength_ = 0;
element.m_prefix_ = 0;
Elements prefix = t.m_prefixLookup_.get(element);
element.m_cPoints_ = comp + comMark;
if (prefix == null) {
element.m_prefix_ = 0;
element.m_prefixChars_ = null;
colEl.setText(temp.toString());
int ce = colEl.next();
element.m_CELength_ = 0;
while (ce != CollationElementIterator.NULLORDER) {
element.m_CEs_[element.m_CELength_++] = ce;
ce = colEl.next();
}
} else {
element.m_cPoints_ = comp;
element.m_prefix_ = 0;
element.m_prefixChars_ = null;
element.m_CELength_ = 1;
element.m_CEs_[0] = prefix.m_mapCE_;
}
setMapCE(t, element);
finalizeAddition(t, element);
precompCh[noOfPrecomposedChs] = comp.charAt(0);
precompClass[noOfPrecomposedChs] = cMarkClass;
noOfPrecomposedChs++;
}
} while (++count < 2 && (precompClass[j] == cMarkClass));
}
return noOfPrecomposedChs;
}
private void addFCD4AccentedContractions(BuildTable t,
CollationElementIterator colEl, String data, Elements element) {
String decomp = Normalizer.decompose(data, false);
String comp = Normalizer.compose(data, false);
element.m_cPoints_ = decomp;
element.m_CELength_ = 0;
element.m_prefix_ = 0;
Elements prefix = t.m_prefixLookup_.get(element);
if (prefix == null) {
element.m_cPoints_ = comp;
element.m_prefix_ = 0;
element.m_prefixChars_ = null;
element.m_CELength_ = 0;
colEl.setText(decomp);
int ce = colEl.next();
element.m_CELength_ = 0;
while (ce != CollationElementIterator.NULLORDER) {
element.m_CEs_[element.m_CELength_++] = ce;
ce = colEl.next();
}
addAnElement(t, element);
}
}
private void processUCACompleteIgnorables(BuildTable t) {
TrieIterator trieiterator = new TrieIterator(
RuleBasedCollator.UCA_.m_trie_);
RangeValueIterator.Element element = new RangeValueIterator.Element();
while (trieiterator.next(element)) {
int start = element.start;
int limit = element.limit;
if (element.value == 0) {
while (start < limit) {
int CE = t.m_mapping_.getValue(start);
if (CE == CE_NOT_FOUND_) {
m_utilElement_.m_prefix_ = 0;
m_utilElement_.m_uchars_ = UCharacter.toString(start);
m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
m_utilElement_.m_cPointsOffset_ = 0;
m_utilElement_.m_CELength_ = 1;
m_utilElement_.m_CEs_[0] = 0;
addAnElement(t, m_utilElement_);
}
start++;
}
}
}
}
}