All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.coll.CollationDataBuilder Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

The newest version!
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
*
* C++ version created on: 2012apr01
* created by: Markus W. Scherer
*/

package com.ibm.icu.impl.coll;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;

import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Normalizer2Impl.Hangul;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.impl.Trie2Writable;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrieBuilder;
import com.ibm.icu.util.StringTrieBuilder;

/**
 * Low-level CollationData builder.
 * Takes (character, CE) pairs and builds them into runtime data structures.
 * Supports characters with context prefixes and contraction suffixes.
 */
final class CollationDataBuilder {  // not final in C++
    /**
     * Collation element modifier. Interface class for a modifier
     * that changes a tailoring builder's temporary CEs to final CEs.
     * Called for every non-special CE32 and every expansion CE.
     */
    interface CEModifier {
        /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */
        long modifyCE32(int ce32);
        /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */
        long modifyCE(long ce);
    }

    CollationDataBuilder() {
        nfcImpl = Norm2AllModes.getNFCInstance().impl;
        base = null;
        baseSettings = null;
        trie = null;
        ce32s = new UVector32();
        ce64s = new UVector64();
        conditionalCE32s = new ArrayList<>();
        modified = false;
        fastLatinEnabled = false;
        fastLatinBuilder = null;
        collIter = null;
        // Reserve the first CE32 for U+0000.
        ce32s.addElement(0);
    }

    void initForTailoring(CollationData b) {
        if(trie != null) {
            throw new IllegalStateException("attempt to reuse a CollationDataBuilder");
        }
        if(b == null) {
            throw new IllegalArgumentException("null CollationData");
        }
        base = b;

        // For a tailoring, the default is to fall back to the base.
        trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32);

        // Set the Latin-1 letters block so that it is allocated first in the data array,
        // to try to improve locality of reference when sorting Latin-1 text.
        // Do not use utrie2_setRange32() since that will not actually allocate blocks
        // that are filled with the default value.
        // ASCII (0..7F) is already preallocated anyway.
        for(int c = 0xc0; c <= 0xff; ++c) {
            trie.set(c, Collation.FALLBACK_CE32);
        }

        // Hangul syllables are not tailorable (except via tailoring Jamos).
        // Always set the Hangul tag to help performance.
        // Do this here, rather than in buildMappings(),
        // so that we see the HANGUL_TAG in various assertions.
        int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
        trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true);

        // Copy the set contents but don't copy/clone the set as a whole because
        // that would copy the isFrozen state too.
        unsafeBackwardSet.addAll(b.unsafeBackwardSet);
    }

    boolean isCompressibleLeadByte(int b) {
        return base.isCompressibleLeadByte(b);
    }

    boolean isCompressiblePrimary(long p) {
        return isCompressibleLeadByte((int)p >>> 24);
    }

    /**
     * @return true if this builder has mappings (e.g., add() has been called)
     */
    boolean hasMappings() { return modified; }

    /**
     * @return true if c has CEs in this builder
     */
    boolean isAssigned(int c) {
        return Collation.isAssignedCE32(trie.get(c));
    }

    void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
        int ce32 = encodeCEs(ces, cesLength);
        addCE32(prefix, s, ce32);
    }

    /**
     * Encodes the ces as either the returned ce32 by itself,
     * or by storing an expansion, with the returned ce32 referring to that.
     *
     * 

add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) */ int encodeCEs(long ces[], int cesLength) { if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) { throw new IllegalArgumentException("mapping to too many CEs"); } if(!isMutable()) { throw new IllegalStateException("attempt to add mappings after build()"); } if(cesLength == 0) { // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. // Do this here so that callers need not do it. return encodeOneCEAsCE32(0); } else if(cesLength == 1) { return encodeOneCE(ces[0]); } else if(cesLength == 2) { // Try to encode two CEs as one CE32. long ce0 = ces[0]; long ce1 = ces[1]; long p0 = ce0 >>> 32; if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE && (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE && p0 != 0) { // Latin mini expansion return (int)p0 | (((int)ce0 & 0xff00) << 8) | (((int)ce1 >> 16) & 0xff00) | Collation.SPECIAL_CE32_LOW_BYTE | Collation.LATIN_EXPANSION_TAG; } } // Try to encode two or more CEs as CE32s. int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH]; // TODO: instance field? for(int i = 0;; ++i) { if(i == cesLength) { return encodeExpansion32(newCE32s, 0, cesLength); } int ce32 = encodeOneCEAsCE32(ces[i]); if(ce32 == Collation.NO_CE32) { break; } newCE32s[i] = ce32; } return encodeExpansion(ces, 0, cesLength); } void addCE32(CharSequence prefix, CharSequence s, int ce32) { if(s.length() == 0) { throw new IllegalArgumentException("mapping from empty string"); } if(!isMutable()) { throw new IllegalStateException("attempt to add mappings after build()"); } int c = Character.codePointAt(s, 0); int cLength = Character.charCount(c); int oldCE32 = trie.get(c); boolean hasContext = prefix.length() != 0|| s.length() > cLength; if(oldCE32 == Collation.FALLBACK_CE32) { // First tailoring for c. // If c has contextual base mappings or if we add a contextual mapping, // then copy the base mappings. // Otherwise we just override the base mapping. int baseCE32 = base.getFinalCE32(base.getCE32(c)); if(hasContext || Collation.ce32HasContext(baseCE32)) { oldCE32 = copyFromBaseCE32(c, baseCE32, true); trie.set(c, oldCE32); } } if(!hasContext) { // No prefix, no contraction. if(!isBuilderContextCE32(oldCE32)) { trie.set(c, ce32); } else { ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32); cond.builtCE32 = Collation.NO_CE32; cond.ce32 = ce32; } } else { ConditionalCE32 cond; if(!isBuilderContextCE32(oldCE32)) { // Replace the simple oldCE32 with a builder context CE32 // pointing to a new ConditionalCE32 list head. int index = addConditionalCE32("\0", oldCE32); int contextCE32 = makeBuilderContextCE32(index); trie.set(c, contextCE32); contextChars.add(c); cond = getConditionalCE32(index); } else { cond = getConditionalCE32ForCE32(oldCE32); cond.builtCE32 = Collation.NO_CE32; } CharSequence suffix = s.subSequence(cLength, s.length()); String context = new StringBuilder().append((char)prefix.length()). append(prefix).append(suffix).toString(); unsafeBackwardSet.addAll(suffix); for(;;) { // invariant: context > cond.context int next = cond.next; if(next < 0) { // Append a new ConditionalCE32 after cond. int index = addConditionalCE32(context, ce32); cond.next = index; break; } ConditionalCE32 nextCond = getConditionalCE32(next); int cmp = context.compareTo(nextCond.context); if(cmp < 0) { // Insert a new ConditionalCE32 between cond and nextCond. int index = addConditionalCE32(context, ce32); cond.next = index; getConditionalCE32(index).next = next; break; } else if(cmp == 0) { // Same context as before, overwrite its ce32. nextCond.ce32 = ce32; break; } cond = nextCond; } } modified = true; } /** * Copies all mappings from the src builder, with modifications. * This builder here must not be built yet, and should be empty. */ void copyFrom(CollationDataBuilder src, CEModifier modifier) { if(!isMutable()) { throw new IllegalStateException("attempt to copyFrom() after build()"); } CopyHelper helper = new CopyHelper(src, this, modifier); Iterator trieIterator = src.trie.iterator(); Trie2.Range range; while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper); } // Update the contextChars and the unsafeBackwardSet while copying, // in case a character had conditional mappings in the source builder // and they were removed later. modified |= src.modified; } void optimize(UnicodeSet set) { if(set.isEmpty()) { return; } UnicodeSetIterator iter = new UnicodeSetIterator(set); while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { int c = iter.codepoint; int ce32 = trie.get(c); if(ce32 == Collation.FALLBACK_CE32) { ce32 = base.getFinalCE32(base.getCE32(c)); ce32 = copyFromBaseCE32(c, ce32, true); trie.set(c, ce32); } } modified = true; } void suppressContractions(UnicodeSet set) { if(set.isEmpty()) { return; } UnicodeSetIterator iter = new UnicodeSetIterator(set); while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { int c = iter.codepoint; int ce32 = trie.get(c); if(ce32 == Collation.FALLBACK_CE32) { ce32 = base.getFinalCE32(base.getCE32(c)); if(Collation.ce32HasContext(ce32)) { ce32 = copyFromBaseCE32(c, ce32, false /* without context */); trie.set(c, ce32); } } else if(isBuilderContextCE32(ce32)) { ce32 = getConditionalCE32ForCE32(ce32).ce32; // Simply abandon the list of ConditionalCE32. // The caller will copy this builder in the end, // eliminating unreachable data. trie.set(c, ce32); contextChars.remove(c); } } modified = true; } void enableFastLatin() { fastLatinEnabled = true; } void build(CollationData data) { buildMappings(data); if(base != null) { data.numericPrimary = base.numericPrimary; data.compressibleBytes = base.compressibleBytes; data.numScripts = base.numScripts; data.scriptsIndex = base.scriptsIndex; data.scriptStarts = base.scriptStarts; } buildFastLatinTable(data); } /** * Looks up CEs for s and appends them to the ces array. * Does not handle normalization: s should be in FCD form. * * Does not write completely ignorable CEs. * Does not write beyond Collation.MAX_EXPANSION_LENGTH. * * @return incremented cesLength */ int getCEs(CharSequence s, long ces[], int cesLength) { return getCEs(s, 0, ces, cesLength); } int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) { int prefixLength = prefix.length(); if(prefixLength == 0) { return getCEs(s, 0, ces, cesLength); } else { return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength); } } /** * Build-time context and CE32 for a code point. * If a code point has contextual mappings, then the default (no-context) mapping * and all conditional mappings are stored in a singly-linked list * of ConditionalCE32, sorted by context strings. * * Context strings sort by prefix length, then by prefix, then by contraction suffix. * Context strings must be unique and in ascending order. */ private static final class ConditionalCE32 { ConditionalCE32(String ct, int ce) { context = ct; ce32 = ce; defaultCE32 = Collation.NO_CE32; builtCE32 = Collation.NO_CE32; next = -1; } boolean hasContext() { return context.length() > 1; } int prefixLength() { return context.charAt(0); } /** * "\0" for the first entry for any code point, with its default CE32. * * Otherwise one unit with the length of the prefix string, * then the prefix string, then the contraction suffix. */ String context; /** * CE32 for the code point and its context. * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). */ int ce32; /** * Default CE32 for all contexts with this same prefix. * Initially NO_CE32. Set only while building runtime data structures, * and only on one of the nodes of a sub-list with the same prefix. */ int defaultCE32; /** * CE32 for the built contexts. * When fetching CEs from the builder, the contexts are built into their runtime form * so that the normal collation implementation can process them. * The result is cached in the list head. It is reset when the contexts are modified. * All of these builtCE32 are invalidated by clearContexts(), * via incrementing the contextsEra. */ int builtCE32; /** * The "era" of building intermediate contexts when the above builtCE32 was set. * When the array of cached, temporary contexts overflows, then clearContexts() * removes them all and invalidates the builtCE32 that used to point to built tries. */ int era = -1; /** * Index of the next ConditionalCE32. * Negative for the end of the list. */ int next; // Note: We could create a separate class for all of the contextual mappings for // a code point, with the builtCE32, the era, and a list of the actual mappings. // The class that represents one mapping would then not need to // store those fields in each element. } protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) { int i = Collation.indexFromCE32(ce32); long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i); long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); return Collation.makeLongPrimaryCE32(p); } protected int addCE(long ce) { int length = ce64s.size(); for(int i = 0; i < length; ++i) { if(ce == ce64s.elementAti(i)) { return i; } } ce64s.addElement(ce); return length; } protected int addCE32(int ce32) { int length = ce32s.size(); for(int i = 0; i < length; ++i) { if(ce32 == ce32s.elementAti(i)) { return i; } } ce32s.addElement(ce32); return length; } protected int addConditionalCE32(String context, int ce32) { assert(context.length() != 0); int index = conditionalCE32s.size(); if(index > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many context-sensitive mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } ConditionalCE32 cond = new ConditionalCE32(context, ce32); conditionalCE32s.add(cond); return index; } protected ConditionalCE32 getConditionalCE32(int index) { return conditionalCE32s.get(index); } protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) { return getConditionalCE32(Collation.indexFromCE32(ce32)); } protected static int makeBuilderContextCE32(int index) { return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index); } protected static boolean isBuilderContextCE32(int ce32) { return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG); } protected static int encodeOneCEAsCE32(long ce) { long p = ce >>> 32; int lower32 = (int)ce; int t = lower32 & 0xffff; assert((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. if((ce & 0xffff00ff00ffL) == 0) { // normal form ppppsstt return (int)p | (lower32 >>> 16) | (t >> 8); } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) { // long-primary form ppppppC1 return Collation.makeLongPrimaryCE32(p); } else if(p == 0 && (t & 0xff) == 0) { // long-secondary form ssssttC2 return Collation.makeLongSecondaryCE32(lower32); } return Collation.NO_CE32; } protected int encodeOneCE(long ce) { // Try to encode one CE as one CE32. int ce32 = encodeOneCEAsCE32(ce); if(ce32 != Collation.NO_CE32) { return ce32; } int index = addCE(ce); if(index > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1); } protected int encodeExpansion(long ces[], int start, int length) { // See if this sequence of CEs has already been stored. long first = ces[start]; int ce64sMax = ce64s.size() - length; for(int i = 0; i <= ce64sMax; ++i) { if(first == ce64s.elementAti(i)) { if(i > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } for(int j = 1;; ++j) { if(j == length) { return Collation.makeCE32FromTagIndexAndLength( Collation.EXPANSION_TAG, i, length); } if(ce64s.elementAti(i + j) != ces[start + j]) { break; } } } } // Store the new sequence. int i = ce64s.size(); if(i > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } for(int j = 0; j < length; ++j) { ce64s.addElement(ces[start + j]); } return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length); } protected int encodeExpansion32(int newCE32s[], int start, int length) { // See if this sequence of CE32s has already been stored. int first = newCE32s[start]; int ce32sMax = ce32s.size() - length; for(int i = 0; i <= ce32sMax; ++i) { if(first == ce32s.elementAti(i)) { if(i > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } for(int j = 1;; ++j) { if(j == length) { return Collation.makeCE32FromTagIndexAndLength( Collation.EXPANSION32_TAG, i, length); } if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; } } } } // Store the new sequence. int i = ce32s.size(); if(i > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } for(int j = 0; j < length; ++j) { ce32s.addElement(newCE32s[start + j]); } return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length); } protected int copyFromBaseCE32(int c, int ce32, boolean withContext) { if(!Collation.isSpecialCE32(ce32)) { return ce32; } switch(Collation.tagFromCE32(ce32)) { case Collation.LONG_PRIMARY_TAG: case Collation.LONG_SECONDARY_TAG: case Collation.LATIN_EXPANSION_TAG: // copy as is break; case Collation.EXPANSION32_TAG: { int index = Collation.indexFromCE32(ce32); int length = Collation.lengthFromCE32(ce32); ce32 = encodeExpansion32(base.ce32s, index, length); break; } case Collation.EXPANSION_TAG: { int index = Collation.indexFromCE32(ce32); int length = Collation.lengthFromCE32(ce32); ce32 = encodeExpansion(base.ces, index, length); break; } case Collation.PREFIX_TAG: { // Flatten prefixes and nested suffixes (contractions) // into a linear list of ConditionalCE32. int trieIndex = Collation.indexFromCE32(ce32); ce32 = base.getCE32FromContexts(trieIndex); // Default if no prefix match. if(!withContext) { return copyFromBaseCE32(c, ce32, false); } ConditionalCE32 head = new ConditionalCE32("", 0); StringBuilder context = new StringBuilder("\0"); int index; if(Collation.isContractionCE32(ce32)) { index = copyContractionsFromBaseCE32(context, c, ce32, head); } else { ce32 = copyFromBaseCE32(c, ce32, true); head.next = index = addConditionalCE32(context.toString(), ce32); } ConditionalCE32 cond = getConditionalCE32(index); // the last ConditionalCE32 so far CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); while(prefixes.hasNext()) { CharsTrie.Entry entry = prefixes.next(); context.setLength(0); context.append(entry.chars).reverse().insert(0, (char)entry.chars.length()); ce32 = entry.value; if(Collation.isContractionCE32(ce32)) { index = copyContractionsFromBaseCE32(context, c, ce32, cond); } else { ce32 = copyFromBaseCE32(c, ce32, true); cond.next = index = addConditionalCE32(context.toString(), ce32); } cond = getConditionalCE32(index); } ce32 = makeBuilderContextCE32(head.next); contextChars.add(c); break; } case Collation.CONTRACTION_TAG: { if(!withContext) { int index = Collation.indexFromCE32(ce32); ce32 = base.getCE32FromContexts(index); // Default if no suffix match. return copyFromBaseCE32(c, ce32, false); } ConditionalCE32 head = new ConditionalCE32("", 0); StringBuilder context = new StringBuilder("\0"); copyContractionsFromBaseCE32(context, c, ce32, head); ce32 = makeBuilderContextCE32(head.next); contextChars.add(c); break; } case Collation.HANGUL_TAG: throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables."); case Collation.OFFSET_TAG: ce32 = getCE32FromOffsetCE32(true, c, ce32); break; case Collation.IMPLICIT_TAG: ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c)); break; default: throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " + "requires ce32 == base.getFinalCE32(ce32)"); } return ce32; } /** * Copies base contractions to a list of ConditionalCE32. * Sets cond.next to the index of the first new item * and returns the index of the last new item. */ protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32, ConditionalCE32 cond) { int trieIndex = Collation.indexFromCE32(ce32); int index; if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { // No match on the single code point. // We are underneath a prefix, and the default mapping is just // a fallback to the mappings for a shorter prefix. assert(context.length() > 1); index = -1; } else { ce32 = base.getCE32FromContexts(trieIndex); // Default if no suffix match. assert(!Collation.isContractionCE32(ce32)); ce32 = copyFromBaseCE32(c, ce32, true); cond.next = index = addConditionalCE32(context.toString(), ce32); cond = getConditionalCE32(index); } int suffixStart = context.length(); CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); while(suffixes.hasNext()) { CharsTrie.Entry entry = suffixes.next(); context.append(entry.chars); ce32 = copyFromBaseCE32(c, entry.value, true); cond.next = index = addConditionalCE32(context.toString(), ce32); // No need to update the unsafeBackwardSet because the tailoring set // is already a copy of the base set. cond = getConditionalCE32(index); context.setLength(suffixStart); } assert(index >= 0); return index; } private static final class CopyHelper { CopyHelper(CollationDataBuilder s, CollationDataBuilder d, CollationDataBuilder.CEModifier m) { src = s; dest = d; modifier = m; } void copyRangeCE32(int start, int end, int ce32) { ce32 = copyCE32(ce32); dest.trie.setRange(start, end, ce32, true); if(CollationDataBuilder.isBuilderContextCE32(ce32)) { dest.contextChars.add(start, end); } } int copyCE32(int ce32) { if(!Collation.isSpecialCE32(ce32)) { long ce = modifier.modifyCE32(ce32); if(ce != Collation.NO_CE) { ce32 = dest.encodeOneCE(ce); } } else { int tag = Collation.tagFromCE32(ce32); if(tag == Collation.EXPANSION32_TAG) { int[] srcCE32s = src.ce32s.getBuffer(); int srcIndex = Collation.indexFromCE32(ce32); int length = Collation.lengthFromCE32(ce32); // Inspect the source CE32s. Just copy them if none are modified. // Otherwise copy to modifiedCEs, with modifications. boolean isModified = false; for(int i = 0; i < length; ++i) { ce32 = srcCE32s[srcIndex + i]; long ce; if(Collation.isSpecialCE32(ce32) || (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) { if(isModified) { modifiedCEs[i] = Collation.ceFromCE32(ce32); } } else { if(!isModified) { for(int j = 0; j < i; ++j) { modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]); } isModified = true; } modifiedCEs[i] = ce; } } if(isModified) { ce32 = dest.encodeCEs(modifiedCEs, length); } else { ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length); } } else if(tag == Collation.EXPANSION_TAG) { long[] srcCEs = src.ce64s.getBuffer(); int srcIndex = Collation.indexFromCE32(ce32); int length = Collation.lengthFromCE32(ce32); // Inspect the source CEs. Just copy them if none are modified. // Otherwise copy to modifiedCEs, with modifications. boolean isModified = false; for(int i = 0; i < length; ++i) { long srcCE = srcCEs[srcIndex + i]; long ce = modifier.modifyCE(srcCE); if(ce == Collation.NO_CE) { if(isModified) { modifiedCEs[i] = srcCE; } } else { if(!isModified) { for(int j = 0; j < i; ++j) { modifiedCEs[j] = srcCEs[srcIndex + j]; } isModified = true; } modifiedCEs[i] = ce; } } if(isModified) { ce32 = dest.encodeCEs(modifiedCEs, length); } else { ce32 = dest.encodeExpansion(srcCEs, srcIndex, length); } } else if(tag == Collation.BUILDER_DATA_TAG) { // Copy the list of ConditionalCE32. ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32); assert(!cond.hasContext()); int destIndex = dest.addConditionalCE32( cond.context, copyCE32(cond.ce32)); ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex); while(cond.next >= 0) { cond = src.getConditionalCE32(cond.next); ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex); destIndex = dest.addConditionalCE32( cond.context, copyCE32(cond.ce32)); int suffixStart = cond.prefixLength() + 1; dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart)); prevDestCond.next = destIndex; } } else { // Just copy long CEs and Latin mini expansions (and other expected values) as is, // assuming that the modifier would not modify them. assert(tag == Collation.LONG_PRIMARY_TAG || tag == Collation.LONG_SECONDARY_TAG || tag == Collation.LATIN_EXPANSION_TAG || tag == Collation.HANGUL_TAG); } } return ce32; } CollationDataBuilder src; CollationDataBuilder dest; CollationDataBuilder.CEModifier modifier; long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH]; } private static void enumRangeForCopy(int start, int end, int value, CopyHelper helper) { if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) { helper.copyRangeCE32(start, end, value); } } protected boolean getJamoCE32s(int jamoCE32s[]) { boolean anyJamoAssigned = base == null; // always set jamoCE32s in the base data boolean needToCopyFromBase = false; for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. int jamo = jamoCpFromIndex(j); boolean fromBase = false; int ce32 = trie.get(jamo); anyJamoAssigned |= Collation.isAssignedCE32(ce32); // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) if(ce32 == Collation.FALLBACK_CE32) { fromBase = true; ce32 = base.getCE32(jamo); } if(Collation.isSpecialCE32(ce32)) { switch(Collation.tagFromCE32(ce32)) { case Collation.LONG_PRIMARY_TAG: case Collation.LONG_SECONDARY_TAG: case Collation.LATIN_EXPANSION_TAG: // Copy the ce32 as-is. break; case Collation.EXPANSION32_TAG: case Collation.EXPANSION_TAG: case Collation.PREFIX_TAG: case Collation.CONTRACTION_TAG: if(fromBase) { // Defer copying until we know if anyJamoAssigned. ce32 = Collation.FALLBACK_CE32; needToCopyFromBase = true; } break; case Collation.IMPLICIT_TAG: // An unassigned Jamo should only occur in tests with incomplete bases. assert(fromBase); ce32 = Collation.FALLBACK_CE32; needToCopyFromBase = true; break; case Collation.OFFSET_TAG: ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); break; case Collation.FALLBACK_TAG: case Collation.RESERVED_TAG_3: case Collation.BUILDER_DATA_TAG: case Collation.DIGIT_TAG: case Collation.U0000_TAG: case Collation.HANGUL_TAG: case Collation.LEAD_SURROGATE_TAG: throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32)); } } jamoCE32s[j] = ce32; } if(anyJamoAssigned && needToCopyFromBase) { for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { if(jamoCE32s[j] == Collation.FALLBACK_CE32) { int jamo = jamoCpFromIndex(j); jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo), /*withContext=*/ true); } } } return anyJamoAssigned; } protected void setDigitTags() { UnicodeSet digits = new UnicodeSet("[:Nd:]"); UnicodeSetIterator iter = new UnicodeSetIterator(digits); while(iter.next()) { assert(iter.codepoint != UnicodeSetIterator.IS_STRING); int c = iter.codepoint; int ce32 = trie.get(c); if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) { int index = addCE32(ce32); if(index > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } ce32 = Collation.makeCE32FromTagIndexAndLength( Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c) trie.set(c, ce32); } } } protected void setLeadSurrogates() { for(char lead = 0xd800; lead < 0xdc00; ++lead) { int leadValue = -1; // utrie2_enumForLeadSurrogate(trie, lead, null, , &value); Iterator trieIterator = trie.iteratorForLeadSurrogate(lead); while(trieIterator.hasNext()) { Trie2.Range range = trieIterator.next(); // The rest of this loop is equivalent to C++ enumRangeLeadValue(). int value = range.value; if(value == Collation.UNASSIGNED_CE32) { value = Collation.LEAD_ALL_UNASSIGNED; } else if(value == Collation.FALLBACK_CE32) { value = Collation.LEAD_ALL_FALLBACK; } else { leadValue = Collation.LEAD_MIXED; break; } if(leadValue < 0) { leadValue = value; } else if(leadValue != value) { leadValue = Collation.LEAD_MIXED; break; } } trie.setForLeadSurrogateCodeUnit(lead, Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue); } } protected void buildMappings(CollationData data) { if(!isMutable()) { throw new IllegalStateException("attempt to build() after build()"); } buildContexts(); int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; int jamoIndex = -1; if(getJamoCE32s(jamoCE32s)) { jamoIndex = ce32s.size(); for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) { ce32s.addElement(jamoCE32s[i]); } // Small optimization: Use a bit in the Hangul ce32 // to indicate that none of the Jamo CE32s are isSpecialCE32() // (as it should be in the root collator). // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. // In order to still have good trie compression and keep this code simple, // we only set this flag if a whole block of 588 Hangul syllables starting with // a common leading consonant (Jamo L) has this property. boolean isAnyJamoVTSpecial = false; for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) { if(Collation.isSpecialCE32(jamoCE32s[i])) { isAnyJamoVTSpecial = true; break; } } int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); int c = Hangul.HANGUL_BASE; for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) { // iterate over the Jamo L int ce32 = hangulCE32; if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) { ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO; } int limit = c + Hangul.JAMO_VT_COUNT; trie.setRange(c, limit - 1, ce32, true); c = limit; } } else { // Copy the Hangul CE32s from the base in blocks per Jamo L, // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) { int ce32 = base.getCE32(c); assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG)); int limit = c + Hangul.JAMO_VT_COUNT; trie.setRange(c, limit - 1, ce32, true); c = limit; } } setDigitTags(); setLeadSurrogates(); // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. ce32s.setElementAt(trie.get(0), 0); trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0)); data.trie = trie.toTrie2_32(); // Mark each lead surrogate as "unsafe" // if any of its 1024 associated supplementary code points is "unsafe". int c = 0x10000; for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { unsafeBackwardSet.add(lead); } } unsafeBackwardSet.freeze(); data.ce32s = ce32s.getBuffer(); data.ces = ce64s.getBuffer(); data.contexts = contexts.toString(); data.base = base; if(jamoIndex >= 0) { data.jamoCE32s = jamoCE32s; // C++: data.ce32s + jamoIndex } else { data.jamoCE32s = base.jamoCE32s; } data.unsafeBackwardSet = unsafeBackwardSet; } protected void clearContexts() { contexts.setLength(0); // Incrementing the contexts build "era" invalidates all of the builtCE32 // from before this clearContexts() call. // Simpler than finding and resetting all of those fields. ++contextsEra; } protected void buildContexts() { // Ignore abandoned lists and the cached builtCE32, // and build all contexts from scratch. clearContexts(); UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); while(iter.next()) { assert(iter.codepoint != UnicodeSetIterator.IS_STRING); int c = iter.codepoint; int ce32 = trie.get(c); if(!isBuilderContextCE32(ce32)) { throw new AssertionError("Impossible: No context data for c in contextChars."); } ConditionalCE32 cond = getConditionalCE32ForCE32(ce32); ce32 = buildContext(cond); trie.set(c, ce32); } } protected int buildContext(ConditionalCE32 head) { // The list head must have no context. assert(!head.hasContext()); // The list head must be followed by one or more nodes that all do have context. assert(head.next >= 0); CharsTrieBuilder prefixBuilder = null; CharsTrieBuilder contractionBuilder = null; // This outer loop goes from each prefix to the next. // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond). // If there are multiple suffixes for the same prefix, // then an inner loop builds a contraction trie for them. for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) { // After the list head, the prefix or suffix can be empty, but not both. assert(cond == head || cond.hasContext()); int prefixLength = cond.prefixLength(); StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1); String prefixString = prefix.toString(); // Collect all contraction suffixes for one prefix. ConditionalCE32 firstCond = cond; ConditionalCE32 lastCond; do { lastCond = cond; // Clear the defaultCE32 fields as we go. // They are left over from building a previous version of this list of contexts. // // One of the code paths below may copy a preceding defaultCE32 // into its emptySuffixCE32. // If a new suffix has been inserted before what used to be // the firstCond for its prefix, then that previous firstCond could still // contain an outdated defaultCE32 from an earlier buildContext() and // result in an incorrect emptySuffixCE32. // So we reset all defaultCE32 before reading and setting new values. cond.defaultCE32 = Collation.NO_CE32; } while(cond.next >= 0 && (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)); int ce32; int suffixStart = prefixLength + 1; // == prefix.length() if(lastCond.context.length() == suffixStart) { // One prefix without contraction suffix. assert(firstCond == lastCond); ce32 = lastCond.ce32; cond = lastCond; } else { // Build the contractions trie. if(contractionBuilder == null) { contractionBuilder = new CharsTrieBuilder(); } else { contractionBuilder.clear(); } // Entry for an empty suffix, to be stored before the trie. int emptySuffixCE32 = Collation.NO_CE32; // Will always be set to a real value. int flags = 0; if(firstCond.context.length() == suffixStart) { // There is a mapping for the prefix and the single character c. (p|c) // If no other suffix matches, then we return this value. emptySuffixCE32 = firstCond.ce32; cond = getConditionalCE32(firstCond.next); } else { // There is no mapping for the prefix and just the single character. // (There is no p|c, only p|cd, p|ce etc.) flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH; // When the prefix matches but none of the prefix-specific suffixes, // then we fall back to the mappings with the next-longest prefix, // and ultimately to mappings with no prefix. // Each fallback might be another set of contractions. // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, // then in text "pch" we find the ch contraction. for(cond = head;; cond = getConditionalCE32(cond.next)) { int length = cond.prefixLength(); if(length == prefixLength) { break; } if(cond.defaultCE32 != Collation.NO_CE32 && (length==0 || prefixString.regionMatches( prefix.length() - length, cond.context, 1, length) /* C++: prefix.endsWith(cond.context, 1, length) */)) { emptySuffixCE32 = cond.defaultCE32; } } cond = firstCond; } // Optimization: Set a flag when // the first character of every contraction suffix has lccc!=0. // Short-circuits contraction matching when a normal letter follows. flags |= Collation.CONTRACT_NEXT_CCC; // Add all of the non-empty suffixes into the contraction trie. for(;;) { String suffix = cond.context.substring(suffixStart); int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0)); if(fcd16 <= 0xff) { flags &= ~Collation.CONTRACT_NEXT_CCC; } fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length())); if(fcd16 > 0xff) { // The last suffix character has lccc!=0, allowing for discontiguous contractions. flags |= Collation.CONTRACT_TRAILING_CCC; } contractionBuilder.add(suffix, cond.ce32); if(cond == lastCond) { break; } cond = getConditionalCE32(cond.next); } int index = addContextTrie(emptySuffixCE32, contractionBuilder); if(index > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many context-sensitive mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags; } assert(cond == lastCond); firstCond.defaultCE32 = ce32; if(prefixLength == 0) { if(cond.next < 0) { // No non-empty prefixes, only contractions. return ce32; } } else { prefix.delete(0, 1); // Remove the length unit. prefix.reverse(); if(prefixBuilder == null) { prefixBuilder = new CharsTrieBuilder(); } prefixBuilder.add(prefix, ce32); if(cond.next < 0) { break; } } } assert(head.defaultCE32 != Collation.NO_CE32); int index = addContextTrie(head.defaultCE32, prefixBuilder); if(index > Collation.MAX_INDEX) { throw new IndexOutOfBoundsException("too many context-sensitive mappings"); // BufferOverflowException is a better fit // but cannot be constructed with a message string. } return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index); } protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) { StringBuilder context = new StringBuilder(); context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32); context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL)); int index = contexts.indexOf(context.toString()); if(index < 0) { index = contexts.length(); contexts.append(context); } return index; } protected void buildFastLatinTable(CollationData data) { if(!fastLatinEnabled) { return; } fastLatinBuilder = new CollationFastLatinBuilder(); if(fastLatinBuilder.forData(data)) { char[] header = fastLatinBuilder.getHeader(); char[] table = fastLatinBuilder.getTable(); if(base != null && Arrays.equals(header, base.fastLatinTableHeader) && Arrays.equals(table, base.fastLatinTable)) { // Same fast Latin table as in the base, use that one instead. fastLatinBuilder = null; header = base.fastLatinTableHeader; table = base.fastLatinTable; } data.fastLatinTableHeader = header; data.fastLatinTable = table; } else { fastLatinBuilder = null; } } protected int getCEs(CharSequence s, int start, long ces[], int cesLength) { if(collIter == null) { collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl)); if(collIter == null) { return 0; } } return collIter.fetchCEs(s, start, ces, cesLength); } protected static int jamoCpFromIndex(int i) { // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27 if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; } i -= Hangul.JAMO_L_COUNT; if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; } i -= Hangul.JAMO_V_COUNT; // i < 27 return Hangul.JAMO_T_BASE + 1 + i; } /** * Build-time collation element and character iterator. * Uses the runtime CollationIterator for fetching CEs for a string * but reads from the builder's unfinished data structures. * In particular, this class reads from the unfinished trie * and has to avoid CollationIterator.nextCE() and redirect other * calls to data.getCE32() and data.getCE32FromSupplementary(). * * We do this so that we need not implement the collation algorithm * again for the builder and make it behave exactly like the runtime code. * That would be more difficult to test and maintain than this indirection. * * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, * so the data accesses from those code paths need not be modified. * * This class iterates directly over whole code points * so that the CollationIterator does not need the finished trie * for handling the LEAD_SURROGATE_TAG. */ private static final class DataBuilderCollationIterator extends CollationIterator { DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) { super(newData, /*numeric=*/ false); builder = b; builderData = newData; builderData.base = builder.base; // Set all of the jamoCE32s[] to indirection CE32s. for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. int jamo = CollationDataBuilder.jamoCpFromIndex(j); jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) | CollationDataBuilder.IS_BUILDER_JAMO_CE32; } builderData.jamoCE32s = jamoCE32s; } int fetchCEs(CharSequence str, int start, long ces[], int cesLength) { // Set the pointers each time, in case they changed due to reallocation. builderData.ce32s = builder.ce32s.getBuffer(); builderData.ces = builder.ce64s.getBuffer(); builderData.contexts = builder.contexts.toString(); // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32(). reset(); s = str; pos = start; while(pos < s.length()) { // No need to keep all CEs in the iterator buffer. clearCEs(); int c = Character.codePointAt(s, pos); pos += Character.charCount(c); int ce32 = builder.trie.get(c); CollationData d; if(ce32 == Collation.FALLBACK_CE32) { d = builder.base; ce32 = builder.base.getCE32(c); } else { d = builderData; } appendCEsFromCE32(d, c, ce32, /*forward=*/ true); for(int i = 0; i < getCEsLength(); ++i) { long ce = getCE(i); if(ce != 0) { if(cesLength < Collation.MAX_EXPANSION_LENGTH) { ces[cesLength] = ce; } ++cesLength; } } } return cesLength; } @Override public void resetToOffset(int newOffset) { reset(); pos = newOffset; } @Override public int getOffset() { return pos; } @Override public int nextCodePoint() { if(pos == s.length()) { return Collation.SENTINEL_CP; } int c = Character.codePointAt(s, pos); pos += Character.charCount(c); return c; } @Override public int previousCodePoint() { if(pos == 0) { return Collation.SENTINEL_CP; } int c = Character.codePointBefore(s, pos); pos -= Character.charCount(c); return c; } @Override protected void forwardNumCodePoints(int num) { pos = Character.offsetByCodePoints(s, pos, num); } @Override protected void backwardNumCodePoints(int num) { pos = Character.offsetByCodePoints(s, pos, -num); } @Override protected int getDataCE32(int c) { return builder.trie.get(c); } @Override protected int getCE32FromBuilderData(int ce32) { assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG)); if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) { int jamo = Collation.indexFromCE32(ce32); return builder.trie.get(jamo); } else { ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32); if(cond.builtCE32 == Collation.NO_CE32 || cond.era != builder.contextsEra) { // Build the context-sensitive mappings into their runtime form and cache the result. try { cond.builtCE32 = builder.buildContext(cond); } catch(IndexOutOfBoundsException e) { builder.clearContexts(); cond.builtCE32 = builder.buildContext(cond); } cond.era = builder.contextsEra; builderData.contexts = builder.contexts.toString(); } return cond.builtCE32; } } protected final CollationDataBuilder builder; protected final CollationData builderData; protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; protected CharSequence s; protected int pos; } protected final boolean isMutable() { // C++ tests !(trie == NULL || utrie2_isFrozen(trie)) // but Java Trie2Writable does not have an observable isFrozen() state. return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen(); } /** @see Collation.BUILDER_DATA_TAG */ private static final int IS_BUILDER_JAMO_CE32 = 0x100; protected Normalizer2Impl nfcImpl; protected CollationData base; protected CollationSettings baseSettings; protected Trie2Writable trie; protected UVector32 ce32s; protected UVector64 ce64s; protected ArrayList conditionalCE32s; // vector of ConditionalCE32 // Characters that have context (prefixes or contraction suffixes). protected UnicodeSet contextChars = new UnicodeSet(); // Serialized UCharsTrie structures for finalized contexts. protected StringBuilder contexts = new StringBuilder(); /** * The "era" of building intermediate contexts. * When the array of cached, temporary contexts overflows, then clearContexts() * removes them all and invalidates the builtCE32 that used to point to built tries. * See {@link ConditionalCE32#era}. */ private int contextsEra = 0; protected UnicodeSet unsafeBackwardSet = new UnicodeSet(); protected boolean modified; protected boolean fastLatinEnabled; protected CollationFastLatinBuilder fastLatinBuilder; protected DataBuilderCollationIterator collIter; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy