All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.util.MutableCodePointTrie Maven / Gradle / Ivy

The newest version!
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

// created: 2018may04 Markus W. Scherer

package com.ibm.icu.util;

import java.util.Arrays;

/**
 * Mutable Unicode code point trie.
 * Fast map from Unicode code points (U+0000..U+10FFFF) to 32-bit integer values.
 * For details see https://icu.unicode.org/design/struct/utrie
 *
 * 

Setting values (especially ranges) and lookup is fast. * The mutable trie is only somewhat space-efficient. * It builds a compacted, immutable {@link CodePointTrie}. * *

This trie can be modified while iterating over its contents. * For example, it is possible to merge its values with those from another * set of ranges (e.g., another @{link CodePointMap}): * Iterate over those source ranges; for each of them iterate over this trie; * add the source value into the value of each trie range. * * @stable ICU 63 */ public final class MutableCodePointTrie extends CodePointMap implements Cloneable { /** * Constructs a mutable trie that initially maps each Unicode code point to the same value. * It uses 32-bit data values until * {@link #buildImmutable(com.ibm.icu.util.CodePointTrie.Type, com.ibm.icu.util.CodePointTrie.ValueWidth)} * is called. * buildImmutable() takes a valueWidth parameter which * determines the number of bits in the data value in the resulting {@link CodePointTrie}. * * @param initialValue the initial value that is set for all code points * @param errorValue the value for out-of-range code points and ill-formed UTF-8/16 * @stable ICU 63 */ public MutableCodePointTrie(int initialValue, int errorValue) { index = new int[BMP_I_LIMIT]; index3NullOffset = -1; data = new int[INITIAL_DATA_LENGTH]; dataNullOffset = -1; origInitialValue = initialValue; this.initialValue = initialValue; this.errorValue = errorValue; highValue = initialValue; } /** * Clones this mutable trie. * * @return the clone * @stable ICU 63 */ @Override public MutableCodePointTrie clone() { try { MutableCodePointTrie builder = (MutableCodePointTrie) super.clone(); int iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT; builder.index = new int[iCapacity]; builder.flags = new byte[UNICODE_LIMIT >> CodePointTrie.SHIFT_3]; for (int i = 0, iLimit = highStart >> CodePointTrie.SHIFT_3; i < iLimit; ++i) { builder.index[i] = index[i]; builder.flags[i] = flags[i]; } builder.index3NullOffset = index3NullOffset; builder.data = data.clone(); builder.dataLength = dataLength; builder.dataNullOffset = dataNullOffset; builder.origInitialValue = origInitialValue; builder.initialValue = initialValue; builder.errorValue = errorValue; builder.highStart = highStart; builder.highValue = highValue; assert index16 == null; return builder; } catch (CloneNotSupportedException ignored) { // Unreachable: Cloning *is* supported. return null; } } /** * Creates a mutable trie with the same contents as the {@link CodePointMap}. * * @param map the source map or trie * @return the mutable trie * @stable ICU 63 */ public static MutableCodePointTrie fromCodePointMap(CodePointMap map) { // TODO: Consider special code branch for map instanceof CodePointTrie? // Use the highValue as the initialValue to reduce the highStart. int errorValue = map.get(-1); int initialValue = map.get(MAX_UNICODE); MutableCodePointTrie mutableTrie = new MutableCodePointTrie(initialValue, errorValue); CodePointMap.Range range = new CodePointMap.Range(); int start = 0; while (map.getRange(start, null, range)) { int end = range.getEnd(); int value = range.getValue(); if (value != initialValue) { if (start == end) { mutableTrie.set(start, value); } else { mutableTrie.setRange(start, end, value); } } start = end + 1; } return mutableTrie; } private void clear() { index3NullOffset = dataNullOffset = -1; dataLength = 0; highValue = initialValue = origInitialValue; highStart = 0; index16 = null; } /** * {@inheritDoc} * @stable ICU 63 */ @Override public int get(int c) { if (c < 0 || MAX_UNICODE < c) { return errorValue; } if (c >= highStart) { return highValue; } int i = c >> CodePointTrie.SHIFT_3; if (flags[i] == ALL_SAME) { return index[i]; } else { return data[index[i] + (c & CodePointTrie.SMALL_DATA_MASK)]; } } private static final int maybeFilterValue(int value, int initialValue, int nullValue, ValueFilter filter) { if (value == initialValue) { value = nullValue; } else if (filter != null) { value = filter.apply(value); } return value; } /** * {@inheritDoc} * *

The trie can be modified between calls to this function. * * @stable ICU 63 */ @Override public boolean getRange(int start, CodePointTrie.ValueFilter filter, CodePointTrie.Range range) { if (start < 0 || MAX_UNICODE < start) { return false; } if (start >= highStart) { int value = highValue; if (filter != null) { value = filter.apply(value); } range.set(start, MAX_UNICODE, value); return true; } int nullValue = initialValue; if (filter != null) { nullValue = filter.apply(nullValue); } int c = start; // Initialize to make compiler happy. Real value when haveValue is true. int trieValue = 0, value = 0; boolean haveValue = false; int i = c >> CodePointTrie.SHIFT_3; do { if (flags[i] == ALL_SAME) { int trieValue2 = index[i]; if (haveValue) { if (trieValue2 != trieValue) { if (filter == null || maybeFilterValue(trieValue2, initialValue, nullValue, filter) != value) { range.set(start, c - 1, value); return true; } trieValue = trieValue2; // may or may not help } } else { trieValue = trieValue2; value = maybeFilterValue(trieValue2, initialValue, nullValue, filter); haveValue = true; } c = (c + CodePointTrie.SMALL_DATA_BLOCK_LENGTH) & ~CodePointTrie.SMALL_DATA_MASK; } else /* MIXED */ { int di = index[i] + (c & CodePointTrie.SMALL_DATA_MASK); int trieValue2 = data[di]; if (haveValue) { if (trieValue2 != trieValue) { if (filter == null || maybeFilterValue(trieValue2, initialValue, nullValue, filter) != value) { range.set(start, c - 1, value); return true; } trieValue = trieValue2; // may or may not help } } else { trieValue = trieValue2; value = maybeFilterValue(trieValue2, initialValue, nullValue, filter); haveValue = true; } while ((++c & CodePointTrie.SMALL_DATA_MASK) != 0) { trieValue2 = data[++di]; if (trieValue2 != trieValue) { if (filter == null || maybeFilterValue(trieValue2, initialValue, nullValue, filter) != value) { range.set(start, c - 1, value); return true; } trieValue = trieValue2; // may or may not help } } } ++i; } while (c < highStart); assert(haveValue); if (maybeFilterValue(highValue, initialValue, nullValue, filter) != value) { range.set(start, c - 1, value); } else { range.set(start, MAX_UNICODE, value); } return true; } private void writeBlock(int block, int value) { int limit = block + CodePointTrie.SMALL_DATA_BLOCK_LENGTH; Arrays.fill(data, block, limit, value); } /** * Sets a value for a code point. * * @param c the code point * @param value the value * @stable ICU 63 */ public void set(int c, int value) { if (c < 0 || MAX_UNICODE < c) { throw new IllegalArgumentException("invalid code point"); } ensureHighStart(c); int block = getDataBlock(c >> CodePointTrie.SHIFT_3); data[block + (c & CodePointTrie.SMALL_DATA_MASK)] = value; } private void fillBlock(int block, int start, int limit, int value) { Arrays.fill(data, block + start, block + limit, value); } /** * Sets a value for each code point [start..end]. * Faster and more space-efficient than setting the value for each code point separately. * * @param start the first code point to get the value * @param end the last code point to get the value (inclusive) * @param value the value * @stable ICU 63 */ public void setRange(int start, int end, int value) { if (start < 0 || MAX_UNICODE < start || end < 0 || MAX_UNICODE < end || start > end) { throw new IllegalArgumentException("invalid code point range"); } ensureHighStart(end); int limit = end + 1; if ((start & CodePointTrie.SMALL_DATA_MASK) != 0) { // Set partial block at [start..following block boundary[. int block = getDataBlock(start >> CodePointTrie.SHIFT_3); int nextStart = (start + CodePointTrie.SMALL_DATA_MASK) & ~CodePointTrie.SMALL_DATA_MASK; if (nextStart <= limit) { fillBlock(block, start & CodePointTrie.SMALL_DATA_MASK, CodePointTrie.SMALL_DATA_BLOCK_LENGTH, value); start = nextStart; } else { fillBlock(block, start & CodePointTrie.SMALL_DATA_MASK, limit & CodePointTrie.SMALL_DATA_MASK, value); return; } } // Number of positions in the last, partial block. int rest = limit & CodePointTrie.SMALL_DATA_MASK; // Round down limit to a block boundary. limit &= ~CodePointTrie.SMALL_DATA_MASK; // Iterate over all-value blocks. while (start < limit) { int i = start >> CodePointTrie.SHIFT_3; if (flags[i] == ALL_SAME) { index[i] = value; } else /* MIXED */ { fillBlock(index[i], 0, CodePointTrie.SMALL_DATA_BLOCK_LENGTH, value); } start += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; } if (rest > 0) { // Set partial block at [last block boundary..limit[. int block = getDataBlock(start >> CodePointTrie.SHIFT_3); fillBlock(block, 0, rest, value); } } /** * Compacts the data and builds an immutable {@link CodePointTrie} according to the parameters. * After this, the mutable trie will be empty. * *

The mutable trie stores 32-bit values until buildImmutable() is called. * If values shorter than 32 bits are to be stored in the immutable trie, * then the upper bits are discarded. * For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581, * and the value width is 8 bits, then each of these is stored as 0x81 * and the immutable trie will return that as an unsigned value. * (Some implementations may want to make productive temporary use of the upper bits * until buildImmutable() discards them.) * *

Not every possible set of mappings can be built into a CodePointTrie, * because of limitations resulting from speed and space optimizations. * Every Unicode assigned character can be mapped to a unique value. * Typical data yields data structures far smaller than the limitations. * *

It is possible to construct extremely unusual mappings that exceed the * data structure limits. * In such a case this function will throw an exception. * * @param type selects the trie type * @param valueWidth selects the number of bits in a trie data value; if smaller than 32 bits, * then the values stored in the trie will be truncated first * * @see #fromCodePointMap(CodePointMap) * @stable ICU 63 */ public CodePointTrie buildImmutable(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth) { if (type == null || valueWidth == null) { throw new IllegalArgumentException("The type and valueWidth must be specified."); } try { return build(type, valueWidth); } finally { clear(); } } private static final int MAX_UNICODE = 0x10ffff; private static final int UNICODE_LIMIT = 0x110000; private static final int BMP_LIMIT = 0x10000; private static final int ASCII_LIMIT = 0x80; private static final int I_LIMIT = UNICODE_LIMIT >> CodePointTrie.SHIFT_3; private static final int BMP_I_LIMIT = BMP_LIMIT >> CodePointTrie.SHIFT_3; private static final int ASCII_I_LIMIT = ASCII_LIMIT >> CodePointTrie.SHIFT_3; private static final int SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3)); // Flag values for data blocks. private static final byte ALL_SAME = 0; private static final byte MIXED = 1; private static final byte SAME_AS = 2; /** Start with allocation of 16k data entries. */ private static final int INITIAL_DATA_LENGTH = (1 << 14); /** Grow about 8x each time. */ private static final int MEDIUM_DATA_LENGTH = (1 << 17); /** * Maximum length of the build-time data array. * One entry per 0x110000 code points. */ private static final int MAX_DATA_LENGTH = UNICODE_LIMIT; // Flag values for index-3 blocks while compacting/building. private static final byte I3_NULL = 0; private static final byte I3_BMP = 1; private static final byte I3_16 = 2; private static final byte I3_18 = 3; private static final int INDEX_3_18BIT_BLOCK_LENGTH = CodePointTrie.INDEX_3_BLOCK_LENGTH + CodePointTrie.INDEX_3_BLOCK_LENGTH / 8; private int[] index; private int index3NullOffset; private int[] data; private int dataLength; private int dataNullOffset; private int origInitialValue; private int initialValue; private int errorValue; private int highStart; private int highValue; /** Temporary array while building the final data. */ private char[] index16; private byte[] flags = new byte[UNICODE_LIMIT >> CodePointTrie.SHIFT_3]; private void ensureHighStart(int c) { if (c >= highStart) { // Round up to a CodePointTrie.CP_PER_INDEX_2_ENTRY boundary to simplify compaction. c = (c + CodePointTrie.CP_PER_INDEX_2_ENTRY) & ~(CodePointTrie.CP_PER_INDEX_2_ENTRY - 1); int i = highStart >> CodePointTrie.SHIFT_3; int iLimit = c >> CodePointTrie.SHIFT_3; if (iLimit > index.length) { int[] newIndex = new int[I_LIMIT]; for (int j = 0; j < i; ++j) { newIndex[j] = index[j]; } index = newIndex; } do { flags[i] = ALL_SAME; index[i] = initialValue; } while(++i < iLimit); highStart = c; } } private int allocDataBlock(int blockLength) { int newBlock = dataLength; int newTop = newBlock + blockLength; if (newTop > data.length) { int capacity; if (data.length < MEDIUM_DATA_LENGTH) { capacity = MEDIUM_DATA_LENGTH; } else if (data.length < MAX_DATA_LENGTH) { capacity = MAX_DATA_LENGTH; } else { // Should never occur. // Either MAX_DATA_LENGTH is incorrect, // or the code writes more values than should be possible. throw new AssertionError(); } int[] newData = new int[capacity]; for (int j = 0; j < dataLength; ++j) { newData[j] = data[j]; } data = newData; } dataLength = newTop; return newBlock; } /** * No error checking for illegal arguments. * The Java version always returns non-negative values. */ private int getDataBlock(int i) { if (flags[i] == MIXED) { return index[i]; } if (i < BMP_I_LIMIT) { int newBlock = allocDataBlock(CodePointTrie.FAST_DATA_BLOCK_LENGTH); int iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1); int iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; do { assert(flags[iStart] == ALL_SAME); writeBlock(newBlock, index[iStart]); flags[iStart] = MIXED; index[iStart++] = newBlock; newBlock += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; } while (iStart < iLimit); return index[i]; } else { int newBlock = allocDataBlock(CodePointTrie.SMALL_DATA_BLOCK_LENGTH); if (newBlock < 0) { return newBlock; } writeBlock(newBlock, index[i]); flags[i] = MIXED; index[i] = newBlock; return newBlock; } } // compaction -------------------------------------------------------------- private void maskValues(int mask) { initialValue &= mask; errorValue &= mask; highValue &= mask; int iLimit = highStart >> CodePointTrie.SHIFT_3; for (int i = 0; i < iLimit; ++i) { if (flags[i] == ALL_SAME) { index[i] &= mask; } } for (int i = 0; i < dataLength; ++i) { data[i] &= mask; } } private static boolean equalBlocks(int[] s, int si, int[] t, int ti, int length) { while (length > 0 && s[si] == t[ti]) { ++si; ++ti; --length; } return length == 0; } private static boolean equalBlocks(char[] s, int si, int[] t, int ti, int length) { while (length > 0 && s[si] == t[ti]) { ++si; ++ti; --length; } return length == 0; } private static boolean equalBlocks(char[] s, int si, char[] t, int ti, int length) { while (length > 0 && s[si] == t[ti]) { ++si; ++ti; --length; } return length == 0; } private static boolean allValuesSameAs(int[] p, int pi, int length, int value) { int pLimit = pi + length; while (pi < pLimit && p[pi] == value) { ++pi; } return pi == pLimit; } /** Search for an identical block. */ private static int findSameBlock(char[] p, int pStart, int length, char[] q, int qStart, int blockLength) { // Ensure that we do not even partially get past length. length -= blockLength; while (pStart <= length) { if (equalBlocks(p, pStart, q, qStart, blockLength)) { return pStart; } ++pStart; } return -1; } private static int findAllSameBlock(int[] p, int start, int limit, int value, int blockLength) { // Ensure that we do not even partially get past limit. limit -= blockLength; for (int block = start; block <= limit; ++block) { if (p[block] == value) { for (int i = 1;; ++i) { if (i == blockLength) { return block; } if (p[block + i] != value) { block += i; break; } } } } return -1; } /** * Look for maximum overlap of the beginning of the other block * with the previous, adjacent block. */ private static int getOverlap(int[] p, int length, int[] q, int qStart, int blockLength) { int overlap = blockLength - 1; assert(overlap <= length); while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) { --overlap; } return overlap; } private static int getOverlap(char[] p, int length, int[] q, int qStart, int blockLength) { int overlap = blockLength - 1; assert(overlap <= length); while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) { --overlap; } return overlap; } private static int getOverlap(char[] p, int length, char[] q, int qStart, int blockLength) { int overlap = blockLength - 1; assert(overlap <= length); while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) { --overlap; } return overlap; } private static int getAllSameOverlap(int[] p, int length, int value, int blockLength) { int min = length - (blockLength - 1); int i = length; while (min < i && p[i - 1] == value) { --i; } return length - i; } private static boolean isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit) { for (int i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { if (index[i] == dataOffset) { return true; } } return false; } /** * Finds the start of the last range in the trie by enumerating backward. * Indexes for code points higher than this will be omitted. */ private int findHighStart() { int i = highStart >> CodePointTrie.SHIFT_3; while (i > 0) { boolean match; if (flags[--i] == ALL_SAME) { match = index[i] == highValue; } else /* MIXED */ { int p = index[i]; for (int j = 0;; ++j) { if (j == CodePointTrie.SMALL_DATA_BLOCK_LENGTH) { match = true; break; } if (data[p + j] != highValue) { match = false; break; } } } if (!match) { return (i + 1) << CodePointTrie.SHIFT_3; } } return 0; } private static final class AllSameBlocks { static final int NEW_UNIQUE = -1; static final int OVERFLOW = -2; AllSameBlocks() { mostRecent = -1; } int findOrAdd(int index, int count, int value) { if (mostRecent >= 0 && values[mostRecent] == value) { refCounts[mostRecent] += count; return indexes[mostRecent]; } for (int i = 0; i < length; ++i) { if (values[i] == value) { mostRecent = i; refCounts[i] += count; return indexes[i]; } } if (length == CAPACITY) { return OVERFLOW; } mostRecent = length; indexes[length] = index; values[length] = value; refCounts[length++] = count; return NEW_UNIQUE; } /** Replaces the block which has the lowest reference count. */ void add(int index, int count, int value) { assert(length == CAPACITY); int least = -1; int leastCount = I_LIMIT; for (int i = 0; i < length; ++i) { assert(values[i] != value); if (refCounts[i] < leastCount) { least = i; leastCount = refCounts[i]; } } assert(least >= 0); mostRecent = least; indexes[least] = index; values[least] = value; refCounts[least] = count; } int findMostUsed() { if (length == 0) { return -1; } int max = -1; int maxCount = 0; for (int i = 0; i < length; ++i) { if (refCounts[i] > maxCount) { max = i; maxCount = refCounts[i]; } } return indexes[max]; } private static final int CAPACITY = 32; private int length; private int mostRecent; private int[] indexes = new int[CAPACITY]; private int[] values = new int[CAPACITY]; private int[] refCounts = new int[CAPACITY]; } // Custom hash table for mixed-value blocks to be found anywhere in the // compacted data or index so far. private static final class MixedBlocks { void init(int maxLength, int newBlockLength) { // We store actual data indexes + 1 to reserve 0 for empty entries. int maxDataIndex = maxLength - newBlockLength + 1; int newLength; if (maxDataIndex <= 0xfff) { // 4k newLength = 6007; shift = 12; mask = 0xfff; } else if (maxDataIndex <= 0x7fff) { // 32k newLength = 50021; shift = 15; mask = 0x7fff; } else if (maxDataIndex <= 0x1ffff) { // 128k newLength = 200003; shift = 17; mask = 0x1ffff; } else { // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M newLength = 1500007; shift = 21; mask = 0x1fffff; } if (table == null || newLength > table.length) { table = new int[newLength]; } else { Arrays.fill(table, 0, newLength, 0); } length = newLength; blockLength = newBlockLength; } void extend(int[] data, int minStart, int prevDataLength, int newDataLength) { int start = prevDataLength - blockLength; if (start >= minStart) { ++start; // Skip the last block that we added last time. } else { start = minStart; // Begin with the first full block. } for (int end = newDataLength - blockLength; start <= end; ++start) { int hashCode = makeHashCode(data, start); addEntry(data, null, start, hashCode, start); } } void extend(char[] data, int minStart, int prevDataLength, int newDataLength) { int start = prevDataLength - blockLength; if (start >= minStart) { ++start; // Skip the last block that we added last time. } else { start = minStart; // Begin with the first full block. } for (int end = newDataLength - blockLength; start <= end; ++start) { int hashCode = makeHashCode(data, start); addEntry(null, data, start, hashCode, start); } } int findBlock(int[] data, int[] blockData, int blockStart) { int hashCode = makeHashCode(blockData, blockStart); int entryIndex = findEntry(data, null, blockData, null, blockStart, hashCode); if (entryIndex >= 0) { return (table[entryIndex] & mask) - 1; } else { return -1; } } int findBlock(char[] data, int[] blockData, int blockStart) { int hashCode = makeHashCode(blockData, blockStart); int entryIndex = findEntry(null, data, blockData, null, blockStart, hashCode); if (entryIndex >= 0) { return (table[entryIndex] & mask) - 1; } else { return -1; } } int findBlock(char[] data, char[] blockData, int blockStart) { int hashCode = makeHashCode(blockData, blockStart); int entryIndex = findEntry(null, data, null, blockData, blockStart, hashCode); if (entryIndex >= 0) { return (table[entryIndex] & mask) - 1; } else { return -1; } } int findAllSameBlock(int[] data, int blockValue) { int hashCode = makeHashCode(blockValue); int entryIndex = findEntry(data, blockValue, hashCode); if (entryIndex >= 0) { return (table[entryIndex] & mask) - 1; } else { return -1; } } private int makeHashCode(int[] blockData, int blockStart) { int blockLimit = blockStart + blockLength; int hashCode = blockData[blockStart++]; do { hashCode = 37 * hashCode + blockData[blockStart++]; } while (blockStart < blockLimit); return hashCode; } private int makeHashCode(char[] blockData, int blockStart) { int blockLimit = blockStart + blockLength; int hashCode = blockData[blockStart++]; do { hashCode = 37 * hashCode + blockData[blockStart++]; } while (blockStart < blockLimit); return hashCode; } private int makeHashCode(int blockValue) { int hashCode = blockValue; for (int i = 1; i < blockLength; ++i) { hashCode = 37 * hashCode + blockValue; } return hashCode; } private void addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex) { assert(0 <= dataIndex && dataIndex < mask); int entryIndex = findEntry(data32, data16, data32, data16, blockStart, hashCode); if (entryIndex < 0) { table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); } } private int findEntry(int[] data32, char[] data16, int[] blockData32, char[] blockData16, int blockStart, int hashCode) { int shiftedHashCode = hashCode << shift; int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1 for (int entryIndex = initialEntryIndex;;) { int entry = table[entryIndex]; if (entry == 0) { return ~entryIndex; } if ((entry & ~mask) == shiftedHashCode) { int dataIndex = (entry & mask) - 1; if (data32 != null ? equalBlocks(data32, dataIndex, blockData32, blockStart, blockLength) : blockData32 != null ? equalBlocks(data16, dataIndex, blockData32, blockStart, blockLength) : equalBlocks(data16, dataIndex, blockData16, blockStart, blockLength)) { return entryIndex; } } entryIndex = nextIndex(initialEntryIndex, entryIndex); } } private int findEntry(int[] data, int blockValue, int hashCode) { int shiftedHashCode = hashCode << shift; int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1 for (int entryIndex = initialEntryIndex;;) { int entry = table[entryIndex]; if (entry == 0) { return ~entryIndex; } if ((entry & ~mask) == shiftedHashCode) { int dataIndex = (entry & mask) - 1; if (allValuesSameAs(data, dataIndex, blockLength, blockValue)) { return entryIndex; } } entryIndex = nextIndex(initialEntryIndex, entryIndex); } } private int nextIndex(int initialEntryIndex, int entryIndex) { // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); return (entryIndex + initialEntryIndex) % length; } /** Ensures non-negative n % m (that is 0..m-1). */ private int modulo(int n, int m) { int i = n % m; if (i < 0) { i += m; } return i; } // Hash table. // The length is a prime number, larger than the maximum data length. // The "shift" lower bits store a data index + 1. // The remaining upper bits store a partial hashCode of the block data values. private int[] table; private int length; private int shift; private int mask; private int blockLength; } private int compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks) { // ASCII data will be stored as a linear table, even if the following code // does not yet count it that way. int newDataCapacity = ASCII_LIMIT; // Add room for a small data null block in case it would match the start of // a fast data block where dataNullOffset must not be set in that case. newDataCapacity += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; // Add room for special values (errorValue, highValue) and padding. newDataCapacity += 4; int iLimit = highStart >> CodePointTrie.SHIFT_3; int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH; int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; for (int i = 0; i < iLimit; i += inc) { if (i == fastILimit) { blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH; inc = 1; } int value = index[i]; if (flags[i] == MIXED) { // Really mixed? int p = value; value = data[p]; if (allValuesSameAs(data, p + 1, blockLength - 1, value)) { flags[i] = ALL_SAME; index[i] = value; // Fall through to ALL_SAME handling. } else { newDataCapacity += blockLength; continue; } } else { assert(flags[i] == ALL_SAME); if (inc > 1) { // Do all of the fast-range data block's ALL_SAME parts have the same value? boolean allSame = true; int next_i = i + inc; for (int j = i + 1; j < next_i; ++j) { assert(flags[j] == ALL_SAME); if (index[j] != value) { allSame = false; break; } } if (!allSame) { // Turn it into a MIXED block. if (getDataBlock(i) < 0) { return -1; } newDataCapacity += blockLength; continue; } } } // Is there another ALL_SAME block with the same value? int other = allSameBlocks.findOrAdd(i, inc, value); if (other == AllSameBlocks.OVERFLOW) { // The fixed-size array overflowed. Slow check for a duplicate block. int jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; for (int j = 0;; j += jInc) { if (j == i) { allSameBlocks.add(i, inc, value); break; } if (j == fastILimit) { jInc = 1; } if (flags[j] == ALL_SAME && index[j] == value) { allSameBlocks.add(j, jInc + inc, value); other = j; break; // We could keep counting blocks with the same value // before we add the first one, which may improve compaction in rare cases, // but it would make it slower. } } } if (other >= 0) { flags[i] = SAME_AS; index[i] = other; } else { // New unique same-value block. newDataCapacity += blockLength; } } return newDataCapacity; } /** * Compacts a build-time trie. * * The compaction * - removes blocks that are identical with earlier ones * - overlaps each new non-duplicate block as much as possible with the previously-written one * - works with fast-range data blocks whose length is a multiple of that of * higher-code-point data blocks * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ private int compactData( int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks) { // The linear ASCII data has been copied into newData already. int newDataLength = 0; for (int i = 0; newDataLength < ASCII_LIMIT; newDataLength += CodePointTrie.FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { index[i] = newDataLength; } int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH; mixedBlocks.init(newData.length, blockLength); mixedBlocks.extend(newData, 0, 0, newDataLength); int iLimit = highStart >> CodePointTrie.SHIFT_3; int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; int fastLength = 0; for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) { if (i == fastILimit) { blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH; inc = 1; fastLength = newDataLength; mixedBlocks.init(newData.length, blockLength); mixedBlocks.extend(newData, 0, 0, newDataLength); } if (flags[i] == ALL_SAME) { int value = index[i]; // Find an earlier part of the data array of length blockLength // that is filled with this value. int n = mixedBlocks.findAllSameBlock(newData, value); // If we find a match, and the current block is the data null block, // and it is not a fast block but matches the start of a fast block, // then we need to continue looking. // This is because this small block is shorter than the fast block, // and not all of the rest of the fast block is filled with this value. // Otherwise trie.getRange() would detect that the fast block starts at // dataNullOffset and assume incorrectly that it is filled with the null value. while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && isStartOfSomeFastBlock(n, index, fastILimit)) { n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); } if (n >= 0) { index[i] = n; } else { n = getAllSameOverlap(newData, newDataLength, value, blockLength); index[i] = newDataLength - n; int prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = value; ++n; } mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else if (flags[i] == MIXED) { int block = index[i]; int n = mixedBlocks.findBlock(newData, data, block); if (n >= 0) { index[i] = n; } else { n = getOverlap(newData, newDataLength, data, block, blockLength); index[i] = newDataLength - n; int prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = data[block + n++]; } mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else /* SAME_AS */ { int j = index[i]; index[i] = index[j]; } } return newDataLength; } private int compactIndex(int fastILimit, MixedBlocks mixedBlocks) { int fastIndexLength = fastILimit >> (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3); if ((highStart >> CodePointTrie.FAST_SHIFT) <= fastIndexLength) { // Only the linear fast index, no multi-stage index tables. index3NullOffset = CodePointTrie.NO_INDEX3_NULL_OFFSET; return fastIndexLength; } // Condense the fast index table. // Also, does it contain an index-3 block with all dataNullOffset? char[] fastIndex = new char[fastIndexLength]; int i3FirstNull = -1; for (int i = 0, j = 0; i < fastILimit; ++j) { int i3 = index[i]; fastIndex[j] = (char)i3; if (i3 == dataNullOffset) { if (i3FirstNull < 0) { i3FirstNull = j; } else if (index3NullOffset < 0 && (j - i3FirstNull + 1) == CodePointTrie.INDEX_3_BLOCK_LENGTH) { index3NullOffset = i3FirstNull; } } else { i3FirstNull = -1; } // Set the index entries that compactData() skipped. // Needed when the multi-stage index covers the fast index range as well. int iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; while (++i < iNext) { i3 += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; index[i] = i3; } } mixedBlocks.init(fastIndexLength, CodePointTrie.INDEX_3_BLOCK_LENGTH); mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); // Examine index-3 blocks. For each determine one of: // - same as the index-3 null block // - same as a fast-index block // - 16-bit indexes // - 18-bit indexes // We store this in the first flags entry for the index-3 block. // // Also determine an upper limit for the index-3 table length. int index3Capacity = 0; i3FirstNull = index3NullOffset; boolean hasLongI3Blocks = false; // If the fast index covers the whole BMP, then // the multi-stage index is only for supplementary code points. // Otherwise, the multi-stage index covers all of Unicode. int iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT; int iLimit = highStart >> CodePointTrie.SHIFT_3; for (int i = iStart; i < iLimit;) { int j = i; int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH; int oredI3 = 0; boolean isNull = true; do { int i3 = index[j]; oredI3 |= i3; if (i3 != dataNullOffset) { isNull = false; } } while (++j < jLimit); if (isNull) { flags[i] = I3_NULL; if (i3FirstNull < 0) { if (oredI3 <= 0xffff) { index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH; } else { index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; hasLongI3Blocks = true; } i3FirstNull = 0; } } else { if (oredI3 <= 0xffff) { int n = mixedBlocks.findBlock(fastIndex, index, i); if (n >= 0) { flags[i] = I3_BMP; index[i] = n; } else { flags[i] = I3_16; index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH; } } else { flags[i] = I3_18; index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; hasLongI3Blocks = true; } } i = j; } int index2Capacity = (iLimit - iStart) >> CodePointTrie.SHIFT_2_3; // Length of the index-1 table, rounded up. int index1Length = (index2Capacity + CodePointTrie.INDEX_2_MASK) >> CodePointTrie.SHIFT_1_2; // Index table: Fast index, index-1, index-3, index-2. // +1 for possible index table padding. int index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; index16 = Arrays.copyOf(fastIndex, index16Capacity); mixedBlocks.init(index16Capacity, CodePointTrie.INDEX_3_BLOCK_LENGTH); MixedBlocks longI3Blocks = null; if (hasLongI3Blocks) { longI3Blocks = new MixedBlocks(); longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH); } // Compact the index-3 table and write an uncompacted version of the index-2 table. char[] index2 = new char[index2Capacity]; int i2Length = 0; i3FirstNull = index3NullOffset; int index3Start = fastIndexLength + index1Length; int indexLength = index3Start; for (int i = iStart; i < iLimit; i += CodePointTrie.INDEX_3_BLOCK_LENGTH) { int i3; byte f = flags[i]; if (f == I3_NULL && i3FirstNull < 0) { // First index-3 null block. Write & overlap it like a normal block, then remember it. f = dataNullOffset <= 0xffff ? I3_16 : I3_18; i3FirstNull = 0; } if (f == I3_NULL) { i3 = index3NullOffset; } else if (f == I3_BMP) { i3 = index[i]; } else if (f == I3_16) { int n = mixedBlocks.findBlock(index16, index, i); if (n >= 0) { i3 = n; } else { if (indexLength == index3Start) { // No overlap at the boundary between the index-1 and index-3 tables. n = 0; } else { n = getOverlap(index16, indexLength, index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH); } i3 = indexLength - n; int prevIndexLength = indexLength; while (n < CodePointTrie.INDEX_3_BLOCK_LENGTH) { index16[indexLength++] = (char)index[i + n++]; } mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); if (hasLongI3Blocks) { longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); } } } else { assert(f == I3_18); assert(hasLongI3Blocks); // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. int j = i; int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH; int k = indexLength; do { ++k; int v = index[j++]; int upperBits = (v & 0x30000) >> 2; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 4; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 6; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 8; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 10; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 12; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 14; index16[k++] = (char)v; v = index[j++]; upperBits |= (v & 0x30000) >> 16; index16[k++] = (char)v; index16[k - 9] = (char)upperBits; } while (j < jLimit); int n = longI3Blocks.findBlock(index16, index16, indexLength); if (n >= 0) { i3 = n | 0x8000; } else { if (indexLength == index3Start) { // No overlap at the boundary between the index-1 and index-3 tables. n = 0; } else { n = getOverlap(index16, indexLength, index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); } i3 = (indexLength - n) | 0x8000; int prevIndexLength = indexLength; if (n > 0) { int start = indexLength; while (n < INDEX_3_18BIT_BLOCK_LENGTH) { index16[indexLength++] = index16[start + n++]; } } else { indexLength += INDEX_3_18BIT_BLOCK_LENGTH; } mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); if (hasLongI3Blocks) { longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); } } } if (index3NullOffset < 0 && i3FirstNull >= 0) { index3NullOffset = i3; } // Set the index-2 table entry. index2[i2Length++] = (char)i3; } assert(i2Length == index2Capacity); assert(indexLength <= index3Start + index3Capacity); if (index3NullOffset < 0) { index3NullOffset = CodePointTrie.NO_INDEX3_NULL_OFFSET; } if (indexLength >= (CodePointTrie.NO_INDEX3_NULL_OFFSET + CodePointTrie.INDEX_3_BLOCK_LENGTH)) { // The index-3 offsets exceed 15 bits, or // the last one cannot be distinguished from the no-null-block value. throw new IndexOutOfBoundsException( "The trie data exceeds limitations of the data structure."); } // Compact the index-2 table and write the index-1 table. // assert(CodePointTrie.INDEX_2_BLOCK_LENGTH == CodePointTrie.INDEX_3_BLOCK_LENGTH) : // "must re-init mixedBlocks"; int blockLength = CodePointTrie.INDEX_2_BLOCK_LENGTH; int i1 = fastIndexLength; for (int i = 0; i < i2Length; i += blockLength) { int n; if ((i2Length - i) >= blockLength) { // normal block assert(blockLength == CodePointTrie.INDEX_2_BLOCK_LENGTH); n = mixedBlocks.findBlock(index16, index2, i); } else { // highStart is inside the last index-2 block. Shorten it. blockLength = i2Length - i; n = findSameBlock(index16, index3Start, indexLength, index2, i, blockLength); } int i2; if (n >= 0) { i2 = n; } else { if (indexLength == index3Start) { // No overlap at the boundary between the index-1 and index-3/2 tables. n = 0; } else { n = getOverlap(index16, indexLength, index2, i, blockLength); } i2 = indexLength - n; int prevIndexLength = indexLength; while (n < blockLength) { index16[indexLength++] = index2[i + n++]; } mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); } // Set the index-1 table entry. index16[i1++] = (char)i2; } assert(i1 == index3Start); assert(indexLength <= index16Capacity); return indexLength; } private int compactTrie(int fastILimit) { // Find the real highStart and round it up. assert((highStart & (CodePointTrie.CP_PER_INDEX_2_ENTRY - 1)) == 0); highValue = get(MAX_UNICODE); int realHighStart = findHighStart(); realHighStart = (realHighStart + (CodePointTrie.CP_PER_INDEX_2_ENTRY - 1)) & ~(CodePointTrie.CP_PER_INDEX_2_ENTRY - 1); if (realHighStart == UNICODE_LIMIT) { highValue = initialValue; } // We always store indexes and data values for the fast range. // Pin highStart to the top of that range while building. int fastLimit = fastILimit << CodePointTrie.SHIFT_3; if (realHighStart < fastLimit) { for (int i = (realHighStart >> CodePointTrie.SHIFT_3); i < fastILimit; ++i) { flags[i] = ALL_SAME; index[i] = highValue; } highStart = fastLimit; } else { highStart = realHighStart; } int[] asciiData = new int[ASCII_LIMIT]; for (int i = 0; i < ASCII_LIMIT; ++i) { asciiData[i] = get(i); } // First we look for which data blocks have the same value repeated over the whole block, // deduplicate such blocks, find a good null data block (for faster enumeration), // and get an upper bound for the necessary data array length. AllSameBlocks allSameBlocks = new AllSameBlocks(); int newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks); int[] newData = Arrays.copyOf(asciiData, newDataCapacity); int dataNullIndex = allSameBlocks.findMostUsed(); MixedBlocks mixedBlocks = new MixedBlocks(); int newDataLength = compactData(fastILimit, newData, dataNullIndex, mixedBlocks); assert(newDataLength <= newDataCapacity); data = newData; dataLength = newDataLength; if (dataLength > (0x3ffff + CodePointTrie.SMALL_DATA_BLOCK_LENGTH)) { // The offset of the last data block is too high to be stored in the index table. throw new IndexOutOfBoundsException( "The trie data exceeds limitations of the data structure."); } if (dataNullIndex >= 0) { dataNullOffset = index[dataNullIndex]; initialValue = data[dataNullOffset]; } else { dataNullOffset = CodePointTrie.NO_DATA_NULL_OFFSET; } int indexLength = compactIndex(fastILimit, mixedBlocks); highStart = realHighStart; return indexLength; } private CodePointTrie build(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth) { // The mutable trie always stores 32-bit values. // When we build a UCPTrie for a smaller value width, we first mask off unused bits // before compacting the data. switch (valueWidth) { case BITS_32: break; case BITS_16: maskValues(0xffff); break; case BITS_8: maskValues(0xff); break; default: // Should be unreachable. throw new IllegalArgumentException(); } int fastLimit = type == CodePointTrie.Type.FAST ? BMP_LIMIT : CodePointTrie.SMALL_LIMIT; int indexLength = compactTrie(fastLimit >> CodePointTrie.SHIFT_3); // Ensure data table alignment: The index length must be even for uint32_t data. if (valueWidth == CodePointTrie.ValueWidth.BITS_32 && (indexLength & 1) != 0) { index16[indexLength++] = 0xffee; // arbitrary value } // Make the total trie structure length a multiple of 4 bytes by padding the data table, // and store special values as the last two data values. int length = indexLength * 2; if (valueWidth == CodePointTrie.ValueWidth.BITS_16) { if (((indexLength ^ dataLength) & 1) != 0) { // padding data[dataLength++] = errorValue; } if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { data[dataLength++] = highValue; data[dataLength++] = errorValue; } length += dataLength * 2; } else if (valueWidth == CodePointTrie.ValueWidth.BITS_32) { // 32-bit data words never need padding to a multiple of 4 bytes. if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { if (data[dataLength - 1] != highValue) { data[dataLength++] = highValue; } data[dataLength++] = errorValue; } length += dataLength * 4; } else { int and3 = (length + dataLength) & 3; if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) { // all set } else if(and3 == 3 && data[dataLength - 1] == highValue) { data[dataLength++] = errorValue; } else { while (and3 != 2) { data[dataLength++] = highValue; and3 = (and3 + 1) & 3; } data[dataLength++] = highValue; data[dataLength++] = errorValue; } length += dataLength; } assert((length & 3) == 0); // Fill the index and data arrays. char[] trieIndex; if (highStart <= fastLimit) { // Condense only the fast index from the mutable-trie index. trieIndex = new char[indexLength]; for (int i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) { trieIndex[j] = (char)index[i]; } } else { if (indexLength == index16.length) { trieIndex = index16; index16 = null; } else { trieIndex = Arrays.copyOf(index16, indexLength); } } // Write the data array. switch (valueWidth) { case BITS_16: { // Write 16-bit data values. char[] data16 = new char[dataLength]; for (int i = 0; i < dataLength; ++i) { data16[i] = (char)data[i]; } return type == CodePointTrie.Type.FAST ? new CodePointTrie.Fast16(trieIndex, data16, highStart, index3NullOffset, dataNullOffset) : new CodePointTrie.Small16(trieIndex, data16, highStart, index3NullOffset, dataNullOffset); } case BITS_32: { // Write 32-bit data values. int[] data32 = Arrays.copyOf(data, dataLength); return type == CodePointTrie.Type.FAST ? new CodePointTrie.Fast32(trieIndex, data32, highStart, index3NullOffset, dataNullOffset) : new CodePointTrie.Small32(trieIndex, data32, highStart, index3NullOffset, dataNullOffset); } case BITS_8: { // Write 8-bit data values. byte[] data8 = new byte[dataLength]; for (int i = 0; i < dataLength; ++i) { data8[i] = (byte)data[i]; } return type == CodePointTrie.Type.FAST ? new CodePointTrie.Fast8(trieIndex, data8, highStart, index3NullOffset, dataNullOffset) : new CodePointTrie.Small8(trieIndex, data8, highStart, index3NullOffset, dataNullOffset); } default: // Should be unreachable. throw new IllegalArgumentException(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy