com.ibm.icu.impl.BMPSet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
******************************************************************************
*
* Copyright (C) 2009-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
/*
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
*
* Latin-1: Look up bytes. 2-byte characters: Bits organized vertically. 3-byte characters: Use zero/one/mixed data
* per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. Supplementary characters: Call contains() on the
* parent set.
*/
public final class BMPSet {
public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
/*
* One boolean ('true' or 'false') per Latin-1 character.
*/
private boolean[] latin1Contains;
/*
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
*
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
* runtime.
*/
private int[] table7FF;
/*
* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
* and set.contains(c) must be called.
*
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
* validity checking at runtime.
*/
private int[] bmpBlockBits;
/*
* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
*/
private int[] list4kStarts;
/*
* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
* supplementary code points. The list is terminated with list[listLength-1]=0x110000.
*/
private final int[] list;
private final int listLength; // length used; list may be longer to minimize reallocs
public BMPSet(final int[] parentList, int parentListLength) {
list = parentList;
listLength = parentListLength;
latin1Contains = new boolean[0x100];
table7FF = new int[64];
bmpBlockBits = new int[64];
list4kStarts = new int[18];
/*
* Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
* first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
* indexes is for finding supplementary code points.
*/
list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
int i;
for (i = 1; i <= 0x10; ++i) {
list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
}
list4kStarts[0x11] = listLength - 1;
initBits();
}
public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) {
list = newParentList;
listLength = newParentListLength;
latin1Contains = otherBMPSet.latin1Contains.clone();
table7FF = otherBMPSet.table7FF.clone();
bmpBlockBits = otherBMPSet.bmpBlockBits.clone();
list4kStarts = otherBMPSet.list4kStarts.clone();
}
public boolean contains(int c) {
if (c <= 0xff) {
return (latin1Contains[c]);
} else if (c <= 0x7ff) {
return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
} else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
return (0 != twoBits);
} else {
// Look up the code point in its 4k block of code points.
return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
}
} else if (c <= 0x10ffff) {
// surrogate or supplementary code point
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
} else {
// Out-of-range code points get false, consistent with long-standing
// behavior of UnicodeSet.contains(c).
return false;
}
}
/*
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
* spanCondition==0 or 1.
*
* @param start The start index
* @param end The end index
* @return The length of the span.
*
* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
* as usual in ICU.
*/
public final int span(CharSequence s, int start, int end, SpanCondition spanCondition) {
char c, c2;
int i = start;
int limit = Math.min(s.length(), end);
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
while (i < limit) {
c = s.charAt(i);
if (c <= 0xff) {
if (!latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
break;
}
} else if (c < 0xd800 ||
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits == 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++i;
}
++i;
}
} else {
// span not
while (i < limit) {
c = s.charAt(i);
if (c <= 0xff) {
if (latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
break;
}
} else if (c < 0xd800 ||
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits != 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++i;
}
++i;
}
}
return i - start;
}
/*
* Symmetrical with span().
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
* limit and spanCondition==0 or 1.
*
* @return The string index which starts the span (i.e. inclusive).
*/
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
char c, c2;
limit = Math.min(s.length(), limit);
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
for (;;) {
c = s.charAt(--limit);
if (c <= 0xff) {
if (!latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
break;
}
} else if (c < 0xd800 ||
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits == 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if (0 == limit) {
return 0;
}
}
} else {
// span not
for (;;) {
c = s.charAt(--limit);
if (c <= 0xff) {
if (latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
break;
}
} else if (c < 0xd800 ||
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits != 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if (0 == limit) {
return 0;
}
}
}
return limit + 1;
}
/*
* Set bits in a bit rectangle in "vertical" bit organization. start> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
// Set one bit indicating an all-one block.
int bits = 1 << lead;
if ((start + 1) == limit) { // Single-character shortcut.
table[trail] |= bits;
return;
}
int limitLead = limit >> 6;
int limitTrail = limit & 0x3f;
if (lead == limitLead) {
// Partial vertical bit column.
while (trail < limitTrail) {
table[trail++] |= bits;
}
} else {
// Partial vertical bit column,
// followed by a bit rectangle,
// followed by another partial vertical bit column.
if (trail > 0) {
do {
table[trail++] |= bits;
} while (trail < 64);
++lead;
}
if (lead < limitLead) {
bits = ~((1 << lead) - 1);
if (limitLead < 0x20) {
bits &= (1 << limitLead) - 1;
}
for (trail = 0; trail < 64; ++trail) {
table[trail] |= bits;
}
}
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
// In that case, bits=1<= 0x100) {
break;
}
do {
latin1Contains[start++] = true;
} while (start < limit && start < 0x100);
} while (limit <= 0x100);
// Set table7FF[].
while (start < 0x800) {
set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
if (limit > 0x800) {
start = 0x800;
break;
}
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
}
// Set bmpBlockBits[].
int minStart = 0x800;
while (start < 0x10000) {
if (limit > 0x10000) {
limit = 0x10000;
}
if (start < minStart) {
start = minStart;
}
if (start < limit) { // Else: Another range entirely in a known mixed-value block.
if (0 != (start & 0x3f)) {
// Mixed-value block of 64 code points.
start >>= 6;
bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
start = (start + 1) << 6; // Round up to the next block boundary.
minStart = start; // Ignore further ranges in this block.
}
if (start < limit) {
if (start < (limit & ~0x3f)) {
// Multiple all-ones blocks of 64 code points each.
set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
}
if (0 != (limit & 0x3f)) {
// Mixed-value block of 64 code points.
limit >>= 6;
bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
limit = (limit + 1) << 6; // Round up to the next block boundary.
minStart = limit; // Ignore further ranges in this block.
}
}
}
if (limit == 0x10000) {
break;
}
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
}
}
/**
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
* points in a certain range.
*
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
* hi=findCodePoint(end) with 0<=lo<=hi= hi || c >= list[hi - 1])
return hi;
// invariant: c >= list[lo]
// invariant: c < list[hi]
for (;;) {
int i = (lo + hi) >>> 1;
if (i == lo) {
break; // Found!
} else if (c < list[i]) {
hi = i;
} else {
lo = i;
}
}
return hi;
}
private final boolean containsSlow(int c, int lo, int hi) {
return (0 != (findCodePoint(c, lo, hi) & 1));
}
}