com.ibm.icu.charset.CharsetSelector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j-charset Show documentation
Show all versions of icu4j-charset Show documentation
icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*/
/*
* This is a port of the C++ class UConverterSelector.
*
* Methods related to serialization are not ported in this version. In addition,
* the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
* in Java.
*
* @author Shaopeng Jia
*/
package com.ibm.icu.charset;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.IntTrie;
import com.ibm.icu.impl.PropsVectors;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* Charset Selector
*
* A charset selector is built with a list of charset names and given an input
* CharSequence returns the list of names the corresponding charsets which can
* convert the CharSequence.
*
* @stable ICU 4.2
*/
public final class CharsetSelector {
private IntTrie trie;
private int[] pv; // table of bits
private String[] encodings; // encodings users ask to use
private void generateSelectorData(PropsVectors pvec,
UnicodeSet excludedCodePoints, int mappingTypes) {
int columns = (encodings.length + 31) / 32;
// set errorValue to all-ones
for (int col = 0; col < columns; ++col) {
pvec.setValue(PropsVectors.ERROR_VALUE_CP,
PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
}
for (int i = 0; i < encodings.length; ++i) {
Charset testCharset = CharsetICU.forNameICU(encodings[i]);
UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
mappingTypes);
int column = i / 32;
int mask = 1 << (i % 32);
// now iterate over intervals on set i
int itemCount = unicodePointSet.getRangeCount();
for (int j = 0; j < itemCount; ++j) {
int startChar = unicodePointSet.getRangeStart(j);
int endChar = unicodePointSet.getRangeEnd(j);
pvec.setValue(startChar, endChar, column, ~0, mask);
}
}
// handle excluded encodings
// Simply set their values to all 1's in the pvec
if (!excludedCodePoints.isEmpty()) {
int itemCount = excludedCodePoints.getRangeCount();
for (int j = 0; j < itemCount; ++j) {
int startChar = excludedCodePoints.getRangeStart(j);
int endChar = excludedCodePoints.getRangeEnd(j);
for (int col = 0; col < columns; col++) {
pvec.setValue(startChar, endChar, col, ~0, ~0);
}
}
}
trie = pvec.compactToTrieWithRowIndexes();
pv = pvec.getCompactedArray();
}
// internal function to intersect two sets of masks
// returns whether the mask has reduced to all zeros. The
// second set of mask consists of len elements in pv starting from
// pvIndex
private boolean intersectMasks(int[] dest, int pvIndex, int len) {
int oredDest = 0;
for (int i = 0; i < len; ++i) {
oredDest |= (dest[i] &= pv[pvIndex + i]);
}
return oredDest == 0;
}
// internal function
private List selectForMask(int[] mask) {
// this is the context we will use. Store a table of indices to which
// encodings are legit
List result = new ArrayList();
int columns = (encodings.length + 31) / 32;
int numOnes = countOnes(mask, columns);
// now we know the exact space we need to index
if (numOnes > 0) {
int k = 0;
for (int j = 0; j < columns; j++) {
int v = mask[j];
for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
if ((v & 1) != 0) {
result.add(encodings[k]);
}
v >>= 1;
}
}
}
// otherwise, index will remain NULL
return result;
}
// internal function to count how many 1's are there in a mask
// algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
private int countOnes(int[] mask, int len) {
int totalOnes = 0;
for (int i = 0; i < len; ++i) {
int ent = mask[i];
for (; ent != 0; totalOnes++) {
ent &= ent - 1; // clear the least significant bit set
}
}
return totalOnes;
}
/**
* Construct a CharsetSelector from a list of charset names.
*
* @param charsetList
* a list of charset names in the form of strings. If charsetList
* is empty, a selector for all available charset is constructed.
* @param excludedCodePoints
* a set of code points to be excluded from consideration.
* Excluded code points appearing in the input CharSequence do
* not change the selection result. It could be empty when no
* code point should be excluded.
* @param mappingTypes
* an int which determines whether to consider only roundtrip
* mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
* CharsetICU.java for the constants that are currently
* supported.
* @throws IllegalArgumentException
* if the parameters is invalid.
* @throws IllegalCharsetNameException
* If the given charset name is illegal.
* @throws UnsupportedCharsetException
* If no support for the named charset is available in this
* instance of the Java virtual machine.
* @stable ICU 4.2
*/
public CharsetSelector(List charsetList, UnicodeSet excludedCodePoints,
int mappingTypes) {
if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
&& mappingTypes != CharsetICU.ROUNDTRIP_SET) {
throw new IllegalArgumentException("Unsupported mappingTypes");
}
int encodingCount = charsetList.size();
if (encodingCount > 0) {
encodings = charsetList.toArray(new String[0]);
} else {
encodings = CharsetProviderICU.getAvailableNames();
encodingCount = encodings.length;
}
PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
generateSelectorData(pvec, excludedCodePoints, mappingTypes);
}
/**
* Select charsets that can map all characters in a CharSequence, ignoring
* the excluded code points.
*
* @param unicodeText
* a CharSequence. It could be empty.
* @return a list that contains charset names in the form of strings. The
* returned encoding names and their order will be the same as
* supplied when building the selector.
*
* @stable ICU 4.2
*/
public List selectForString(CharSequence unicodeText) {
int columns = (encodings.length + 31) / 32;
int[] mask = new int[columns];
for (int i = 0; i < columns; i++) {
mask[i] = - 1; // set each bit to 1
// Note: All integers are signed in Java, assigning
// 2 ^ 32 -1 to mask is wrong!
}
int index = 0;
while (index < unicodeText.length()) {
int c = UTF16.charAt(unicodeText, index);
int pvIndex = trie.getCodePointValue(c);
index += UTF16.getCharCount(c);
if (intersectMasks(mask, pvIndex, columns)) {
break;
}
}
return selectForMask(mask);
}
}