org.apache.sanselan.util.UnicodeUtils Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.sanselan.util;
import java.io.UnsupportedEncodingException;
import org.apache.sanselan.common.BinaryConstants;
public abstract class UnicodeUtils implements BinaryConstants
{
/**
* This class should never be instantiated.
*/
private UnicodeUtils()
{
}
public static class UnicodeException extends Exception
{
public UnicodeException(String message)
{
super(message);
}
}
// A default single-byte charset.
public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
// /*
// * Guess the character encoding of arbitrary character data in a data
// * buffer.
// *
// * The data may not run to the end of the buffer; it may be terminated.
// This
// * makes the problem much harder, since the character data may be followed
// * by arbitrary data.
// */
// public static int guessCharacterEncoding(byte bytes[], int index)
// {
// int length = bytes.length - index;
//
// if (length < 1)
// return CHAR_ENCODING_CODE_AMBIGUOUS;
//
// if (length >= 2)
// {
// // look for BOM.
//
// int c1 = 0xff & bytes[index];
// int c2 = 0xff & bytes[index + 1];
// if (c1 == 0xFF && c2 == 0xFE)
// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
// else if (c1 == 0xFE && c2 == 0xFF)
// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
// }
//
// }
//
// /*
// * Guess the character encoding of arbitrary character data in a data
// * buffer.
// *
// * The data fills the entire buffer. If it is terminated, the terminator
// * byte(s) will be the last bytes in the buffer.
// *
// * This makes the problem a bit easier.
// */
// public static int guessCharacterEncodingSimple(byte bytes[], int index)
// throws UnicodeException
// {
// int length = bytes.length - index;
//
// if (length < 1)
// return CHAR_ENCODING_CODE_AMBIGUOUS;
//
// if (length >= 2)
// {
// // identify or eliminate UTF-16 with a BOM.
//
// int c1 = 0xff & bytes[index];
// int c2 = 0xff & bytes[index + 1];
// if (c1 == 0xFF && c2 == 0xFE)
// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
// else if (c1 == 0xFE && c2 == 0xFF)
// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
// }
//
// if (length >= 2)
// {
// // look for optional double-byte terminator.
//
// int c1 = 0xff & bytes[bytes.length - 2];
// int c2 = 0xff & bytes[bytes.length - 1];
// if (c1 == 0 && c2 == 0)
// {
// // definitely a flavor of UTF-16.
// if (length % 2 != 0)
// throw new UnicodeException(
// "Character data with double-byte terminator has an odd length.");
//
// boolean mayHaveTerminator = true;
// boolean mustHaveTerminator = false;
// boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
// BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
// mayHaveTerminator, mustHaveTerminator);
// boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
// BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
// mayHaveTerminator, mustHaveTerminator);
// if ((!possibleBigEndian) && (!possibleLittleEndian))
// throw new UnicodeException(
// "Invalid character data, possibly UTF-16.");
// if (possibleBigEndian && possibleLittleEndian)
// return CHAR_ENCODING_CODE_AMBIGUOUS;
// if (possibleBigEndian)
// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
// if (possibleLittleEndian)
// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
// }
// }
//
// List possibleEncodings = new ArrayList();
// if (length % 2 == 0)
// {
// boolean mayHaveTerminator = true;
// boolean mustHaveTerminator = false;
// boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
// BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
// mayHaveTerminator, mustHaveTerminator);
// boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
// BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
// mayHaveTerminator, mustHaveTerminator);
//
// if (possibleBigEndian)
// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
// if (possibleLittleEndian)
// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
// }
//
// }
public static final boolean isValidISO_8859_1(String s)
{
try
{
String roundtrip = new String(s.getBytes("ISO-8859-1"),
"ISO-8859-1");
return s.equals(roundtrip);
} catch (UnsupportedEncodingException e)
{
// should never be thrown.
throw new RuntimeException("Error parsing string.", e);
}
}
/*
* Return the index of the first utf-16 terminator (ie. two even-aligned
* nulls). If not found, return -1.
*/
private static int findFirstDoubleByteTerminator(byte bytes[], int index)
{
for (int i = index; i < bytes.length - 1; i += 2)
{
int c1 = 0xff & bytes[index];
int c2 = 0xff & bytes[index + 1];
if (c1 == 0 && c2 == 0)
return i;
}
return -1;
}
public final int findEndWithTerminator(byte bytes[], int index)
throws UnicodeException
{
return findEnd(bytes, index, true);
}
public final int findEndWithoutTerminator(byte bytes[], int index)
throws UnicodeException
{
return findEnd(bytes, index, false);
}
protected abstract int findEnd(byte bytes[], int index,
boolean includeTerminator) throws UnicodeException;
public static UnicodeUtils getInstance(int charEncodingCode)
throws UnicodeException
{
switch (charEncodingCode)
{
case CHAR_ENCODING_CODE_ISO_8859_1:
return new UnicodeMetricsASCII();
case CHAR_ENCODING_CODE_UTF_8:
// Debug.debug("CHAR_ENCODING_CODE_UTF_8");
return new UnicodeMetricsUTF8();
case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
// Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
return new UnicodeMetricsUTF16WithBOM();
case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
default:
throw new UnicodeException("Unknown char encoding code: "
+ charEncodingCode);
}
}
private static class UnicodeMetricsASCII extends UnicodeUtils
{
public int findEnd(byte bytes[], int index, boolean includeTerminator)
throws UnicodeException
{
for (int i = index; i < bytes.length; i++)
{
if (bytes[i] == 0)
return includeTerminator ? i + 1 : i;
}
return bytes.length;
// throw new UnicodeException("Terminator not found.");
}
}
// private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
// {
// public int findEnd(byte bytes[], int index, boolean includeTerminator)
// throws UnicodeException
// {
// for (int i = index; i < bytes.length; i++)
// {
// if (bytes[i] == 0)
// return includeTerminator ? i + 1 : i;
// }
// return bytes.length;
// // throw new UnicodeException("Terminator not found.");
// }
// }
private static class UnicodeMetricsUTF8 extends UnicodeUtils
{
public int findEnd(byte bytes[], int index, boolean includeTerminator)
throws UnicodeException
{
// http://en.wikipedia.org/wiki/UTF-8
while (true)
{
if (index == bytes.length)
return bytes.length;
if (index > bytes.length)
throw new UnicodeException("Terminator not found.");
int c1 = 0xff & bytes[index++];
if (c1 == 0)
return includeTerminator ? index : index - 1;
else if (c1 <= 0x7f)
continue;
else if (c1 <= 0xDF)
{
if (index >= bytes.length)
throw new UnicodeException("Invalid unicode.");
int c2 = 0xff & bytes[index++];
if (c2 < 0x80 || c2 > 0xBF)
throw new UnicodeException("Invalid code point.");
} else if (c1 <= 0xEF)
{
if (index >= bytes.length - 1)
throw new UnicodeException("Invalid unicode.");
int c2 = 0xff & bytes[index++];
if (c2 < 0x80 || c2 > 0xBF)
throw new UnicodeException("Invalid code point.");
int c3 = 0xff & bytes[index++];
if (c3 < 0x80 || c3 > 0xBF)
throw new UnicodeException("Invalid code point.");
} else if (c1 <= 0xF4)
{
if (index >= bytes.length - 2)
throw new UnicodeException("Invalid unicode.");
int c2 = 0xff & bytes[index++];
if (c2 < 0x80 || c2 > 0xBF)
throw new UnicodeException("Invalid code point.");
int c3 = 0xff & bytes[index++];
if (c3 < 0x80 || c3 > 0xBF)
throw new UnicodeException("Invalid code point.");
int c4 = 0xff & bytes[index++];
if (c4 < 0x80 || c4 > 0xBF)
throw new UnicodeException("Invalid code point.");
} else
throw new UnicodeException("Invalid code point.");
}
}
}
private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
{
protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
public UnicodeMetricsUTF16(int byteOrder)
{
this.byteOrder = byteOrder;
}
public boolean isValid(byte bytes[], int index,
boolean mayHaveTerminator, boolean mustHaveTerminator)
throws UnicodeException
{
// http://en.wikipedia.org/wiki/UTF-16/UCS-2
while (true)
{
if (index == bytes.length)
{
// end of buffer, no terminator found.
return !mustHaveTerminator;
}
if (index >= bytes.length - 1)
{
// end of odd-length buffer, no terminator found.
return false;
}
int c1 = 0xff & bytes[index++];
int c2 = 0xff & bytes[index++];
int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
if (c1 == 0 && c2 == 0)
{
// terminator found.
return mayHaveTerminator;
}
if (msb1 >= 0xD8)
{
// Surrogate pair found.
if (msb1 >= 0xDC)
{
// invalid first surrogate.
return false;
}
if (index >= bytes.length - 1)
{
// missing second surrogate.
return false;
}
// second word.
int c3 = 0xff & bytes[index++];
int c4 = 0xff & bytes[index++];
int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
if (msb2 < 0xDC)
{
// invalid second surrogate.
return false;
}
}
}
}
public int findEnd(byte bytes[], int index, boolean includeTerminator)
throws UnicodeException
{
// http://en.wikipedia.org/wiki/UTF-16/UCS-2
while (true)
{
if (index == bytes.length)
return bytes.length;
if (index > bytes.length - 1)
throw new UnicodeException("Terminator not found.");
int c1 = 0xff & bytes[index++];
int c2 = 0xff & bytes[index++];
int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
if (c1 == 0 && c2 == 0)
{
return includeTerminator ? index : index - 2;
} else if (msb1 >= 0xD8)
{
if (index > bytes.length - 1)
throw new UnicodeException("Terminator not found.");
// second word.
int c3 = 0xff & bytes[index++];
int c4 = 0xff & bytes[index++];
int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
if (msb2 < 0xDC)
throw new UnicodeException("Invalid code point.");
}
}
}
}
private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
{
public UnicodeMetricsUTF16NoBOM(final int byteOrder)
{
super(byteOrder);
}
}
private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
{
public UnicodeMetricsUTF16WithBOM()
{
super(BYTE_ORDER_BIG_ENDIAN);
}
public int findEnd(byte bytes[], int index, boolean includeTerminator)
throws UnicodeException
{
// http://en.wikipedia.org/wiki/UTF-16/UCS-2
if (index >= bytes.length - 1)
throw new UnicodeException("Missing BOM.");
int c1 = 0xff & bytes[index++];
int c2 = 0xff & bytes[index++];
if (c1 == 0xFF && c2 == 0xFE)
byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
else if (c1 == 0xFE && c2 == 0xFF)
byteOrder = BYTE_ORDER_BIG_ENDIAN;
else
throw new UnicodeException("Invalid byte order mark.");
return super.findEnd(bytes, index, includeTerminator);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy