org.apache.avro.util.UtfTextUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro.util;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
/**
* Text utilities especially suited for UTF encoded bytes.
*
*
* When the character set is unknown, methods in this class assume UTF encoded
* text and try to detect the UTF variant (8/16/32 bits, big/little endian),
* using the BOM (if present) or an educated guess assuming the first character
* is in the range U+0000-U+00FF. This heuristic works for all latin text based
* formats, which includes Avro IDL, JSON, XML, etc. If the heuristic fails,
* UTF-8 is assumed.
*
*
* @see XML specification,
* appendix F: Autodetection of Character Encodings (Non-Normative)
*/
public class UtfTextUtils {
private static final int TRANSFER_BUFFER_SIZE = 4096;
/**
* JVM standard character set (but that doesn't have a constant in
* {@link StandardCharsets}) for UTF-32.
*/
private static final Charset UTF_32 = Charset.forName("UTF-32");
/**
* JVM standard character set (but that doesn't have a constant in
* {@link StandardCharsets}) for UTF-32BE.
*/
private static final Charset UTF_32BE = Charset.forName("UTF-32BE");
/**
* JVM standard character set (but that doesn't have a constant in
* {@link StandardCharsets}) for UTF-32LE.
*/
private static final Charset UTF_32LE = Charset.forName("UTF-32LE");
public static String asString(byte[] bytes, Charset charset) {
if (charset == null) {
charset = detectUtfCharset(bytes);
}
return skipBOM(new String(bytes, charset));
}
/**
* Reads the specified input stream as text. If {@code charset} is {@code null},
* the method will assume UTF encoded text and attempt to detect the appropriate
* charset.
*
* @param input the input to read
* @param charset the character set of the input, if known
* @return all bytes, read into a string
* @throws IOException when reading the input fails for some reason
*/
public static String readAllBytes(InputStream input, Charset charset) throws IOException {
if (charset == null) {
input = ensureMarkSupport(input);
input.mark(4);
byte[] buffer = new byte[4];
int bytesRead = fillBuffer(input, buffer);
input.reset();
charset = detectUtfCharset0(buffer, bytesRead);
if (charset == null) {
throw new IOException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)");
}
}
Reader reader = new InputStreamReader(input, charset);
return readAllChars(reader);
}
private static InputStream ensureMarkSupport(InputStream input) {
if (input.markSupported()) {
return input;
} else {
return new BufferedInputStream(input);
}
}
private static int fillBuffer(InputStream in, byte[] buf) throws IOException {
int remaining = buf.length;
int offset = 0;
while (remaining > 0) {
int bytesRead = in.read(buf, offset, remaining);
// As remaining > 0, bytesRead is either -1 or positive
if (bytesRead == -1) {
break;
}
offset += bytesRead;
remaining -= bytesRead;
}
return offset;
}
public static String readAllChars(Reader input) throws IOException {
StringBuilder buffer = new StringBuilder();
char[] charBuffer = new char[TRANSFER_BUFFER_SIZE];
int charsRead;
while ((charsRead = input.read(charBuffer, 0, TRANSFER_BUFFER_SIZE)) >= 0) {
buffer.append(charBuffer, 0, charsRead);
}
return skipBOM(buffer);
}
private static String skipBOM(CharSequence buffer) {
if (buffer.charAt(0) == '\uFEFF') {
return buffer.subSequence(1, buffer.length()).toString();
}
return buffer.toString();
}
/**
* Assuming UTF encoded bytes, detect the UTF variant (8/16/32 bits, big/little
* endian).
*
*
* To ensure the most accurate detection, the algorithm requires at least 4
* bytes. One should only provide less than 4 bytes of data if that is all there
* is.
*
*
*
* Detection is certain when a byte order mark (BOM) is used. Otherwise a
* heuristic is used, which works when the first character is from the first 256
* characters from the BMP (U+0000-U+00FF). This works for all latin-based
* textual formats, like Avro IDL, JSON, YAML, XML, etc.
*
*
* @param firstFewBytes the first few bytes of the text to detect the character
* set of
* @return the character set to use
*/
public static Charset detectUtfCharset(byte[] firstFewBytes) {
Charset detectedCharset = detectUtfCharset0(firstFewBytes, firstFewBytes.length);
if (detectedCharset == null) {
throw new IllegalArgumentException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)");
}
return detectedCharset;
}
private static Charset detectUtfCharset0(byte[] firstFewBytes, int numBytes) {
// spotless:off
/*
* Lookup table, adapted from https://www.w3.org/TR/xml/#sec-guessing
* It omits non-UTF encodings (the 2nd and 3rd rows from the end).
* Note that the order (with respect to UTF-32 & UTF-16) is important!
*
* (the non-zero bytes encode the byte order mark, BOM)
*
* Match the 'magic bytes' in order, and take the first match:
* 00 00 FE FF -> UTF-32 (be)
* FF FE 00 00 -> UTF-32 (le)
* 00 00 FF FE -> unsupported UCS-4 (byte order 2143)
* FE FF 00 00 -> unsupported UCS-4 (byte order 3412)
* FE FF __ __ -> UTF-16 (be)
* FF FE __ __ -> UTF-16 (le)
* EF BB BF __ -> UTF-8
* 00 00 00 __ -> UTF-32BE
* __ 00 00 00 -> UTF-32LE
* 00 00 __ 00 -> unsupported UCS-4 (byte order 2143)
* 00 __ 00 00 -> unsupported UCS-4 (byte order 3412)
* 00 __ __ __ -> UTF-16BE
* __ 00 __ __ -> UTF-16LE
* __ __ __ __ -> UTF-8 (fallback)
*/
// spotless:on
int quad = quad(firstFewBytes, numBytes);
int word = quad >>> 16;
if (numBytes > 3 && (quad == 0x0000FEFF || quad == 0xFFFE0000)) {
// With BOM: UTF-32 (Charset handles BOM & endianness)
return UTF_32;
} else if (numBytes > 3 && (quad == 0x0000FFFE || quad == 0xFEFF0000)) {
// With BOM: unsupported UCS-4 encoding (byte order 2143 resp. 3412)
return null;
} else if (numBytes > 1 && (word == 0xFEFF || word == 0xFFFE)) {
// With BOM: UTF-16 (Charset handles BOM & endianness)
return StandardCharsets.UTF_16;
} else if (numBytes > 2 && quad >>> 8 == 0xEFBBBF) {
// With BOM: UTF-8 (Charset does not handle a BOM, so our caller must skip it)
return StandardCharsets.UTF_8;
} else if (numBytes > 3 && (quad & 0xFFFFFF00) == 0) {
// Without BOM (i.e., a guess)
return UTF_32BE;
} else if (numBytes > 3 && (quad & 0x00FFFFFF) == 0) {
// Without BOM (i.e., a guess)
return UTF_32LE;
} else if (numBytes > 3 && (quad & 0xFFFF00FF) == 0 || (quad & 0xFF00FFFF) == 0) {
// Without BOM (i.e., a guess): unsupported UCS-4 encoding (byte order 2143
// resp. 3412)
return null;
} else if (numBytes > 1 && (word & 0xFF00) == 0) {
// Without BOM (i.e., a guess)
return StandardCharsets.UTF_16BE;
} else if (numBytes > 1 && (word & 0x00FF) == 0) {
// Without BOM (i.e., a guess)
return StandardCharsets.UTF_16LE;
} else {
// Fallback
return StandardCharsets.UTF_8;
}
}
private static int quad(byte[] bytes, int length) {
int quad = 0xFFFFFFFF;
switch (length) {
default:
quad = (quad & 0xFFFFFF00) | (bytes[3] & 0xFF);
// Fallthrough
case 3:
quad = (quad & 0xFFFF00FF) | (bytes[2] & 0xFF) << 8;
// Fallthrough
case 2:
quad = (quad & 0xFF00FFFF) | (bytes[1] & 0xFF) << 16;
// Fallthrough
case 1:
quad = (quad & 0x00FFFFFF) | (bytes[0] & 0xFF) << 24;
// Fallthrough
case 0:
break;
}
return quad;
}
}