All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.util.UtfTextUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro.util;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
 * Text utilities especially suited for UTF encoded bytes.
 *
 * 

* When the character set is unknown, methods in this class assume UTF encoded * text and try to detect the UTF variant (8/16/32 bits, big/little endian), * using the BOM (if present) or an educated guess assuming the first character * is in the range U+0000-U+00FF. This heuristic works for all latin text based * formats, which includes Avro IDL, JSON, XML, etc. If the heuristic fails, * UTF-8 is assumed. *

* * @see XML specification, * appendix F: Autodetection of Character Encodings (Non-Normative) */ public class UtfTextUtils { private static final int TRANSFER_BUFFER_SIZE = 4096; /** * JVM standard character set (but that doesn't have a constant in * {@link StandardCharsets}) for UTF-32. */ private static final Charset UTF_32 = Charset.forName("UTF-32"); /** * JVM standard character set (but that doesn't have a constant in * {@link StandardCharsets}) for UTF-32BE. */ private static final Charset UTF_32BE = Charset.forName("UTF-32BE"); /** * JVM standard character set (but that doesn't have a constant in * {@link StandardCharsets}) for UTF-32LE. */ private static final Charset UTF_32LE = Charset.forName("UTF-32LE"); public static String asString(byte[] bytes, Charset charset) { if (charset == null) { charset = detectUtfCharset(bytes); } return skipBOM(new String(bytes, charset)); } /** * Reads the specified input stream as text. If {@code charset} is {@code null}, * the method will assume UTF encoded text and attempt to detect the appropriate * charset. * * @param input the input to read * @param charset the character set of the input, if known * @return all bytes, read into a string * @throws IOException when reading the input fails for some reason */ public static String readAllBytes(InputStream input, Charset charset) throws IOException { if (charset == null) { input = ensureMarkSupport(input); input.mark(4); byte[] buffer = new byte[4]; int bytesRead = fillBuffer(input, buffer); input.reset(); charset = detectUtfCharset0(buffer, bytesRead); if (charset == null) { throw new IOException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)"); } } Reader reader = new InputStreamReader(input, charset); return readAllChars(reader); } private static InputStream ensureMarkSupport(InputStream input) { if (input.markSupported()) { return input; } else { return new BufferedInputStream(input); } } private static int fillBuffer(InputStream in, byte[] buf) throws IOException { int remaining = buf.length; int offset = 0; while (remaining > 0) { int bytesRead = in.read(buf, offset, remaining); // As remaining > 0, bytesRead is either -1 or positive if (bytesRead == -1) { break; } offset += bytesRead; remaining -= bytesRead; } return offset; } public static String readAllChars(Reader input) throws IOException { StringBuilder buffer = new StringBuilder(); char[] charBuffer = new char[TRANSFER_BUFFER_SIZE]; int charsRead; while ((charsRead = input.read(charBuffer, 0, TRANSFER_BUFFER_SIZE)) >= 0) { buffer.append(charBuffer, 0, charsRead); } return skipBOM(buffer); } private static String skipBOM(CharSequence buffer) { if (buffer.charAt(0) == '\uFEFF') { return buffer.subSequence(1, buffer.length()).toString(); } return buffer.toString(); } /** * Assuming UTF encoded bytes, detect the UTF variant (8/16/32 bits, big/little * endian). * *

* To ensure the most accurate detection, the algorithm requires at least 4 * bytes. One should only provide less than 4 bytes of data if that is all there * is. *

* *

* Detection is certain when a byte order mark (BOM) is used. Otherwise a * heuristic is used, which works when the first character is from the first 256 * characters from the BMP (U+0000-U+00FF). This works for all latin-based * textual formats, like Avro IDL, JSON, YAML, XML, etc. *

* * @param firstFewBytes the first few bytes of the text to detect the character * set of * @return the character set to use */ public static Charset detectUtfCharset(byte[] firstFewBytes) { Charset detectedCharset = detectUtfCharset0(firstFewBytes, firstFewBytes.length); if (detectedCharset == null) { throw new IllegalArgumentException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)"); } return detectedCharset; } private static Charset detectUtfCharset0(byte[] firstFewBytes, int numBytes) { // spotless:off /* * Lookup table, adapted from https://www.w3.org/TR/xml/#sec-guessing * It omits non-UTF encodings (the 2nd and 3rd rows from the end). * Note that the order (with respect to UTF-32 & UTF-16) is important! * * (the non-zero bytes encode the byte order mark, BOM) * * Match the 'magic bytes' in order, and take the first match: * 00 00 FE FF -> UTF-32 (be) * FF FE 00 00 -> UTF-32 (le) * 00 00 FF FE -> unsupported UCS-4 (byte order 2143) * FE FF 00 00 -> unsupported UCS-4 (byte order 3412) * FE FF __ __ -> UTF-16 (be) * FF FE __ __ -> UTF-16 (le) * EF BB BF __ -> UTF-8 * 00 00 00 __ -> UTF-32BE * __ 00 00 00 -> UTF-32LE * 00 00 __ 00 -> unsupported UCS-4 (byte order 2143) * 00 __ 00 00 -> unsupported UCS-4 (byte order 3412) * 00 __ __ __ -> UTF-16BE * __ 00 __ __ -> UTF-16LE * __ __ __ __ -> UTF-8 (fallback) */ // spotless:on int quad = quad(firstFewBytes, numBytes); int word = quad >>> 16; if (numBytes > 3 && (quad == 0x0000FEFF || quad == 0xFFFE0000)) { // With BOM: UTF-32 (Charset handles BOM & endianness) return UTF_32; } else if (numBytes > 3 && (quad == 0x0000FFFE || quad == 0xFEFF0000)) { // With BOM: unsupported UCS-4 encoding (byte order 2143 resp. 3412) return null; } else if (numBytes > 1 && (word == 0xFEFF || word == 0xFFFE)) { // With BOM: UTF-16 (Charset handles BOM & endianness) return StandardCharsets.UTF_16; } else if (numBytes > 2 && quad >>> 8 == 0xEFBBBF) { // With BOM: UTF-8 (Charset does not handle a BOM, so our caller must skip it) return StandardCharsets.UTF_8; } else if (numBytes > 3 && (quad & 0xFFFFFF00) == 0) { // Without BOM (i.e., a guess) return UTF_32BE; } else if (numBytes > 3 && (quad & 0x00FFFFFF) == 0) { // Without BOM (i.e., a guess) return UTF_32LE; } else if (numBytes > 3 && (quad & 0xFFFF00FF) == 0 || (quad & 0xFF00FFFF) == 0) { // Without BOM (i.e., a guess): unsupported UCS-4 encoding (byte order 2143 // resp. 3412) return null; } else if (numBytes > 1 && (word & 0xFF00) == 0) { // Without BOM (i.e., a guess) return StandardCharsets.UTF_16BE; } else if (numBytes > 1 && (word & 0x00FF) == 0) { // Without BOM (i.e., a guess) return StandardCharsets.UTF_16LE; } else { // Fallback return StandardCharsets.UTF_8; } } private static int quad(byte[] bytes, int length) { int quad = 0xFFFFFFFF; switch (length) { default: quad = (quad & 0xFFFFFF00) | (bytes[3] & 0xFF); // Fallthrough case 3: quad = (quad & 0xFFFF00FF) | (bytes[2] & 0xFF) << 8; // Fallthrough case 2: quad = (quad & 0xFF00FFFF) | (bytes[1] & 0xFF) << 16; // Fallthrough case 1: quad = (quad & 0x00FFFFFF) | (bytes[0] & 0xFF) << 24; // Fallthrough case 0: break; } return quad; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy