All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.io.Text Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.io;

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.nio.charset.StandardCharsets;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;

import org.apache.avro.reflect.Stringable;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;

/** This class stores text using standard UTF8 encoding.  It provides methods
 * to serialize, deserialize, and compare texts at byte level.  The type of
 * length is integer and is serialized using zero-compressed format.  

In * addition, it provides methods for string traversal without converting the * byte array to a string.

Also includes utilities for * serializing/deserialing a string, coding/decoding a string, checking if a * byte array contains valid UTF8 code, calculating the length of an encoded * string. */ @Stringable @InterfaceAudience.Public @InterfaceStability.Stable public class Text extends BinaryComparable implements WritableComparable { private static final ThreadLocal ENCODER_FACTORY = new ThreadLocal() { @Override protected CharsetEncoder initialValue() { return StandardCharsets.UTF_8.newEncoder(). onMalformedInput(CodingErrorAction.REPORT). onUnmappableCharacter(CodingErrorAction.REPORT); } }; private static final ThreadLocal DECODER_FACTORY = new ThreadLocal() { @Override protected CharsetDecoder initialValue() { return StandardCharsets.UTF_8.newDecoder(). onMalformedInput(CodingErrorAction.REPORT). onUnmappableCharacter(CodingErrorAction.REPORT); } }; // max size of the byte array, seems to be a safe choice for multiple JVMs // (see ArrayList.MAX_ARRAY_SIZE) private static final int ARRAY_MAX_SIZE = Integer.MAX_VALUE - 8; private static final byte[] EMPTY_BYTES = new byte[0]; private byte[] bytes = EMPTY_BYTES; private int length = 0; private int textLength = -1; /** * Construct an empty text string. */ public Text() { } /** * Construct from a string. * @param string input string. */ public Text(String string) { set(string); } /** * Construct from another text. * @param utf8 input utf8. */ public Text(Text utf8) { set(utf8); } /** * Construct from a byte array. * * @param utf8 input utf8. */ public Text(byte[] utf8) { set(utf8); } /** * @return Get a copy of the bytes that is exactly the length of the data. * See {@link #getBytes()} for faster access to the underlying array. */ public byte[] copyBytes() { return Arrays.copyOf(bytes, length); } /** * Returns the raw bytes; however, only data up to {@link #getLength()} is * valid. Please use {@link #copyBytes()} if you * need the returned array to be precisely the length of the data. */ @Override public byte[] getBytes() { return bytes; } /** * Returns the number of bytes in the byte array. */ @Override public int getLength() { return length; } /** * @return Returns the length of this text. The length is equal to the number of * Unicode code units in the text. */ public int getTextLength() { if (textLength < 0) { textLength = toString().length(); } return textLength; } /** * Returns the Unicode Scalar Value (32-bit integer value) * for the character at position. Note that this * method avoids using the converter or doing String instantiation. * * @param position input position. * @return the Unicode scalar value at position or -1 * if the position is invalid or points to a * trailing byte */ public int charAt(int position) { if (position > this.length) return -1; // too long if (position < 0) return -1; // duh. ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); return bytesToCodePoint(bb.slice()); } public int find(String what) { return find(what, 0); } /** * Finds any occurrence of what in the backing * buffer, starting as position start. The starting * position is measured in bytes and the return value is in * terms of byte position in the buffer. The backing buffer is * not converted to a string for this operation. * * @param what input what. * @param start input start. * @return byte position of the first occurrence of the search * string in the UTF-8 buffer or -1 if not found */ public int find(String what, int start) { try { ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length); ByteBuffer tgt = encode(what); byte b = tgt.get(); src.position(start); while (src.hasRemaining()) { if (b == src.get()) { // matching first byte src.mark(); // save position in loop tgt.mark(); // save position in target boolean found = true; int pos = src.position()-1; while (tgt.hasRemaining()) { if (!src.hasRemaining()) { // src expired first tgt.reset(); src.reset(); found = false; break; } if (!(tgt.get() == src.get())) { tgt.reset(); src.reset(); found = false; break; // no match } } if (found) return pos; } } return -1; // not found } catch (CharacterCodingException e) { throw new RuntimeException("Should not have happened", e); } } /** * Set to contain the contents of a string. * * @param string input string. */ public void set(String string) { try { ByteBuffer bb = encode(string, true); bytes = bb.array(); length = bb.limit(); textLength = string.length(); } catch (CharacterCodingException e) { throw new RuntimeException("Should not have happened", e); } } /** * Set to a utf8 byte array. If the length of utf8 is * zero, actually clear {@link #bytes} and any existing * data is lost. * * @param utf8 input utf8. */ public void set(byte[] utf8) { if (utf8.length == 0) { bytes = EMPTY_BYTES; length = 0; textLength = -1; } else { set(utf8, 0, utf8.length); } } /** * Copy a text. * @param other other. */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); this.textLength = other.textLength; } /** * Set the Text to range of bytes. * * @param utf8 the data to copy from * @param start the first position of the new string * @param len the number of bytes of the new string */ public void set(byte[] utf8, int start, int len) { ensureCapacity(len); System.arraycopy(utf8, start, bytes, 0, len); this.length = len; this.textLength = -1; } /** * Append a range of bytes to the end of the given text. * * @param utf8 the data to copy from * @param start the first position to append from utf8 * @param len the number of bytes to append */ public void append(byte[] utf8, int start, int len) { byte[] original = bytes; if (ensureCapacity(length + len)) { System.arraycopy(original, 0, bytes, 0, length); } System.arraycopy(utf8, start, bytes, length, len); length += len; textLength = -1; } /** * Clear the string to empty. * * Note: For performance reasons, this call does not clear the * underlying byte array that is retrievable via {@link #getBytes()}. * In order to free the byte-array memory, call {@link #set(byte[])} * with an empty byte array (For example, new byte[0]). */ public void clear() { length = 0; textLength = -1; } /** * Sets the capacity of this Text object to at least * capacity bytes. If the current buffer is longer, then the * capacity and existing content of the buffer are unchanged. If * capacity is larger than the current capacity, the Text * object's capacity is increased to match and any existing data is lost. * * @param capacity the number of bytes we need * @return true if the internal array was resized or false otherwise */ private boolean ensureCapacity(final int capacity) { if (bytes.length < capacity) { // Try to expand the backing array by the factor of 1.5x // (by taking the current size + diving it by half). // // If the calculated value is beyond the size // limit, we cap it to ARRAY_MAX_SIZE long targetSizeLong = bytes.length + (bytes.length >> 1); int targetSize = (int)Math.min(targetSizeLong, ARRAY_MAX_SIZE); targetSize = Math.max(capacity, targetSize); bytes = new byte[targetSize]; return true; } return false; } @Override public String toString() { try { return decode(bytes, 0, length); } catch (CharacterCodingException e) { throw new RuntimeException("Should not have happened", e); } } @Override public void readFields(DataInput in) throws IOException { int newLength = WritableUtils.readVInt(in); readWithKnownLength(in, newLength); } public void readFields(DataInput in, int maxLength) throws IOException { int newLength = WritableUtils.readVInt(in); if (newLength < 0) { throw new IOException("tried to deserialize " + newLength + " bytes of data! newLength must be non-negative."); } else if (newLength >= maxLength) { throw new IOException("tried to deserialize " + newLength + " bytes of data, but maxLength = " + maxLength); } readWithKnownLength(in, newLength); } /** * Skips over one Text in the input. * @param in input in. * @throws IOException raised on errors performing I/O. */ public static void skip(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); WritableUtils.skipFully(in, length); } /** * Read a Text object whose length is already known. * This allows creating Text from a stream which uses a different serialization * format. * * @param in input in. * @param len input len. * @throws IOException raised on errors performing I/O. */ public void readWithKnownLength(DataInput in, int len) throws IOException { ensureCapacity(len); in.readFully(bytes, 0, len); length = len; textLength = -1; } /** * Serialize. Write this object to out length uses zero-compressed encoding. * * @see Writable#write(DataOutput) */ @Override public void write(DataOutput out) throws IOException { WritableUtils.writeVInt(out, length); out.write(bytes, 0, length); } public void write(DataOutput out, int maxLength) throws IOException { if (length > maxLength) { throw new IOException("data was too long to write! Expected " + "less than or equal to " + maxLength + " bytes, but got " + length + " bytes."); } WritableUtils.writeVInt(out, length); out.write(bytes, 0, length); } /** * Returns true iff o is a Text with the same length and same * contents. */ @Override public boolean equals(Object o) { if (o instanceof Text) return super.equals(o); return false; } @Override public int hashCode() { return super.hashCode(); } /** A WritableComparator optimized for Text keys. */ public static class Comparator extends WritableComparator { public Comparator() { super(Text.class); } @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int n1 = WritableUtils.decodeVIntSize(b1[s1]); int n2 = WritableUtils.decodeVIntSize(b2[s2]); return compareBytes(b1, s1 + n1, l1 - n1, b2, s2 + n2, l2 - n2); } } static { // register this comparator WritableComparator.define(Text.class, new Comparator()); } /// STATIC UTILITIES FROM HERE DOWN /** * @return Converts the provided byte array to a String using the * UTF-8 encoding. If the input is malformed, * replace by a default value. * * @param utf8 input utf8. * @throws CharacterCodingException when a character * encoding or decoding error occurs. */ public static String decode(byte[] utf8) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8), true); } public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8, start, length), true); } /** * @return Converts the provided byte array to a String using the * UTF-8 encoding. If replace is true, then * malformed input is replaced with the * substitution character, which is U+FFFD. Otherwise the * method throws a MalformedInputException. * * @param utf8 input utf8. * @param start input start. * @param length input length. * @param replace input replace. * @throws CharacterCodingException when a character * encoding or decoding error occurs. */ public static String decode(byte[] utf8, int start, int length, boolean replace) throws CharacterCodingException { return decode(ByteBuffer.wrap(utf8, start, length), replace); } private static String decode(ByteBuffer utf8, boolean replace) throws CharacterCodingException { CharsetDecoder decoder = DECODER_FACTORY.get(); if (replace) { decoder.onMalformedInput( java.nio.charset.CodingErrorAction.REPLACE); decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } String str = decoder.decode(utf8).toString(); // set decoder back to its default value: REPORT if (replace) { decoder.onMalformedInput(CodingErrorAction.REPORT); decoder.onUnmappableCharacter(CodingErrorAction.REPORT); } return str; } /** * Converts the provided String to bytes using the * UTF-8 encoding. If the input is malformed, * invalid chars are replaced by a default value. * * @param string input string. * @return ByteBuffer: bytes stores at ByteBuffer.array() * and length is ByteBuffer.limit() * @throws CharacterCodingException when a character * encoding or decoding error occurs. */ public static ByteBuffer encode(String string) throws CharacterCodingException { return encode(string, true); } /** * Converts the provided String to bytes using the * UTF-8 encoding. If replace is true, then * malformed input is replaced with the * substitution character, which is U+FFFD. Otherwise the * method throws a MalformedInputException. * * @param string input string. * @param replace input replace. * @return ByteBuffer: bytes stores at ByteBuffer.array() * and length is ByteBuffer.limit() * @throws CharacterCodingException when a character * encoding or decoding error occurs. */ public static ByteBuffer encode(String string, boolean replace) throws CharacterCodingException { CharsetEncoder encoder = ENCODER_FACTORY.get(); if (replace) { encoder.onMalformedInput(CodingErrorAction.REPLACE); encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } ByteBuffer bytes = encoder.encode(CharBuffer.wrap(string.toCharArray())); if (replace) { encoder.onMalformedInput(CodingErrorAction.REPORT); encoder.onUnmappableCharacter(CodingErrorAction.REPORT); } return bytes; } static final public int DEFAULT_MAX_LEN = 1024 * 1024; /** * @return Read a UTF8 encoded string from in. * @param in input in. * @throws IOException raised on errors performing I/O. */ public static String readString(DataInput in) throws IOException { return readString(in, Integer.MAX_VALUE); } /** * @return Read a UTF8 encoded string with a maximum size. * @param in input datainput. * @param maxLength input maxLength. * @throws IOException raised on errors performing I/O. */ public static String readString(DataInput in, int maxLength) throws IOException { int length = WritableUtils.readVIntInRange(in, 0, maxLength); byte [] bytes = new byte[length]; in.readFully(bytes, 0, length); return decode(bytes); } /** * Write a UTF8 encoded string to out. * * @param out input out. * @param s input s. * @throws IOException raised on errors performing I/O. * @return a UTF8 encoded string to out. */ public static int writeString(DataOutput out, String s) throws IOException { ByteBuffer bytes = encode(s); int length = bytes.limit(); WritableUtils.writeVInt(out, length); out.write(bytes.array(), 0, length); return length; } /** * @return Write a UTF8 encoded string with a maximum size to out. * * @param out input out. * @param s input s. * @param maxLength input maxLength. * @throws IOException raised on errors performing I/O. */ public static int writeString(DataOutput out, String s, int maxLength) throws IOException { ByteBuffer bytes = encode(s); int length = bytes.limit(); if (length > maxLength) { throw new IOException("string was too long to write! Expected " + "less than or equal to " + maxLength + " bytes, but got " + length + " bytes."); } WritableUtils.writeVInt(out, length); out.write(bytes.array(), 0, length); return length; } ////// states for validateUTF8 private static final int LEAD_BYTE = 0; private static final int TRAIL_BYTE_1 = 1; private static final int TRAIL_BYTE = 2; /** * Check if a byte array contains valid UTF-8. * * @param utf8 byte array * @throws MalformedInputException if the byte array contains invalid UTF-8 */ public static void validateUTF8(byte[] utf8) throws MalformedInputException { validateUTF8(utf8, 0, utf8.length); } /** * Check to see if a byte array is valid UTF-8. * * @param utf8 the array of bytes * @param start the offset of the first byte in the array * @param len the length of the byte sequence * @throws MalformedInputException if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { int count = start; int leadByte = 0; int length = 0; int state = LEAD_BYTE; while (count < start+len) { int aByte = utf8[count] & 0xFF; switch (state) { case LEAD_BYTE: leadByte = aByte; length = bytesFromUTF8[aByte]; switch (length) { case 0: // check for ASCII if (leadByte > 0x7F) throw new MalformedInputException(count); break; case 1: if (leadByte < 0xC2 || leadByte > 0xDF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 2: if (leadByte < 0xE0 || leadByte > 0xEF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 3: if (leadByte < 0xF0 || leadByte > 0xF4) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; default: // too long! Longest valid UTF-8 is 4 bytes (lead + three) // or if < 0 we got a trail byte in the lead byte position throw new MalformedInputException(count); } // switch (length) break; case TRAIL_BYTE_1: if (leadByte == 0xF0 && aByte < 0x90) throw new MalformedInputException(count); if (leadByte == 0xF4 && aByte > 0x8F) throw new MalformedInputException(count); if (leadByte == 0xE0 && aByte < 0xA0) throw new MalformedInputException(count); if (leadByte == 0xED && aByte > 0x9F) throw new MalformedInputException(count); // falls through to regular trail-byte test!! case TRAIL_BYTE: if (aByte < 0x80 || aByte > 0xBF) throw new MalformedInputException(count); if (--length == 0) { state = LEAD_BYTE; } else { state = TRAIL_BYTE; } break; default: break; } // switch (state) count++; } } /** * Magic numbers for UTF-8. These are the number of bytes * that follow a given lead byte. Trailing bytes * have the value -1. The values 4 and 5 are presented in * this table, even though valid UTF-8 cannot include the * five and six byte sequences. */ static final int[] bytesFromUTF8 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // trail bytes -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; /** * @return Returns the next code point at the current position in * the buffer. The buffer's position will be incremented. * Any mark set on this buffer will be changed by this method! * * @param bytes input bytes. */ public static int bytesToCodePoint(ByteBuffer bytes) { bytes.mark(); byte b = bytes.get(); bytes.reset(); int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; if (extraBytesToRead < 0) return -1; // trailing byte! int ch = 0; switch (extraBytesToRead) { case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += (bytes.get() & 0xFF); ch <<= 6; case 2: ch += (bytes.get() & 0xFF); ch <<= 6; case 1: ch += (bytes.get() & 0xFF); ch <<= 6; case 0: ch += (bytes.get() & 0xFF); } ch -= offsetsFromUTF8[extraBytesToRead]; return ch; } static final int offsetsFromUTF8[] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; /** * For the given string, returns the number of UTF-8 bytes * required to encode the string. * @param string text to encode * @return number of UTF-8 bytes required to encode */ public static int utf8Length(String string) { CharacterIterator iter = new StringCharacterIterator(string); char ch = iter.first(); int size = 0; while (ch != CharacterIterator.DONE) { if ((ch >= 0xD800) && (ch < 0xDC00)) { // surrogate pair? char trail = iter.next(); if ((trail > 0xDBFF) && (trail < 0xE000)) { // valid pair size += 4; } else { // invalid pair size += 3; iter.previous(); // rewind one } } else if (ch < 0x80) { size++; } else if (ch < 0x800) { size += 2; } else { // ch < 0x10000, that is, the largest char value size += 3; } ch = iter.next(); } return size; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy