All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.util.LineReader Maven / Gradle / Ivy

There is a newer version: 3.2.0-9
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.util; import io.prestosql.hadoop.TextLineLengthLimitExceededException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; /** * A class that provides a line reader from an input stream. * Depending on the constructor used, lines will either be terminated by: *

    *
  • one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF).
  • *
  • or, a custom byte sequence delimiter
  • *
* In both cases, EOF also terminates an otherwise unterminated * line. */ @InterfaceAudience.LimitedPrivate({"MapReduce"}) @InterfaceStability.Unstable public class LineReader implements Closeable { // Limitation for array size is VM specific. Current HotSpot VM limitation // for array size is Integer.MAX_VALUE - 5 (2^31 - 1 - 5). // Integer.MAX_VALUE - 8 should be safe enough. private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; private static final int DEFAULT_BUFFER_SIZE = 64 * 1024; private static final byte CR = '\r'; private static final byte LF = '\n'; // The line delimiter private final byte[] recordDelimiterBytes; private int bufferSize = DEFAULT_BUFFER_SIZE; private InputStream in; private byte[] buffer; // the number of bytes of real data in the buffer private int bufferLength = 0; // the current position in the buffer private int bufferPosn = 0; /** * Create a line reader that reads from the given stream using the * default buffer-size (64k). * @param in The input stream * @throws IOException */ public LineReader(InputStream in) { this(in, DEFAULT_BUFFER_SIZE); } /** * Create a line reader that reads from the given stream using the * given buffer-size. * @param in The input stream * @param bufferSize Size of the read buffer * @throws IOException */ public LineReader(InputStream in, int bufferSize) { this.in = in; this.bufferSize = bufferSize; this.buffer = new byte[this.bufferSize]; this.recordDelimiterBytes = null; } /** * Create a line reader that reads from the given stream using the * io.file.buffer.size specified in the given * Configuration. * @param in input stream * @param conf configuration * @throws IOException */ public LineReader(InputStream in, Configuration conf) throws IOException { this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE)); } /** * Create a line reader that reads from the given stream using the * default buffer-size, and using a custom delimiter of array of * bytes. * @param in The input stream * @param recordDelimiterBytes The delimiter */ public LineReader(InputStream in, byte[] recordDelimiterBytes) { this.in = in; this.bufferSize = DEFAULT_BUFFER_SIZE; this.buffer = new byte[this.bufferSize]; this.recordDelimiterBytes = recordDelimiterBytes; } /** * Create a line reader that reads from the given stream using the * given buffer-size, and using a custom delimiter of array of * bytes. * @param in The input stream * @param bufferSize Size of the read buffer * @param recordDelimiterBytes The delimiter * @throws IOException */ public LineReader(InputStream in, int bufferSize, byte[] recordDelimiterBytes) { this.in = in; this.bufferSize = bufferSize; this.buffer = new byte[this.bufferSize]; this.recordDelimiterBytes = recordDelimiterBytes; } /** * Create a line reader that reads from the given stream using the * io.file.buffer.size specified in the given * Configuration, and using a custom delimiter of array of * bytes. * @param in input stream * @param conf configuration * @param recordDelimiterBytes The delimiter * @throws IOException */ public LineReader(InputStream in, Configuration conf, byte[] recordDelimiterBytes) throws IOException { this.in = in; this.bufferSize = conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); this.buffer = new byte[this.bufferSize]; this.recordDelimiterBytes = recordDelimiterBytes; } /** * Close the underlying stream. * @throws IOException */ public void close() throws IOException { in.close(); } /** * Read one line from the InputStream into the given Text. * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline * found. * * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { maxLineLength = Math.min(maxLineLength, MAX_ARRAY_SIZE); maxBytesToConsume = Math.min(maxBytesToConsume, MAX_ARRAY_SIZE); if (this.recordDelimiterBytes != null) { return readCustomLine(str, maxLineLength, maxBytesToConsume); } else { return readDefaultLine(str, maxLineLength, maxBytesToConsume); } } protected int fillBuffer(InputStream in, byte[] buffer, boolean inDelimiter) throws IOException { return in.read(buffer); } /** * Read a line terminated by one of CR, LF, or CRLF. */ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; //account for CR from previous read } bufferLength = fillBuffer(in, buffer, prevCharCR); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; //CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; if (appendLength > 0) { // We want to fail the read when the line length is over the limit. throw new TextLineLengthLimitExceededException("Too many bytes before newline: " + maxLineLength); } } if (appendLength > 0) { int newTxtLength = txtLength + appendLength; if (str.getBytes().length < newTxtLength && Math.max(newTxtLength, txtLength << 1) > MAX_ARRAY_SIZE) { // If str need to be resized but the target capacity is over VM limit, it will trigger OOM. // In such case we will throw an IOException so the caller can deal with it. throw new TextLineLengthLimitExceededException("Too many bytes before newline: " + newTxtLength); } str.append(buffer, startPosn, appendLength); txtLength = newTxtLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (newlineLength == 0 && bytesConsumed >= maxBytesToConsume) { // It is possible that bytesConsumed is over the maxBytesToConsume but we // didn't append anything to str.bytes. If we have consumed over maxBytesToConsume // bytes but still haven't seen a line terminator, we will fail the read. throw new TextLineLengthLimitExceededException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; } /** * Read a line terminated by a custom delimiter. */ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); bytesConsumed += ambiguousByteCount; } break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn -= delPosn; if (bufferPosn < -1) { bufferPosn = -1; } delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; if (appendLength > 0) { // We want to fail the read when the line length is over the limit. throw new TextLineLengthLimitExceededException("Too many bytes before delimiter: " + maxLineLength); } } bytesConsumed += ambiguousByteCount; if (appendLength >= 0 && ambiguousByteCount > 0) { //appending the ambiguous characters (refer case 2.2) str.append(recordDelimiterBytes, 0, ambiguousByteCount); ambiguousByteCount = 0; // since it is now certain that the split did not split a delimiter we // should not read the next record: clear the flag otherwise duplicate // records could be generated unsetNeedAdditionalRecordAfterSplit(); } if (appendLength > 0) { int newTxtLength = txtLength + appendLength; if (str.getBytes().length < newTxtLength && Math.max(newTxtLength, txtLength << 1) > MAX_ARRAY_SIZE) { // If str need to be resized but the target capacity is over VM limit, it will trigger OOM. // In such case we will throw an IOException so the caller can deal with it. throw new TextLineLengthLimitExceededException("Too many bytes before delimiter: " + newTxtLength); } str.append(buffer, startPosn, appendLength); txtLength = newTxtLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (delPosn < recordDelimiterBytes.length && bytesConsumed >= maxBytesToConsume) { // It is possible that bytesConsumed is over the maxBytesToConsume but we // didn't append anything to str.bytes. If we have consumed over maxBytesToConsume // bytes but still haven't seen a line terminator, we will fail the read. throw new TextLineLengthLimitExceededException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; } /** * Read from the InputStream into the given Text. * @param str the object to store the given line * @param maxLineLength the maximum number of bytes to store into str. * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength) throws IOException { return readLine(str, maxLineLength, Integer.MAX_VALUE); } /** * Read from the InputStream into the given Text. * @param str the object to store the given line * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text str) throws IOException { return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); } protected int getBufferPosn() { return bufferPosn; } protected int getBufferSize() { return bufferSize; } protected void unsetNeedAdditionalRecordAfterSplit() { // needed for custom multi byte line delimiters only // see MAPREDUCE-6549 for details } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy