All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datatorrent.lib.io.block.ReaderContext Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.lib.io.block;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.PositionedReadable;

/**
 * This controls how an {@link AbstractBlockReader} reads a {@link BlockMetadata}.
 *
 * @param  type of stream
 * @since 2.1.0
 */
public interface ReaderContext
{

  /**
   * Initializes the reader context.
   *
   * @param stream           input stream
   * @param blockMetadata    block-metadata
   * @param consecutiveBlock if current block was a consecutive block in the source. if it is then we continue reading
   *                         from the last offset.
   */
  void initialize(STREAM stream, BlockMetadata blockMetadata, boolean consecutiveBlock);

  /**
   * Reads an entity. Returns null when the block is processed.
   *
   * @return {@link Entity}. null when the work is done.
   * @throws IOException
   */
  Entity next() throws IOException;

  /**
   * Represents the total bytes used to construct the record.
* Used bytes can be different from the bytes in the record. */ class Entity { private byte[] record; private long usedBytes; public void clear() { record = null; usedBytes = -1; } public byte[] getRecord() { return record; } public void setRecord(byte[] record) { this.record = record; } public long getUsedBytes() { return usedBytes; } public void setUsedBytes(long usedBytes) { this.usedBytes = usedBytes; } } /** * An Abstract reader context which assumes that a block boundary is never crossed if the last entity read is fully * contained in the block. * * @param type of stream. */ abstract class AbstractReaderContext implements ReaderContext { protected transient long offset; protected transient STREAM stream; protected transient BlockMetadata blockMetadata; protected final transient Entity entity; protected AbstractReaderContext() { offset = -1; entity = new Entity(); } @Override public void initialize(STREAM stream, BlockMetadata blockMetadata, boolean consecutiveBlock) { this.stream = stream; this.blockMetadata = blockMetadata; if (!consecutiveBlock) { offset = blockMetadata.getOffset(); } } @Override public Entity next() throws IOException { if (offset < blockMetadata.getLength()) { Entity entity = readEntity(); offset += entity.usedBytes; return entity; } return null; } protected abstract Entity readEntity() throws IOException; } /** * This reader context splits the block into entities on '\n' or '\r'.
* It will not read ahead of the block boundary if the last entity was completely contained in the block.
* Any records formed using this context will need a way to validate the start of the record. * * @param type of stream. */ class LineReaderContext extends AbstractReaderContext { protected int bufferSize; private final transient ByteArrayOutputStream lineBuilder; private final transient ByteArrayOutputStream emptyBuilder; private final transient ByteArrayOutputStream tmpBuilder; private transient byte[] buffer; private transient String strBuffer; private transient int posInStr; public LineReaderContext() { super(); bufferSize = 8192; lineBuilder = new ByteArrayOutputStream(); emptyBuilder = new ByteArrayOutputStream(); tmpBuilder = new ByteArrayOutputStream(); } @Override public void initialize(STREAM stream, BlockMetadata blockMetadata, boolean consecutiveBlock) { if (buffer == null) { buffer = new byte[bufferSize]; } super.initialize(stream, blockMetadata, consecutiveBlock); } @Override protected Entity readEntity() throws IOException { //Implemented a buffered reader instead of using java's BufferedReader because it was reading much ahead of block // boundary and faced issues with duplicate records. Controlling the buffer size didn't help either. boolean foundEOL = false; int bytesRead = 0; long usedBytes = 0; while (!foundEOL) { tmpBuilder.reset(); if (posInStr == 0) { bytesRead = stream.read(offset + usedBytes, buffer, 0, bufferSize); if (bytesRead == -1) { break; } strBuffer = new String(buffer); } while (posInStr < strBuffer.length()) { char c = strBuffer.charAt(posInStr); if (c != '\r' && c != '\n') { tmpBuilder.write(c); posInStr++; } else { foundEOL = true; break; } } byte[] subLine = tmpBuilder.toByteArray(); usedBytes += subLine.length; lineBuilder.write(subLine); if (foundEOL) { while (posInStr < strBuffer.length()) { char c = strBuffer.charAt(posInStr); if (c == '\r' || c == '\n') { emptyBuilder.write(c); posInStr++; } else { break; } } usedBytes += emptyBuilder.toByteArray().length; } else { //read more bytes from the input stream posInStr = 0; } } posInStr = 0; //when end of stream is reached then bytesRead is -1 if (bytesRead == -1) { lineBuilder.reset(); emptyBuilder.reset(); return null; } entity.clear(); entity.record = lineBuilder.toByteArray(); entity.usedBytes = usedBytes; lineBuilder.reset(); emptyBuilder.reset(); return entity; } /** * Sets the buffer size of read. * * @param bufferSize size of the buffer */ public void setBufferSize(int bufferSize) { this.bufferSize = bufferSize; } /** * @return the buffer size of read. */ public int getBufferSize() { return this.bufferSize; } } /** * Another reader context that splits the block into records on '\n' or '\r'.
* This implementation doesn't need a way to validate the start of a record.
*

* This starts parsing the block (except the first block of the file) from the first eol character. * It is a less optimized version of an {@link LineReaderContext} which always reads beyond the block * boundary. * * @param */ class ReadAheadLineReaderContext extends LineReaderContext { @Override public void initialize(STREAM stream, BlockMetadata blockMetadata, boolean consecutiveBlock) { super.initialize(stream, blockMetadata, consecutiveBlock); //ignore first entity of all the blocks except the first one because those bytes //were used during the parsing of the previous block. if (!consecutiveBlock && blockMetadata.getOffset() != 0) { try { Entity entity = readEntity(); offset += entity.usedBytes; } catch (IOException e) { throw new RuntimeException("when reading first entity", e); } } } @Override public Entity next() throws IOException { if (offset < blockMetadata.getLength() || (offset == blockMetadata.getLength() && !blockMetadata.isLastBlock())) { Entity entity = readEntity(); offset += entity.usedBytes; return entity; } return null; } } /** * This creates fixed sized entities.
* It doesn't read beyond the block boundary therefore the last byte-array could be smaller.
* * @param type of stream. */ class FixedBytesReaderContext extends AbstractReaderContext { //When this field is null, it is initialized to default fs block size in setup. protected Integer length; @Override public void initialize(STREAM stream, BlockMetadata blockMetadata, boolean consecutiveBlock) { if (length == null) { length = (int)new Configuration().getLong("fs.local.block.size", 32 * 1024 * 1024); LOG.debug("length init {}", length); } super.initialize(stream, blockMetadata, consecutiveBlock); } @Override protected Entity readEntity() throws IOException { entity.clear(); int bytesToRead = length; if (offset + length >= blockMetadata.getLength()) { bytesToRead = (int)(blockMetadata.getLength() - offset); } byte[] record = new byte[bytesToRead]; stream.readFully(offset, record, 0, bytesToRead); entity.usedBytes = bytesToRead; entity.record = record; return entity; } /** * Sets the length of each record. * * @param length fixed length of each record. */ public void setLength(Integer length) { this.length = length; } /** * @return the length of record. */ public Integer getLength() { return this.length; } private static final Logger LOG = LoggerFactory.getLogger(FixedBytesReaderContext.class); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy