com.twitter.elephantbird.mapreduce.io.BinaryBlockReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elephant-bird-core Show documentation
Core utilities.
There is a newer version: 4.17
package com.twitter.elephantbird.mapreduce.io;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import com.google.protobuf.ByteString;
import com.twitter.elephantbird.util.Protobufs;
import com.twitter.elephantbird.util.StreamSearcher;

import org.apache.commons.io.input.BoundedInputStream;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A class to read blocks of binary objects like protobufs.
 */
public class BinaryBlockReader {
  private static final Logger LOG = LoggerFactory.getLogger(BinaryBlockReader.class);

  // though any type of objects can be stored, each block itself is
  // stored as a protocolbuf (SerializedBlock).

  private InputStream in_;
  private final StreamSearcher searcher_;
  private final BinaryConverter protoConverter_;
  private List curBlobs_;
  private int numLeftToReadThisBlock_ = 0;
  private boolean readNewBlocks_ = true;
  private boolean skipEmptyRecords = true; // skip any records of length zero

  public BinaryBlockReader(InputStream in, BinaryConverter protoConverter) {
    this(in, protoConverter, true);
  }

  public BinaryBlockReader(InputStream in,
                              BinaryConverter protoConverter,
                              boolean skipEmptyRecords) {
    in_ = in;
    protoConverter_ = protoConverter;
    searcher_ = new StreamSearcher(Protobufs.KNOWN_GOOD_POSITION_MARKER);
    this.skipEmptyRecords = skipEmptyRecords;
  }

  public BinaryConverter getConverter() {
    return protoConverter_;
  }

  public void close() throws IOException {
    if (in_ != null)
      in_.close();
  }

  /**
   * Sets input stream. Sometimes the actual input stream might be created
   * away from the constructor.
   */
  public void setInputStream(InputStream in) {
    in_ = in; // not closing existing in_, normally it is null
  }

  /**
   * Returns next deserialized object. null indicates end of stream or a
   * deserialization error. Use {@link #readNext(BinaryWritable)} to
   * distinguish betwen end of stream or deserialization error.
   */
  public M readNext() throws IOException {
    byte[] blob = readNextProtoBytes();
    return blob == null ?
        null : protoConverter_.fromBytes(blob);
  }

  /**
   * Returns true if new proto object was read into writable, false other wise.
   */
  public boolean readNext(BinaryWritable writable) throws IOException {
    byte[] blob = readNextProtoBytes();
    if (blob != null) {
      writable.set(protoConverter_.fromBytes(blob));
      return true;
    }
    return false;
  }

  /**
   * Return byte blob for the next proto object. null indicates end of stream;
   */
  public byte[] readNextProtoBytes() throws IOException {
    while (true) {
      if (!setupNewBlockIfNeeded()) {
        return null;
      }

      int blobIndex = curBlobs_.size() - numLeftToReadThisBlock_;
      numLeftToReadThisBlock_--;
      byte[] blob = curBlobs_.get(blobIndex).toByteArray();
      if (blob.length == 0 && skipEmptyRecords) {
        continue;
      }
      return blob;
    }
  }

  /**
   * returns true if bytes for next object are written to writable, false
   * other wise.
   */
  public boolean readNextProtoBytes(BytesWritable writable) throws IOException {
    byte[] blob = readNextProtoBytes();
    if (blob != null) {
      writable.set(blob, 0, blob.length);
      return true;
    }
    return false;
  }

  public void markNoMoreNewBlocks() {
    readNewBlocks_ = false;
  }

  public long skipToNextSyncPoint() throws IOException {
    return searcher_.search(in_);
  }

  /**
   * Finds next block marker and reads the block. If skipIfStartingOnBoundary is set
   * skips the the first block if the marker starts exactly at the current position.
   * (i.e. there were no bytes from previous block before the start of the marker).
   */
  public List parseNextBlock(boolean skipIfStartingOnBoundary) throws IOException {
    LOG.debug("BlockReader: none left to read, skipping to sync point");
    long skipped = skipToNextSyncPoint();
    if (skipped <= -1) {
      LOG.debug("BlockReader: SYNC point eof");
      // EOF if there are no more sync markers.
      return null;
    }

    int blockSize = readInt();
    LOG.debug("BlockReader: found sync point, next block has size " + blockSize);
    if (blockSize < 0) {
      LOG.debug("ProtobufReader: reading size after sync point eof");
      // EOF if the size cannot be read.
      return null;
    }

    if (skipIfStartingOnBoundary && skipped == Protobufs.KNOWN_GOOD_POSITION_MARKER.length) {
      // skip the current current block
      in_.skip(blockSize);
      return parseNextBlock(false);
    }

    SerializedBlock block = SerializedBlock.parseFrom(
      new BoundedInputStream(in_, blockSize),
      blockSize);

    curBlobs_ = block.getProtoBlobs();
    numLeftToReadThisBlock_ = curBlobs_.size();
    LOG.debug("ProtobufReader: number in next block is " + numLeftToReadThisBlock_);
    return curBlobs_;
  }

  private boolean setupNewBlockIfNeeded() throws IOException {
    if (numLeftToReadThisBlock_ == 0) {
      if (!readNewBlocks_) {
        // If the reader has been told not to read more blocks, stop.
        // This happens when a map boundary has been crossed in a map job, for example.
        // The goal then is to finsh reading what has been parsed, but let the next split
        // handle everything starting at the next sync point.
        return false;
      }
      curBlobs_ = parseNextBlock(false);
      if (curBlobs_ == null) {
        // If there is nothing, it likely means EOF. Signal that processing is done.
        return false;
      }
    }

    return true;
  }

  private int readInt() throws IOException {
    int b = in_.read();
    if (b == -1) {
      return -1;
    }

    return b | (in_.read() << 8) | (in_.read() << 16) | (in_.read() << 24);
  }
}