com.google.cloud.dataflow.sdk.io.BlockBasedSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;

import java.io.IOException;
import java.util.NoSuchElementException;

import javax.annotation.Nullable;

/**
 * A {@code BlockBasedSource} is a {@link FileBasedSource} where a file consists of blocks of
 * records.
 *
 * {@code BlockBasedSource} should be derived from when a file format does not support efficient
 * seeking to a record in the file, but can support efficient seeking to a block. Alternatively,
 * records in the file cannot be offset-addressed, but blocks can (it is not possible to say
 * that record {code i} starts at offset {@code m}, but it is possible to say that block {@code j}
 *  starts at offset {@code n}).
 *
 * 
The records that will be read from a {@code BlockBasedSource} that corresponds to a subrange
 * of a file {@code [startOffset, endOffset)} are those records such that the record is contained in
 * a block that starts at offset {@code i}, where {@code i >= startOffset} and
 * {@code i < endOffset}. In other words, a record will be read from the source if its first byte is
 * contained in a block that begins within the range described by the source.
 *
 * 
This entails that it is possible to determine the start offsets of all blocks in a file.
 *
 * 
Progress reporting for reading from a {@code BlockBasedSource} is inaccurate. A {@link
 * BlockBasedReader} reports its current offset as {@code (offset of current block) + (current block
 * size) * (fraction of block consumed)}. However, only the offset of the current block is required
 * to be accurately reported by subclass implementations. As such, in the worst case, the current
 * offset is only updated at block boundaries.
 *
 * 
{@code BlockBasedSource} supports dynamic splitting. However, because records in a {@code
 * BlockBasedSource} are not required to have offsets and progress reporting is inaccurate, {@code
 * BlockBasedReader} only supports splitting at block boundaries.
 * In other words, {@link BlockBasedReader#atSplitPoint} returns true iff the current record is the
 * first record in a block. See {@link FileBasedSource.FileBasedReader} for discussion about split
 * points.
 *
 * @param  The type of records to be read from the source.
 */
@Experimental(Experimental.Kind.SOURCE_SINK)
public abstract class BlockBasedSource extends FileBasedSource {
  /**
   * Creates a {@code BlockBasedSource} based on a file name or pattern. Subclasses must call this
   * constructor when creating a {@code BlockBasedSource} for a file pattern. See
   * {@link FileBasedSource} for more information.
   */
  public BlockBasedSource(String fileOrPatternSpec, long minBundleSize) {
    super(fileOrPatternSpec, minBundleSize);
  }

  /**
   * Creates a {@code BlockBasedSource} for a single file. Subclasses must call this constructor
   * when implementing {@link BlockBasedSource#createForSubrangeOfFile}. See documentation in
   * {@link FileBasedSource}.
   */
  public BlockBasedSource(String fileName, long minBundleSize, long startOffset, long endOffset) {
    super(fileName, minBundleSize, startOffset, endOffset);
  }

  /**
   * Creates a {@code BlockBasedSource} for the specified range in a single file.
   */
  @Override
  protected abstract BlockBasedSource createForSubrangeOfFile(
      String fileName, long start, long end);

  /**
   * Creates a {@code BlockBasedReader}.
   */
  @Override
  protected abstract BlockBasedReader createSingleFileReader(PipelineOptions options);

  /**
   * A {@code Block} represents a block of records that can be read.
   */
  @Experimental(Experimental.Kind.SOURCE_SINK)
  protected abstract static class Block {
    /**
     * Returns the current record.
     */
    public abstract T getCurrentRecord();

    /**
     * Reads the next record from the block and returns true iff one exists.
     */
    public abstract boolean readNextRecord() throws IOException;

    /**
     * Returns the fraction of the block already consumed, if possible, as a value in
     * {@code [0, 1]}. It should not include the current record. Successive results from this method
     * must be monotonically increasing.
     *
     * 
If it is not possible to compute the fraction of the block consumed this method may
     * return zero. For example, when the total number of records in the block is unknown.
     */
    public abstract double getFractionOfBlockConsumed();
  }

  /**
   * A {@code Reader} that reads records from a {@link BlockBasedSource}. If the source is a
   * subrange of a file, the blocks that will be read by this reader are those such that the first
   * byte of the block is within the range {@code [start, end)}.
   */
  @Experimental(Experimental.Kind.SOURCE_SINK)
  protected abstract static class BlockBasedReader extends FileBasedReader {
    private boolean atSplitPoint;

    protected BlockBasedReader(BlockBasedSource source) {
      super(source);
    }

    /**
     * Read the next block from the input.
     */
    public abstract boolean readNextBlock() throws IOException;

    /**
     * Returns the current block (the block that was read by the last successful call to
     * {@link BlockBasedReader#readNextBlock}). May return null initially, or if no block has been
     * successfully read.
     */
    @Nullable
    public abstract Block getCurrentBlock();

    /**
     * Returns the size of the current block in bytes as it is represented in the underlying file,
     * if possible. This method may return {@code 0} if the size of the current block is unknown.
     *
     * 
The size returned by this method must be such that for two successive blocks A and B,
     * {@code offset(A) + size(A) <= offset(B)}. If this is not satisfied, the progress reported
     * by the {@code BlockBasedReader} will be non-monotonic and will interfere with the quality
     * (but not correctness) of dynamic work rebalancing.
     *
     * 
This method and {@link Block#getFractionOfBlockConsumed} are used to provide an estimate
     * of progress within a block ({@code getCurrentBlock().getFractionOfBlockConsumed() *
     * getCurrentBlockSize()}). It is acceptable for the result of this computation to be {@code 0},
     * but progress estimation will be inaccurate.
     */
    public abstract long getCurrentBlockSize();

    /**
     * Returns the largest offset such that starting to read from that offset includes the current
     * block.
     */
    public abstract long getCurrentBlockOffset();

    @Override
    public final T getCurrent() throws NoSuchElementException {
      Block currentBlock = getCurrentBlock();
      if (currentBlock == null) {
        throw new NoSuchElementException(
            "No block has been successfully read from " + getCurrentSource());
      }
      return currentBlock.getCurrentRecord();
    }

    /**
     * Returns true if the reader is at a split point. A {@code BlockBasedReader} is at a split
     * point if the current record is the first record in a block. In other words, split points
     * are block boundaries.
     */
    @Override
    protected boolean isAtSplitPoint() {
      return atSplitPoint;
    }

    /**
     * Reads the next record from the {@link #getCurrentBlock() current block} if
     * possible. Will call {@link #readNextBlock()} to advance to the next block if not.
     *
     * The first record read from a block is treated as a split point.
     */
    @Override
    protected final boolean readNextRecord() throws IOException {
      atSplitPoint = false;

      while (getCurrentBlock() == null || !getCurrentBlock().readNextRecord()) {
        if (!readNextBlock()) {
          return false;
        }
        // The first record in a block is a split point.
        atSplitPoint = true;
      }
      return true;
    }

    @Override
    public Double getFractionConsumed() {
      if (getCurrentSource().getEndOffset() == Long.MAX_VALUE) {
        return null;
      }
      Block currentBlock = getCurrentBlock();
      if (currentBlock == null) {
        // There is no current block (i.e., the read has not yet begun).
        return 0.0;
      }
      long currentBlockOffset = getCurrentBlockOffset();
      long startOffset = getCurrentSource().getStartOffset();
      long endOffset = getCurrentSource().getEndOffset();
      double fractionAtBlockStart =
          ((double) (currentBlockOffset - startOffset)) / (endOffset - startOffset);
      double fractionAtBlockEnd =
          ((double) (currentBlockOffset + getCurrentBlockSize() - startOffset)
              / (endOffset - startOffset));
      return Math.min(
          1.0,
          fractionAtBlockStart
          + currentBlock.getFractionOfBlockConsumed()
            * (fractionAtBlockEnd - fractionAtBlockStart));
    }

    @Override
    protected long getCurrentOffset() {
      return getCurrentBlockOffset();
    }
  }
}