com.google.cloud.dataflow.sdk.io.BlockBasedSource Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import java.io.IOException;
import java.util.NoSuchElementException;
import javax.annotation.Nullable;
/**
* A {@code BlockBasedSource} is a {@link FileBasedSource} where a file consists of blocks of
* records.
*
* {@code BlockBasedSource} should be derived from when a file format does not support efficient
* seeking to a record in the file, but can support efficient seeking to a block. Alternatively,
* records in the file cannot be offset-addressed, but blocks can (it is not possible to say
* that record {code i} starts at offset {@code m}, but it is possible to say that block {@code j}
* starts at offset {@code n}).
*
*
The records that will be read from a {@code BlockBasedSource} that corresponds to a subrange
* of a file {@code [startOffset, endOffset)} are those records such that the record is contained in
* a block that starts at offset {@code i}, where {@code i >= startOffset} and
* {@code i < endOffset}. In other words, a record will be read from the source if its first byte is
* contained in a block that begins within the range described by the source.
*
*
This entails that it is possible to determine the start offsets of all blocks in a file.
*
*
Progress reporting for reading from a {@code BlockBasedSource} is inaccurate. A {@link
* BlockBasedReader} reports its current offset as {@code (offset of current block) + (current block
* size) * (fraction of block consumed)}. However, only the offset of the current block is required
* to be accurately reported by subclass implementations. As such, in the worst case, the current
* offset is only updated at block boundaries.
*
*
{@code BlockBasedSource} supports dynamic splitting. However, because records in a {@code
* BlockBasedSource} are not required to have offsets and progress reporting is inaccurate, {@code
* BlockBasedReader} only supports splitting at block boundaries.
* In other words, {@link BlockBasedReader#atSplitPoint} returns true iff the current record is the
* first record in a block. See {@link FileBasedSource.FileBasedReader} for discussion about split
* points.
*
* @param The type of records to be read from the source.
*/
@Experimental(Experimental.Kind.SOURCE_SINK)
public abstract class BlockBasedSource extends FileBasedSource {
/**
* Creates a {@code BlockBasedSource} based on a file name or pattern. Subclasses must call this
* constructor when creating a {@code BlockBasedSource} for a file pattern. See
* {@link FileBasedSource} for more information.
*/
public BlockBasedSource(String fileOrPatternSpec, long minBundleSize) {
super(fileOrPatternSpec, minBundleSize);
}
/**
* Creates a {@code BlockBasedSource} for a single file. Subclasses must call this constructor
* when implementing {@link BlockBasedSource#createForSubrangeOfFile}. See documentation in
* {@link FileBasedSource}.
*/
public BlockBasedSource(String fileName, long minBundleSize, long startOffset, long endOffset) {
super(fileName, minBundleSize, startOffset, endOffset);
}
/**
* Creates a {@code BlockBasedSource} for the specified range in a single file.
*/
@Override
protected abstract BlockBasedSource createForSubrangeOfFile(
String fileName, long start, long end);
/**
* Creates a {@code BlockBasedReader}.
*/
@Override
protected abstract BlockBasedReader createSingleFileReader(PipelineOptions options);
/**
* A {@code Block} represents a block of records that can be read.
*/
@Experimental(Experimental.Kind.SOURCE_SINK)
protected abstract static class Block {
/**
* Returns the current record.
*/
public abstract T getCurrentRecord();
/**
* Reads the next record from the block and returns true iff one exists.
*/
public abstract boolean readNextRecord() throws IOException;
/**
* Returns the fraction of the block already consumed, if possible, as a value in
* {@code [0, 1]}. It should not include the current record. Successive results from this method
* must be monotonically increasing.
*
* If it is not possible to compute the fraction of the block consumed this method may
* return zero. For example, when the total number of records in the block is unknown.
*/
public abstract double getFractionOfBlockConsumed();
}
/**
* A {@code Reader} that reads records from a {@link BlockBasedSource}. If the source is a
* subrange of a file, the blocks that will be read by this reader are those such that the first
* byte of the block is within the range {@code [start, end)}.
*/
@Experimental(Experimental.Kind.SOURCE_SINK)
protected abstract static class BlockBasedReader extends FileBasedReader {
private boolean atSplitPoint;
protected BlockBasedReader(BlockBasedSource source) {
super(source);
}
/**
* Read the next block from the input.
*/
public abstract boolean readNextBlock() throws IOException;
/**
* Returns the current block (the block that was read by the last successful call to
* {@link BlockBasedReader#readNextBlock}). May return null initially, or if no block has been
* successfully read.
*/
@Nullable
public abstract Block getCurrentBlock();
/**
* Returns the size of the current block in bytes as it is represented in the underlying file,
* if possible. This method may return {@code 0} if the size of the current block is unknown.
*
* The size returned by this method must be such that for two successive blocks A and B,
* {@code offset(A) + size(A) <= offset(B)}. If this is not satisfied, the progress reported
* by the {@code BlockBasedReader} will be non-monotonic and will interfere with the quality
* (but not correctness) of dynamic work rebalancing.
*
*
This method and {@link Block#getFractionOfBlockConsumed} are used to provide an estimate
* of progress within a block ({@code getCurrentBlock().getFractionOfBlockConsumed() *
* getCurrentBlockSize()}). It is acceptable for the result of this computation to be {@code 0},
* but progress estimation will be inaccurate.
*/
public abstract long getCurrentBlockSize();
/**
* Returns the largest offset such that starting to read from that offset includes the current
* block.
*/
public abstract long getCurrentBlockOffset();
@Override
public final T getCurrent() throws NoSuchElementException {
Block currentBlock = getCurrentBlock();
if (currentBlock == null) {
throw new NoSuchElementException(
"No block has been successfully read from " + getCurrentSource());
}
return currentBlock.getCurrentRecord();
}
/**
* Returns true if the reader is at a split point. A {@code BlockBasedReader} is at a split
* point if the current record is the first record in a block. In other words, split points
* are block boundaries.
*/
@Override
protected boolean isAtSplitPoint() {
return atSplitPoint;
}
/**
* Reads the next record from the {@link #getCurrentBlock() current block} if
* possible. Will call {@link #readNextBlock()} to advance to the next block if not.
*
* The first record read from a block is treated as a split point.
*/
@Override
protected final boolean readNextRecord() throws IOException {
atSplitPoint = false;
while (getCurrentBlock() == null || !getCurrentBlock().readNextRecord()) {
if (!readNextBlock()) {
return false;
}
// The first record in a block is a split point.
atSplitPoint = true;
}
return true;
}
@Override
public Double getFractionConsumed() {
if (getCurrentSource().getEndOffset() == Long.MAX_VALUE) {
return null;
}
Block currentBlock = getCurrentBlock();
if (currentBlock == null) {
// There is no current block (i.e., the read has not yet begun).
return 0.0;
}
long currentBlockOffset = getCurrentBlockOffset();
long startOffset = getCurrentSource().getStartOffset();
long endOffset = getCurrentSource().getEndOffset();
double fractionAtBlockStart =
((double) (currentBlockOffset - startOffset)) / (endOffset - startOffset);
double fractionAtBlockEnd =
((double) (currentBlockOffset + getCurrentBlockSize() - startOffset)
/ (endOffset - startOffset));
return Math.min(
1.0,
fractionAtBlockStart
+ currentBlock.getFractionOfBlockConsumed()
* (fractionAtBlockEnd - fractionAtBlockStart));
}
@Override
protected long getCurrentOffset() {
return getCurrentBlockOffset();
}
}
}