com.google.cloud.dataflow.sdk.io.FileBasedSource Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2014 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.StaticValueProvider;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.NoSuchElementException;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
/**
* A common base class for all file-based {@link Source}s. Extend this class to implement your own
* file-based custom source.
*
* A file-based {@code Source} is a {@code Source} backed by a file pattern defined as a Java
* glob, a single file, or a offset range for a single file. See {@link OffsetBasedSource} and
* {@link com.google.cloud.dataflow.sdk.io.range.RangeTracker} for semantics of offset ranges.
*
*
This source stores a {@code String} that is an {@link IOChannelFactory} specification for a
* file or file pattern. There should be an {@code IOChannelFactory} defined for the file
* specification provided. Please refer to {@link IOChannelUtils} and {@link IOChannelFactory} for
* more information on this.
*
*
In addition to the methods left abstract from {@code BoundedSource}, subclasses must implement
* methods to create a sub-source and a reader for a range of a single file -
* {@link #createForSubrangeOfFile} and {@link #createSingleFileReader}. Please refer to
* {@link XmlSource} for an example implementation of {@code FileBasedSource}.
*
* @param Type of records represented by the source.
*/
public abstract class FileBasedSource extends OffsetBasedSource {
private static final Logger LOG = LoggerFactory.getLogger(FileBasedSource.class);
private static final float FRACTION_OF_FILES_TO_STAT = 0.01f;
// Package-private for testing
static final int MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT = 100;
// Size of the thread pool to be used for performing file operations in parallel.
// Package-private for testing.
static final int THREAD_POOL_SIZE = 128;
private final ValueProvider fileOrPatternSpec;
private final Mode mode;
/**
* A given {@code FileBasedSource} represents a file resource of one of these types.
*/
public enum Mode {
FILEPATTERN,
SINGLE_FILE_OR_SUBRANGE
}
/**
* Create a {@code FileBaseSource} based on a file or a file pattern specification. This
* constructor must be used when creating a new {@code FileBasedSource} for a file pattern.
*
* See {@link OffsetBasedSource} for a detailed description of {@code minBundleSize}.
*
* @param fileOrPatternSpec {@link IOChannelFactory} specification of file or file pattern
* represented by the {@link FileBasedSource}.
* @param minBundleSize minimum bundle size in bytes.
*/
public FileBasedSource(String fileOrPatternSpec, long minBundleSize) {
this(StaticValueProvider.of(fileOrPatternSpec), minBundleSize);
}
/**
* Create a {@code FileBaseSource} based on a file or a file pattern specification.
* Same as the {@code String} constructor, but accepting a {@link ValueProvider}
* to allow for runtime configuration of the source.
*/
public FileBasedSource(ValueProvider fileOrPatternSpec, long minBundleSize) {
super(0, Long.MAX_VALUE, minBundleSize);
mode = Mode.FILEPATTERN;
this.fileOrPatternSpec = fileOrPatternSpec;
}
/**
* Create a {@code FileBasedSource} based on a single file. This constructor must be used when
* creating a new {@code FileBasedSource} for a subrange of a single file.
* Additionally, this constructor must be used to create new {@code FileBasedSource}s when
* subclasses implement the method {@link #createForSubrangeOfFile}.
*
* See {@link OffsetBasedSource} for detailed descriptions of {@code minBundleSize},
* {@code startOffset}, and {@code endOffset}.
*
* @param fileName {@link IOChannelFactory} specification of the file represented by the
* {@link FileBasedSource}.
* @param minBundleSize minimum bundle size in bytes.
* @param startOffset starting byte offset.
* @param endOffset ending byte offset. If the specified value {@code >= #getMaxEndOffset()} it
* implies {@code #getMaxEndOffSet()}.
*/
public FileBasedSource(String fileName, long minBundleSize,
long startOffset, long endOffset) {
super(startOffset, endOffset, minBundleSize);
mode = Mode.SINGLE_FILE_OR_SUBRANGE;
this.fileOrPatternSpec = StaticValueProvider.of(fileName);
}
public final String getFileOrPatternSpec() {
return fileOrPatternSpec.get();
}
public final ValueProvider getFileOrPatternSpecProvider() {
return fileOrPatternSpec;
}
public final Mode getMode() {
return mode;
}
@Override
public final FileBasedSource createSourceForSubrange(long start, long end) {
checkArgument(mode != Mode.FILEPATTERN,
"Cannot split a file pattern based source based on positions");
checkArgument(start >= getStartOffset(), "Start offset value " + start
+ " of the subrange cannot be smaller than the start offset value " + getStartOffset()
+ " of the parent source");
checkArgument(end <= getEndOffset(), "End offset value " + end
+ " of the subrange cannot be larger than the end offset value " + getEndOffset()
+ " of the parent source");
checkState(fileOrPatternSpec.isAccessible(),
"Subrange creation should only happen at execution time.");
FileBasedSource source = createForSubrangeOfFile(fileOrPatternSpec.get(), start, end);
if (start > 0 || end != Long.MAX_VALUE) {
checkArgument(source.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE,
"Source created for the range [" + start + "," + end + ")"
+ " must be a subrange source");
}
return source;
}
/**
* Creates and returns a new {@code FileBasedSource} of the same type as the current
* {@code FileBasedSource} backed by a given file and an offset range. When current source is
* being split, this method is used to generate new sub-sources. When creating the source
* subclasses must call the constructor {@link #FileBasedSource(String, long, long, long)} of
* {@code FileBasedSource} with corresponding parameter values passed here.
*
* @param fileName file backing the new {@code FileBasedSource}.
* @param start starting byte offset of the new {@code FileBasedSource}.
* @param end ending byte offset of the new {@code FileBasedSource}. May be Long.MAX_VALUE,
* in which case it will be inferred using {@link #getMaxEndOffset}.
*/
protected abstract FileBasedSource createForSubrangeOfFile(
String fileName, long start, long end);
/**
* Creates and returns an instance of a {@code FileBasedReader} implementation for the current
* source assuming the source represents a single file. File patterns will be handled by
* {@code FileBasedSource} implementation automatically.
*/
protected abstract FileBasedReader createSingleFileReader(
PipelineOptions options);
@Override
public final long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
// This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here
// we perform the size estimation of files and file patterns using the interface provided by
// IOChannelFactory.
if (mode == Mode.FILEPATTERN) {
checkState(fileOrPatternSpec.isAccessible(),
"Size estimation should be done at execution time.");
IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec.get());
// TODO Implement a more efficient parallel/batch size estimation mechanism for file patterns.
long startTime = System.currentTimeMillis();
long totalSize = 0;
Collection inputs = factory.match(fileOrPatternSpec.get());
if (inputs.size() <= MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT) {
totalSize = getExactTotalSizeOfFiles(inputs, factory);
LOG.debug("Size estimation of all files of pattern " + fileOrPatternSpec.get() + " took "
+ (System.currentTimeMillis() - startTime) + " ms");
} else {
totalSize = getEstimatedSizeOfFilesBySampling(inputs, factory);
LOG.debug("Size estimation of pattern " + fileOrPatternSpec.get() + " by sampling took "
+ (System.currentTimeMillis() - startTime) + " ms");
}
return totalSize;
} else {
long start = getStartOffset();
long end = Math.min(getEndOffset(), getMaxEndOffset(options));
return end - start;
}
}
// Get the exact total size of the given set of files.
// Invokes multiple requests for size estimation in parallel using a thread pool.
// TODO: replace this with bulk request API when it is available. Will require updates
// to IOChannelFactory interface.
private static long getExactTotalSizeOfFiles(
Collection files, IOChannelFactory ioChannelFactory) throws Exception {
List> futures = new ArrayList<>();
ListeningExecutorService service =
MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(THREAD_POOL_SIZE));
long totalSize = 0;
try {
for (String file : files) {
futures.add(createFutureForSizeEstimation(file, ioChannelFactory, service));
}
for (Long val : Futures.allAsList(futures).get()) {
totalSize += val;
}
return totalSize;
} finally {
service.shutdown();
}
}
private static ListenableFuture createFutureForSizeEstimation(
final String file,
final IOChannelFactory ioChannelFactory,
ListeningExecutorService service) {
return service.submit(
new Callable() {
@Override
public Long call() throws Exception {
return ioChannelFactory.getSizeBytes(file);
}
});
}
// Estimate the total size of the given set of files through sampling and extrapolation.
// Currently we use uniform sampling which requires a linear sampling size for a reasonable
// estimate.
// TODO: Implement a more efficient sampling mechanism.
private static long getEstimatedSizeOfFilesBySampling(
Collection files, IOChannelFactory ioChannelFactory) throws Exception {
int sampleSize = (int) (FRACTION_OF_FILES_TO_STAT * files.size());
sampleSize = Math.max(MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT, sampleSize);
List selectedFiles = new ArrayList(files);
Collections.shuffle(selectedFiles);
selectedFiles = selectedFiles.subList(0, sampleSize);
return files.size() * getExactTotalSizeOfFiles(selectedFiles, ioChannelFactory)
/ selectedFiles.size();
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.add(DisplayData.item("filePattern", getFileOrPatternSpecProvider())
.withLabel("File Pattern"));
}
private ListenableFuture>> createFutureForFileSplit(
final String file,
final long desiredBundleSizeBytes,
final PipelineOptions options,
ListeningExecutorService service) {
return service.submit(new Callable>>() {
@Override
public List> call() throws Exception {
return createForSubrangeOfFile(file, 0, Long.MAX_VALUE)
.splitIntoBundles(desiredBundleSizeBytes, options);
}
});
}
@Override
public final List> splitIntoBundles(
long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
// This implementation of method splitIntoBundles is provided to simplify subclasses. Here we
// split a FileBasedSource based on a file pattern to FileBasedSources based on full single
// files. For files that can be efficiently seeked, we further split FileBasedSources based on
// those files to FileBasedSources based on sub ranges of single files.
if (mode == Mode.FILEPATTERN) {
long startTime = System.currentTimeMillis();
List>>> futures = new ArrayList<>();
ListeningExecutorService service =
MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(THREAD_POOL_SIZE));
try {
checkState(fileOrPatternSpec.isAccessible(),
"Bundle splitting should only happen at execution time.");
Collection expandedFiles =
FileBasedSource.expandFilePattern(fileOrPatternSpec.get());
checkArgument(!expandedFiles.isEmpty(),
"Unable to find any files matching %s", fileOrPatternSpec.get());
for (final String file : expandedFiles) {
futures.add(createFutureForFileSplit(file, desiredBundleSizeBytes, options, service));
}
List> splitResults =
ImmutableList.copyOf(Iterables.concat(Futures.allAsList(futures).get()));
LOG.debug(
"Splitting the source based on file pattern "
+ fileOrPatternSpec
+ " took "
+ (System.currentTimeMillis() - startTime)
+ " ms");
return splitResults;
} finally {
service.shutdown();
}
} else {
if (isSplittable()) {
List> splitResults = new ArrayList<>();
for (OffsetBasedSource split : super.splitIntoBundles(desiredBundleSizeBytes, options)) {
splitResults.add((FileBasedSource) split);
}
return splitResults;
} else {
LOG.debug("The source for file " + fileOrPatternSpec
+ " is not split into sub-range based sources since the file is not seekable");
return ImmutableList.of(this);
}
}
}
/**
* Determines whether a file represented by this source is can be split into bundles.
*
* By default, a file is splittable if it is on a file system that supports efficient read
* seeking. Subclasses may override to provide different behavior.
*/
protected boolean isSplittable() throws Exception {
// We split a file-based source into subranges only if the file is efficiently seekable.
// If a file is not efficiently seekable it would be highly inefficient to create and read a
// source based on a subrange of that file.
checkState(fileOrPatternSpec.isAccessible(),
"isSplittable should only be called at runtime.");
IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec.get());
return factory.isReadSeekEfficient(fileOrPatternSpec.get());
}
@Override
public final BoundedReader createReader(PipelineOptions options) throws IOException {
// Validate the current source prior to creating a reader for it.
this.validate();
if (mode == Mode.FILEPATTERN) {
long startTime = System.currentTimeMillis();
Collection files = FileBasedSource.expandFilePattern(fileOrPatternSpec.get());
List> fileReaders = new ArrayList<>();
for (String fileName : files) {
long endOffset;
try {
endOffset = IOChannelUtils.getFactory(fileName).getSizeBytes(fileName);
} catch (IOException e) {
LOG.warn("Failed to get size of " + fileName, e);
endOffset = Long.MAX_VALUE;
}
fileReaders.add(
createForSubrangeOfFile(fileName, 0, endOffset).createSingleFileReader(options));
}
LOG.debug("Creating a reader for file pattern " + fileOrPatternSpec + " took "
+ (System.currentTimeMillis() - startTime) + " ms");
if (fileReaders.size() == 1) {
return fileReaders.get(0);
}
return new FilePatternReader(this, fileReaders);
} else {
return createSingleFileReader(options);
}
}
@Override
public String toString() {
switch (mode) {
case FILEPATTERN:
return fileOrPatternSpec.toString();
case SINGLE_FILE_OR_SUBRANGE:
return fileOrPatternSpec.toString() + " range " + super.toString();
default:
throw new IllegalStateException("Unexpected mode: " + mode);
}
}
@Override
public void validate() {
super.validate();
switch (mode) {
case FILEPATTERN:
checkArgument(getStartOffset() == 0,
"FileBasedSource is based on a file pattern or a full single file "
+ "but the starting offset proposed " + getStartOffset() + " is not zero");
checkArgument(getEndOffset() == Long.MAX_VALUE,
"FileBasedSource is based on a file pattern or a full single file "
+ "but the ending offset proposed " + getEndOffset() + " is not Long.MAX_VALUE");
break;
case SINGLE_FILE_OR_SUBRANGE:
// Nothing more to validate.
break;
default:
throw new IllegalStateException("Unknown mode: " + mode);
}
}
@Override
public final long getMaxEndOffset(PipelineOptions options) throws Exception {
if (mode == Mode.FILEPATTERN) {
throw new IllegalArgumentException("Cannot determine the exact end offset of a file pattern");
}
if (getEndOffset() == Long.MAX_VALUE) {
IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec.get());
return factory.getSizeBytes(fileOrPatternSpec.get());
} else {
return getEndOffset();
}
}
protected static final Collection expandFilePattern(String fileOrPatternSpec)
throws IOException {
IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec);
Collection matches = factory.match(fileOrPatternSpec);
LOG.info("Matched {} files for pattern {}", matches.size(), fileOrPatternSpec);
return matches;
}
/**
* A {@link Source.Reader reader} that implements code common to readers of
* {@code FileBasedSource}s.
*
* Seekability
*
* This reader uses a {@link ReadableByteChannel} created for the file represented by the
* corresponding source to efficiently move to the correct starting position defined in the
* source. Subclasses of this reader should implement {@link #startReading} to get access to this
* channel. If the source corresponding to the reader is for a subrange of a file the
* {@code ReadableByteChannel} provided is guaranteed to be an instance of the type
* {@link SeekableByteChannel}, which may be used by subclass to traverse back in the channel to
* determine the correct starting position.
*
*
Reading Records
*
* Sequential reading is implemented using {@link #readNextRecord}.
*
*
Then {@code FileBasedReader} implements "reading a range [A, B)" in the following way.
*
* - {@link #start} opens the file
*
- {@link #start} seeks the {@code SeekableByteChannel} to A (reading offset ranges for
* non-seekable files is not supported) and calls {@code startReading()}
*
- {@link #start} calls {@link #advance} once, which, via {@link #readNextRecord},
* locates the first record which is at a split point AND its offset is at or after A.
* If this record is at or after B, {@link #advance} returns false and reading is finished.
*
- if the previous advance call returned {@code true} sequential reading starts and
* {@code advance()} will be called repeatedly
*
* {@code advance()} calls {@code readNextRecord()} on the subclass, and stops (returns false) if
* the new record is at a split point AND the offset of the new record is at or after B.
*
* Thread Safety
*
* Since this class implements {@link Source.Reader} it guarantees thread safety. Abstract
* methods defined here will not be accessed by more than one thread concurrently.
*/
public abstract static class FileBasedReader extends OffsetBasedReader {
private ReadableByteChannel channel = null;
/**
* Subclasses should not perform IO operations at the constructor. All IO operations should be
* delayed until the {@link #startReading} method is invoked.
*/
public FileBasedReader(FileBasedSource source) {
super(source);
checkArgument(source.getMode() != Mode.FILEPATTERN,
"FileBasedReader does not support reading file patterns");
}
@Override
public synchronized FileBasedSource getCurrentSource() {
return (FileBasedSource) super.getCurrentSource();
}
@Override
protected final boolean startImpl() throws IOException {
FileBasedSource source = getCurrentSource();
IOChannelFactory factory = IOChannelUtils.getFactory(
source.getFileOrPatternSpecProvider().get());
this.channel = factory.open(source.getFileOrPatternSpecProvider().get());
if (channel instanceof SeekableByteChannel) {
SeekableByteChannel seekChannel = (SeekableByteChannel) channel;
seekChannel.position(source.getStartOffset());
} else {
// Channel is not seekable. Must not be a subrange.
checkArgument(source.mode != Mode.SINGLE_FILE_OR_SUBRANGE,
"Subrange-based sources must only be defined for file types that support seekable "
+ " read channels");
checkArgument(source.getStartOffset() == 0, "Start offset "
+ source.getStartOffset()
+ " is not zero but channel for reading the file is not seekable.");
}
startReading(channel);
// Advance once to load the first record.
return advanceImpl();
}
@Override
protected final boolean advanceImpl() throws IOException {
return readNextRecord();
}
/**
* Closes any {@link ReadableByteChannel} created for the current reader. This implementation is
* idempotent. Any {@code close()} method introduced by a subclass must be idempotent and must
* call the {@code close()} method in the {@code FileBasedReader}.
*/
@Override
public void close() throws IOException {
if (channel != null) {
channel.close();
}
}
/**
* Performs any initialization of the subclass of {@code FileBasedReader} that involves IO
* operations. Will only be invoked once and before that invocation the base class will seek the
* channel to the source's starting offset.
*
* Provided {@link ReadableByteChannel} is for the file represented by the source of this
* reader. Subclass may use the {@code channel} to build a higher level IO abstraction, e.g., a
* BufferedReader or an XML parser.
*
*
If the corresponding source is for a subrange of a file, {@code channel} is guaranteed to
* be an instance of the type {@link SeekableByteChannel}.
*
*
After this method is invoked the base class will not be reading data from the channel or
* adjusting the position of the channel. But the base class is responsible for properly closing
* the channel.
*
* @param channel a byte channel representing the file backing the reader.
*/
protected abstract void startReading(ReadableByteChannel channel) throws IOException;
/**
* Reads the next record from the channel provided by {@link #startReading}. Methods
* {@link #getCurrent}, {@link #getCurrentOffset}, and {@link #isAtSplitPoint()} should return
* the corresponding information about the record read by the last invocation of this method.
*
*
Note that this method will be called the same way for reading the first record in the
* source (file or offset range in the file) and for reading subsequent records. It is up to the
* subclass to do anything special for locating and reading the first record, if necessary.
*
* @return {@code true} if a record was successfully read, {@code false} if the end of the
* channel was reached before successfully reading a new record.
*/
protected abstract boolean readNextRecord() throws IOException;
}
// An internal Reader implementation that concatenates a sequence of FileBasedReaders.
private class FilePatternReader extends BoundedReader {
private final FileBasedSource source;
private final List> fileReaders;
final ListIterator> fileReadersIterator;
FileBasedReader currentReader = null;
public FilePatternReader(FileBasedSource source, List> fileReaders) {
this.source = source;
this.fileReaders = fileReaders;
this.fileReadersIterator = fileReaders.listIterator();
}
@Override
public boolean start() throws IOException {
return startNextNonemptyReader();
}
@Override
public boolean advance() throws IOException {
checkState(currentReader != null, "Call start() before advance()");
if (currentReader.advance()) {
return true;
}
return startNextNonemptyReader();
}
private boolean startNextNonemptyReader() throws IOException {
while (fileReadersIterator.hasNext()) {
currentReader = fileReadersIterator.next();
if (currentReader.start()) {
return true;
}
currentReader.close();
}
return false;
}
@Override
public T getCurrent() throws NoSuchElementException {
// A NoSuchElement will be thrown by the last FileBasedReader if getCurrent() is called after
// advance() returns false.
return currentReader.getCurrent();
}
@Override
public Instant getCurrentTimestamp() throws NoSuchElementException {
// A NoSuchElement will be thrown by the last FileBasedReader if getCurrentTimestamp()
// is called after advance() returns false.
return currentReader.getCurrentTimestamp();
}
@Override
public void close() throws IOException {
// Close all readers that may have not yet been closed.
// If this reader has not been started, currentReader is null.
if (currentReader != null) {
currentReader.close();
}
while (fileReadersIterator.hasNext()) {
fileReadersIterator.next().close();
}
}
@Override
public FileBasedSource getCurrentSource() {
return source;
}
@Override
public FileBasedSource splitAtFraction(double fraction) {
// Unsupported. TODO: implement.
LOG.debug("Dynamic splitting of FilePatternReader is unsupported.");
return null;
}
@Override
public Double getFractionConsumed() {
if (currentReader == null) {
return 0.0;
}
if (fileReaders.isEmpty()) {
return 1.0;
}
int index = fileReadersIterator.previousIndex();
int numReaders = fileReaders.size();
if (index == numReaders) {
return 1.0;
}
double before = 1.0 * index / numReaders;
double after = 1.0 * (index + 1) / numReaders;
Double fractionOfCurrentReader = currentReader.getFractionConsumed();
if (fractionOfCurrentReader == null) {
return before;
}
return before + fractionOfCurrentReader * (after - before);
}
}
}