All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.io.FileBasedSource Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2014 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;

import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider;
import com.google.cloud.dataflow.sdk.options.ValueProvider.StaticValueProvider;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.NoSuchElementException;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;

/**
 * A common base class for all file-based {@link Source}s. Extend this class to implement your own
 * file-based custom source.
 *
 * 

A file-based {@code Source} is a {@code Source} backed by a file pattern defined as a Java * glob, a single file, or a offset range for a single file. See {@link OffsetBasedSource} and * {@link com.google.cloud.dataflow.sdk.io.range.RangeTracker} for semantics of offset ranges. * *

This source stores a {@code String} that is an {@link IOChannelFactory} specification for a * file or file pattern. There should be an {@code IOChannelFactory} defined for the file * specification provided. Please refer to {@link IOChannelUtils} and {@link IOChannelFactory} for * more information on this. * *

In addition to the methods left abstract from {@code BoundedSource}, subclasses must implement * methods to create a sub-source and a reader for a range of a single file - * {@link #createForSubrangeOfFile} and {@link #createSingleFileReader}. Please refer to * {@link XmlSource} for an example implementation of {@code FileBasedSource}. * * @param Type of records represented by the source. */ public abstract class FileBasedSource extends OffsetBasedSource { private static final Logger LOG = LoggerFactory.getLogger(FileBasedSource.class); private static final float FRACTION_OF_FILES_TO_STAT = 0.01f; // Package-private for testing static final int MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT = 100; // Size of the thread pool to be used for performing file operations in parallel. // Package-private for testing. static final int THREAD_POOL_SIZE = 128; private final ValueProvider fileOrPatternSpec; private final Mode mode; /** * A given {@code FileBasedSource} represents a file resource of one of these types. */ public enum Mode { FILEPATTERN, SINGLE_FILE_OR_SUBRANGE } /** * Create a {@code FileBaseSource} based on a file or a file pattern specification. This * constructor must be used when creating a new {@code FileBasedSource} for a file pattern. * *

See {@link OffsetBasedSource} for a detailed description of {@code minBundleSize}. * * @param fileOrPatternSpec {@link IOChannelFactory} specification of file or file pattern * represented by the {@link FileBasedSource}. * @param minBundleSize minimum bundle size in bytes. */ public FileBasedSource(String fileOrPatternSpec, long minBundleSize) { this(StaticValueProvider.of(fileOrPatternSpec), minBundleSize); } /** * Create a {@code FileBaseSource} based on a file or a file pattern specification. * Same as the {@code String} constructor, but accepting a {@link ValueProvider} * to allow for runtime configuration of the source. */ public FileBasedSource(ValueProvider fileOrPatternSpec, long minBundleSize) { super(0, Long.MAX_VALUE, minBundleSize); mode = Mode.FILEPATTERN; this.fileOrPatternSpec = fileOrPatternSpec; } /** * Create a {@code FileBasedSource} based on a single file. This constructor must be used when * creating a new {@code FileBasedSource} for a subrange of a single file. * Additionally, this constructor must be used to create new {@code FileBasedSource}s when * subclasses implement the method {@link #createForSubrangeOfFile}. * *

See {@link OffsetBasedSource} for detailed descriptions of {@code minBundleSize}, * {@code startOffset}, and {@code endOffset}. * * @param fileName {@link IOChannelFactory} specification of the file represented by the * {@link FileBasedSource}. * @param minBundleSize minimum bundle size in bytes. * @param startOffset starting byte offset. * @param endOffset ending byte offset. If the specified value {@code >= #getMaxEndOffset()} it * implies {@code #getMaxEndOffSet()}. */ public FileBasedSource(String fileName, long minBundleSize, long startOffset, long endOffset) { super(startOffset, endOffset, minBundleSize); mode = Mode.SINGLE_FILE_OR_SUBRANGE; this.fileOrPatternSpec = StaticValueProvider.of(fileName); } public final String getFileOrPatternSpec() { return fileOrPatternSpec.get(); } public final ValueProvider getFileOrPatternSpecProvider() { return fileOrPatternSpec; } public final Mode getMode() { return mode; } @Override public final FileBasedSource createSourceForSubrange(long start, long end) { checkArgument(mode != Mode.FILEPATTERN, "Cannot split a file pattern based source based on positions"); checkArgument(start >= getStartOffset(), "Start offset value " + start + " of the subrange cannot be smaller than the start offset value " + getStartOffset() + " of the parent source"); checkArgument(end <= getEndOffset(), "End offset value " + end + " of the subrange cannot be larger than the end offset value " + getEndOffset() + " of the parent source"); checkState(fileOrPatternSpec.isAccessible(), "Subrange creation should only happen at execution time."); FileBasedSource source = createForSubrangeOfFile(fileOrPatternSpec.get(), start, end); if (start > 0 || end != Long.MAX_VALUE) { checkArgument(source.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE, "Source created for the range [" + start + "," + end + ")" + " must be a subrange source"); } return source; } /** * Creates and returns a new {@code FileBasedSource} of the same type as the current * {@code FileBasedSource} backed by a given file and an offset range. When current source is * being split, this method is used to generate new sub-sources. When creating the source * subclasses must call the constructor {@link #FileBasedSource(String, long, long, long)} of * {@code FileBasedSource} with corresponding parameter values passed here. * * @param fileName file backing the new {@code FileBasedSource}. * @param start starting byte offset of the new {@code FileBasedSource}. * @param end ending byte offset of the new {@code FileBasedSource}. May be Long.MAX_VALUE, * in which case it will be inferred using {@link #getMaxEndOffset}. */ protected abstract FileBasedSource createForSubrangeOfFile( String fileName, long start, long end); /** * Creates and returns an instance of a {@code FileBasedReader} implementation for the current * source assuming the source represents a single file. File patterns will be handled by * {@code FileBasedSource} implementation automatically. */ protected abstract FileBasedReader createSingleFileReader( PipelineOptions options); @Override public final long getEstimatedSizeBytes(PipelineOptions options) throws Exception { // This implementation of method getEstimatedSizeBytes is provided to simplify subclasses. Here // we perform the size estimation of files and file patterns using the interface provided by // IOChannelFactory. if (mode == Mode.FILEPATTERN) { checkState(fileOrPatternSpec.isAccessible(), "Size estimation should be done at execution time."); IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec.get()); // TODO Implement a more efficient parallel/batch size estimation mechanism for file patterns. long startTime = System.currentTimeMillis(); long totalSize = 0; Collection inputs = factory.match(fileOrPatternSpec.get()); if (inputs.size() <= MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT) { totalSize = getExactTotalSizeOfFiles(inputs, factory); LOG.debug("Size estimation of all files of pattern " + fileOrPatternSpec.get() + " took " + (System.currentTimeMillis() - startTime) + " ms"); } else { totalSize = getEstimatedSizeOfFilesBySampling(inputs, factory); LOG.debug("Size estimation of pattern " + fileOrPatternSpec.get() + " by sampling took " + (System.currentTimeMillis() - startTime) + " ms"); } return totalSize; } else { long start = getStartOffset(); long end = Math.min(getEndOffset(), getMaxEndOffset(options)); return end - start; } } // Get the exact total size of the given set of files. // Invokes multiple requests for size estimation in parallel using a thread pool. // TODO: replace this with bulk request API when it is available. Will require updates // to IOChannelFactory interface. private static long getExactTotalSizeOfFiles( Collection files, IOChannelFactory ioChannelFactory) throws Exception { List> futures = new ArrayList<>(); ListeningExecutorService service = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(THREAD_POOL_SIZE)); long totalSize = 0; try { for (String file : files) { futures.add(createFutureForSizeEstimation(file, ioChannelFactory, service)); } for (Long val : Futures.allAsList(futures).get()) { totalSize += val; } return totalSize; } finally { service.shutdown(); } } private static ListenableFuture createFutureForSizeEstimation( final String file, final IOChannelFactory ioChannelFactory, ListeningExecutorService service) { return service.submit( new Callable() { @Override public Long call() throws Exception { return ioChannelFactory.getSizeBytes(file); } }); } // Estimate the total size of the given set of files through sampling and extrapolation. // Currently we use uniform sampling which requires a linear sampling size for a reasonable // estimate. // TODO: Implement a more efficient sampling mechanism. private static long getEstimatedSizeOfFilesBySampling( Collection files, IOChannelFactory ioChannelFactory) throws Exception { int sampleSize = (int) (FRACTION_OF_FILES_TO_STAT * files.size()); sampleSize = Math.max(MAX_NUMBER_OF_FILES_FOR_AN_EXACT_STAT, sampleSize); List selectedFiles = new ArrayList(files); Collections.shuffle(selectedFiles); selectedFiles = selectedFiles.subList(0, sampleSize); return files.size() * getExactTotalSizeOfFiles(selectedFiles, ioChannelFactory) / selectedFiles.size(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("filePattern", getFileOrPatternSpecProvider()) .withLabel("File Pattern")); } private ListenableFuture>> createFutureForFileSplit( final String file, final long desiredBundleSizeBytes, final PipelineOptions options, ListeningExecutorService service) { return service.submit(new Callable>>() { @Override public List> call() throws Exception { return createForSubrangeOfFile(file, 0, Long.MAX_VALUE) .splitIntoBundles(desiredBundleSizeBytes, options); } }); } @Override public final List> splitIntoBundles( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { // This implementation of method splitIntoBundles is provided to simplify subclasses. Here we // split a FileBasedSource based on a file pattern to FileBasedSources based on full single // files. For files that can be efficiently seeked, we further split FileBasedSources based on // those files to FileBasedSources based on sub ranges of single files. if (mode == Mode.FILEPATTERN) { long startTime = System.currentTimeMillis(); List>>> futures = new ArrayList<>(); ListeningExecutorService service = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(THREAD_POOL_SIZE)); try { checkState(fileOrPatternSpec.isAccessible(), "Bundle splitting should only happen at execution time."); Collection expandedFiles = FileBasedSource.expandFilePattern(fileOrPatternSpec.get()); checkArgument(!expandedFiles.isEmpty(), "Unable to find any files matching %s", fileOrPatternSpec.get()); for (final String file : expandedFiles) { futures.add(createFutureForFileSplit(file, desiredBundleSizeBytes, options, service)); } List> splitResults = ImmutableList.copyOf(Iterables.concat(Futures.allAsList(futures).get())); LOG.debug( "Splitting the source based on file pattern " + fileOrPatternSpec + " took " + (System.currentTimeMillis() - startTime) + " ms"); return splitResults; } finally { service.shutdown(); } } else { if (isSplittable()) { List> splitResults = new ArrayList<>(); for (OffsetBasedSource split : super.splitIntoBundles(desiredBundleSizeBytes, options)) { splitResults.add((FileBasedSource) split); } return splitResults; } else { LOG.debug("The source for file " + fileOrPatternSpec + " is not split into sub-range based sources since the file is not seekable"); return ImmutableList.of(this); } } } /** * Determines whether a file represented by this source is can be split into bundles. * *

By default, a file is splittable if it is on a file system that supports efficient read * seeking. Subclasses may override to provide different behavior. */ protected boolean isSplittable() throws Exception { // We split a file-based source into subranges only if the file is efficiently seekable. // If a file is not efficiently seekable it would be highly inefficient to create and read a // source based on a subrange of that file. checkState(fileOrPatternSpec.isAccessible(), "isSplittable should only be called at runtime."); IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec.get()); return factory.isReadSeekEfficient(fileOrPatternSpec.get()); } @Override public final BoundedReader createReader(PipelineOptions options) throws IOException { // Validate the current source prior to creating a reader for it. this.validate(); if (mode == Mode.FILEPATTERN) { long startTime = System.currentTimeMillis(); Collection files = FileBasedSource.expandFilePattern(fileOrPatternSpec.get()); List> fileReaders = new ArrayList<>(); for (String fileName : files) { long endOffset; try { endOffset = IOChannelUtils.getFactory(fileName).getSizeBytes(fileName); } catch (IOException e) { LOG.warn("Failed to get size of " + fileName, e); endOffset = Long.MAX_VALUE; } fileReaders.add( createForSubrangeOfFile(fileName, 0, endOffset).createSingleFileReader(options)); } LOG.debug("Creating a reader for file pattern " + fileOrPatternSpec + " took " + (System.currentTimeMillis() - startTime) + " ms"); if (fileReaders.size() == 1) { return fileReaders.get(0); } return new FilePatternReader(this, fileReaders); } else { return createSingleFileReader(options); } } @Override public String toString() { switch (mode) { case FILEPATTERN: return fileOrPatternSpec.toString(); case SINGLE_FILE_OR_SUBRANGE: return fileOrPatternSpec.toString() + " range " + super.toString(); default: throw new IllegalStateException("Unexpected mode: " + mode); } } @Override public void validate() { super.validate(); switch (mode) { case FILEPATTERN: checkArgument(getStartOffset() == 0, "FileBasedSource is based on a file pattern or a full single file " + "but the starting offset proposed " + getStartOffset() + " is not zero"); checkArgument(getEndOffset() == Long.MAX_VALUE, "FileBasedSource is based on a file pattern or a full single file " + "but the ending offset proposed " + getEndOffset() + " is not Long.MAX_VALUE"); break; case SINGLE_FILE_OR_SUBRANGE: // Nothing more to validate. break; default: throw new IllegalStateException("Unknown mode: " + mode); } } @Override public final long getMaxEndOffset(PipelineOptions options) throws Exception { if (mode == Mode.FILEPATTERN) { throw new IllegalArgumentException("Cannot determine the exact end offset of a file pattern"); } if (getEndOffset() == Long.MAX_VALUE) { IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec.get()); return factory.getSizeBytes(fileOrPatternSpec.get()); } else { return getEndOffset(); } } protected static final Collection expandFilePattern(String fileOrPatternSpec) throws IOException { IOChannelFactory factory = IOChannelUtils.getFactory(fileOrPatternSpec); Collection matches = factory.match(fileOrPatternSpec); LOG.info("Matched {} files for pattern {}", matches.size(), fileOrPatternSpec); return matches; } /** * A {@link Source.Reader reader} that implements code common to readers of * {@code FileBasedSource}s. * *

Seekability

* *

This reader uses a {@link ReadableByteChannel} created for the file represented by the * corresponding source to efficiently move to the correct starting position defined in the * source. Subclasses of this reader should implement {@link #startReading} to get access to this * channel. If the source corresponding to the reader is for a subrange of a file the * {@code ReadableByteChannel} provided is guaranteed to be an instance of the type * {@link SeekableByteChannel}, which may be used by subclass to traverse back in the channel to * determine the correct starting position. * *

Reading Records

* *

Sequential reading is implemented using {@link #readNextRecord}. * *

Then {@code FileBasedReader} implements "reading a range [A, B)" in the following way. *

    *
  1. {@link #start} opens the file *
  2. {@link #start} seeks the {@code SeekableByteChannel} to A (reading offset ranges for * non-seekable files is not supported) and calls {@code startReading()} *
  3. {@link #start} calls {@link #advance} once, which, via {@link #readNextRecord}, * locates the first record which is at a split point AND its offset is at or after A. * If this record is at or after B, {@link #advance} returns false and reading is finished. *
  4. if the previous advance call returned {@code true} sequential reading starts and * {@code advance()} will be called repeatedly *
* {@code advance()} calls {@code readNextRecord()} on the subclass, and stops (returns false) if * the new record is at a split point AND the offset of the new record is at or after B. * *

Thread Safety

* *

Since this class implements {@link Source.Reader} it guarantees thread safety. Abstract * methods defined here will not be accessed by more than one thread concurrently. */ public abstract static class FileBasedReader extends OffsetBasedReader { private ReadableByteChannel channel = null; /** * Subclasses should not perform IO operations at the constructor. All IO operations should be * delayed until the {@link #startReading} method is invoked. */ public FileBasedReader(FileBasedSource source) { super(source); checkArgument(source.getMode() != Mode.FILEPATTERN, "FileBasedReader does not support reading file patterns"); } @Override public synchronized FileBasedSource getCurrentSource() { return (FileBasedSource) super.getCurrentSource(); } @Override protected final boolean startImpl() throws IOException { FileBasedSource source = getCurrentSource(); IOChannelFactory factory = IOChannelUtils.getFactory( source.getFileOrPatternSpecProvider().get()); this.channel = factory.open(source.getFileOrPatternSpecProvider().get()); if (channel instanceof SeekableByteChannel) { SeekableByteChannel seekChannel = (SeekableByteChannel) channel; seekChannel.position(source.getStartOffset()); } else { // Channel is not seekable. Must not be a subrange. checkArgument(source.mode != Mode.SINGLE_FILE_OR_SUBRANGE, "Subrange-based sources must only be defined for file types that support seekable " + " read channels"); checkArgument(source.getStartOffset() == 0, "Start offset " + source.getStartOffset() + " is not zero but channel for reading the file is not seekable."); } startReading(channel); // Advance once to load the first record. return advanceImpl(); } @Override protected final boolean advanceImpl() throws IOException { return readNextRecord(); } /** * Closes any {@link ReadableByteChannel} created for the current reader. This implementation is * idempotent. Any {@code close()} method introduced by a subclass must be idempotent and must * call the {@code close()} method in the {@code FileBasedReader}. */ @Override public void close() throws IOException { if (channel != null) { channel.close(); } } /** * Performs any initialization of the subclass of {@code FileBasedReader} that involves IO * operations. Will only be invoked once and before that invocation the base class will seek the * channel to the source's starting offset. * *

Provided {@link ReadableByteChannel} is for the file represented by the source of this * reader. Subclass may use the {@code channel} to build a higher level IO abstraction, e.g., a * BufferedReader or an XML parser. * *

If the corresponding source is for a subrange of a file, {@code channel} is guaranteed to * be an instance of the type {@link SeekableByteChannel}. * *

After this method is invoked the base class will not be reading data from the channel or * adjusting the position of the channel. But the base class is responsible for properly closing * the channel. * * @param channel a byte channel representing the file backing the reader. */ protected abstract void startReading(ReadableByteChannel channel) throws IOException; /** * Reads the next record from the channel provided by {@link #startReading}. Methods * {@link #getCurrent}, {@link #getCurrentOffset}, and {@link #isAtSplitPoint()} should return * the corresponding information about the record read by the last invocation of this method. * *

Note that this method will be called the same way for reading the first record in the * source (file or offset range in the file) and for reading subsequent records. It is up to the * subclass to do anything special for locating and reading the first record, if necessary. * * @return {@code true} if a record was successfully read, {@code false} if the end of the * channel was reached before successfully reading a new record. */ protected abstract boolean readNextRecord() throws IOException; } // An internal Reader implementation that concatenates a sequence of FileBasedReaders. private class FilePatternReader extends BoundedReader { private final FileBasedSource source; private final List> fileReaders; final ListIterator> fileReadersIterator; FileBasedReader currentReader = null; public FilePatternReader(FileBasedSource source, List> fileReaders) { this.source = source; this.fileReaders = fileReaders; this.fileReadersIterator = fileReaders.listIterator(); } @Override public boolean start() throws IOException { return startNextNonemptyReader(); } @Override public boolean advance() throws IOException { checkState(currentReader != null, "Call start() before advance()"); if (currentReader.advance()) { return true; } return startNextNonemptyReader(); } private boolean startNextNonemptyReader() throws IOException { while (fileReadersIterator.hasNext()) { currentReader = fileReadersIterator.next(); if (currentReader.start()) { return true; } currentReader.close(); } return false; } @Override public T getCurrent() throws NoSuchElementException { // A NoSuchElement will be thrown by the last FileBasedReader if getCurrent() is called after // advance() returns false. return currentReader.getCurrent(); } @Override public Instant getCurrentTimestamp() throws NoSuchElementException { // A NoSuchElement will be thrown by the last FileBasedReader if getCurrentTimestamp() // is called after advance() returns false. return currentReader.getCurrentTimestamp(); } @Override public void close() throws IOException { // Close all readers that may have not yet been closed. // If this reader has not been started, currentReader is null. if (currentReader != null) { currentReader.close(); } while (fileReadersIterator.hasNext()) { fileReadersIterator.next().close(); } } @Override public FileBasedSource getCurrentSource() { return source; } @Override public FileBasedSource splitAtFraction(double fraction) { // Unsupported. TODO: implement. LOG.debug("Dynamic splitting of FilePatternReader is unsupported."); return null; } @Override public Double getFractionConsumed() { if (currentReader == null) { return 0.0; } if (fileReaders.isEmpty()) { return 1.0; } int index = fileReadersIterator.previousIndex(); int numReaders = fileReaders.size(); if (index == numReaders) { return 1.0; } double before = 1.0 * index / numReaders; double after = 1.0 * (index + 1) / numReaders; Double fractionOfCurrentReader = currentReader.getFractionConsumed(); if (fractionOfCurrentReader == null) { return before; } return before + fractionOfCurrentReader * (after - before); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy