
com.google.cloud.dataflow.sdk.runners.worker.FileBasedReader Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
******************************************************************************/
package com.google.cloud.dataflow.sdk.runners.worker;
import static com.google.api.client.util.Preconditions.checkNotNull;
import static com.google.cloud.dataflow.sdk.runners.worker.SourceTranslationUtils.cloudPositionToReaderPosition;
import static com.google.cloud.dataflow.sdk.runners.worker.SourceTranslationUtils.cloudProgressToReaderProgress;
import static com.google.cloud.dataflow.sdk.runners.worker.SourceTranslationUtils.splitRequestToApproximateProgress;
import com.google.api.services.dataflow.model.ApproximateProgress;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.io.range.OffsetRangeTracker;
import com.google.cloud.dataflow.sdk.util.CoderUtils;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.common.worker.AbstractBoundedReaderIterator;
import com.google.cloud.dataflow.sdk.util.common.worker.ProgressTracker;
import com.google.cloud.dataflow.sdk.util.common.worker.Reader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.nio.channels.Channels;
import java.util.Collection;
import javax.annotation.Nullable;
/**
* Abstract base class for sources that read from files.
*
* @param the type of the elements read from the source
*/
public abstract class FileBasedReader extends Reader {
protected static final int BUF_SIZE = 200;
protected final String filename;
@Nullable
protected final Long startPosition;
@Nullable
protected final Long endPosition;
protected final Coder coder;
protected final boolean useDefaultBufferSize;
private static final Logger LOG = LoggerFactory.getLogger(FileBasedReader.class);
protected FileBasedReader(String filename, @Nullable Long startPosition,
@Nullable Long endPosition, Coder coder, boolean useDefaultBufferSize) {
this.filename = filename;
this.startPosition = startPosition;
this.endPosition = endPosition;
this.coder = coder;
this.useDefaultBufferSize = useDefaultBufferSize;
}
/**
* Returns a new iterator for elements in the given range in the
* given file. If the range starts in the middle an element, this
* element is skipped as it is considered part of the previous
* range; if the last element that starts in the range finishes
* beyond the end position, it is still considered part of this
* range. In other words, the start position and the end position
* are "rounded up" to element boundaries.
*
* @param endPosition offset of the end position; null means end-of-file
*/
protected abstract ReaderIterator newReaderIteratorForRangeInFile(IOChannelFactory factory,
String oneFile, long startPosition, @Nullable Long endPosition) throws IOException;
/**
* Returns a new iterator for elements in the given files. Caller
* must ensure that the file collection is not empty.
*/
protected abstract ReaderIterator newReaderIteratorForFiles(
IOChannelFactory factory, Collection files) throws IOException;
@Override
public ReaderIterator iterator() throws IOException {
IOChannelFactory factory = IOChannelUtils.getFactory(filename);
Collection inputs = factory.match(filename);
if (inputs.isEmpty()) {
throw new FileNotFoundException("No match for file pattern '" + filename + "'");
}
if (startPosition != null || endPosition != null) {
if (inputs.size() != 1) {
throw new IllegalArgumentException(
"Offset range specified: [" + startPosition + ", " + endPosition + "), so "
+ "an exact filename was expected, but more than 1 file matched \"" + filename
+ "\" (total " + inputs.size() + "): apparently a filepattern was given.");
}
return newReaderIteratorForRangeInFile(factory, inputs.iterator().next(),
startPosition == null ? 0 : startPosition, endPosition);
} else {
return newReaderIteratorForFiles(factory, inputs);
}
}
/**
* Abstract base class for file-based source iterators.
*/
protected abstract class FileBasedIterator extends AbstractBoundedReaderIterator {
protected final CopyableSeekableByteChannel seeker;
protected final PushbackInputStream stream;
protected final OffsetRangeTracker rangeTracker;
protected long offset;
protected final ProgressTracker progressTracker;
protected ByteArrayOutputStream nextElement;
protected DecompressingStreamFactory compressionStreamFactory;
FileBasedIterator(
CopyableSeekableByteChannel seeker,
long startOffset,
long offset,
@Nullable Long endOffset,
ProgressTracker progressTracker,
DecompressingStreamFactory compressionStreamFactory)
throws IOException {
this.seeker = checkNotNull(seeker);
this.seeker.position(startOffset);
this.compressionStreamFactory = compressionStreamFactory;
InputStream inputStream =
compressionStreamFactory.createInputStream(Channels.newInputStream(seeker));
BufferedInputStream bufferedStream =
useDefaultBufferSize
? new BufferedInputStream(inputStream)
: new BufferedInputStream(inputStream, BUF_SIZE);
this.stream = new PushbackInputStream(bufferedStream, BUF_SIZE);
long stopOffset = (endOffset == null) ? OffsetRangeTracker.OFFSET_INFINITY : endOffset;
this.rangeTracker = new OffsetRangeTracker(startOffset, stopOffset);
this.offset = offset;
this.progressTracker = checkNotNull(progressTracker);
}
/**
* Reads the next element.
*
* @return a {@code ByteArrayOutputStream} containing the contents
* of the element, or {@code null} if the end of the stream
* has been reached.
* @throws IOException if an I/O error occurs
*/
protected abstract ByteArrayOutputStream readElement() throws IOException;
@Override
protected boolean hasNextImpl() throws IOException {
long startOffset = offset;
ByteArrayOutputStream element = readElement(); // As a side effect, updates "offset"
if (element != null && rangeTracker.tryReturnRecordAt(true, startOffset)) {
nextElement = element;
progressTracker.saw((int) (offset - startOffset));
} else {
nextElement = null;
}
return nextElement != null;
}
@Override
protected T nextImpl() throws IOException {
return CoderUtils.decodeFromByteArray(coder, nextElement.toByteArray());
}
@Override
public Progress getProgress() {
// Currently we assume that only a offset position and fraction are reported as
// current progress. An implementor can override this method to update
// other metrics, e.g. report a different completion percentage or remaining time.
com.google.api.services.dataflow.model.Position currentPosition =
new com.google.api.services.dataflow.model.Position();
currentPosition.setByteOffset(offset);
ApproximateProgress progress = new ApproximateProgress();
progress.setPosition(currentPosition);
// If endOffset is unspecified, we don't know the fraction consumed.
if (rangeTracker.getStopPosition() != Long.MAX_VALUE) {
progress.setPercentComplete((float) rangeTracker.getFractionConsumed());
}
return cloudProgressToReaderProgress(progress);
}
@Override
public DynamicSplitResult requestDynamicSplit(DynamicSplitRequest splitRequest) {
checkNotNull(splitRequest);
// Currently, file-based Reader only supports split at a byte offset.
ApproximateProgress splitProgress = splitRequestToApproximateProgress(splitRequest);
com.google.api.services.dataflow.model.Position splitPosition = splitProgress.getPosition();
if (splitPosition == null) {
LOG.warn("FileBasedReader only supports split at a Position. Requested: {}",
splitRequest);
return null;
}
Long splitOffset = splitPosition.getByteOffset();
if (splitOffset == null) {
LOG.warn("FileBasedReader only supports split at byte offset. Requested: {}",
splitPosition);
return null;
}
if (rangeTracker.trySplitAtPosition(splitOffset)) {
return new DynamicSplitResultWithPosition(cloudPositionToReaderPosition(splitPosition));
} else {
return null;
}
}
/**
* Returns the end offset of the iterator or Long.MAX_VALUE if unspecified.
* The method is called for test ONLY.
*/
long getEndOffset() {
return rangeTracker.getStopPosition();
}
@Override
public void close() throws IOException {
stream.close();
}
}
/**
* Factory interface for creating a decompressing {@link InputStream}.
*/
public interface DecompressingStreamFactory {
/**
* Create a decompressing {@link InputStream} from an existing {@link InputStream}.
*
* @param inputStream the existing stream
* @return a stream that decompresses the contents of the existing stream
* @throws IOException
*/
public InputStream createInputStream(InputStream inputStream) throws IOException;
}
/**
* Factory for creating decompressing input streams based on a filename and
* a {@link TextIO.CompressionType}. If the compression mode is AUTO, the filename
* is checked against known extensions to determine a compression type to use.
*/
protected static class FilenameBasedStreamFactory
implements DecompressingStreamFactory {
private String filename;
private TextIO.CompressionType compressionType;
public FilenameBasedStreamFactory(String filename, TextIO.CompressionType compressionType) {
this.filename = filename;
this.compressionType = compressionType;
}
protected TextIO.CompressionType getCompressionTypeForAuto() {
for (TextIO.CompressionType type : TextIO.CompressionType.values()) {
if (type.matches(filename) && type != TextIO.CompressionType.AUTO
&& type != TextIO.CompressionType.UNCOMPRESSED) {
return type;
}
}
return TextIO.CompressionType.UNCOMPRESSED;
}
@Override
public InputStream createInputStream(InputStream inputStream) throws IOException {
if (compressionType == TextIO.CompressionType.AUTO) {
return getCompressionTypeForAuto().createInputStream(inputStream);
}
return compressionType.createInputStream(inputStream);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy