
com.google.cloud.dataflow.sdk.runners.worker.TextReader Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
******************************************************************************/
package com.google.cloud.dataflow.sdk.runners.worker;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.common.worker.ProgressTracker;
import com.google.cloud.dataflow.sdk.util.common.worker.ProgressTrackerGroup;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.Collection;
import java.util.Iterator;
import javax.annotation.Nullable;
/**
* A source that reads text files.
*
* @param the type of the elements read from the source
*/
public class TextReader extends FileBasedReader {
final boolean stripTrailingNewlines;
final TextIO.CompressionType compressionType;
public TextReader(String filename, boolean stripTrailingNewlines, @Nullable Long startPosition,
@Nullable Long endPosition, Coder coder, TextIO.CompressionType compressionType) {
this(filename, stripTrailingNewlines, startPosition, endPosition, coder, true,
compressionType);
}
protected TextReader(String filename, boolean stripTrailingNewlines, @Nullable Long startPosition,
@Nullable Long endPosition, Coder coder, boolean useDefaultBufferSize,
TextIO.CompressionType compressionType) {
super(filename, startPosition, endPosition, coder, useDefaultBufferSize);
this.stripTrailingNewlines = stripTrailingNewlines;
this.compressionType = compressionType;
}
@Override
protected ReaderIterator newReaderIteratorForRangeInFile(IOChannelFactory factory,
String oneFile, long startPosition, @Nullable Long endPosition) throws IOException {
// Position before the first record, so we can find the record beginning.
final long start = startPosition > 0 ? startPosition - 1 : 0;
TextFileIterator iterator = newReaderIteratorForRangeWithStrictStart(
factory, oneFile, stripTrailingNewlines, start, endPosition);
// Skip the initial record if start position was set.
if (startPosition > 0) {
iterator.hasNextImpl();
}
return iterator;
}
@Override
protected ReaderIterator newReaderIteratorForFiles(
IOChannelFactory factory, Collection files) throws IOException {
if (files.size() == 1) {
return newReaderIteratorForFile(factory, files.iterator().next(), stripTrailingNewlines);
}
return new TextFileMultiIterator(factory, files.iterator(), stripTrailingNewlines);
}
private TextFileIterator newReaderIteratorForFile(
IOChannelFactory factory, String input, boolean stripTrailingNewlines) throws IOException {
return newReaderIteratorForRangeWithStrictStart(factory, input, stripTrailingNewlines, 0, null);
}
/**
* Returns a new iterator for lines in the given range in the given
* file. Does NOT skip the first line if the range starts in the
* middle of a line (instead, the latter half that starts at
* startOffset will be returned as the first element).
*/
private TextFileIterator newReaderIteratorForRangeWithStrictStart(IOChannelFactory factory,
String input, boolean stripTrailingNewlines, long startOffset, @Nullable Long endOffset)
throws IOException {
ReadableByteChannel reader = factory.open(input);
if (!(reader instanceof SeekableByteChannel)) {
throw new UnsupportedOperationException("Unable to seek in stream for " + input);
}
SeekableByteChannel seeker = (SeekableByteChannel) reader;
return new TextFileIterator(
new CopyableSeekableByteChannel(seeker), stripTrailingNewlines, startOffset, endOffset,
new FileBasedReader.FilenameBasedStreamFactory(input, compressionType));
}
class TextFileMultiIterator extends LazyMultiReaderIterator {
private final IOChannelFactory factory;
private final boolean stripTrailingNewlines;
public TextFileMultiIterator(
IOChannelFactory factory, Iterator inputs, boolean stripTrailingNewlines) {
super(inputs);
this.factory = factory;
this.stripTrailingNewlines = stripTrailingNewlines;
}
@Override
protected ReaderIterator open(String input) throws IOException {
return newReaderIteratorForFile(factory, input, stripTrailingNewlines);
}
}
class TextFileIterator extends FileBasedIterator {
private final boolean stripTrailingNewlines;
private ScanState state;
TextFileIterator(CopyableSeekableByteChannel seeker, boolean stripTrailingNewlines,
long startOffset, @Nullable Long endOffset,
FileBasedReader.DecompressingStreamFactory compressionStreamFactory) throws IOException {
this(seeker, stripTrailingNewlines, startOffset, startOffset, endOffset,
new ProgressTrackerGroup() {
@Override
protected void report(Integer lineLength) {
notifyElementRead(lineLength.longValue());
}
}.start(),
new ScanState(BUF_SIZE, !stripTrailingNewlines),
compressionStreamFactory);
}
private TextFileIterator(CopyableSeekableByteChannel seeker, boolean stripTrailingNewlines,
long startOffset, long offset, @Nullable Long endOffset, ProgressTracker tracker,
ScanState state, FileBasedReader.DecompressingStreamFactory compressionStreamFactory)
throws IOException {
super(seeker, startOffset, offset, endOffset, tracker, compressionStreamFactory);
this.stripTrailingNewlines = stripTrailingNewlines;
this.state = state;
}
private TextFileIterator(TextFileIterator it) throws IOException {
// Correctly adjust the start position of the seeker given
// that it may hold bytes that have been read and now reside
// in the read buffer (that is copied during cloning).
this(
it.seeker.copy(),
it.stripTrailingNewlines,
it.rangeTracker.getStartPosition() + it.state.totalBytesRead,
it.offset,
it.rangeTracker.getStopPosition(),
it.progressTracker.copy(),
it.state.copy(),
it.compressionStreamFactory);
}
@Override
public ReaderIterator copy() throws IOException {
return new TextFileIterator(this);
}
/**
* Reads a line of text. A line is considered to be terminated by any
* one of a line feed ({@code '\n'}), a carriage return
* ({@code '\r'}), or a carriage return followed immediately by a linefeed
* ({@code "\r\n"}).
*
* @return a {@code ByteArrayOutputStream} containing the contents of the
* line, with any line-termination characters stripped if
* keepNewlines==false, or {@code null} if the end of the stream has
* been reached.
* @throws IOException if an I/O error occurs
*/
@Override
protected ByteArrayOutputStream readElement() throws IOException {
ByteArrayOutputStream buffer = new ByteArrayOutputStream(BUF_SIZE);
int charsConsumed = 0;
while (true) {
// Attempt to read blocks of data at a time
// until a separator is found.
if (!state.readBytes(stream)) {
break;
}
int consumed = state.consumeUntilSeparator(buffer);
charsConsumed += consumed;
if (consumed > 0 && state.separatorFound()) {
if (state.lastByteRead() == '\r') {
charsConsumed += state.copyCharIfLinefeed(buffer, stream);
}
break;
}
}
if (charsConsumed == 0) {
// Note that charsConsumed includes the size of any separators that may
// have been stripped off -- so if we didn't get anything, we're at the
// end of the file.
return null;
}
offset += charsConsumed;
return buffer;
}
}
/**
* ScanState encapsulates the state for the current buffer of text
* being scanned.
*/
private static class ScanState {
private int start; // Valid bytes in buf start at this index
private int pos; // Where the separator is in the buf (if one was found)
private int end; // the index of the end of bytes in buf
private byte[] buf;
private boolean keepNewlines;
private byte lastByteRead;
private long totalBytesRead;
public ScanState(int size, boolean keepNewlines) {
this.start = 0;
this.pos = 0;
this.end = 0;
this.buf = new byte[size];
this.keepNewlines = keepNewlines;
totalBytesRead = 0;
}
public ScanState copy() {
byte[] bufCopy = new byte[buf.length]; // copy :(
System.arraycopy(buf, start, bufCopy, start, end - start);
return new ScanState(
this.keepNewlines, this.start, this.pos, this.end, bufCopy, this.lastByteRead, 0);
}
private ScanState(boolean keepNewlines, int start, int pos, int end, byte[] buf,
byte lastByteRead, long totalBytesRead) {
this.start = start;
this.pos = pos;
this.end = end;
this.buf = buf;
this.keepNewlines = keepNewlines;
this.lastByteRead = lastByteRead;
this.totalBytesRead = totalBytesRead;
}
public boolean readBytes(PushbackInputStream stream) throws IOException {
if (start < end) {
return true;
}
assert end <= buf.length : end + " > " + buf.length;
int bytesRead = stream.read(buf, end, buf.length - end);
if (bytesRead == -1) {
return false;
}
totalBytesRead += bytesRead;
end += bytesRead;
return true;
}
/**
* Consumes characters until a separator character is found or the
* end of buffer is reached.
*
* Updates the state to indicate the position of the separator
* character. If pos==len, no separator was found.
*
* @return the number of characters consumed.
*/
public int consumeUntilSeparator(ByteArrayOutputStream out) {
for (pos = start; pos < end; ++pos) {
lastByteRead = buf[pos];
if (separatorFound()) {
int charsConsumed = (pos - start + 1); // The separator is consumed
copyToOutputBuffer(out);
start = pos + 1; // skip the separator
return charsConsumed;
}
}
// No separator found
assert pos == end;
int charsConsumed = (pos - start);
out.write(buf, start, charsConsumed);
start = 0;
end = 0;
pos = 0;
return charsConsumed;
}
public boolean separatorFound() {
return lastByteRead == '\n' || lastByteRead == '\r';
}
public byte lastByteRead() {
return buf[pos];
}
/**
* Copies data from the input buffer to the output buffer.
*
*
If keepNewlines==true, line-termination characters are included in the copy.
*/
private void copyToOutputBuffer(ByteArrayOutputStream out) {
int charsCopied = pos - start;
if (keepNewlines && separatorFound()) {
charsCopied++;
}
out.write(buf, start, charsCopied);
}
/**
* Scans the input buffer to determine if a matched carriage return
* has an accompanying linefeed and process the input buffer accordingly.
*
*
If keepNewlines==true and a linefeed character is detected,
* it is included in the copy.
*
* @return the number of characters consumed
*/
private int copyCharIfLinefeed(ByteArrayOutputStream out, PushbackInputStream stream)
throws IOException {
int charsConsumed = 0;
// Check to make sure we don't go off the end of the buffer
if ((pos + 1) < end) {
if (buf[pos + 1] == '\n') {
charsConsumed++;
pos++;
start++;
if (keepNewlines) {
out.write('\n');
}
}
} else {
// We are at the end of the buffer and need one more
// byte. Get it the slow but safe way.
int b = stream.read();
if (b == '\n') {
charsConsumed++;
totalBytesRead++;
if (keepNewlines) {
out.write(b);
}
} else if (b != -1) {
// Consider replacing unread() since it may be slow if
// iterators are cloned frequently.
stream.unread(b);
}
}
return charsConsumed;
}
}
}