All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.runners.worker.TextReader Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 ******************************************************************************/

package com.google.cloud.dataflow.sdk.runners.worker;

import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.util.IOChannelFactory;
import com.google.cloud.dataflow.sdk.util.common.worker.ProgressTracker;
import com.google.cloud.dataflow.sdk.util.common.worker.ProgressTrackerGroup;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.Collection;
import java.util.Iterator;

import javax.annotation.Nullable;

/**
 * A source that reads text files.
 *
 * @param  the type of the elements read from the source
 */
public class TextReader extends FileBasedReader {
  final boolean stripTrailingNewlines;
  final TextIO.CompressionType compressionType;

  public TextReader(String filename, boolean stripTrailingNewlines, @Nullable Long startPosition,
      @Nullable Long endPosition, Coder coder, TextIO.CompressionType compressionType) {
    this(filename, stripTrailingNewlines, startPosition, endPosition, coder, true,
        compressionType);
  }

  protected TextReader(String filename, boolean stripTrailingNewlines, @Nullable Long startPosition,
      @Nullable Long endPosition, Coder coder, boolean useDefaultBufferSize,
      TextIO.CompressionType compressionType) {
    super(filename, startPosition, endPosition, coder, useDefaultBufferSize);
    this.stripTrailingNewlines = stripTrailingNewlines;
    this.compressionType = compressionType;
  }

  @Override
  protected ReaderIterator newReaderIteratorForRangeInFile(IOChannelFactory factory,
      String oneFile, long startPosition, @Nullable Long endPosition) throws IOException {
    // Position before the first record, so we can find the record beginning.
    final long start = startPosition > 0 ? startPosition - 1 : 0;

    TextFileIterator iterator = newReaderIteratorForRangeWithStrictStart(
        factory, oneFile, stripTrailingNewlines, start, endPosition);

    // Skip the initial record if start position was set.
    if (startPosition > 0) {
      iterator.hasNextImpl();
    }

    return iterator;
  }

  @Override
  protected ReaderIterator newReaderIteratorForFiles(
      IOChannelFactory factory, Collection files) throws IOException {
    if (files.size() == 1) {
      return newReaderIteratorForFile(factory, files.iterator().next(), stripTrailingNewlines);
    }

    return new TextFileMultiIterator(factory, files.iterator(), stripTrailingNewlines);
  }

  private TextFileIterator newReaderIteratorForFile(
      IOChannelFactory factory, String input, boolean stripTrailingNewlines) throws IOException {
    return newReaderIteratorForRangeWithStrictStart(factory, input, stripTrailingNewlines, 0, null);
  }

  /**
   * Returns a new iterator for lines in the given range in the given
   * file.  Does NOT skip the first line if the range starts in the
   * middle of a line (instead, the latter half that starts at
   * startOffset will be returned as the first element).
   */
  private TextFileIterator newReaderIteratorForRangeWithStrictStart(IOChannelFactory factory,
      String input, boolean stripTrailingNewlines, long startOffset, @Nullable Long endOffset)
      throws IOException {
    ReadableByteChannel reader = factory.open(input);
    if (!(reader instanceof SeekableByteChannel)) {
      throw new UnsupportedOperationException("Unable to seek in stream for " + input);
    }

    SeekableByteChannel seeker = (SeekableByteChannel) reader;

    return new TextFileIterator(
        new CopyableSeekableByteChannel(seeker), stripTrailingNewlines, startOffset, endOffset,
        new FileBasedReader.FilenameBasedStreamFactory(input, compressionType));
  }

  class TextFileMultiIterator extends LazyMultiReaderIterator {
    private final IOChannelFactory factory;
    private final boolean stripTrailingNewlines;

    public TextFileMultiIterator(
        IOChannelFactory factory, Iterator inputs, boolean stripTrailingNewlines) {
      super(inputs);
      this.factory = factory;
      this.stripTrailingNewlines = stripTrailingNewlines;
    }

    @Override
    protected ReaderIterator open(String input) throws IOException {
      return newReaderIteratorForFile(factory, input, stripTrailingNewlines);
    }
  }

  class TextFileIterator extends FileBasedIterator {
    private final boolean stripTrailingNewlines;
    private ScanState state;

    TextFileIterator(CopyableSeekableByteChannel seeker, boolean stripTrailingNewlines,
        long startOffset, @Nullable Long endOffset,
        FileBasedReader.DecompressingStreamFactory compressionStreamFactory) throws IOException {
      this(seeker, stripTrailingNewlines, startOffset, startOffset, endOffset,
          new ProgressTrackerGroup() {
            @Override
            protected void report(Integer lineLength) {
              notifyElementRead(lineLength.longValue());
            }
          }.start(),
          new ScanState(BUF_SIZE, !stripTrailingNewlines),
          compressionStreamFactory);
    }

    private TextFileIterator(CopyableSeekableByteChannel seeker, boolean stripTrailingNewlines,
        long startOffset, long offset, @Nullable Long endOffset, ProgressTracker tracker,
        ScanState state, FileBasedReader.DecompressingStreamFactory compressionStreamFactory)
            throws IOException {
      super(seeker, startOffset, offset, endOffset, tracker, compressionStreamFactory);

      this.stripTrailingNewlines = stripTrailingNewlines;
      this.state = state;
    }

    private TextFileIterator(TextFileIterator it) throws IOException {
      // Correctly adjust the start position of the seeker given
      // that it may hold bytes that have been read and now reside
      // in the read buffer (that is copied during cloning).
      this(
          it.seeker.copy(),
          it.stripTrailingNewlines,
          it.rangeTracker.getStartPosition() + it.state.totalBytesRead,
          it.offset,
          it.rangeTracker.getStopPosition(),
          it.progressTracker.copy(),
          it.state.copy(),
          it.compressionStreamFactory);
    }

    @Override
    public ReaderIterator copy() throws IOException {
      return new TextFileIterator(this);
    }

    /**
     * Reads a line of text. A line is considered to be terminated by any
     * one of a line feed ({@code '\n'}), a carriage return
     * ({@code '\r'}), or a carriage return followed immediately by a linefeed
     * ({@code "\r\n"}).
     *
     * @return a {@code ByteArrayOutputStream} containing the contents of the
     *     line, with any line-termination characters stripped if
     *     keepNewlines==false, or {@code null} if the end of the stream has
     *     been reached.
     * @throws IOException if an I/O error occurs
     */
    @Override
    protected ByteArrayOutputStream readElement() throws IOException {
      ByteArrayOutputStream buffer = new ByteArrayOutputStream(BUF_SIZE);

      int charsConsumed = 0;
      while (true) {
        // Attempt to read blocks of data at a time
        // until a separator is found.
        if (!state.readBytes(stream)) {
          break;
        }

        int consumed = state.consumeUntilSeparator(buffer);
        charsConsumed += consumed;
        if (consumed > 0 && state.separatorFound()) {
          if (state.lastByteRead() == '\r') {
            charsConsumed += state.copyCharIfLinefeed(buffer, stream);
          }
          break;
        }
      }

      if (charsConsumed == 0) {
        // Note that charsConsumed includes the size of any separators that may
        // have been stripped off -- so if we didn't get anything, we're at the
        // end of the file.
        return null;
      }

      offset += charsConsumed;
      return buffer;
    }
  }

  /**
   * ScanState encapsulates the state for the current buffer of text
   * being scanned.
   */
  private static class ScanState {
    private int start; // Valid bytes in buf start at this index
    private int pos; // Where the separator is in the buf (if one was found)
    private int end; // the index of the end of bytes in buf
    private byte[] buf;
    private boolean keepNewlines;
    private byte lastByteRead;
    private long totalBytesRead;

    public ScanState(int size, boolean keepNewlines) {
      this.start = 0;
      this.pos = 0;
      this.end = 0;
      this.buf = new byte[size];
      this.keepNewlines = keepNewlines;
      totalBytesRead = 0;
    }

    public ScanState copy() {
      byte[] bufCopy = new byte[buf.length]; // copy :(
      System.arraycopy(buf, start, bufCopy, start, end - start);
      return new ScanState(
          this.keepNewlines, this.start, this.pos, this.end, bufCopy, this.lastByteRead, 0);
    }

    private ScanState(boolean keepNewlines, int start, int pos, int end, byte[] buf,
        byte lastByteRead, long totalBytesRead) {
      this.start = start;
      this.pos = pos;
      this.end = end;
      this.buf = buf;
      this.keepNewlines = keepNewlines;
      this.lastByteRead = lastByteRead;
      this.totalBytesRead = totalBytesRead;
    }

    public boolean readBytes(PushbackInputStream stream) throws IOException {
      if (start < end) {
        return true;
      }
      assert end <= buf.length : end + " > " + buf.length;
      int bytesRead = stream.read(buf, end, buf.length - end);
      if (bytesRead == -1) {
        return false;
      }
      totalBytesRead += bytesRead;
      end += bytesRead;
      return true;
    }

    /**
     * Consumes characters until a separator character is found or the
     * end of buffer is reached.
     *
     * 

Updates the state to indicate the position of the separator * character. If pos==len, no separator was found. * * @return the number of characters consumed. */ public int consumeUntilSeparator(ByteArrayOutputStream out) { for (pos = start; pos < end; ++pos) { lastByteRead = buf[pos]; if (separatorFound()) { int charsConsumed = (pos - start + 1); // The separator is consumed copyToOutputBuffer(out); start = pos + 1; // skip the separator return charsConsumed; } } // No separator found assert pos == end; int charsConsumed = (pos - start); out.write(buf, start, charsConsumed); start = 0; end = 0; pos = 0; return charsConsumed; } public boolean separatorFound() { return lastByteRead == '\n' || lastByteRead == '\r'; } public byte lastByteRead() { return buf[pos]; } /** * Copies data from the input buffer to the output buffer. * *

If keepNewlines==true, line-termination characters are included in the copy. */ private void copyToOutputBuffer(ByteArrayOutputStream out) { int charsCopied = pos - start; if (keepNewlines && separatorFound()) { charsCopied++; } out.write(buf, start, charsCopied); } /** * Scans the input buffer to determine if a matched carriage return * has an accompanying linefeed and process the input buffer accordingly. * *

If keepNewlines==true and a linefeed character is detected, * it is included in the copy. * * @return the number of characters consumed */ private int copyCharIfLinefeed(ByteArrayOutputStream out, PushbackInputStream stream) throws IOException { int charsConsumed = 0; // Check to make sure we don't go off the end of the buffer if ((pos + 1) < end) { if (buf[pos + 1] == '\n') { charsConsumed++; pos++; start++; if (keepNewlines) { out.write('\n'); } } } else { // We are at the end of the buffer and need one more // byte. Get it the slow but safe way. int b = stream.read(); if (b == '\n') { charsConsumed++; totalBytesRead++; if (keepNewlines) { out.write(b); } } else if (b != -1) { // Consider replacing unread() since it may be slow if // iterators are cloned frequently. stream.unread(b); } } return charsConsumed; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy