com.google.cloud.bigtable.beam.sequencefiles.SequenceFileSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigtable-beam-import Show documentation
There is a newer version: 2.14.7
/*
 * Copyright 2017 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.sequencefiles;

import static com.google.common.base.Preconditions.checkState;

import avro.shaded.com.google.common.collect.Sets;
import com.google.common.base.Preconditions;
import com.google.common.primitives.UnsignedBytes;
import java.io.EOFException;
import java.io.IOException;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SeekableByteChannel;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.io.FileBasedSource;
import org.apache.beam.sdk.io.FileBasedSource.FileBasedReader;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.values.KV;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * A {@link FileBasedSource} that can read hadoop's {@link SequenceFile}s.
 *
 * @param  The type of the {@link SequenceFile} key.
 * @param  The type of the {@link SequenceFile} value.
 */
class SequenceFileSource extends FileBasedSource> {

  private static final Log LOG = LogFactory.getLog(SequenceFileSource.class);

  private final Class keyClass;
  private final Class valueClass;
  private final Class> keySerializationClass;
  private final Class> valueSerializationClass;
  private final KvCoder coder;

  /**
   * Constructs a new top level source.
   *
   * @param fileOrPatternSpec The path or pattern of the file(s) to read.
   * @param keyClass The {@link Class} of the key.
   * @param keySerialization The {@link Class} of the hadoop {@link org.apache.hadoop.io.serializer.Serialization}
   *        to use for the key.
   * @param valueClass The {@link Class} of the value.
   * @param valueSerialization The {@link Class} of the hadoop {@link
   * org.apache.hadoop.io.serializer.Serialization} to use for the value.
   */
  SequenceFileSource(ValueProvider fileOrPatternSpec,
      Class keyClass,
      Class> keySerialization,
      Class valueClass,
      Class> valueSerialization,
      long minBundleSize) {
    super(fileOrPatternSpec, minBundleSize);

    Preconditions.checkArgument(minBundleSize >= SequenceFile.SYNC_INTERVAL,
        "minBundleSize must be at least " + SequenceFile.SYNC_INTERVAL);

    this.keyClass = keyClass;
    this.valueClass = valueClass;
    this.keySerializationClass = keySerialization;
    this.valueSerializationClass = valueSerialization;
    this.coder = KvCoder.of(
        new HadoopSerializationCoder<>(keyClass, keySerialization),
        new HadoopSerializationCoder<>(valueClass, valueSerialization)
    );
  }

  /**
   * Constructs a subsource for a given range.
   *
   * @param fileMetadata specification of the file represented by the {@link SequenceFileSource}, in
   * suitable form for use with {@link FileSystems#match(List)}.
   * @param startOffset starting byte offset.
   * @param endOffset ending byte offset. If the specified value {@code >= #getMaxEndOffset()} it
   *        implies {@code #getMaxEndOffSet()}.
   * @param keyClass The {@link Class} of the key.
   * @param keySerialization The {@link Class} of the hadoop {@link org.apache.hadoop.io.serializer.Serialization}
   *        to use for the key.
   * @param valueClass The {@link Class} of the value.
   * @param valueSerialization The {@link Class} of the hadoop {@link
   *        org.apache.hadoop.io.serializer.Serialization} to use for the value.
   */
  private SequenceFileSource(Metadata fileMetadata,
      long startOffset, long endOffset,
      Class keyClass,
      Class> keySerialization,
      Class valueClass,
      Class> valueSerialization,
      long minBundleSize,
      KvCoder coder) {
    super(fileMetadata, minBundleSize, startOffset, endOffset);
    this.keyClass = keyClass;
    this.valueClass = valueClass;
    this.keySerializationClass = keySerialization;
    this.valueSerializationClass = valueSerialization;
    this.coder = coder;
  }

  /**
   * {@inheritDoc}
   */
  @Override
  protected FileBasedSource> createForSubrangeOfFile(Metadata fileMetadata, long start,
      long end) {
    LOG.debug("Creating source for subrange: " + start + "-" + end);
    return new SequenceFileSource<>(fileMetadata,
        start, end,
        keyClass,
        keySerializationClass, valueClass,
        valueSerializationClass,
        getMinBundleSize(),
        coder
    );
  }

  /**
   * {@inheritDoc}
   */
  @Override
  protected FileBasedReader> createSingleFileReader(PipelineOptions options) {
    Set serializationNames = Sets.newHashSet(
        keySerializationClass.getName(),
        valueSerializationClass.getName()
    );
    return new SeqFileReader<>(this, keyClass, valueClass,
        serializationNames.toArray(new String[serializationNames.size()]));
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public Coder> getDefaultOutputCoder() {
    return coder;
  }

  /**
   * A {@link FileBasedReader} for reading records from a {@link SequenceFile}.
   *
   * @param  The type of the record keys.
   * @param  The type of the record values.
   */
  static class SeqFileReader extends FileBasedReader> {

    private final Class keyClass;
    private final Class valueClass;
    private final String[] serializationNames;

    private SequenceFile.Reader reader;

    // Sync is consumed during startReading(), so we need to track that for the first call of
    // readNextRecord
    private boolean isFirstRecord;
    private boolean isAtSplitPoint;
    private boolean eof;

    private long startOfNextRecord;
    private long startOfRecord;
    private KV record;

    SeqFileReader(FileBasedSource> source, Class keyClass, Class valueClass,
        String[] serializationNames) {
      super(source);
      this.keyClass = keyClass;
      this.valueClass = valueClass;
      this.serializationNames = serializationNames;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void startReading(ReadableByteChannel channel) throws IOException {
      checkState(channel instanceof SeekableByteChannel,
          "%s only supports reading from a SeekableByteChannel",
          SequenceFileSource.class.getSimpleName()
      );

      SeekableByteChannel seekableByteChannel = (SeekableByteChannel) channel;
      FileStream fileStream = new FileStream(seekableByteChannel);
      FSDataInputStream fsDataInputStream = new FSDataInputStream(fileStream);

      // Construct the underlying SequenceFile.Reader
      Configuration configuration = new Configuration(false);
      if (serializationNames.length > 0) {
        configuration.setStrings("io.serializations", serializationNames);
      }

      reader = new SequenceFile.Reader(configuration,
          SequenceFile.Reader.stream(fsDataInputStream));

      // Seek to the start of the next closest sync point
      try {
        reader.sync(getCurrentSource().getStartOffset());
      } catch (EOFException e) {
        LOG.debug("Found EOF when starting to read: " + getCurrentSource().getStartOffset());
        eof = true;
      }

      // Prep for the next readNextRecord() call
      startOfNextRecord = reader.getPosition();
      isFirstRecord = true;

      LOG.debug("startReading, offset: " + getCurrentSource().getStartOffset() + ", position: "
          + startOfNextRecord);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void close() throws IOException {
      if (reader != null) {
        reader.close();
      }
      super.close();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected boolean readNextRecord() throws IOException {
      if (eof) {
        return false;
      }

      K key = ReflectionUtils.newInstance(keyClass, null);
      V value = ReflectionUtils.newInstance(valueClass, null);

      startOfRecord = startOfNextRecord;

      try {
        eof = reader.next(key) == null;
      } catch (EOFException e) {
        eof = true;
      }

      if (eof) {
        record = null;
      } else {
        value = readCurrentValueUnchecked(value);
        record = KV.of(key, value);
      }

      isAtSplitPoint = isFirstRecord || reader.syncSeen();
      isFirstRecord = false;
      startOfNextRecord = reader.getPosition();

      return record != null;
    }

    @SuppressWarnings("unchecked")
    private V readCurrentValueUnchecked(V value) throws IOException {
      return (V) reader.getCurrentValue(value);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected boolean isAtSplitPoint() throws NoSuchElementException {
      return isAtSplitPoint;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected long getCurrentOffset() throws NoSuchElementException {
      if (record == null) {
        throw new NoSuchElementException();
      }
      return startOfRecord;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public KV getCurrent() throws NoSuchElementException {
      if (record == null) {
        throw new NoSuchElementException();
      }
      return record;
    }
  }


  /**
   * Adapter to convert a Beam {@link SeekableByteChannel} to hadoop's {@link FSDataInputStream}.
   */
  static class FileStream extends FSInputStream {

    private final SeekableByteChannel inner;
    private final ByteBuffer singleByteBuffer = ByteBuffer.allocate(1);

    FileStream(SeekableByteChannel inner) {
      this.inner = inner;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void seek(long l) throws IOException {
      inner.position(l);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public long getPos() throws IOException {
      return inner.position();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean seekToNewSource(long l) throws IOException {
      return false;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public int read(byte[] buffer, int offset, int length) throws IOException {
      ByteBuffer byteBuffer = ByteBuffer.wrap(buffer, offset, length);
      return inner.read(byteBuffer);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public int read() throws IOException {
      int numRead = 0;

      // Workaround Java 9 overridden methods with covariant return types
      ((Buffer)singleByteBuffer).clear();
      while (numRead == 0) {
        numRead = inner.read(singleByteBuffer);
      }

      if (numRead == -1) {
        return -1;
      }

      return UnsignedBytes.toInt(singleByteBuffer.get(0));
    }
  }
}