com.google.cloud.dataflow.sdk.io.hdfs.HDFSFileSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-hdfs_2.11 Show documentation
Scio add-on for HDFS
There is a newer version: 0.2.6
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.dataflow.sdk.io.hdfs;

import com.google.cloud.dataflow.sdk.coders.*;
import com.google.cloud.dataflow.sdk.io.BoundedSource;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.util.CoderUtils;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;

import javax.annotation.Nullable;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.NoSuchElementException;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;

public class HDFSFileSource extends BoundedSource {
  private static final long serialVersionUID = 0L;

  private final String filepattern;
  private final Class> formatClass;
  private final Coder coder;
  private final SerializableFunction, T> inputConverter;
  private final SerializableConfiguration serializableConfiguration;
  private final SerializableSplit serializableSplit;
  private final boolean validate;

  private HDFSFileSource(String filepattern,
                         Class> formatClass,
                         Coder coder,
                         SerializableFunction, T> inputConverter,
                         SerializableConfiguration serializableConfiguration,
                         SerializableSplit serializableSplit,
                         boolean validate) {
    this.filepattern = filepattern;
    this.formatClass = castClass(formatClass);
    this.coder = coder;
    this.inputConverter = inputConverter;
    this.serializableConfiguration = serializableConfiguration;
    this.serializableSplit = serializableSplit;
    this.validate = validate;
  }

  // =======================================================================
  // Factory methods
  // =======================================================================

  public static > HDFSFileSource
  from(String filepattern,
       Class formatClass,
       Coder coder,
       SerializableFunction, T> inputConverter) {
    return new HDFSFileSource<>(filepattern, formatClass, coder, inputConverter, null, null, true);
  }

  public static > HDFSFileSource, K, V>
  from(String filepattern,
       Class formatClass,
       Class keyClass,
       Class valueClass) {
    KvCoder coder = KvCoder.of(getDefaultCoder(keyClass), getDefaultCoder(valueClass));
    SerializableFunction, KV> inputConverter =
        new SerializableFunction, KV>() {
          @Override
          public KV apply(KV input) {
            return input;
          }
        };
    return new HDFSFileSource<>(filepattern, formatClass, coder, inputConverter, null, null, true);
  }

  public static HDFSFileSource
  fromText(String filepattern) {
    SerializableFunction, String> inputConverter =
        new SerializableFunction, String>() {
      @Override
      public String apply(KV input) {
        return input.getValue().toString();
      }
    };
    return from(filepattern, TextInputFormat.class, StringUtf8Coder.of(), inputConverter);
  }

  public static  HDFSFileSource, NullWritable>
  fromAvro(String filepattern, final AvroCoder coder) {
    Class> formatClass = castClass(AvroKeyInputFormat.class);
    SerializableFunction, NullWritable>, T> inputConverter =
        new SerializableFunction, NullWritable>, T>() {
          @Override
          public T apply(KV, NullWritable> input) {
            try {
              return CoderUtils.clone(coder, input.getKey().datum());
            } catch (CoderException e) {
              throw new RuntimeException(e);
            }
          }
        };
    Configuration conf = new Configuration();
    conf.set("avro.schema.input.key", coder.getSchema().toString());
    return from(filepattern, formatClass, coder, inputConverter).withConfiguration(conf);
  }

  public static HDFSFileSource, NullWritable>
  fromAvro(String filepattern, Schema schema) {
    return fromAvro(filepattern, AvroCoder.of(schema));
  }

  public static  HDFSFileSource, NullWritable>
  fromAvro(String filepattern, Class cls) {
    return fromAvro(filepattern, AvroCoder.of(cls));
  }

  // =======================================================================
  // Builder methods
  // =======================================================================

  public HDFSFileSource withCoder(Coder coder) {
    return new HDFSFileSource<>(
        filepattern, formatClass, coder, inputConverter,
        serializableConfiguration, serializableSplit, validate);
  }

  public HDFSFileSource withInputConverter(
      SerializableFunction, T> inputConverter) {
    return new HDFSFileSource<>(
        filepattern, formatClass, coder, inputConverter,
        serializableConfiguration, serializableSplit, validate);
  }

  public HDFSFileSource withConfiguration(Configuration conf) {
    SerializableConfiguration serializableConfiguration = new SerializableConfiguration(conf);
    return new HDFSFileSource<>(
        filepattern, formatClass, coder, inputConverter,
        serializableConfiguration, serializableSplit, validate);
  }

  public HDFSFileSource withInputSplit(InputSplit inputSplit) {
    SerializableSplit serializableSplit = new SerializableSplit(inputSplit);
    return new HDFSFileSource<>(
        filepattern, formatClass, coder, inputConverter,
        serializableConfiguration, serializableSplit, validate);
  }

  public HDFSFileSource withoutValidation() {
    return new HDFSFileSource<>(
        filepattern, formatClass, coder, inputConverter,
        serializableConfiguration, serializableSplit, false);
  }

  // =======================================================================
  // BoundedSource
  // =======================================================================

  @Override
  public List> splitIntoBundles(
      long desiredBundleSizeBytes,
      PipelineOptions options) throws Exception {
    if (serializableSplit == null) {
      return Lists.transform(computeSplits(desiredBundleSizeBytes),
          new Function>() {
            @Override
            public BoundedSource apply(@Nullable InputSplit inputSplit) {
              SerializableSplit serializableSplit = new SerializableSplit(inputSplit);
              return new HDFSFileSource<>(
                  filepattern, formatClass, coder, inputConverter,
                  serializableConfiguration, serializableSplit, validate);
            }
          });
    } else {
      return ImmutableList.of(this);
    }
  }

  @Override
  public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
    long size = 0;
    Job job = Job.getInstance(); // new instance
    for (FileStatus st : listStatus(createFormat(job), job)) {
      size += st.getLen();
    }
    return size;
  }

  @Override
  public boolean producesSortedKeys(PipelineOptions options) throws Exception {
    return false;
  }

  @Override
  public BoundedReader createReader(PipelineOptions options) throws IOException {
    this.validate();
    return new HDFSFileReader<>(this, filepattern, formatClass, serializableSplit);
  }

  @Override
  public void validate() {
    if (validate) {
      try {
        FileSystem fs = FileSystem.get(new URI(filepattern), Job.getInstance().getConfiguration());
        FileStatus[] fileStatuses = fs.globStatus(new Path(filepattern));
        checkState(
            fileStatuses != null && fileStatuses.length > 0,
            "Unable to find any files matching %s", filepattern);
      } catch (IOException | URISyntaxException e) {
        throw new RuntimeException(e);
      }
    }
  }

  @Override
  public Coder getDefaultOutputCoder() {
    return coder;
  }

  // =======================================================================
  // Helpers
  // =======================================================================

  private List computeSplits(long desiredBundleSizeBytes)
      throws IOException, IllegalAccessException, InstantiationException {
    Job job = Job.getInstance();
    FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
    FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
    return createFormat(job).getSplits(job);
  }

  private FileInputFormat createFormat(Job job)
      throws IOException, IllegalAccessException, InstantiationException {
    Path path = new Path(filepattern);
    FileInputFormat.addInputPath(job, path);
    return formatClass.newInstance();
  }

  private List listStatus(FileInputFormat format, Job job)
      throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
    // FileInputFormat#listStatus is protected, so call using reflection
    Method listStatus = FileInputFormat.class.getDeclaredMethod("listStatus", JobContext.class);
    listStatus.setAccessible(true);
    @SuppressWarnings("unchecked")
    List stat = (List) listStatus.invoke(format, job);
    return stat;
  }

  @SuppressWarnings("unchecked")
  private static  Coder getDefaultCoder(Class c) {
    if (Writable.class.isAssignableFrom(c)) {
      Class writableClass = (Class) c;
      return (Coder) WritableCoder.of(writableClass);
    } else if (Void.class.equals(c)) {
      return (Coder) VoidCoder.of();
    }
    // TODO: how to use registered coders here?
    throw new IllegalStateException("Cannot find coder for " + c);
  }

  @SuppressWarnings("unchecked")
  private static  Class castClass(Class aClass) {
    return (Class) aClass;
  }

  // =======================================================================
  // BoundedReader
  // =======================================================================

  private static class HDFSFileReader extends BoundedSource.BoundedReader {

    private final HDFSFileSource source;
    private final String filepattern;
    private final Class> formatClass;
    private final Job job;

    private List splits;
    private ListIterator splitsIterator;

    private Configuration conf;
    private FileInputFormat format;
    private TaskAttemptContext attemptContext;
    private RecordReader currentReader;
    private KV currentPair;

    HDFSFileReader(
        HDFSFileSource source,
        String filepattern,
        Class> formatClass,
        SerializableSplit serializableSplit)
        throws IOException {
      this.source = source;
      this.filepattern = filepattern;
      this.formatClass = formatClass;
      this.job = Job.getInstance();

      if (source.serializableConfiguration != null) {
        for (Map.Entry entry : source.serializableConfiguration.get()) {
          job.getConfiguration().set(entry.getKey(), entry.getValue());
        }
      }

      if (serializableSplit != null) {
        this.splits = ImmutableList.of(serializableSplit.getSplit());
        this.splitsIterator = splits.listIterator();
      }
    }

    @Override
    public boolean start() throws IOException {
      Path path = new Path(filepattern);
      FileInputFormat.addInputPath(job, path);

      conf = job.getConfiguration();
      try {
        format = formatClass.newInstance();
      } catch (InstantiationException | IllegalAccessException e) {
        throw new IOException("Cannot instantiate file input format " + formatClass, e);
      }
      attemptContext = new TaskAttemptContextImpl(conf, new TaskAttemptID());

      if (splitsIterator == null) {
        splits = format.getSplits(job);
        splitsIterator = splits.listIterator();
      }

      return advance();
    }

    @Override
    public boolean advance() throws IOException {
      try {
        if (currentReader != null && currentReader.nextKeyValue()) {
          currentPair = nextPair();
          return true;
        } else {
          while (splitsIterator.hasNext()) {
            // advance the reader and see if it has records
            InputSplit nextSplit = splitsIterator.next();
            @SuppressWarnings("unchecked")
            RecordReader reader =
                (RecordReader) format.createRecordReader(nextSplit, attemptContext);
            if (currentReader != null) {
              currentReader.close();
            }
            currentReader = reader;
            currentReader.initialize(nextSplit, attemptContext);
            if (currentReader.nextKeyValue()) {
              currentPair = nextPair();
              return true;
            }
            currentReader.close();
            currentReader = null;
          }
          // either no next split or all readers were empty
          currentPair = null;
          return false;
        }
      } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new IOException(e);
      }
    }

    @Override
    public T getCurrent() throws NoSuchElementException {
      if (currentPair == null) {
        throw new NoSuchElementException();
      }
      return source.inputConverter.apply(currentPair);
    }

    @Override
    public void close() throws IOException {
      if (currentReader != null) {
        currentReader.close();
        currentReader = null;
      }
      currentPair = null;
    }

    @Override
    public BoundedSource getCurrentSource() {
      return source;
    }

    @SuppressWarnings("unchecked")
    private KV nextPair() throws IOException, InterruptedException {
      K key = currentReader.getCurrentKey();
      V value = currentReader.getCurrentValue();
      // clone Writable objects since they are reused between calls to RecordReader#nextKeyValue
      if (key instanceof Writable) {
        key = (K) WritableUtils.clone((Writable) key, conf);
      }
      if (value instanceof Writable) {
        value = (V) WritableUtils.clone((Writable) value, conf);
      }
      return KV.of(key, value);
    }

    // =======================================================================
    // Optional overrides
    // =======================================================================

    @Override
    public Double getFractionConsumed() {
      if (currentReader == null) {
        return 0.0;
      }
      if (splits.isEmpty()) {
        return 1.0;
      }
      int index = splitsIterator.previousIndex();
      int numReaders = splits.size();
      if (index == numReaders) {
        return 1.0;
      }
      double before = 1.0 * index / numReaders;
      double after = 1.0 * (index + 1) / numReaders;
      Double fractionOfCurrentReader = getProgress();
      if (fractionOfCurrentReader == null) {
        return before;
      }
      return before + fractionOfCurrentReader * (after - before);
    }

    private Double getProgress() {
      try {
        return (double) currentReader.getProgress();
      } catch (IOException | InterruptedException e) {
        return null;
      }
    }

  }

  // =======================================================================
  // SerializableSplit
  // =======================================================================

  /**
   * A wrapper to allow Hadoop {@link org.apache.hadoop.mapreduce.InputSplit}s to be
   * serialized using Java's standard serialization mechanisms. Note that the InputSplit
   * has to be Writable (which most are).
   */
  private static class SerializableSplit implements Externalizable {
    private static final long serialVersionUID = 0L;

    private InputSplit split;

    public SerializableSplit() {
    }

    public SerializableSplit(InputSplit split) {
      checkArgument(split instanceof Writable, "Split is not writable: %s", split);
      this.split = split;
    }

    public InputSplit getSplit() {
      return split;
    }

    @Override
    public void writeExternal(ObjectOutput out) throws IOException {
      out.writeUTF(split.getClass().getCanonicalName());
      ((Writable) split).write(out);
    }

    @Override
    public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
      String className = in.readUTF();
      try {
        split = (InputSplit) Class.forName(className).newInstance();
        ((Writable) split).readFields(in);
      } catch (InstantiationException | IllegalAccessException e) {
        throw new IOException(e);
      }
    }
  }

}