com.google.cloud.dataflow.sdk.io.hdfs.HDFSFileSink Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-hdfs_2.11 Show documentation
Scio add-on for HDFS
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.dataflow.sdk.io.hdfs;

import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.io.Sink;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.task.JobContextImpl;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.security.UserGroupInformation;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.PrivilegedExceptionAction;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import static com.google.common.base.Preconditions.checkState;

public class HDFSFileSink extends Sink {

  private static final JobID jobId = new JobID(
      Long.toString(System.currentTimeMillis()),
      new Random().nextInt(Integer.MAX_VALUE));

  private final String path;
  private final Class> formatClass;
  private final SerializableFunction> outputConverter;
  private final SerializableConfiguration serializableConfiguration;
  private final String username;
  private final boolean validate;

  private HDFSFileSink(String path,
                       Class> formatClass,
                       SerializableFunction> outputConverter,
                       SerializableConfiguration serializableConfiguration,
                       String username,
                       boolean validate) {
    this.path = path;
    this.formatClass = formatClass;
    this.outputConverter = outputConverter;
    this.serializableConfiguration = serializableConfiguration;
    this.username = username;
    this.validate = validate;
  }

  // =======================================================================
  // Factory methods
  // =======================================================================

  public static > HDFSFileSink
  to(String path, Class formatClass, SerializableFunction> outputConverter) {
    return new HDFSFileSink<>(path, formatClass, outputConverter, null, null, true);
  }

  public static  HDFSFileSink toText(String path) {
    SerializableFunction> outputConverter =
        new SerializableFunction>() {
          @Override
          public KV apply(T input) {
            return KV.of(NullWritable.get(), new Text(input.toString()));
          }
        };
    return to(path, TextOutputFormat.class, outputConverter);
  }

  public static  HDFSFileSink, NullWritable> toAvro(String path) {
    SerializableFunction, NullWritable>> outputConverter =
        new SerializableFunction, NullWritable>>() {
          @Override
          public KV, NullWritable> apply(T input) {
            return KV.of(new AvroKey<>(input), NullWritable.get());
          }
        };
    return to(path, AvroKeyOutputFormat.class, outputConverter);
  }

  // =======================================================================
  // Builder methods
  // =======================================================================

  public HDFSFileSink withConfiguration(Configuration conf) {
    SerializableConfiguration serializableConfiguration = new SerializableConfiguration(conf);
    return new HDFSFileSink<>(
        path, formatClass, outputConverter, serializableConfiguration, username, validate);
  }

  public HDFSFileSink withUsername(String username) {
    return new HDFSFileSink<>(
        path, formatClass, outputConverter, serializableConfiguration, username, false);
  }

  public HDFSFileSink withoutValidation() {
    return new HDFSFileSink<>(
        path, formatClass, outputConverter, serializableConfiguration, username, false);
  }

  // =======================================================================
  // Sink
  // =======================================================================

  @Override
  public void validate(PipelineOptions options) {
    if (validate) {
      try {
        FileSystem fs = FileSystem.get(new URI(path), jobInstance().getConfiguration());
        checkState(!fs.exists(new Path(path)), "Output path %s exists", path);
      } catch (IOException | URISyntaxException e) {
        throw new RuntimeException(e);
      }
    }
  }

  @Override
  public Sink.WriteOperation createWriteOperation(PipelineOptions options) {
    return new HDFSWriteOperation<>(this, path, formatClass);
  }

  private Job jobInstance() throws IOException {
    Job job = Job.getInstance();
    if (serializableConfiguration != null) {
      for (Map.Entry entry : serializableConfiguration.get()) {
        job.getConfiguration().set(entry.getKey(), entry.getValue());
      }
    }
    job.setJobID(jobId);
    return job;
  }

  // =======================================================================
  // WriteOperation
  // =======================================================================

  /** {{@link WriteOperation}} for HDFS. */
  private static class HDFSWriteOperation extends WriteOperation {

    private final HDFSFileSink sink;
    private final String path;
    private final Class> formatClass;

    HDFSWriteOperation(HDFSFileSink sink,
                       String path,
                       Class> formatClass) {
      this.sink = sink;
      this.path = path;
      this.formatClass = formatClass;
    }

    @Override
    public void initialize(PipelineOptions options) throws Exception {
      Job job = sink.jobInstance();
      FileOutputFormat.setOutputPath(job, new Path(path));
    }

    @Override
    public void finalize(final Iterable writerResults, PipelineOptions options)
        throws Exception {
      if (sink.username == null) {
        doFinalize(writerResults);
      } else {
        UserGroupInformation.createRemoteUser(sink.username).doAs(
            new PrivilegedExceptionAction() {
              @Override
              public Void run() throws Exception {
                doFinalize(writerResults);
                return null;
              }
            });
      }
    }

    private void doFinalize(Iterable writerResults) throws Exception {
      Job job = sink.jobInstance();
      FileSystem fs = FileSystem.get(new URI(path), job.getConfiguration());

      // If there are 0 output shards, just create output folder.
      if (!writerResults.iterator().hasNext()) {
        fs.mkdirs(new Path(path));
        return;
      }

      // job successful
      JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
      FileOutputCommitter outputCommitter = new FileOutputCommitter(new Path(path), context);
      outputCommitter.commitJob(context);

      // get actual output shards
      Set actual = Sets.newHashSet();
      FileStatus[] statuses = fs.listStatus(new Path(path), new PathFilter() {
        @Override
        public boolean accept(Path path) {
          String name = path.getName();
          return !name.startsWith("_") && !name.startsWith(".");
        }
      });

      // get expected output shards
      Set expected = Sets.newHashSet(writerResults);
      checkState(
          expected.size() == Lists.newArrayList(writerResults).size(),
          "Data loss due to writer results hash collision");
      for (FileStatus s : statuses) {
        String name = s.getPath().getName();
        int pos = name.indexOf('.');
        actual.add(pos > 0 ? name.substring(0, pos) : name);
      }

      checkState(actual.equals(expected), "Writer results and output files do not match");

      // rename output shards to Hadoop style, i.e. part-r-00000.txt
      int i = 0;
      for (FileStatus s : statuses) {
        String name = s.getPath().getName();
        int pos = name.indexOf('.');
        String ext = pos > 0 ? name.substring(pos) : "";
        fs.rename(
            s.getPath(),
            new Path(s.getPath().getParent(), String.format("part-r-%05d%s", i, ext)));
        i++;
      }
    }

    @Override
    public Writer createWriter(PipelineOptions options) throws Exception {
      return new HDFSWriter<>(this, path, formatClass);
    }

    @Override
    public Sink getSink() {
      return sink;
    }

    @Override
    public Coder getWriterResultCoder() {
      return StringUtf8Coder.of();
    }

  }

  // =======================================================================
  // Writer
  // =======================================================================

  private static class HDFSWriter extends Writer {

    private final HDFSWriteOperation writeOperation;
    private final String path;
    private final Class> formatClass;

    // unique hash for each task
    private int hash;

    private TaskAttemptContext context;
    private RecordWriter recordWriter;
    private FileOutputCommitter outputCommitter;

    HDFSWriter(HDFSWriteOperation writeOperation,
               String path,
               Class> formatClass) {
      this.writeOperation = writeOperation;
      this.path = path;
      this.formatClass = formatClass;
    }

    @Override
    public void open(final String uId) throws Exception {
      if (writeOperation.sink.username == null) {
        doOpen(uId);
      } else {
        UserGroupInformation.createRemoteUser(writeOperation.sink.username).doAs(
            new PrivilegedExceptionAction() {
              @Override
              public Void run() throws Exception {
                doOpen(uId);
                return null;
              }
            });
      }
    }

    private void doOpen(String uId) throws Exception {
      this.hash = uId.hashCode();

      Job job = writeOperation.sink.jobInstance();
      FileOutputFormat.setOutputPath(job, new Path(path));

      // Each Writer is responsible for writing one bundle of elements and is represented by one
      // unique Hadoop task based on uId/hash. All tasks share the same job ID. Since Dataflow
      // handles retrying of failed bundles, each task has one attempt only.
      JobID jobId = job.getJobID();
      TaskID taskId = new TaskID(jobId, TaskType.REDUCE, hash);
      context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID(taskId, 0));

      FileOutputFormat outputFormat = formatClass.newInstance();
      recordWriter = outputFormat.getRecordWriter(context);
      outputCommitter = (FileOutputCommitter) outputFormat.getOutputCommitter(context);
    }

    @Override
    public void write(T value) throws Exception {
      KV kv = writeOperation.sink.outputConverter.apply(value);
      recordWriter.write(kv.getKey(), kv.getValue());
    }

    @Override
    public String close() throws Exception {
      if (writeOperation.sink.username == null) {
        return doClose();
      } else {
        return UserGroupInformation.createRemoteUser(writeOperation.sink.username).doAs(
            new PrivilegedExceptionAction() {
              @Override
              public String run() throws Exception {
                return doClose();
              }
            });
      }
    }

    private String doClose() throws Exception {
      // task/attempt successful
      recordWriter.close(context);
      outputCommitter.commitTask(context);

      // result is prefix of the output file name
      return String.format("part-r-%d", hash);
    }

    @Override
    public WriteOperation getWriteOperation() {
      return writeOperation;
    }

  }

}