gobblin.source.extractor.hadoop.HadoopFileInputSource Maven / Gradle / Ivy
Show all versions of gobblin-core Show documentation
/*
 * Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */
package gobblin.source.extractor.hadoop;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.ReflectionUtils;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.extract.AbstractSource;
import gobblin.source.workunit.Extract;
import gobblin.source.workunit.WorkUnit;
import gobblin.util.HadoopUtils;
/**
 * An implementation of {@link gobblin.source.Source} that uses a Hadoop {@link FileInputFormat} to get a
 * {@link FileSplit} per {@link Extractor} return by {@link #getExtractor(WorkUnitState)} and a
 * {@link RecordReader} to read the {@link FileSplit}.
 *
 * 
 *   This class can read either keys of type {@link #} or values of type {@link #} supported by the
 *   given {@link FileInputFormat}, through the property {@link #FILE_INPUT_READ_KEYS_KEY}. It will read keys
 *   if the property is set to {@code true}, otherwise it will read values. By default, it will read values
 *   through the given {@link FileInputFormat}.
 *   
 *
 * 
 *   A concrete implementation of this class should implement {@link #getFileInputFormat(State, Configuration)}
 *   and {@link #getExtractor(WorkUnitState, RecordReader, FileSplit, boolean)}, which returns a
 *   {@link HadoopFileInputExtractor} that needs an concrete implementation.
 * 
 *
 * @param  output schema type
 * @param  output data record type
 * @param  key type expected by the {@link FileInputFormat}
 * @param  value type expected by the {@link FileInputFormat}
 *
 * @author Yinan Li
 */
public abstract class HadoopFileInputSource extends AbstractSource {
  private static final String HADOOP_SOURCE_KEY_PREFIX = "source.hadoop.";
  public static final String FILE_INPUT_FORMAT_CLASS_KEY = HADOOP_SOURCE_KEY_PREFIX + "file.input.format.class";
  public static final String FILE_SPLITS_DESIRED_KEY = HADOOP_SOURCE_KEY_PREFIX + "file.splits.desired";
  public static final int DEFAULT_FILE_SPLITS_DESIRED = 1;
  public static final String FILE_INPUT_PATHS_KEY = HADOOP_SOURCE_KEY_PREFIX + "file.input.paths";
  public static final String FILE_INPUT_READ_KEYS_KEY = HADOOP_SOURCE_KEY_PREFIX + "file.read.keys";
  public static final boolean DEFAULT_FILE_INPUT_READ_KEYS = false;
  public static final String FILE_SPLIT_PATH_KEY = HADOOP_SOURCE_KEY_PREFIX + "file.split.path";
  static final String FILE_SPLIT_BYTES_STRING_KEY = HADOOP_SOURCE_KEY_PREFIX + "file.split.bytes.string";
  @Override
  public List getWorkunits(SourceState state) {
    try {
      Job job = Job.getInstance(new Configuration());
      if (state.contains(FILE_INPUT_PATHS_KEY)) {
        for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
          FileInputFormat.addInputPath(job, new Path(inputPath));
        }
      }
      FileInputFormat fileInputFormat = getFileInputFormat(state, job.getConfiguration());
      List fileSplits = fileInputFormat.getSplits(job);
      if (fileSplits == null || fileSplits.isEmpty()) {
        return ImmutableList.of();
      }
      Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ?
          Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
      String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
      String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
      List workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
      for (InputSplit inputSplit : fileSplits) {
        // Create one WorkUnit per InputSplit
        FileSplit fileSplit = (FileSplit) inputSplit;
        Extract extract = createExtract(tableType, tableNamespace, tableName);
        WorkUnit workUnit = WorkUnit.create(extract);
        workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
        workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
        workUnits.add(workUnit);
      }
      return workUnits;
    } catch (IOException ioe) {
      throw new RuntimeException("Failed to get workunits", ioe);
    }
  }
  @Override
  public Extractor getExtractor(WorkUnitState workUnitState) throws IOException {
    if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) {
      throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
    }
    Configuration configuration = new Configuration();
    FileInputFormat fileInputFormat = getFileInputFormat(workUnitState, configuration);
    String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY);
    FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
    TaskAttemptContext taskAttemptContext =
        getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID());
    try {
      RecordReader recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext);
      recordReader.initialize(fileSplit, taskAttemptContext);
      boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS);
      return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
    } catch (InterruptedException ie) {
      throw new IOException(ie);
    }
  }
  @Override
  public void shutdown(SourceState state) {
  }
  /**
   * Get a {@link FileInputFormat} instance used to get {@link FileSplit}s and a {@link RecordReader}
   * for every {@link FileSplit}.
   *
   * 
   *   This default implementation simply creates a new instance of a {@link FileInputFormat} class
   *   specified using the configuration property {@link #FILE_INPUT_FORMAT_CLASS_KEY}.
   * 
   *
   * @param state a {@link State} object carrying configuration properties
   * @param configuration a Hadoop {@link Configuration} object carrying Hadoop configurations
   * @return a {@link FileInputFormat} instance
   */
  @SuppressWarnings("unchecked")
  protected FileInputFormat getFileInputFormat(State state, Configuration configuration) {
    Preconditions.checkArgument(state.contains(FILE_INPUT_FORMAT_CLASS_KEY));
    try {
      return (FileInputFormat) ReflectionUtils.newInstance(
          Class.forName(state.getProp(FILE_INPUT_FORMAT_CLASS_KEY)), configuration);
    } catch (ClassNotFoundException cnfe) {
      throw new RuntimeException(cnfe);
    }
  }
  /**
   * Get a {@link HadoopFileInputExtractor} instance.
   *
   * @param workUnitState a {@link WorkUnitState} object carrying Gobblin configuration properties
   * @param recordReader a Hadoop {@link RecordReader} object used to read input records
   * @param fileSplit the {@link FileSplit} to read input records from
   * @param readKeys whether the {@link OldApiHadoopFileInputExtractor} should read keys of type {@link #};
   *                 by default values of type {@link #>V>} are read.
   * @return a {@link HadoopFileInputExtractor} instance
   */
  protected abstract HadoopFileInputExtractor getExtractor(WorkUnitState workUnitState,
      RecordReader recordReader, FileSplit fileSplit, boolean readKeys);
  private TaskAttemptContext getTaskAttemptContext(Configuration configuration, TaskAttemptID taskAttemptID) {
    Class> taskAttemptContextClass;
    try {
      // For Hadoop 2.x
      taskAttemptContextClass = Class.forName("org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl");
    } catch (ClassNotFoundException cnfe) {
      throw new RuntimeException(cnfe);
    }
    try {
      return (TaskAttemptContext) taskAttemptContextClass.getDeclaredConstructor(
          Configuration.class, TaskAttemptID.class).newInstance(configuration, taskAttemptID);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  /**
   * A factory class for creating new dummy {@link TaskAttemptID}s.
   *
   * 
   *   This class extends {@link TaskAttemptID} so it has access to some protected string constants
   *   in {@link TaskAttemptID}.
   * 
   */
  private static class DummyTaskAttemptIDFactory extends TaskAttemptID {
    /**
     * Create a new {@link TaskAttemptID} instance.
     *
     * @return a new {@link TaskAttemptID} instance
     */
    public static TaskAttemptID newTaskAttemptID() {
      return TaskAttemptID.forName(ATTEMPT + SEPARATOR + Long.toString(System.currentTimeMillis()) +
          SEPARATOR + 0 + SEPARATOR + 'm' + SEPARATOR + 0 + SEPARATOR + 0);
    }
  }
}