All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.cdap.internal.app.runtime.batch.dataset.input.MultipleInputs Maven / Gradle / Ivy

There is a newer version: 6.10.1
Show newest version
/*
 * Copyright © 2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.cdap.internal.app.runtime.batch.dataset.input;

import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;

import java.lang.reflect.Type;
import java.util.HashMap;
import java.util.Map;

/**
 * This utility class supports MapReduce jobs that have multiple inputs with
 * a different {@link InputFormat} and {@link Mapper} for each path
 */
public final class MultipleInputs {
  private static final String INPUT_CONFIGS = "mapreduce.input.multipleinputs.inputs";

  private static final Gson GSON = new GsonBuilder().enableComplexMapKeySerialization().create();
  private static final Type STRING_MAPPER_INPUT_MAP_TYPE = new TypeToken>() { }.getType();

  private MultipleInputs() {
  }

  /**
   * Add a {@link Path} with a custom {@link InputFormat} and
   * {@link Mapper} to the list of inputs for the map-reduce job.
   *
   * @param job The {@link Job}
   * @param namedInput name of the input
   * @param inputFormatClass the name of the InputFormat class to be used for this input
   * @param inputConfigs the configurations to be used for this input
   * @param mapperClass {@link Mapper} class to use for this path
   */
  @SuppressWarnings("unchecked")
  public static void addInput(Job job, String namedInput, String inputFormatClass, Map inputConfigs,
                              Class mapperClass) {
    Configuration conf = job.getConfiguration();

    Map map = getInputMap(conf);
    // this shouldn't happen, because it is already protected against in BasicMapReduceContext#addInput
    if (map.containsKey(namedInput)) {
      throw new IllegalArgumentException("Input already configured: " + namedInput);
    }
    map.put(namedInput, new MapperInput(inputFormatClass, inputConfigs, mapperClass));
    conf.set(INPUT_CONFIGS, GSON.toJson(map));

    job.setInputFormatClass(MultiInputFormat.class);
  }

  /**
   * @param conf the Configuration from which to deserialize the input configurations
   * @return a mapping from input name to the MapperInput for that input
   */
  public static Map getInputMap(Configuration conf) {
    String mapString = conf.get(INPUT_CONFIGS);
    if (mapString == null) {
      return new HashMap<>();
    }
    return GSON.fromJson(mapString, STRING_MAPPER_INPUT_MAP_TYPE);
  }

  /**
   * A simple POJO for encapsulating information for an input. We don't use
   * {@link io.cdap.cdap.internal.app.runtime.batch.dataset.input.MapperInput}, because that consists of an Interface,
   * which doesn't serialize well.
   */
  public static final class MapperInput {
    private final String inputFormatClassName;
    private final Map inputFormatConfiguration;
    private final String mapperClassName;

    public MapperInput(String inputFormatClassName, Map inputFormatConfiguration,
                       Class mapper) {
      this.inputFormatClassName = inputFormatClassName;
      this.inputFormatConfiguration = inputFormatConfiguration;
      this.mapperClassName = mapper.getName();
    }

    public String getInputFormatClassName() {
      return inputFormatClassName;
    }

    public Map getInputFormatConfiguration() {
      return inputFormatConfiguration;
    }

    public String getMapperClassName() {
      return mapperClassName;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy