All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.indexer.path.StaticPathSpec Maven / Gradle / Ivy

There is a newer version: 31.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexer.path;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import org.apache.druid.indexer.HadoopDruidIndexerConfig;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import java.util.LinkedHashSet;
import java.util.Set;


public class StaticPathSpec implements PathSpec
{
  private static final Logger log = new Logger(StaticPathSpec.class);

  private final String paths;
  private final Class inputFormat;

  @JsonCreator
  public StaticPathSpec(
      @JsonProperty("paths") String paths,
      @JsonProperty("inputFormat") Class inputFormat
  )
  {
    this.paths = paths;
    this.inputFormat = inputFormat;
  }

  @Override
  public Job addInputPaths(HadoopDruidIndexerConfig config, Job job)
  {
    log.info("Adding paths[%s]", paths);

    addToMultipleInputs(config, job, paths, inputFormat);

    return job;
  }

  @JsonProperty
  public Class getInputFormat()
  {
    return inputFormat;
  }

  @JsonProperty
  public String getPaths()
  {
    return paths;
  }

  public static void addToMultipleInputs(
      HadoopDruidIndexerConfig config,
      Job job,
      String path,
      Class inputFormatClass
  )
  {
    if (path != null) {
      addToMultipleInputs(config, job, ImmutableSet.of(path), inputFormatClass);
    }
  }

  public static void addToMultipleInputs(
      HadoopDruidIndexerConfig config,
      Job job,
      Set paths,
      Class inputFormatClass
  )
  {
    if (paths == null || paths.isEmpty()) {
      return;
    }

    Class inputFormatClassToUse = inputFormatClass;
    if (inputFormatClassToUse == null) {
      if (config.isCombineText()) {
        inputFormatClassToUse = CombineTextInputFormat.class;
      } else {
        inputFormatClassToUse = TextInputFormat.class;
      }
    }

    // Due to https://issues.apache.org/jira/browse/MAPREDUCE-5061 we can't directly do
    // MultipleInputs.addInputPath(job, path, inputFormatClassToUse)
    // but have to handle hadoop glob path ourselves correctly
    // This change and HadoopGlobPathSplitter.java can be removed once the hadoop issue is fixed
    Set pathStrings = new LinkedHashSet<>();
    for (String path : paths) {
      Iterables.addAll(pathStrings, HadoopGlobPathSplitter.splitGlob(path));
    }
    if (!pathStrings.isEmpty()) {
      addInputPath(job, pathStrings, inputFormatClassToUse);
    }
  }

  // copied from MultipleInputs.addInputPath with slight modifications
  private static void addInputPath(Job job, Iterable pathStrings, Class inputFormatClass)
  {
    Configuration conf = job.getConfiguration();
    StringBuilder inputFormats = new StringBuilder(
        StringUtils.nullToEmptyNonDruidDataString(conf.get(MultipleInputs.DIR_FORMATS))
    );

    String[] paths = Iterables.toArray(pathStrings, String.class);
    for (int i = 0; i < paths.length - 1; i++) {
      if (inputFormats.length() > 0) {
        inputFormats.append(',');
      }
      inputFormats.append(paths[i]).append(';').append(inputFormatClass.getName());
    }
    if (inputFormats.length() > 0) {
      conf.set(MultipleInputs.DIR_FORMATS, inputFormats.toString());
    }
    // add last one separately for possible initialization in MultipleInputs
    MultipleInputs.addInputPath(job, new Path(paths[paths.length - 1]), inputFormatClass);
  }

  @Override
  public boolean equals(Object o)
  {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }

    StaticPathSpec that = (StaticPathSpec) o;

    if (paths != null ? !paths.equals(that.paths) : that.paths != null) {
      return false;
    }
    return !(inputFormat != null ? !inputFormat.equals(that.inputFormat) : that.inputFormat != null);

  }

  @Override
  public int hashCode()
  {
    int result = paths != null ? paths.hashCode() : 0;
    result = 31 * result + (inputFormat != null ? inputFormat.hashCode() : 0);
    return result;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy