All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.data.input.impl.LocalInputSource Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.data.input.impl;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterators;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.AndFileFilter;
import org.apache.commons.io.filefilter.IOFileFilter;
import org.apache.commons.io.filefilter.NameFileFilter;
import org.apache.commons.io.filefilter.NotFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.druid.data.input.AbstractInputSource;
import org.apache.druid.data.input.InputFileAttribute;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.data.input.InputSourceReader;
import org.apache.druid.data.input.InputSplit;
import org.apache.druid.data.input.SplitHintSpec;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.utils.CollectionUtils;
import org.apache.druid.utils.Streams;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.File;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class LocalInputSource extends AbstractInputSource implements SplittableInputSource>
{
  private static final Logger log = new Logger(LocalInputSource.class);
  public static final String TYPE_KEY = "local";

  @Nullable
  private final File baseDir;
  @Nullable
  private final String filter;
  private final List files;

  @JsonCreator
  public LocalInputSource(
      @JsonProperty("baseDir") @Nullable File baseDir,
      @JsonProperty("filter") @Nullable String filter,
      @JsonProperty("files") @Nullable List files
  )
  {
    this.baseDir = baseDir;
    this.filter = baseDir != null ? Preconditions.checkNotNull(filter, "filter") : filter;
    this.files = files == null ? Collections.emptyList() : files;

    if (baseDir == null && CollectionUtils.isNullOrEmpty(files)) {
      throw new IAE("At least one of baseDir or files should be specified");
    }
  }

  @JsonIgnore
  @Nonnull
  @Override
  public Set getTypes()
  {
    return Collections.singleton(TYPE_KEY);
  }

  public LocalInputSource(File baseDir, String filter)
  {
    this(baseDir, filter, null);
  }

  @Nullable
  public File getBaseDir()
  {
    return baseDir;
  }

  /**
   * Returns the base directory for serialization. This is better than returning {@link File} directly, because
   * Jackson serializes {@link File} using {@link File#getAbsolutePath()}, and we'd prefer to not force relative
   * path resolution as part of serialization.
   */
  @Nullable
  @JsonProperty("baseDir")
  @JsonInclude(JsonInclude.Include.NON_NULL)
  private String getBaseDirForSerialization()
  {
    if (baseDir == null) {
      return null;
    } else {
      return baseDir.getPath();
    }
  }

  @Nullable
  @JsonProperty
  @JsonInclude(JsonInclude.Include.NON_NULL)
  public String getFilter()
  {
    return filter;
  }

  public List getFiles()
  {
    return files;
  }

  /**
   * Returns the list of file paths for serialization. This is better than returning {@link File} directly, because
   * Jackson serializes {@link File} using {@link File#getAbsolutePath()}, and we'd prefer to not force relative
   * path resolution as part of serialization.
   */
  @JsonProperty("files")
  @JsonInclude(JsonInclude.Include.NON_EMPTY)
  private List getFilesForSerialization()
  {
    return getFiles().stream().map(File::getPath).collect(Collectors.toList());
  }

  @Override
  public Stream>> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
  {
    return Streams.sequentialStreamFrom(getSplitFileIterator(inputFormat, getSplitHintSpecOrDefault(splitHintSpec)))
                  .map(InputSplit::new);
  }

  @Override
  public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
  {
    return Iterators.size(getSplitFileIterator(inputFormat, getSplitHintSpecOrDefault(splitHintSpec)));
  }

  private Iterator> getSplitFileIterator(final InputFormat inputFormat, SplitHintSpec splitHintSpec)
  {
    final Iterator fileIterator = getFileIterator();
    return splitHintSpec.split(
        fileIterator,
        file -> new InputFileAttribute(
            file.length(),
            inputFormat != null
            ? inputFormat.getWeightedSize(file.getName(), file.length())
            : file.length()
        )
    );
  }

  @VisibleForTesting
  Iterator getFileIterator()
  {
    return
        Iterators.filter(
            Iterators.concat(
                getDirectoryListingIterator(),
                getFilesListIterator()
            ),
            file -> file.length() > 0
        );
  }

  private Iterator getDirectoryListingIterator()
  {
    if (baseDir == null) {
      return Collections.emptyIterator();
    } else {
      final IOFileFilter fileFilter;
      if (files == null) {
        fileFilter = new WildcardFileFilter(filter);
      } else {
        fileFilter = new AndFileFilter(
            new WildcardFileFilter(filter),
            new NotFileFilter(
                new NameFileFilter(files.stream().map(File::getName).collect(Collectors.toList()), IOCase.SENSITIVE)
            )
        );
      }
      Iterator fileIterator = FileUtils.iterateFiles(
          baseDir.getAbsoluteFile(),
          fileFilter,
          TrueFileFilter.INSTANCE
      );
      if (!fileIterator.hasNext()) {
        // base dir & filter are guaranteed to be non-null here
        // (by construction and non-null check of baseDir a few lines above):
        log.info("Local inputSource filter [%s] for base dir [%s] did not match any files", filter, baseDir);
      }
      return fileIterator;
    }
  }

  private Iterator getFilesListIterator()
  {
    if (files == null) {
      return Collections.emptyIterator();
    } else {
      return files.iterator();
    }
  }

  @Override
  public SplittableInputSource> withSplit(InputSplit> split)
  {
    return new LocalInputSource(null, null, split.get());
  }

  @Override
  public boolean needsFormat()
  {
    return true;
  }

  @Override
  protected InputSourceReader formattableReader(
      InputRowSchema inputRowSchema,
      InputFormat inputFormat,
      @Nullable File temporaryDirectory
  )
  {
    //noinspection ConstantConditions
    return new InputEntityIteratingReader(
        inputRowSchema,
        inputFormat,
        Iterators.transform(getFileIterator(), FileEntity::new),
        temporaryDirectory
    );
  }

  @Override
  public boolean equals(Object o)
  {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }
    LocalInputSource that = (LocalInputSource) o;
    return Objects.equals(baseDir, that.baseDir) &&
           Objects.equals(filter, that.filter) &&
           Objects.equals(files, that.files);
  }

  @Override
  public int hashCode()
  {
    return Objects.hash(baseDir, filter, files);
  }

  @Override
  public String toString()
  {
    return "LocalInputSource{" +
           "baseDir=\"" + baseDir +
           "\", filter=" + filter +
           ", files=" + files +
           "}";
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy