All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.druid.storage.hdfs.HdfsDataSegmentPuller Maven / Gradle / Ivy

There is a newer version: 0.12.3
Show newest version
/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.storage.hdfs;

import com.google.common.base.Predicate;
import com.google.common.base.Throwables;
import com.google.common.io.ByteSource;
import com.google.inject.Inject;
import com.metamx.common.CompressionUtils;
import com.metamx.common.FileUtils;
import com.metamx.common.IAE;
import com.metamx.common.RetryUtils;
import com.metamx.common.UOE;
import com.metamx.common.logger.Logger;
import io.druid.segment.loading.DataSegmentPuller;
import io.druid.segment.loading.SegmentLoadingException;
import io.druid.segment.loading.URIDataPuller;
import io.druid.timeline.DataSegment;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

import javax.tools.FileObject;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Writer;
import java.net.URI;
import java.util.ArrayList;
import java.util.concurrent.Callable;

/**
 */
public class HdfsDataSegmentPuller implements DataSegmentPuller, URIDataPuller
{
  public static final int DEFAULT_RETRY_COUNT = 3;

  /**
   * FileObject.getLastModified and FileObject.delete don't throw IOException. This allows us to wrap those calls
   */
  public static class HdfsIOException extends RuntimeException
  {
    private final IOException cause;

    public HdfsIOException(IOException ex)
    {
      super(ex);
      this.cause = ex;
    }

    protected IOException getIOException()
    {
      return cause;
    }
  }


  public static FileObject buildFileObject(final URI uri, final Configuration config)
  {
    return buildFileObject(uri, config, false);
  }

  public static FileObject buildFileObject(final URI uri, final Configuration config, final Boolean overwrite)
  {
    return new FileObject()
    {
      final Path path = new Path(uri);

      @Override
      public URI toUri()
      {
        return uri;
      }

      @Override
      public String getName()
      {
        return path.getName();
      }

      @Override
      public InputStream openInputStream() throws IOException
      {
        final FileSystem fs = path.getFileSystem(config);
        return fs.open(path);
      }

      @Override
      public OutputStream openOutputStream() throws IOException
      {
        final FileSystem fs = path.getFileSystem(config);
        return fs.create(path, overwrite);
      }

      @Override
      public Reader openReader(boolean ignoreEncodingErrors) throws IOException
      {
        throw new UOE("HDFS Reader not supported");
      }

      @Override
      public CharSequence getCharContent(boolean ignoreEncodingErrors) throws IOException
      {
        throw new UOE("HDFS CharSequence not supported");
      }

      @Override
      public Writer openWriter() throws IOException
      {
        throw new UOE("HDFS Writer not supported");
      }

      @Override
      public long getLastModified()
      {
        try {
          final FileSystem fs = path.getFileSystem(config);
          return fs.getFileStatus(path).getModificationTime();
        }
        catch (IOException ex) {
          throw new HdfsIOException(ex);
        }
      }

      @Override
      public boolean delete()
      {
        try {
          final FileSystem fs = path.getFileSystem(config);
          return fs.delete(path, false);
        }
        catch (IOException ex) {
          throw new HdfsIOException(ex);
        }
      }
    };
  }

  private static final Logger log = new Logger(HdfsDataSegmentPuller.class);
  protected final Configuration config;

  @Inject
  public HdfsDataSegmentPuller(final Configuration config)
  {
    this.config = config;
  }


  @Override
  public void getSegmentFiles(DataSegment segment, File dir) throws SegmentLoadingException
  {
    getSegmentFiles(getPath(segment), dir);
  }

  public FileUtils.FileCopyResult getSegmentFiles(final Path path, final File outDir) throws SegmentLoadingException
  {
    final LocalFileSystem localFileSystem = new LocalFileSystem();
    try {
      final FileSystem fs = path.getFileSystem(config);
      if (fs.isDirectory(path)) {

        // --------    directory     ---------

        try {
          return RetryUtils.retry(
              new Callable()
              {
                @Override
                public FileUtils.FileCopyResult call() throws Exception
                {
                  if (!fs.exists(path)) {
                    throw new SegmentLoadingException("No files found at [%s]", path.toString());
                  }

                  final RemoteIterator children = fs.listFiles(path, false);
                  final ArrayList localChildren = new ArrayList<>();
                  final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
                  while (children.hasNext()) {
                    final LocatedFileStatus child = children.next();
                    final Path childPath = child.getPath();
                    final String fname = childPath.getName();
                    if (fs.isDirectory(childPath)) {
                      log.warn("[%s] is a child directory, skipping", childPath.toString());
                    } else {
                      final File outFile = new File(outDir, fname);

                      // Actual copy
                      fs.copyToLocalFile(childPath, new Path(outFile.toURI()));
                      result.addFile(outFile);
                    }
                  }
                  log.info(
                      "Copied %d bytes from [%s] to [%s]",
                      result.size(),
                      path.toString(),
                      outDir.getAbsolutePath()
                  );
                  return result;
                }

              },
              shouldRetryPredicate(),
              DEFAULT_RETRY_COUNT
          );
        }
        catch (Exception e) {
          throw Throwables.propagate(e);
        }
      } else if (CompressionUtils.isZip(path.getName())) {

        // --------    zip     ---------

        final FileUtils.FileCopyResult result = CompressionUtils.unzip(
            new ByteSource()
            {
              @Override
              public InputStream openStream() throws IOException
              {
                return getInputStream(path);
              }
            }, outDir, shouldRetryPredicate(), false
        );

        log.info(
            "Unzipped %d bytes from [%s] to [%s]",
            result.size(),
            path.toString(),
            outDir.getAbsolutePath()
        );

        return result;
      } else if (CompressionUtils.isGz(path.getName())) {

        // --------    gzip     ---------

        final String fname = path.getName();
        final File outFile = new File(outDir, CompressionUtils.getGzBaseName(fname));
        final FileUtils.FileCopyResult result = CompressionUtils.gunzip(
            new ByteSource()
            {
              @Override
              public InputStream openStream() throws IOException
              {
                return getInputStream(path);
              }
            },
            outFile
        );

        log.info(
            "Gunzipped %d bytes from [%s] to [%s]",
            result.size(),
            path.toString(),
            outFile.getAbsolutePath()
        );
        return result;
      } else {
        throw new SegmentLoadingException("Do not know how to handle file type at [%s]", path.toString());
      }
    }
    catch (IOException e) {
      throw new SegmentLoadingException(e, "Error loading [%s]", path.toString());
    }
  }

  public FileUtils.FileCopyResult getSegmentFiles(URI uri, File outDir) throws SegmentLoadingException
  {
    if (!uri.getScheme().equalsIgnoreCase(HdfsStorageDruidModule.SCHEME)) {
      throw new SegmentLoadingException("Don't know how to load SCHEME for URI [%s]", uri.toString());
    }
    return getSegmentFiles(new Path(uri), outDir);
  }

  public InputStream getInputStream(Path path) throws IOException
  {
    return buildFileObject(path.toUri(), config).openInputStream();
  }

  @Override
  public InputStream getInputStream(URI uri) throws IOException
  {
    if (!uri.getScheme().equalsIgnoreCase(HdfsStorageDruidModule.SCHEME)) {
      throw new IAE("Don't know how to load SCHEME [%s] for URI [%s]", uri.getScheme(), uri.toString());
    }
    return buildFileObject(uri, config).openInputStream();
  }

  /**
   * Return the "version" (aka last modified timestamp) of the URI
   *
   * @param uri The URI of interest
   *
   * @return The last modified timestamp of the uri in String format
   *
   * @throws IOException
   */
  @Override
  public String getVersion(URI uri) throws IOException
  {
    try {
      return String.format("%d", buildFileObject(uri, config).getLastModified());
    }
    catch (HdfsIOException ex) {
      throw ex.getIOException();
    }
  }

  @Override
  public Predicate shouldRetryPredicate()
  {
    return new Predicate()
    {
      @Override
      public boolean apply(Throwable input)
      {
        if (input == null) {
          return false;
        }
        if (input instanceof HdfsIOException) {
          return true;
        }
        if (input instanceof IOException) {
          return true;
        }
        return apply(input.getCause());
      }
    };
  }

  private Path getPath(DataSegment segment)
  {
    return new Path(String.valueOf(segment.getLoadSpec().get("path")));
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy