All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.hdfs.HdfsFileStorage.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2016 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.hdfs

import java.io.{File, InputStream}
import java.net.URI
import java.nio.file.Path

import com.spotify.scio.io.FileStorage
import org.apache.avro.file.{SeekableFileInput, SeekableInput}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, PathFilter}
import org.apache.hadoop.io.compress.CompressionCodecFactory


object HdfsFileStorage {
  def apply(path: String): FileStorage = new HdfsFileStorage(path)
}


private class HdfsFileStorage(protected val path: String) extends FileStorage {

  private val pathFilter = new PathFilter {
    override def accept(path: org.apache.hadoop.fs.Path): Boolean =
      !path.getName.startsWith("_") && !path.getName.startsWith(".")
  }

  override protected def listFiles: Seq[Path] = {
    val conf = new Configuration()
    val fs = FileSystem.get(new URI(path), conf)

    fs
      .listStatus(new org.apache.hadoop.fs.Path(path), pathFilter)
      .map(status => new File(status.getPath.toString).toPath).toSeq
  }

  override protected def getObjectInputStream(path: Path): InputStream = {
    val hPath = new org.apache.hadoop.fs.Path(path.toString)
    val conf = new Configuration()
    val factory = new CompressionCodecFactory(conf)
    val fs = FileSystem.get(path.toUri, conf)
    val codec = factory.getCodec(hPath)
    if (codec != null) {
      codec.createInputStream(fs.open(hPath))
    } else {
      fs.open(hPath)
    }
  }

  override protected def getAvroSeekableInput(path: Path): SeekableInput = {
    val hPath = new org.apache.hadoop.fs.Path(path.toString)
    new SeekableFileInput(new File(hPath.toUri))
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy