All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.spark.ParallelCopyGCSDirectoryIntoHDFSSpark Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.tools.spark;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.utils.Utils;
import picard.cmdline.programgroups.OtherProgramGroup;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import scala.Tuple2;

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Parallel copy a file or directory from Google Cloud Storage into the HDFS file system used by Spark
 *
 * 

This tool uses a Spark cluster to do a parallel copy of either a single file or a directory from * Google Cloud Storage (GCS) into the HDFS file system used by Spark to support Resilient Distributed Datasets (RDDs). * Files are divided into chunks of size equal to the HDFS block size (with the exception of the final * chunk) and each Spark task is responsible for copying one chunk. To copy all of the files in a GCS directory, * provide the GCS directory path, including the trailing slash. Directory copies are non-recursive so * subdirectories will be skipped. Within directories each file is divided into chunks independently (so this will be * inefficient if you have lots of files smaller than the block size). After all chunks are copied, the HDFS * concat method is used to stitch together chunks into single files without re-copying them.

*

This functionality is used by the structural variation workflow to copy reference data when a * Spark cluster is created, and may also be used to copy sample data to a Spark cluster.

* *

Inputs

*
    *
  • An gcs input file or directory.
  • *
* *

Output

*
    *
  • A copy of the input as HDFS output.
  • *
* *

Usage example

*
 *   gatk ParallelCopyGCSDirectoryIntoHDFSSpark \
 *     --input-gcs-path gs://bucket_name/input_reads.bam \
 *     --output-hdfs-directory hdfs://cluster-name-m:8020/directory/input_reads.bam \
 *     -- \
 *     --spark-runner GCS \
 *     --cluster my-dataproc-cluster
 * 
*/ @DocumentedFeature @CommandLineProgramProperties( oneLineSummary = "Parallel copy a file or directory from Google Cloud Storage into the HDFS file system used by Spark", summary = "This tool uses a Spark cluster to do a parallel copy of either a single file or a directory from" + " Google Cloud Storage (GCS) into the HDFS file system used by Spark to support Resilient Distributed Datasets (RDDs)." + " Files are divided into chunks of size equal to the HDFS block size (with the exception of the final" + " chunk) and each Spark task is responsible for copying one chunk. To copy all of the files in a GCS directory," + " provide the GCS directory path, including the trailing slash. Directory copies are non-recursive so" + " subdirectories will be skipped. Within directories each file is divided into chunks independently (so this will be" + " inefficient if you have lots of files smaller than the block size). After all chunks are copied, the HDFS" + " concat method is used to stitch together chunks into single files without re-copying them." + " This functionality is used by the structural variation workflow to copy reference data when a" + " Spark cluster is created, and may also be used to copy sample data to a Spark cluster.", programGroup = OtherProgramGroup.class) @BetaFeature public class ParallelCopyGCSDirectoryIntoHDFSSpark extends GATKSparkTool { private static final long serialVersionUID = 1L; // default buffer size for reading chunks is 64MiB based on performance profiling and what appears to be conventional // wisdom to use a power of two for byte buffer sizes public static final int SIXTY_FOUR_MIB = 67108864; public static final String INPUT_GCS_PATH_LONG_NAME = "input-gcs-path"; public static final String OUTPUT_HDFS_DIRECTORY_LONG_NAME = "output-hdfs-directory"; public static final String INPUT_GLOB = "input-file-glob"; public static final String INPUT_GLOB_ALL_FILES = "*"; @Argument(doc = "input GCS file path (add trailing slash when specifying a directory)", fullName = INPUT_GCS_PATH_LONG_NAME) private String inputGCSPath = null; @Argument(doc = "optional wildcard glob to subset files in the input directory to copy", fullName = INPUT_GLOB) private String inputGlob = INPUT_GLOB_ALL_FILES; @Argument(doc = "output directory on HDFS to into which to transfer the data (will be created by the tool)", fullName = OUTPUT_HDFS_DIRECTORY_LONG_NAME) private String outputHDFSDirectory; @Override protected void runTool(final JavaSparkContext ctx) { if (! BucketUtils.isGcsUrl(inputGCSPath)) { throw new UserException("Input path "+ inputGCSPath + " is not a GCS URI"); } if (! BucketUtils.isHadoopUrl(outputHDFSDirectory)) { throw new UserException("Output directory " + outputHDFSDirectory + " is not an HDFS URI"); } final String inputGCSPathFinal = inputGCSPath; final String outputDirectoryFinal = outputHDFSDirectory; org.apache.hadoop.fs.Path outputHdfsDirectoryPath = new org.apache.hadoop.fs.Path(outputHDFSDirectory); try(FileSystem fs = outputHdfsDirectoryPath.getFileSystem(new Configuration())) { if (fs.exists(outputHdfsDirectoryPath)) { throw new UserException("Specified output directory " + outputHdfsDirectoryPath + " already exists. Please specify a new directory name."); } fs.mkdirs(outputHdfsDirectoryPath); final long chunkSize = getChunkSize(fs); final List gcsNIOPaths = getGCSFilePathsToCopy(inputGCSPathFinal, inputGlob); List> chunkList = setupChunks(chunkSize, gcsNIOPaths); if (chunkList.size() == 0) { logger.info("no files found to copy"); return; } final JavaPairRDD chunkRDD = ctx.parallelizePairs(chunkList, chunkList.size()); final JavaPairRDD> chunkMappingRDD = chunkRDD.mapToPair(p -> new Tuple2<>(p._1(), readChunkToHdfs(p._1(), chunkSize, p._2(), outputDirectoryFinal))); final Map>> chunksByFilePath = chunkMappingRDD.groupByKey().collectAsMap(); concatenateChunks(outputDirectoryFinal, fs, gcsNIOPaths, chunksByFilePath); } catch (NoSuchFileException e) { throw new UserException("Could not locate input path " + e.getFile() + ". If you are trying to copy an entire directory, please include a trailing slash on your path."); } catch (IOException e) { throw new GATKException(e.getMessage(), e); } } private void concatenateChunks(final String outputDirectoryFinal, final FileSystem fs, final List gcsNIOPaths, final Map>> chunksByFilePath) throws IOException { for (Path path : gcsNIOPaths) { if (Files.isDirectory(path)) { continue; } final String filePath = path.toUri().toString(); final Iterable> chunkListForFile = chunksByFilePath.get(filePath); final String basename = path.getName(path.getNameCount() - 1).toString(); final org.apache.hadoop.fs.Path outFilePath = new org.apache.hadoop.fs.Path(outputDirectoryFinal + "/" + basename); fs.createNewFile(outFilePath); SortedMap chunkMap = new TreeMap<>(); for (Tuple2 entry : chunkListForFile) { chunkMap.put(entry._1(), entry._2()); } org.apache.hadoop.fs.Path[] chunkPaths = new org.apache.hadoop.fs.Path[chunkMap.size()]; final Iterator iterator = chunkMap.keySet().iterator(); while (iterator.hasNext()) { final Integer next = iterator.next(); final String chunkPath = chunkMap.get(next); chunkPaths[next] = new org.apache.hadoop.fs.Path(chunkPath); } fs.concat(outFilePath, chunkPaths); } } private List> setupChunks(final long chunkSize, final List gcsNIOPaths) throws IOException { List> chunkList = new ArrayList<>(); for (Path path : gcsNIOPaths) { if (Files.isDirectory(path)) { logger.info("skipping directory " + path); continue; } final long fileSize = Files.size(path); final long chunks = fileSize / chunkSize + (fileSize % chunkSize == 0 ? 0 : 1); logger.info("processing path " + path + ", size = " + fileSize + ", chunks = " + chunks); for (int i = 0; i < chunks; i++) { chunkList.add(new Tuple2<>(path.toUri().toString(), i)); } } return chunkList; } private List getGCSFilePathsToCopy(final String inputGCSPathFinal, final String inputGlob) throws IOException { final List gcsNIOPaths; final Path inputGCSNIOPath = IOUtils.getPath(inputGCSPathFinal); if (Files.isDirectory(inputGCSNIOPath)) { logger.info("transferring input directory: " + inputGCSPathFinal); gcsNIOPaths = Utils.stream(Files.newDirectoryStream(inputGCSNIOPath, inputGlob)).collect(Collectors.toList()); } else { logger.info("transferring single file: " + inputGCSNIOPath); if (! INPUT_GLOB_ALL_FILES.equals(inputGlob)) { logger.warn("Input glob " + inputGlob + " specified, but input argument was not a directory. Ignoring glob."); } gcsNIOPaths = Collections.singletonList(inputGCSNIOPath); } return gcsNIOPaths; } static long getChunkSize(final FileSystem fs) { return Long.parseLong(fs.getConf().get("dfs.blocksize")); } private static final Tuple2 readChunkToHdfs(final String inputGCSPathFinal, final long chunkSize, final Integer chunkNum, final String outputDirectory) { final Path gcsPath = IOUtils.getPath(inputGCSPathFinal); final String basename = gcsPath.getName(gcsPath.getNameCount() - 1).toString(); org.apache.hadoop.fs.Path outputPath = new org.apache.hadoop.fs.Path(outputDirectory); final String chunkPath = outputPath + "/" + basename + ".chunk." + chunkNum; try (SeekableByteChannel channel = Files.newByteChannel(gcsPath); final OutputStream outputStream = new BufferedOutputStream(BucketUtils.createFile(chunkPath))){ final long start = chunkSize * (long) chunkNum; channel.position(start); ByteBuffer byteBuffer = ByteBuffer.allocateDirect((int) Math.min(SIXTY_FOUR_MIB, chunkSize)); long bytesRead = 0; while(channel.read(byteBuffer) > 0) { byteBuffer.flip(); while (byteBuffer.hasRemaining() && bytesRead < chunkSize) { byte b = byteBuffer.get(); outputStream.write(b); bytesRead++; } if (bytesRead == chunkSize) { break; } if (bytesRead > chunkSize) { throw new GATKException("Encountered an unknown error condition and read too many bytes; output file may be corrupt"); } byteBuffer.clear(); } } catch (IOException e) { throw new GATKException(e.getMessage() + "; inputGCSPathFinal = " + inputGCSPathFinal, e); } return new Tuple2<>(chunkNum, chunkPath); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy