All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBImport Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.tools.genomicsdb;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.FeatureReader;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFUtils;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.genomicsdb.Constants;
import org.genomicsdb.GenomicsDBUtils;
import org.genomicsdb.importer.GenomicsDBImporter;
import org.genomicsdb.model.BatchCompletionCallbackFunctionArgument;
import org.genomicsdb.model.Coordinates;
import org.genomicsdb.model.GenomicsDBCallsetsMapProto;
import org.genomicsdb.model.GenomicsDBImportConfiguration;
import org.genomicsdb.model.ImportConfig;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.function.Function;
import java.util.stream.Collectors;


/**
 * Import single-sample GVCFs into GenomicsDB before joint genotyping.
 *
 * 

The GATK4 Best Practice Workflow for SNP and Indel calling uses GenomicsDBImport to merge GVCFs from multiple samples. * GenomicsDBImport offers the same functionality as CombineGVCFs and initially came from the Intel-Broad Center for Genomics. * The datastore transposes sample-centric variant information across genomic loci to make data more accessible to tools. *

* *

To query the contents of the GenomicsDB datastore, use * SelectVariants. * See Tutorial#11813 to get started.

* *

Details on GenomicsDB are at * https://github.com/GenomicsDB/GenomicsDB/wiki. * In brief, GenomicsDB utilises a data storage system optimized for storing/querying sparse arrays. * Genomics data is typically sparse in that each sample has few variants with respect to the entire reference genome. * GenomicsDB contains specialized code for genomics applications, such as VCF parsing and INFO field annotation * calculation. *

* *

Input

*

* One or more GVCFs produced by in HaplotypeCaller with the `-ERC GVCF` or `-ERC BP_RESOLUTION` settings, containing * the samples to joint-genotype. *

* *

Output

*

* A GenomicsDB workspace *

* *

Usage examples

* * Provide each sample GVCF separately. *
 *    gatk --java-options "-Xmx4g -Xms4g" GenomicsDBImport \
 *      -V data/gvcfs/mother.g.vcf.gz \
 *      -V data/gvcfs/father.g.vcf.gz \
 *      -V data/gvcfs/son.g.vcf.gz \
 *      --genomicsdb-workspace-path my_database \
 *      --tmp-dir=/path/to/large/tmp \
 *      -L 20
 *  
* * Provide sample GVCFs in a map file. * *
 *    gatk --java-options "-Xmx4g -Xms4g" \
 *       GenomicsDBImport \
 *       --genomicsdb-workspace-path my_database \
 *       --batch-size 50 \
 *       -L chr1:1000-10000 \
 *       --sample-name-map cohort.sample_map \
 *       --tmp-dir=/path/to/large/tmp \
 *       --reader-threads 5
 *  
* * The sample map is a tab-delimited text file with sample_name--tab--path_to_sample_vcf per line. Using a sample map * saves the tool from having to download the GVCF headers in order to determine the sample names. Sample names in * the sample name map file may have non-tab whitespace, but may not begin or end with whitespace. * *
 *  sample1      sample1.vcf.gz
 *  sample2      sample2.vcf.gz
 *  sample3      sample3.vcf.gz
 *  
* * Add new samples to an existing genomicsdb workspace. *
 *    gatk --java-options "-Xmx4g -Xms4g" GenomicsDBImport \
 *      -V data/gvcfs/mother.g.vcf.gz \
 *      -V data/gvcfs/father.g.vcf.gz \
 *      -V data/gvcfs/son.g.vcf.gz \
 *      --genomicsdb-update-workspace-path my_database \
 *      --tmp-dir=/path/to/large/tmp \
 *  
* In the incremental import case, no intervals are specified in the command because the tool will use the same * intervals used in the initial import. Sample map is also supported for incremental import * * Get Picard-style interval_list from existing workspace *
 *    gatk --java-options "-Xmx4g -Xms4g" GenomicsDBImport \
 *      --genomicsdb-update-workspace-path my_database \
 *      --output-interval-list-to-file /output/path/to/file
 *  
* The interval_list for the specified/existing workspace will be written to /output/path/to/file. * This will output a Picard-style interval_list (with a sequence dictionary header) *

Caveats

*
    *
  • IMPORTANT: The -Xmx value the tool is run with should be less than the total amount of physical memory available by at least a few GB, as the native TileDB library requires additional memory on top of the Java memory. Failure to leave enough memory for the native code can result in confusing error messages!
  • *
  • At least one interval must be provided, unless incrementally importing new samples in which case specified intervals are ignored in favor of intervals specified in the existing workspace
  • *
  • Input GVCFs cannot contain multiple entries for a single genomic position
  • *
  • The --genomicsdb-workspace-path must point to a non-existent or empty directory
  • *
  • The --genomicsdb-update-workspace-path must point to a existing genomicsdb workspace
  • *
  • Either --genomicsdb-workspace-path or genomicsdb-update-workspace-path must be specified
  • *
  • GenomicsDBImport uses temporary disk storage during import. The amount of temporary disk storage required can exceed the space available, especially when specifying a large number of intervals. The command line argument `--tmp-dir` can be used to specify an alternate temporary storage location with sufficient space..
  • *
  • It is recommended that users backup existing genomicsdb workspaces before adding new samples using --genomicsdb-update-workspace-path. If the tool fails during incremental import for any reason, the workspace may be in an inconsistent/corrupted state
  • *
* *

Developer Note

* To read data from GenomicsDB, use the query interface {@link org.genomicsdb.reader.GenomicsDBFeatureReader} */ @DocumentedFeature @CommandLineProgramProperties( summary = "Import VCFs to GenomicsDB", oneLineSummary = "Import VCFs to GenomicsDB", programGroup = ShortVariantDiscoveryProgramGroup.class ) public final class GenomicsDBImport extends GATKTool { private static final long DEFAULT_VCF_BUFFER_SIZE_PER_SAMPLE = 16*1024L; private static final long DEFAULT_SEGMENT_SIZE = 1048576L; private static final int DEFAULT_ZERO_BATCH_SIZE = 0; public static final String WORKSPACE_ARG_LONG_NAME = "genomicsdb-workspace-path"; public static final String INCREMENTAL_WORKSPACE_ARG_LONG_NAME = "genomicsdb-update-workspace-path"; public static final String SEGMENT_SIZE_ARG_LONG_NAME = "genomicsdb-segment-size"; public static final String OVERWRITE_WORKSPACE_LONG_NAME = "overwrite-existing-genomicsdb-workspace"; public static final String INTERVAL_LIST_LONG_NAME = "output-interval-list-to-file"; public static final String VCF_BUFFER_SIZE_ARG_NAME = "genomicsdb-vcf-buffer-size"; public static final String BATCHSIZE_ARG_LONG_NAME = "batch-size"; public static final String CONSOLIDATE_ARG_NAME = "consolidate"; public static final String SAMPLE_NAME_MAP_LONG_NAME = "sample-name-map"; public static final String VALIDATE_SAMPLE_MAP_LONG_NAME = "validate-sample-name-map"; public static final String MERGE_INPUT_INTERVALS_LONG_NAME = "merge-input-intervals"; public static final String VCF_INITIALIZER_THREADS_LONG_NAME = "reader-threads"; public static final String MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL = "max-num-intervals-to-import-in-parallel"; public static final int INTERVAL_LIST_SIZE_WARNING_THRESHOLD = 100; public static final String SHARED_POSIXFS_OPTIMIZATIONS = GenomicsDBArgumentCollection.SHARED_POSIXFS_OPTIMIZATIONS; @Argument(fullName = WORKSPACE_ARG_LONG_NAME, doc = "Workspace for GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " + "Use this argument when creating a new GenomicsDB workspace. " + "Either this or "+INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" must be specified." + " Must be an empty or non-existent directory.", mutex = {INCREMENTAL_WORKSPACE_ARG_LONG_NAME,INTERVAL_LIST_LONG_NAME}) private String workspace; @Argument(fullName = INCREMENTAL_WORKSPACE_ARG_LONG_NAME, doc = "Workspace when updating GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " + "Use this argument when adding new samples to an existing GenomicsDB workspace or "+ "when using the "+INTERVAL_LIST_LONG_NAME+" option. " + "Either this or "+WORKSPACE_ARG_LONG_NAME+" must be specified. " + "Must point to an existing workspace.", mutex = {WORKSPACE_ARG_LONG_NAME}) private String incrementalImportWorkspace; @Argument(fullName = SEGMENT_SIZE_ARG_LONG_NAME, doc = "Buffer size in bytes allocated for GenomicsDB attributes during " + "import. Should be large enough to hold data from one site. ", optional = true) private long segmentSize = DEFAULT_SEGMENT_SIZE; @Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, doc = "GVCF files to be imported to GenomicsDB. Each file must contain" + " data for only a single sample. Either this or " + SAMPLE_NAME_MAP_LONG_NAME + " must be specified.", optional = true, mutex = {SAMPLE_NAME_MAP_LONG_NAME}) private List variantPaths; @Argument(fullName = VCF_BUFFER_SIZE_ARG_NAME, doc = "Buffer size in bytes to store variant contexts." + " Larger values are better as smaller values cause frequent disk writes." + " Defaults to " + DEFAULT_VCF_BUFFER_SIZE_PER_SAMPLE + " which was empirically determined to work" + " well for many inputs.", optional = true, minValue = 1024L, minRecommendedValue = 10 * 1024) private long vcfBufferSizePerSample = DEFAULT_VCF_BUFFER_SIZE_PER_SAMPLE; @Argument(fullName = OVERWRITE_WORKSPACE_LONG_NAME, doc = "Will overwrite given workspace if it exists. " + "Otherwise a new workspace is created. " + "Cannot be set to true if "+INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" is also set. " + "Defaults to false", optional = true) private Boolean overwriteExistingWorkspace = false; @Argument(fullName = BATCHSIZE_ARG_LONG_NAME, doc = "Batch size controls the number of samples for which readers are open at once " + "and therefore provides a way to minimize memory consumption. However, it can take longer to complete. " + "Use the consolidate flag if more than a hundred batches were used. This will improve feature read time. " + "batchSize=0 means no batching (i.e. readers for all samples will be opened at once) " + "Defaults to " + DEFAULT_ZERO_BATCH_SIZE, optional = true) private int batchSize = DEFAULT_ZERO_BATCH_SIZE; @Argument(fullName = CONSOLIDATE_ARG_NAME, doc = "Boolean flag to enable consolidation. If importing data in batches, a new fragment is created for " + "each batch. In case thousands of fragments are created, GenomicsDB feature readers will try " + "to open ~20x as many files. Also, internally GenomicsDB would consume more memory to maintain " + "bookkeeping data from all fragments. Use this flag to merge all fragments into one. " + "Merging can potentially improve read performance, however overall benefit might not be noticeable " + "as the top Java layers have significantly higher overheads. This flag has no effect if only one " + "batch is used. Defaults to false", optional = true) private Boolean doConsolidation = false; @Advanced @Argument(fullName = SAMPLE_NAME_MAP_LONG_NAME, doc = "Path to file containing a mapping of sample name to file uri in tab delimited format. If this is " + "specified then the header from the first sample will be treated as the merged header rather than " + "merging the headers, and the sample names will be taken from this file. This may be used to rename " + "input samples. This is a performance optimization that relaxes the normal checks for consistent " + "headers. Using vcfs with incompatible headers may result in silent data corruption.", optional = true, mutex = {StandardArgumentDefinitions.VARIANT_LONG_NAME}) private String sampleNameMapFile; @Argument(fullName = VALIDATE_SAMPLE_MAP_LONG_NAME, doc = "Boolean flag to enable checks on the sampleNameMap file. If true, tool checks whether" + "feature readers are valid and shows a warning if sample names do not match with the headers. " + "Defaults to false", optional = true) private Boolean validateSampleToReaderMap = false; @Argument(fullName = MERGE_INPUT_INTERVALS_LONG_NAME, doc = "Boolean flag to import all data in between intervals. Improves performance using large lists of " + "intervals, as in exome sequencing, especially if GVCF data only exists for specified intervals.") private boolean mergeInputIntervals = false; @Argument(fullName = INTERVAL_LIST_LONG_NAME, doc = "Path to output file where intervals from existing workspace should be written." + "If this option is specified, the tools outputs the interval_list of the workspace pointed to " + "by "+INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" at the path specified here in a Picard-style interval_list " + "with a sequence dictionary header", optional= true, mutex = {WORKSPACE_ARG_LONG_NAME}) private String intervalListOutputPathString; @Advanced @Argument(fullName = VCF_INITIALIZER_THREADS_LONG_NAME, doc = "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance " + "when network latency is an issue. Multiple reader threads are not supported when running with multiple intervals.", optional = true, minValue = 1) private int vcfInitializerThreads = 1; @Advanced @Argument(fullName = MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL, doc = "Max number of intervals to import in parallel; higher values may improve performance, but require more" + " memory and a higher number of file descriptors open at the same time", optional = true, minValue = 1) private int maxNumIntervalsToImportInParallel = 1; @Argument(fullName = SHARED_POSIXFS_OPTIMIZATIONS, doc = "Allow for optimizations to improve the usability and performance for shared Posix Filesystems(e.g. NFS, Lustre). " + "If set, file level locking is disabled and file system writes are minimized by keeping a higher number of " + "file descriptors open for longer periods of time. Use with " + BATCHSIZE_ARG_LONG_NAME + " option if keeping a " + "large number of file descriptors open is an issue", optional = true) private boolean sharedPosixFSOptimizations = false; //executor service used when vcfInitializerThreads > 1 private ExecutorService inputPreloadExecutorService; /** * Get the largest interval per contig that contains the intervals specified on the command line. * @param getIntervals intervals to be transformed * @param sequenceDictionary used to validate intervals * @return a list of one interval per contig spanning the input intervals after processing and validation */ @Override protected List transformTraversalIntervals(final List getIntervals, final SAMSequenceDictionary sequenceDictionary) { if (mergeInputIntervals) { return IntervalUtils.getSpanningIntervals(getIntervals, sequenceDictionary); } else { return getIntervals; } } @Override public int getDefaultCloudPrefetchBufferSize() { // Empirical testing has shown that this tool performs best at scale with cloud buffering // disabled. With cloud buffering on and thousands of concurrent GenomicsDBImport tasks, // we do too many simultaneous GCS accesses (since the prefetcher spawns a new thread for each // reader upon a query) and start seeing intermittent failures, even with aggressive retries. return 0; } @Override public int getDefaultCloudIndexPrefetchBufferSize() { // Empirical testing has shown that this tool performs best at scale with cloud buffering // disabled. With cloud buffering on and thousands of concurrent GenomicsDBImport tasks, // we do too many simultaneous GCS accesses (since the prefetcher spawns a new thread for each // reader upon a query) and start seeing intermittent failures, even with aggressive retries. return 0; } @Override public String getProgressMeterRecordLabel() { return "batches"; } // Intervals from command line (merged if specified) private List intervals; // Sorted mapping between sample names and corresponding GVCF file name // // IMPORTANT: This must be sorted or it will result in sample name swaps in the output database. // This happens because the callset json is generated independently from the import process // each imported batch is then sorted, so if we have an unsorted list we'll end up with different global vs batch // sorting. // We preemptively sort here so we will have consistent sorting. private SortedMap sampleNameToVcfPath = new TreeMap<>(); // Needed as smartMergeHeaders() returns a set of VCF header lines private Set mergedHeaderLines = null; // sequence dictionary created from the merged header private SAMSequenceDictionary mergedHeaderSequenceDictionary; // Path to vidmap file to be written by GenomicsDBImporter private String vidMapJSONFile; // Path to callsetmap file to be written by GenomicsDBImporter private String callsetMapJSONFile; // Path to combined VCF header file to be written by GenomicsDBImporter private String vcfHeaderFile; // GenomicsDB callset map protobuf structure containing all callset names // used to write the callset json file on traversal success private GenomicsDBCallsetsMapProto.CallsetMappingPB callsetMappingPB; //in-progress batchCount private int batchCount = 1; // true if genomicsdb-update-workspace-path is specified private Boolean doIncrementalImport = false; // true if --output-interval-list-file is specified private Boolean getIntervalsFromExistingWorkspace = false; /** * Before traversal starts, create the feature readers * for all the input GVCFs, create the merged header and * initialize the interval */ @Override public void onStartup() { initializeWorkspaceAndToolMode(); assertVariantPathsOrSampleNameFileWasSpecified(); assertOverwriteWorkspaceAndIncrementalImportMutuallyExclusive(); initializeHeaderAndSampleMappings(); initializeIntervals(); super.onStartup(); } private void initializeWorkspaceAndToolMode() { if (incrementalImportWorkspace != null && !incrementalImportWorkspace.isEmpty()) { doIncrementalImport = true; workspace = incrementalImportWorkspace; if (intervalListOutputPathString != null && !intervalListOutputPathString.isEmpty()) { getIntervalsFromExistingWorkspace = true; } } } private void assertOverwriteWorkspaceAndIncrementalImportMutuallyExclusive() { if (overwriteExistingWorkspace && doIncrementalImport) { throw new CommandLineException(OVERWRITE_WORKSPACE_LONG_NAME + " cannot be set to true when " + INCREMENTAL_WORKSPACE_ARG_LONG_NAME + " is set"); } } private void assertVariantPathsOrSampleNameFileWasSpecified(){ if ( (variantPaths == null || variantPaths.isEmpty()) && sampleNameMapFile == null && !getIntervalsFromExistingWorkspace) { throw new CommandLineException.MissingArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, "One of --" + StandardArgumentDefinitions.VARIANT_LONG_NAME + " or --" + SAMPLE_NAME_MAP_LONG_NAME + " must be specified" ); } } /** * sets the values of mergedHeaderLines, mergedHeaderSequenceDictionary, and sampleNameToVcfPath */ private void initializeHeaderAndSampleMappings() { // Only one of -V and --sampleNameMapFile may be specified if (variantPaths != null && variantPaths.size() > 0) { // -V was specified final List headers = new ArrayList<>(variantPaths.size()); for (final String variantPathString : variantPaths) { final Path variantPath = IOUtils.getPath(variantPathString); final VCFHeader header = getHeaderFromPath(variantPath); Utils.validate(header != null, "Null header was found in " + variantPath + "."); assertGVCFHasOnlyOneSample(variantPathString, header); headers.add(header); final String sampleName = header.getGenotypeSamples().get(0); try { final URI previousPath = sampleNameToVcfPath.put(sampleName, new URI(variantPathString)); if (previousPath != null) { throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both " + variantPath.toUri() + " and " + previousPath + "."); } } catch(final URISyntaxException e) { throw new UserException("Malformed URI "+e.toString(), e); } } mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true); mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary(); mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines()); } else if (sampleNameMapFile != null) { // --sampleNameMap was specified //it's VERY IMPORTANT that this map is Sorted according to String's natural ordering, if it is not //the resulting database will have incorrect sample names //see https://github.com/broadinstitute/gatk/issues/3682 for more information sampleNameToVcfPath = loadSampleNameMapFileInSortedOrder(IOUtils.getPath(sampleNameMapFile)); final Path firstHeaderPath = IOUtils.getPath(sampleNameToVcfPath.entrySet().iterator().next().getValue().toString()); final VCFHeader header = getHeaderFromPath(firstHeaderPath); //getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder()); mergedHeaderSequenceDictionary = header.getSequenceDictionary(); mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines()); } else if (getIntervalsFromExistingWorkspace){ final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME); IOUtils.assertPathsAreReadable(vcfHeader); final String header = GenomicsDBUtils.readEntireFile(vcfHeader); try { File tempHeader = File.createTempFile("tempheader", ".vcf"); tempHeader.deleteOnExit(); Files.write(tempHeader.toPath(), header.getBytes(StandardCharsets.UTF_8)); mergedHeaderSequenceDictionary = VCFFileReader.getSequenceDictionary(tempHeader); } catch (final IOException e) { throw new UserException("Unable to create temporary header file to get sequence dictionary"); } } else { throw new UserException(StandardArgumentDefinitions.VARIANT_LONG_NAME+" or "+ SAMPLE_NAME_MAP_LONG_NAME+" must be specified unless "+ INTERVAL_LIST_LONG_NAME+" is specified"); } if ( mergedHeaderSequenceDictionary == null) { throw new UserException("The merged vcf header has no sequence dictionary. Please provide a header that contains a sequence dictionary."); } } private VCFHeader getHeaderFromPath(final Path variantPath) { try(final FeatureReader reader = getReaderFromPath(variantPath)) { return (VCFHeader) reader.getHeader(); } catch (final IOException e) { throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e); } } private static void assertGVCFHasOnlyOneSample(final String variantPath, final VCFHeader header) { // A GVCF file must contain only one sample, throw an exception otherwise final int numberOfSamples = header.getNGenotypeSamples(); if (numberOfSamples != 1) { throw new UserException("Input GVCF: " + variantPath + " was expected to contain a single sample but actually contained " + numberOfSamples + " samples."); } } /** * load a tab delimited new line separated file of sample name to URI mapping: * this maintains the keys in the same order that they appeared in the file * * this tool should only call {@link #loadSampleNameMapFileInSortedOrder(Path)}, * this version is exposed for the benefit of {@link org.broadinstitute.hellbender.tools.FixCallSetSampleOrdering} * * ex: * * Sample1\tpathToSample1.vcf\n * Sample2\tpathTosample2.vcf\n * ... * * The sample names must be unique. * @param sampleToFileMapPath path to the mapping file * @return map of sample name to corresponding file, the map will be ordered according to the order in the input file */ public static LinkedHashMap loadSampleNameMapFile(final Path sampleToFileMapPath) { try { final List lines = Files.readAllLines(sampleToFileMapPath); if (lines.isEmpty()) { throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file"); } final LinkedHashMap sampleToFilename = new LinkedHashMap<>(); for ( final String line : lines) { final String[] split = line.split("\\t",-1); if (split.length != 2) { throw new UserException.BadInput("Expected a file with 2 fields per line in the format\nSample\tFile\n but found line: \"" + line +"\" with "+split.length+" fields"); } if ( !split[0].trim().equals(split[0]) || split[0].trim().isEmpty() || split[1].trim().isEmpty()) { throw new UserException.BadInput("Expected a file of format\nSample\tFile\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace"); } final String sample = split[0]; final String path = split[1].trim(); try { final URI oldPath = sampleToFilename.put(sample, new URI(path)); if (oldPath != null){ throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath ); } } catch(final URISyntaxException e) { throw new UserException("Malformed URI "+e.toString()); } } return sampleToFilename; } catch (final IOException e) { throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e); } } /** * load a tab delimited new line separated file of sample name to URI mapping: * * ex: * Sample1\tpathToSample1.vcf\n * Sample2\tpathTosample2.vcf\n * ... * * The sample names must be unique. * @param sampleToFileMapPath path to the mapping file * @return map of sample name to corresponding file, sorted by sample name */ public static SortedMap loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath){ return new TreeMap<>(loadSampleNameMapFile(sampleToFileMapPath)); } /** * Before traversal, fix configuration parameters and initialize * GenomicsDB. Hard-coded to handle only VCF files and headers */ @Override public void onTraversalStart() { if (getIntervalsFromExistingWorkspace) { final IntervalList outputList = new IntervalList(getBestAvailableSequenceDictionary()); intervals.forEach(i -> outputList.add(new Interval(i.getContig(), i.getStart(), i.getEnd()))); final Path intervalListOutputPath = IOUtils.getPath(intervalListOutputPathString); outputList.write(intervalListOutputPath.toFile()); return; } String workspaceDir = BucketUtils.makeFilePathAbsolute(overwriteCreateOrCheckWorkspace()); vidMapJSONFile = IOUtils.appendPathToDir(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME); callsetMapJSONFile = IOUtils.appendPathToDir(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME); vcfHeaderFile = IOUtils.appendPathToDir(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME); if (doIncrementalImport) { logger.info("Callset Map JSON file will be re-written to " + callsetMapJSONFile); logger.info("Incrementally importing to workspace - " + workspaceDir); } else { logger.info("Vid Map JSON file will be written to " + vidMapJSONFile); logger.info("Callset Map JSON file will be written to " + callsetMapJSONFile); logger.info("Complete VCF Header will be written to " + vcfHeaderFile); logger.info("Importing to workspace - " + workspaceDir); } initializeInputPreloadExecutorService(); } private void initializeInputPreloadExecutorService() { if( vcfInitializerThreads > 1) { if( intervals.size() == 1) { final ThreadFactory threadFactory = new ThreadFactoryBuilder() .setNameFormat("readerInitializer-thread-%d") .setDaemon(true) .build(); this.inputPreloadExecutorService = Executors.newFixedThreadPool(vcfInitializerThreads, threadFactory); } else { logger.warn("GenomicsDBImport cannot use multiple VCF reader threads for initialization when the " + "number of intervals is greater than 1. Falling back to serial VCF reader initialization."); inputPreloadExecutorService = null; } } else { inputPreloadExecutorService = null; } } private Map> createSampleToReaderMap( final Map sampleNameToVcfPath, final int batchSize, final int index) { // TODO: fix casting since it's really ugly return inputPreloadExecutorService != null ? getFeatureReadersInParallel((SortedMap) sampleNameToVcfPath, batchSize, index) : getFeatureReadersSerially(sampleNameToVcfPath, batchSize, index); } private Void logMessageOnBatchCompletion(final BatchCompletionCallbackFunctionArgument arg) { progressMeter.update(intervals.get(0)); logger.info("Done importing batch " + arg.batchCount + "/" + arg.totalBatchCount); this.batchCount = arg.batchCount + 1; return null; } private List generatePartitionListFromIntervals(List chromosomeIntervals) { return chromosomeIntervals.stream().map(interval -> { GenomicsDBImportConfiguration.Partition.Builder partitionBuilder = GenomicsDBImportConfiguration.Partition.newBuilder(); Coordinates.ContigPosition.Builder contigPositionBuilder = Coordinates.ContigPosition.newBuilder(); Coordinates.GenomicsDBColumn.Builder columnBuilder = Coordinates.GenomicsDBColumn.newBuilder(); //begin contigPositionBuilder.setContig(interval.getContig()).setPosition(interval.getStart()); columnBuilder.setContigPosition(contigPositionBuilder.build()); partitionBuilder.setBegin(columnBuilder.build()); //end contigPositionBuilder.setPosition(interval.getEnd()); columnBuilder.setContigPosition(contigPositionBuilder.build()); partitionBuilder.setEnd(columnBuilder.build()); partitionBuilder.setWorkspace(workspace); partitionBuilder.setGenerateArrayNameFromPartitionBounds(true); return partitionBuilder.build(); }).collect(Collectors.toList()); } private void generateIntervalListFromWorkspace() { String[] partitions = GenomicsDBUtils.listGenomicsDBArrays(workspace); intervals = new ArrayList<>(); for (int i=0; i format:"+ partitions[i]+" where is "+Constants.CHROMOSOME_FOLDER_DELIMITER_SYMBOL_REGEX+ "\nWas the specified workspace created using GenomicsDBImport?"); } final String contig = partitionInfo[0]; final int start = Integer.parseInt(partitionInfo[1]); final int end = Integer.parseInt(partitionInfo[2]); intervals.add(new SimpleInterval(contig, start, end)); } } private ImportConfig createImportConfig(final int batchSize) { final List partitions = generatePartitionListFromIntervals(intervals); GenomicsDBImportConfiguration.ImportConfiguration.Builder importConfigurationBuilder = GenomicsDBImportConfiguration.ImportConfiguration.newBuilder(); importConfigurationBuilder.addAllColumnPartitions(partitions); importConfigurationBuilder.setSizePerColumnPartition(vcfBufferSizePerSample); importConfigurationBuilder.setFailIfUpdating(true && !doIncrementalImport); importConfigurationBuilder.setSegmentSize(segmentSize); importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation); importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations); ImportConfig importConfig = new ImportConfig(importConfigurationBuilder.build(), validateSampleToReaderMap, true, batchSize, mergedHeaderLines, sampleNameToVcfPath, this::createSampleToReaderMap, doIncrementalImport); importConfig.setOutputCallsetmapJsonFile(callsetMapJSONFile); importConfig.setOutputVidmapJsonFile(vidMapJSONFile); importConfig.setOutputVcfHeaderFile(vcfHeaderFile); importConfig.setUseSamplesInOrder(true); importConfig.setFunctionToCallOnBatchCompletion(this::logMessageOnBatchCompletion); return importConfig; } /** * A complete traversal from start to finish. This method will import all samples * specified in the input GVCF files. */ @Override public void traverse() { // if we're just trying to get interval list, nothing to be done if (getIntervalsFromExistingWorkspace) { return; } // Force the progress meter to update after every batch progressMeter.setRecordsBetweenTimeChecks(1L); final int sampleCount = sampleNameToVcfPath.size(); final int updatedBatchSize = (batchSize == DEFAULT_ZERO_BATCH_SIZE) ? sampleCount : batchSize; final ImportConfig importConfig = createImportConfig(updatedBatchSize); GenomicsDBImporter importer; try { importer = new GenomicsDBImporter(importConfig); // Modify importer directly from updateImportProtobufVidMapping. org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.updateImportProtobufVidMapping(importer); importer.executeImport(maxNumIntervalsToImportInParallel); } catch (final IOException e) { throw new UserException("Error initializing GenomicsDBImporter", e); } catch (final IllegalArgumentException iae) { throw new GATKException("Null feature reader found in sampleNameMap file: " + sampleNameMapFile, iae); } catch (final CompletionException ce) { throw (ce.getCause() instanceof RuntimeException ? (RuntimeException) ce.getCause() : ce); } } @Override public Object onTraversalSuccess() { if (getIntervalsFromExistingWorkspace) { logger.info("Interval list generated!"); } else if (batchSize == DEFAULT_ZERO_BATCH_SIZE) { logger.info("Import completed!"); } else { logger.info("Import of all batches to GenomicsDB completed!"); } return true; } /** * Method to create feature readers for input files or GCS URLs * in the current batch * * @param sampleNametoPath Sample name to file name mapping * @param batchSize Current batch size * @param lowerSampleIndex 0-based Lower bound of sample index -- inclusive * @return Feature readers to be imported in the current batch, sorted by sample name */ private SortedMap> getFeatureReadersInParallel( final SortedMap sampleNametoPath, final int batchSize, final int lowerSampleIndex) { final SortedMap> sampleToReaderMap = new TreeMap<>(); logger.info("Starting batch input file preload"); final Map>> futures = new LinkedHashMap<>(); final List sampleNames = new ArrayList<>(sampleNametoPath.keySet()); for(int i = lowerSampleIndex; i < sampleNametoPath.size() && i < lowerSampleIndex+batchSize; ++i) { final String sampleName = sampleNames.get(i); futures.put(sampleName, inputPreloadExecutorService.submit(() -> { final Path variantPath = IOUtils.getPath(sampleNametoPath.get(sampleName).toString()); try { return new InitializedQueryWrapper(getReaderFromPath(variantPath), intervals.get(0)); } catch (final IOException e) { throw new UserException.CouldNotReadInputFile("Couldn't read file: " + variantPath.toUri(), e); } })); } futures.forEach((sampleName, future) -> { try { final FeatureReader reader = future.get(); sampleToReaderMap.put(sampleName, reader); } catch (InterruptedException | ExecutionException e) { throw new UserException.CouldNotReadInputFile("Failure while waiting for FeatureReader to initialize ", e); } }); logger.info("Finished batch preload"); logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples"); return sampleToReaderMap; } private SortedMap> getFeatureReadersSerially(final Map sampleNameToPath, final int batchSize, final int lowerSampleIndex){ final SortedMap> sampleToReaderMap = new TreeMap<>(); final List sampleNames = new ArrayList<>(sampleNameToPath.keySet()); for(int i = lowerSampleIndex; i < sampleNameToPath.size() && i < lowerSampleIndex+batchSize; ++i) { final String sampleName = sampleNames.get(i); final FeatureReader reader = getReaderFromPath(IOUtils.getPath(sampleNameToPath.get(sampleName).toString())); sampleToReaderMap.put(sampleName, reader); } logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples"); return sampleToReaderMap; } /** * Creates a feature reader object from a given VCF URI (can also be * a local file path) and returns it * @return Feature reader * @param variantPath */ private FeatureReader getReaderFromPath(final Path variantPath) { final String variantURI = variantPath.toAbsolutePath().toUri().toString(); try { final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), true, BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer), BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer)); /* Anonymous FeatureReader subclass that wraps returned iterators to ensure that the GVCFs do not * contain MNPs. */ return new FeatureReader() { /** Iterator that asserts that variants are not MNPs. */ class NoMnpIterator implements CloseableTribbleIterator { private final CloseableTribbleIterator inner; NoMnpIterator(CloseableTribbleIterator inner) { this.inner = inner; } @Override public void close() { inner.close(); } @Override public Iterator iterator() { return this; } @Override public boolean hasNext() { return inner.hasNext(); } @Override public VariantContext next() { if (!hasNext()) throw new NoSuchElementException(); final VariantContext vc = inner.next(); if (GATKVariantContextUtils.isUnmixedMnpIgnoringNonRef(vc)) { throw new UserException.BadInput(String.format( "GenomicsDBImport does not support GVCFs with MNPs. MNP found at %1s:%2d in VCF %3s", vc.getContig(), vc.getStart(), variantPath.toAbsolutePath() )); } return vc; } } @Override public void close() throws IOException { reader.close(); } @Override public List getSequenceNames() { return reader.getSequenceNames(); } @Override public Object getHeader() { return reader.getHeader(); } @Override public boolean isQueryable() { return reader.isQueryable(); } @Override public CloseableTribbleIterator query(Locatable locus) throws IOException { return new NoMnpIterator(reader.query(locus)); } @Override public CloseableTribbleIterator query(String chr, int start, int end) throws IOException { return new NoMnpIterator(reader.query(chr, start, end)); } @Override public CloseableTribbleIterator iterator() throws IOException { return new NoMnpIterator(reader.iterator()); } }; } catch (final TribbleException e){ throw new UserException("Failed to create reader from " + variantURI, e); } } /** * Input argument "overwriteExistingWorkspace" defaults to false. * The tool creates a new workspace if it doesn't exist, or checks that * a workspace exists in the incremental import case. Deletes * an existing workspace if argument is true * * @return The workspace directory */ private String overwriteCreateOrCheckWorkspace() { String workspaceDir = BucketUtils.makeFilePathAbsolute(workspace); // From JavaDoc for GenomicsDBUtils.createTileDBWorkspace // returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first. // returnCode = -1 : path was not a directory // returnCode = -2 : failed to create workspace // returnCode = 1 : if overwriteExistingWorkspace is false, return 1 if directory already exists int returnCode = GenomicsDBUtils.createTileDBWorkspace(workspaceDir, overwriteExistingWorkspace); if (returnCode == -1) { throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspace + " already exists and is not a directory"); } else if (returnCode < 0) { throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspace); } else if (!overwriteExistingWorkspace && returnCode == 1) { if (doIncrementalImport) { return workspaceDir; } else { throw new UnableToCreateGenomicsDBWorkspace("Error creating GenomicsDB workspace: " + workspace + " already exists"); } } else { if (doIncrementalImport) { throw new UserException("We require an existing valid workspace when incremental import is set"); } return workspaceDir; } } static class UnableToCreateGenomicsDBWorkspace extends UserException { private static final long serialVersionUID = 1L; UnableToCreateGenomicsDBWorkspace(final String message){ super(message); } } /** * Loads our intervals using the best available sequence * dictionary (as returned by {@link #getBestAvailableSequenceDictionary}) * to parse/verify them. Does nothing if no intervals were specified. */ protected void initializeIntervals() { if (intervalArgumentCollection.intervalsSpecified()) { if (getIntervalsFromExistingWorkspace || doIncrementalImport) { logger.warn(INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" was set, so ignoring specified intervals." + "The tool will use the intervals specified by the initial import"); generateIntervalListFromWorkspace(); return; } final SAMSequenceDictionary intervalDictionary = getBestAvailableSequenceDictionary(); if (intervalDictionary == null) { throw new UserException("We require at least one input source that " + "has a sequence dictionary (reference or reads) when intervals are specified"); } intervals = new ArrayList<>(); final List simpleIntervalList = intervalArgumentCollection.getIntervals(intervalDictionary); if (!mergeInputIntervals && simpleIntervalList.size() > INTERVAL_LIST_SIZE_WARNING_THRESHOLD) { logger.warn(String.format( "A large number of intervals were specified. " + "Using more than %d intervals in a single import is not recommended and can cause performance to suffer. " + "If GVCF data only exists within those intervals, performance can be improved by aggregating intervals with the " + MERGE_INPUT_INTERVALS_LONG_NAME + " argument.", INTERVAL_LIST_SIZE_WARNING_THRESHOLD) ); } intervals = mergeInputIntervals ? IntervalUtils.getSpanningIntervals(simpleIntervalList, getBestAvailableSequenceDictionary()) : simpleIntervalList; } else if (getIntervalsFromExistingWorkspace || doIncrementalImport) { // in incremental import case, we don't care if intervals were specified generateIntervalListFromWorkspace(); } else { throw new UserException("No intervals specified"); } } @Override public void onShutdown(){ if(inputPreloadExecutorService != null) { inputPreloadExecutorService.shutdownNow(); } } /** * Overriding getBestAvailableSequenceDictionary() to prefer the mergedVCFHeader's * sequence directory, if present, over any other dictionaries * * @return Sequence directory from merged header, or super.getBestAvailableSequenceDictionary() * if none */ @Override public SAMSequenceDictionary getBestAvailableSequenceDictionary() { final SAMSequenceDictionary sequenceDictionary = mergedHeaderSequenceDictionary; if (sequenceDictionary == null) { return super.getBestAvailableSequenceDictionary(); } else { return sequenceDictionary; } } /** * This class is a hack to force parallel loading of the headers and indexes of remote gvcf files. * It initializes a feature reader and starts a query. This causes the header and index to be read, and also causes any * pre-fetching to begin if enabled. It is very narrowly crafted and should not be used for other purposes. */ private static final class InitializedQueryWrapper implements FeatureReader { private final FeatureReader reader; private final SimpleInterval interval; private CloseableTribbleIterator query; private InitializedQueryWrapper(final FeatureReader reader, final Locatable interval) throws IOException { this.reader = reader; this.interval = new SimpleInterval(interval); this.query = reader.query(interval.getContig(), interval.getStart(), interval.getEnd()); } @Override public CloseableTribbleIterator query(final String chr, final int start, final int end) { final SimpleInterval queryInterval = new SimpleInterval(chr, start, end); if( !interval.equals(queryInterval)){ throw new GATKException("Cannot call query with different interval, expected:" + this.interval + " queried with: " + queryInterval); } if( query != null ){ final CloseableTribbleIterator tmp = query; query = null; return tmp; } else { throw new GATKException("Cannot call query twice on this wrapper."); } } @Override public CloseableTribbleIterator iterator() { throw new UnsupportedOperationException("iterator() not supported, this should not have been called and indicates an issue with GenomicsDB integration"); } @Override public void close() throws IOException { reader.close(); } @Override public List getSequenceNames() { throw new UnsupportedOperationException("getSequenceNames() not supported, this should not have been called and indicates an issue with GenomicsDB integration"); } @Override public Object getHeader() { return reader.getHeader(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy