All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqBuildKmers Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.tools.spark.pathseq;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.datasources.ReferenceFileSparkSource;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerShort;
import org.broadinstitute.hellbender.tools.spark.utils.LargeLongHopscotchSet;
import org.broadinstitute.hellbender.tools.spark.utils.LongBloomFilter;

import java.util.Collection;

/**
 * Produce a set of k-mers from the given host reference. The output file from this tool is required to run the PathSeq pipeline.
 *
 * 

The tool works by scanning the reference one position at a time. It takes the k-mer (k-base subsequence) starting at each * consecutive position and adds it to a set. By default, the set is stored as a * hash table.

* *

Users also have the option to represent the k-mers set using a Bloom * filter by specifying a non-zero value for the --bloom-false-positive-probability parameter. This uses less memory * than the default hash set but also can produce false positives. In other words, when * asked whether a non-host k-mer exists in the set, it will incorrectly say yes with a probability, p. The user can * specify p so that the probability of incorrectly subtracting a non-host read is negligibly * small. For p = 0.0001 and read length of 151 bases, the probability of the PathSeq incorrectly subtracting a non-host * read is < 1.5%, but the amount of memory used is reduced 4-fold compared to a hash table. For this reason, Bloom * filters are generally recommended.

* *

Note that the file formats used for storing these k-mer data structures are only readable by the PathSeq tools.

* *

Input

*
    *
  • An indexed host reference in FASTA format
  • *
* *

Output

*
    *
  • A set of the k-mers in the reference
  • *
* *

Usage examples

* *

Builds a hash table of every k-mer (k = 31) in the reference. Each k-mer is masked at the 16th position.

*
 * gatk PathSeqBuildKmers  \
 *   --reference host_reference.fasta \
 *   --output host_reference.hss \
 *   --kmer-mask 16 \
 *   --kmer-size 31
 * 
* *

Builds a Bloom filter with false positive probability p < 0.001.

*
 * gatk PathSeqBuildKmers  \
 *   --reference host_reference.fasta \
 *   --output host_reference.hss \
 *   --bloom-false-positive-probability 0.001 \
 *   --kmer-mask 16 \
 *   --kmer-size 31
 * 
* *

Notes

* *

For most references, the Java VM will run out of memory with the default settings. The Java heap size limit should * be set at least 20x the size of the reference (less if building a Bloom filter). For example, for a 3 GB reference set * the limit to 60 GB by adding --java-options "-Xmx60g" to the command.

* * @author Mark Walker <[email protected]> */ @DocumentedFeature @CommandLineProgramProperties(summary = "Produce a set of k-mers from the given host reference. The output file from this tool is required to run the PathSeq pipeline.", oneLineSummary = "Builds set of host reference k-mers", programGroup = MetagenomicsProgramGroup.class) public final class PathSeqBuildKmers extends CommandLineProgram { public static final String REFERENCE_LONG_NAME = StandardArgumentDefinitions.REFERENCE_LONG_NAME; public static final String REFERENCE_SHORT_NAME = StandardArgumentDefinitions.REFERENCE_SHORT_NAME; public static final String BLOOM_FILTER_FALSE_POSITIVE_P_LONG_NAME = "bloom-false-positive-probability"; public static final String BLOOM_FILTER_FALSE_POSITIVE_P_SHORT_NAME = "P"; public static final String KMER_SIZE_LONG_NAME = "kmer-size"; public static final String KMER_SIZE_SHORT_NAME = "SZ"; public static final String KMER_MASK_LONG_NAME = "kmer-mask"; public static final String KMER_MASK_SHORT_NAME = "M"; public static final String KMER_SPACING_LONG_NAME = "kmer-spacing"; public static final String KMER_SPACING_SHORT_NAME = "SP"; @Argument(doc = "File for k-mer set output. Extension will be automatically added if not present (" + PSKmerUtils.HOPSCOTCH_SET_EXTENSION + " for hash set or " + PSKmerUtils.BLOOM_FILTER_EXTENSION + " for Bloom filter)", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) public String outputFile; @Argument(doc = "Reference FASTA file path on local disk", fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME, shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME) public GATKPath reference; /** *

Note that the provided argument is used as an upper limit on the probability, and the actual false positive * probability may be less.

*/ @Argument(doc = "If non-zero, creates a Bloom filter with this false positive probability", fullName = BLOOM_FILTER_FALSE_POSITIVE_P_LONG_NAME, shortName = BLOOM_FILTER_FALSE_POSITIVE_P_SHORT_NAME, minValue = 0.0, maxValue = 1.0, maxRecommendedValue = 0.001, optional = true) public double bloomFpp = 0; /** * Reducing the k-mer length will increase the number of host reads subtracted in the * filtering phase of the pipeline, but it may also increase the number of non-host (i.e. microbial) * reads that are incorrectly subtracted. Note that changing the length of the k-mer does not affect memory usage. */ @Argument(doc = "K-mer size, must be odd and less than 32", fullName = KMER_SIZE_LONG_NAME, shortName = KMER_SIZE_SHORT_NAME, minValue = 1, maxValue = 31, optional = true) public int kmerSize = 31; /** * K-mer masking allows mismatches to occur at one or more specified positions. Masking the middle base is recommended * to enhance host read detection. */ @Argument(doc = "Comma-delimited list of base indices (starting with 0) to mask in each k-mer", fullName = KMER_MASK_LONG_NAME, shortName = KMER_MASK_SHORT_NAME, optional = true) public String kmerMaskString = ""; /** * The k-mer set size can be reduced by only storing k-mers starting at every n bases in the reference. By default * every k-mer, starting at consecutive bases in the reference, is stored. */ @Argument(doc = "Spacing between successive k-mers", fullName = KMER_SPACING_LONG_NAME, shortName = KMER_SPACING_SHORT_NAME, minValue = 1, optional = true) public int kmerSpacing = 1; /** * Get the list of distinct kmers in the reference, and write them to a file as a HopScotch set or Bloom filter. */ @Override protected Object doWork() { final ReferenceFileSparkSource reference = new ReferenceFileSparkSource(this.reference); final byte[] maskBytes = PSUtils.parseMask(kmerMaskString, kmerSize); final SVKmerShort kmerMask = SVKmerShort.getMask(maskBytes, kmerSize); logger.info("Loading reference kmers..."); final Collection maskedKmerCollection = PSKmerUtils.getMaskedKmersFromLocalReference(reference, kmerSize, kmerSpacing, kmerMask); final long numLongs = PSKmerUtils.longArrayCollectionSize(maskedKmerCollection); if (bloomFpp > 0) { logger.info("Building Bloom filter with false positive probability " + bloomFpp + "..."); final LongBloomFilter bloomFilter = PSKmerUtils.longArrayCollectionToBloomFilter(maskedKmerCollection, numLongs, bloomFpp); final PSKmerBloomFilter kmerBloomFilter = new PSKmerBloomFilter(bloomFilter, kmerSize, kmerMask, numLongs); logger.info("Theoretical Bloom filter false positive probability: " + kmerBloomFilter.getFalsePositiveProbability()); PSKmerUtils.writeKmerBloomFilter(outputFile, kmerBloomFilter); } else { logger.info("Building kmer hash set..."); final LargeLongHopscotchSet kmerHopscotchSet = PSKmerUtils.longArrayCollectionToSet(maskedKmerCollection, numLongs); final PSKmerSet kmerSet = new PSKmerSet(kmerHopscotchSet, kmerSize, kmerMask); PSKmerUtils.writeKmerSet(outputFile, kmerSet); } return null; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy