 
                        
        
                        
        org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqBuildKmers Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.spark.pathseq;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.datasources.ReferenceFileSparkSource;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerShort;
import org.broadinstitute.hellbender.tools.spark.utils.LargeLongHopscotchSet;
import org.broadinstitute.hellbender.tools.spark.utils.LongBloomFilter;
import java.util.Collection;
/**
 * Produce a set of k-mers from the given host reference. The output file from this tool is required to run the PathSeq pipeline.
 *
 * The tool works by scanning the reference one position at a time. It takes the k-mer (k-base subsequence) starting at each
 * consecutive position and adds it to a set. By default, the set is stored as a
 * hash table.
 *
 * Users also have the option to represent the k-mers set using a Bloom
 * filter by specifying a non-zero value for the --bloom-false-positive-probability parameter. This uses less memory
 * than the default hash set but also can produce false positives. In other words, when
 * asked whether a non-host k-mer exists in the set, it will incorrectly say yes with a probability, p. The user can
 * specify p so that the probability of incorrectly subtracting a non-host read is negligibly
 * small. For p = 0.0001 and read length of 151 bases, the probability of the PathSeq incorrectly subtracting a non-host
 * read is < 1.5%, but the amount of memory used is reduced 4-fold compared to a hash table. For this reason, Bloom
 * filters are generally recommended.
 *
 * Note that the file formats used for storing these k-mer data structures are only readable by the PathSeq tools.
 *
 * Input
 * 
 *     - An indexed host reference in FASTA format*
*
 *Output
 * 
 *     - A set of the k-mers in the reference*
*
 *Usage examples
 *
 * Builds a hash table of every k-mer (k = 31) in the reference. Each k-mer is masked at the 16th position.
 * 
 * gatk PathSeqBuildKmers  \
 *   --reference host_reference.fasta \
 *   --output host_reference.hss \
 *   --kmer-mask 16 \
 *   --kmer-size 31
 * 
 *
 * Builds a Bloom filter with false positive probability p < 0.001.
 * 
 * gatk PathSeqBuildKmers  \
 *   --reference host_reference.fasta \
 *   --output host_reference.hss \
 *   --bloom-false-positive-probability 0.001 \
 *   --kmer-mask 16 \
 *   --kmer-size 31
 * 
 *
 * Notes
 *
 * For most references, the Java VM will run out of memory with the default settings. The Java heap size limit should
 * be set at least 20x the size of the reference (less if building a Bloom filter). For example, for a 3 GB reference set
 * the limit to 60 GB by adding --java-options "-Xmx60g" to the command.
 *
 * @author Mark Walker <[email protected]>
 */
@DocumentedFeature
@CommandLineProgramProperties(summary = "Produce a set of k-mers from the given host reference. The output file from this tool is required to run the PathSeq pipeline.",
        oneLineSummary = "Builds set of host reference k-mers",
        programGroup = MetagenomicsProgramGroup.class)
public final class PathSeqBuildKmers extends CommandLineProgram {
    public static final String REFERENCE_LONG_NAME = StandardArgumentDefinitions.REFERENCE_LONG_NAME;
    public static final String REFERENCE_SHORT_NAME = StandardArgumentDefinitions.REFERENCE_SHORT_NAME;
    public static final String BLOOM_FILTER_FALSE_POSITIVE_P_LONG_NAME = "bloom-false-positive-probability";
    public static final String BLOOM_FILTER_FALSE_POSITIVE_P_SHORT_NAME = "P";
    public static final String KMER_SIZE_LONG_NAME = "kmer-size";
    public static final String KMER_SIZE_SHORT_NAME = "SZ";
    public static final String KMER_MASK_LONG_NAME = "kmer-mask";
    public static final String KMER_MASK_SHORT_NAME = "M";
    public static final String KMER_SPACING_LONG_NAME = "kmer-spacing";
    public static final String KMER_SPACING_SHORT_NAME = "SP";
    @Argument(doc = "File for k-mer set output. Extension will be automatically added if not present ("
            + PSKmerUtils.HOPSCOTCH_SET_EXTENSION + " for hash set or "
            + PSKmerUtils.BLOOM_FILTER_EXTENSION + " for Bloom filter)",
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
    public String outputFile;
    @Argument(doc = "Reference FASTA file path on local disk",
            fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME,
            shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME)
    public GATKPath reference;
    /**
     * Note that the provided argument is used as an upper limit on the probability, and the actual false positive
     * probability may be less.
     */
    @Argument(doc = "If non-zero, creates a Bloom filter with this false positive probability",
            fullName = BLOOM_FILTER_FALSE_POSITIVE_P_LONG_NAME,
            shortName = BLOOM_FILTER_FALSE_POSITIVE_P_SHORT_NAME,
            minValue = 0.0,
            maxValue = 1.0,
            maxRecommendedValue = 0.001,
            optional = true)
    public double bloomFpp = 0;
    /**
     * Reducing the k-mer length will increase the number of host reads subtracted in the
     * filtering phase of the pipeline, but it may also increase the number of non-host (i.e. microbial)
     * reads that are incorrectly subtracted. Note that changing the length of the k-mer does not affect memory usage.
     */
    @Argument(doc = "K-mer size, must be odd and less than 32",
            fullName = KMER_SIZE_LONG_NAME,
            shortName = KMER_SIZE_SHORT_NAME,
            minValue = 1,
            maxValue = 31,
            optional = true)
    public int kmerSize = 31;
    /**
     * K-mer masking allows mismatches to occur at one or more specified positions. Masking the middle base is recommended
     * to enhance host read detection.
     */
    @Argument(doc = "Comma-delimited list of base indices (starting with 0) to mask in each k-mer",
            fullName = KMER_MASK_LONG_NAME,
            shortName = KMER_MASK_SHORT_NAME,
            optional = true)
    public String kmerMaskString = "";
    /**
     * The k-mer set size can be reduced by only storing k-mers starting at every n bases in the reference. By default
     * every k-mer, starting at consecutive bases in the reference, is stored.
     */
    @Argument(doc = "Spacing between successive k-mers",
            fullName = KMER_SPACING_LONG_NAME,
            shortName = KMER_SPACING_SHORT_NAME,
            minValue = 1,
            optional = true)
    public int kmerSpacing = 1;
    /**
     * Get the list of distinct kmers in the reference, and write them to a file as a HopScotch set or Bloom filter.
     */
    @Override
    protected Object doWork() {
        final ReferenceFileSparkSource reference = new ReferenceFileSparkSource(this.reference);
        final byte[] maskBytes = PSUtils.parseMask(kmerMaskString, kmerSize);
        final SVKmerShort kmerMask = SVKmerShort.getMask(maskBytes, kmerSize);
        logger.info("Loading reference kmers...");
        final Collection maskedKmerCollection = PSKmerUtils.getMaskedKmersFromLocalReference(reference, kmerSize, kmerSpacing, kmerMask);
        final long numLongs = PSKmerUtils.longArrayCollectionSize(maskedKmerCollection);
        if (bloomFpp > 0) {
            logger.info("Building Bloom filter with false positive probability " + bloomFpp + "...");
            final LongBloomFilter bloomFilter = PSKmerUtils.longArrayCollectionToBloomFilter(maskedKmerCollection, numLongs, bloomFpp);
            final PSKmerBloomFilter kmerBloomFilter = new PSKmerBloomFilter(bloomFilter, kmerSize, kmerMask, numLongs);
            logger.info("Theoretical Bloom filter false positive probability: " + kmerBloomFilter.getFalsePositiveProbability());
            PSKmerUtils.writeKmerBloomFilter(outputFile, kmerBloomFilter);
        } else {
            logger.info("Building kmer hash set...");
            final LargeLongHopscotchSet kmerHopscotchSet = PSKmerUtils.longArrayCollectionToSet(maskedKmerCollection, numLongs);
            final PSKmerSet kmerSet = new PSKmerSet(kmerHopscotchSet, kmerSize, kmerMask);
            PSKmerUtils.writeKmerSet(outputFile, kmerSet);
        }
        return null;
    }
}