org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqBuildReferenceTaxonomy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMSequenceRecord;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.ReferenceInputArgumentCollection;
import org.broadinstitute.hellbender.cmdline.argumentcollections.RequiredReferenceInputArgumentCollection;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import scala.Tuple2;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;

/**
 * Build an annotated taxonomy datafile for a given microbe reference. The output file from this tool is required to run the PathSeq pipeline.
 *
 * The tool reads the list of sequence accessions from the given reference. For each accession, it looks up the NCBI
 * taxonomic ID of the corresponding organism and builds a taxonomic tree containing only organisms that are
 * represented in the reference. The reference should only contain sequences from NCBI RefSeq and/or Genbank databases.
 *
 * Input
 * 
 *     An indexed microbe reference in FASTA format (NCBI RefSeq/Genbank sequences)
 *     Downloaded NCBI RefSeq (and/or GenBank) catalog archive file(s)
 *     Downloaded NCBI taxonomy archive file
 * 
 *
 * See argument documentation for information about where to download the archive files.
 *
 * Output
 * 
 *     A binary file containing reference taxonomy information
 * 
 *
 * Usage examples
 *
 *  * gatk PathSeqBuildReferenceTaxonomy \
 *   --reference microbe_reference.fasta \
 *   --output taxonomy.db \
 *   --refseq-catalog RefSeq-releaseXX.catalog.gz \
 *   --tax-dump taxdump.tar.gz \
 *   --min-non-virus-contig-length 2000
 * 
 *
 * Notes
 *
 * Often there are inconsistencies between the reference sequences, NCBI catalog, and taxonomy archive. To
 * minimize this issue, ensure that the input files are retrieved on the same date.
 *
 * @author Mark Walker <[email protected]>
 */
@DocumentedFeature
@CommandLineProgramProperties(summary = "Build an annotated taxonomy datafile for a given microbe reference. The output file from this tool is required to run the PathSeq pipeline.",
        oneLineSummary = "Builds a taxonomy datafile of the microbe reference",
        programGroup = MetagenomicsProgramGroup.class)
public class PathSeqBuildReferenceTaxonomy extends CommandLineProgram {

    public static final String REFSEQ_CATALOG_LONG_NAME = "refseq-catalog";
    public static final String REFSEQ_CATALOG_SHORT_NAME = "RC";
    public static final String GENBANK_CATALOG_LONG_NAME = "genbank-catalog";
    public static final String GENBANK_CATALOG_SHORT_NAME = "GC";
    public static final String TAX_DUMP_LONG_NAME = "tax-dump";
    public static final String TAX_DUMP_SHORT_NAME = "TD";
    public static final String MIN_NON_VIRUS_CONTIG_LENGTH_LONG_NAME = "min-non-virus-contig-length";
    public static final String MIN_NON_VIRUS_CONTIG_LENGTH_SHORT_NAME = MIN_NON_VIRUS_CONTIG_LENGTH_LONG_NAME;

    @ArgumentCollection
    protected final ReferenceInputArgumentCollection referenceArguments = new RequiredReferenceInputArgumentCollection();

    @Argument(doc = "Local path for the output file. By convention, the extension should be \".db\"",
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
    public String outputPath;

    @Argument(doc = "Local path to catalog file " +
            "(RefSeq-releaseXX.catalog.gz available at ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/)",
            fullName = REFSEQ_CATALOG_LONG_NAME,
            shortName = REFSEQ_CATALOG_SHORT_NAME,
            optional = true)
    public String refseqCatalogPath = null;

    /**
     * This may be supplied alone or in addition to the RefSeq catalog in the case that sequences from GenBank are
     * present in the reference.
     */
    @Argument(doc = "Local path to Genbank catalog file " +
            "(gbXXX.catalog.XXX.txt.gz at ftp://ftp.ncbi.nlm.nih.gov/genbank/catalog/)",
            fullName = GENBANK_CATALOG_LONG_NAME,
            shortName = GENBANK_CATALOG_SHORT_NAME,
            optional = true)
    public String genbankCatalogPath = null;

    @Argument(doc = "Local path to taxonomy dump tarball (taxdump.tar.gz available at ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/)",
            fullName = TAX_DUMP_LONG_NAME,
            shortName = TAX_DUMP_SHORT_NAME)
    public String taxdumpPath;

    /**
     * Sequences from non-virus organisms less than this length will be filtered out such that any reads aligning to them will
     * be ignored. This is a quality control measure to remove shorter sequences from draft genomes that are likely to
     * contain sequencing artifacts such as cross-species contamination or sequencing adapters. Note this may
     * remove some bacteria plasmid sequences.
     */
    @Argument(doc = "Minimum reference contig length for non-viruses",
            fullName = MIN_NON_VIRUS_CONTIG_LENGTH_LONG_NAME,
            shortName = MIN_NON_VIRUS_CONTIG_LENGTH_SHORT_NAME,
            minValue = 0,
            minRecommendedValue = 500,
            maxRecommendedValue = 10000)
    public int minNonVirusContigLength = 0;

    @Override
    public Object doWork() {

        if (refseqCatalogPath == null && genbankCatalogPath == null) {
            throw new UserException.BadInput("At least one of --refseq-catalog or --genbank-catalog must be specified");
        }

        logger.info("Parsing reference and files... (this may take a few minutes)");
        final ReferenceDataSource reference = ReferenceDataSource.of(referenceArguments.getReferencePath());
        if (reference.getSequenceDictionary() == null) {
            throw new UserException.BadInput("Reference sequence dictionary not found. Please build one using CreateSequenceDictionary.");
        }
        final List referenceRecords = reference.getSequenceDictionary().getSequences();

        //Parse reference index, filling in data to taxIdToProperties and accessionToNameAndLength where possible
        final Map taxIdToProperties = new HashMap<>();
        final Map> accessionToNameAndLength = PSBuildReferenceTaxonomyUtils.parseReferenceRecords(referenceRecords, taxIdToProperties);

        //Parse RefSeq catalog to determine taxonomic ID's of accession keys in accessionToNameAndLength
        Set accessionsNotFound = null;
        if (refseqCatalogPath != null) {
            try (final BufferedReader refseqCatalogStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderGz(refseqCatalogPath)) {
                accessionsNotFound = PSBuildReferenceTaxonomyUtils.parseCatalog(refseqCatalogStreamReader, accessionToNameAndLength, taxIdToProperties, false, null);
            } catch (IOException e) {
                throw new GATKException("Error reading RefSeq catalog", e);
            }
        }

        //Parse Genbank catalog
        if (genbankCatalogPath != null) {
            try (final BufferedReader genbankCatalogStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderGz(genbankCatalogPath)) {
                accessionsNotFound = PSBuildReferenceTaxonomyUtils.parseCatalog(genbankCatalogStreamReader, accessionToNameAndLength, taxIdToProperties, true, accessionsNotFound);
            } catch (IOException e) {
                throw new GATKException("Error reading GenBank catalog", e);
            }
        }
        if (accessionsNotFound != null && !accessionsNotFound.isEmpty()) {
            PSUtils.logItemizedWarning(logger, accessionsNotFound, "Did not find entries in the catalog for the following reference accessions");
        }

        //Get the scientific name of every taxonomic node (even the ones not in the reference)
        try (final BufferedReader namesStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderTarGz(taxdumpPath, "names.dmp")) {
            PSBuildReferenceTaxonomyUtils.parseNamesFile(namesStreamReader, taxIdToProperties);
        } catch (IOException e) {
            throw new GATKException("Error reading taxdump names files", e);
        }

        //Gets the taxonomic rank (e.g. family, order, genus, species, etc.) and parent of each node
        try (final BufferedReader nodesStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderTarGz(taxdumpPath, "nodes.dmp")) {
            final Collection taxNotFound = PSBuildReferenceTaxonomyUtils.parseNodesFile(nodesStreamReader, taxIdToProperties);
            PSUtils.logItemizedWarning(logger, taxNotFound, "Did not find entry from reference sequence names or the names file for following some tax ID's. Setting name to tax_");
        } catch (IOException e) {
            throw new GATKException("Error reading taxdump names files", e);
        }

        //Build the taxonomic tree and a map from accession ID's to taxonomic ID's
        logger.info("Building taxonomic database...");
        final PSTree tree = PSBuildReferenceTaxonomyUtils.buildTaxonomicTree(taxIdToProperties);
        PSBuildReferenceTaxonomyUtils.removeUnusedTaxIds(taxIdToProperties, tree);
        final Map accessionToTaxId = PSBuildReferenceTaxonomyUtils.buildAccessionToTaxIdMap(taxIdToProperties, tree, minNonVirusContigLength);

        //Write output
        PSBuildReferenceTaxonomyUtils.writeTaxonomyDatabase(outputPath, new PSTaxonomyDatabase(tree, accessionToTaxId));

        return null;
    }

}