All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.spark.pathseq.PathSeqBuildReferenceTaxonomy Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMSequenceRecord;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.ReferenceInputArgumentCollection;
import org.broadinstitute.hellbender.cmdline.argumentcollections.RequiredReferenceInputArgumentCollection;
import org.broadinstitute.hellbender.cmdline.programgroups.MetagenomicsProgramGroup;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import scala.Tuple2;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;

/**
 * Build an annotated taxonomy datafile for a given microbe reference. The output file from this tool is required to run the PathSeq pipeline.
 *
 * 

The tool reads the list of sequence accessions from the given reference. For each accession, it looks up the NCBI * taxonomic ID of the corresponding organism and builds a taxonomic tree containing only organisms that are * represented in the reference. The reference should only contain sequences from NCBI RefSeq and/or Genbank databases.

* *

Input

*
    *
  • An indexed microbe reference in FASTA format (NCBI RefSeq/Genbank sequences)
  • *
  • Downloaded NCBI RefSeq (and/or GenBank) catalog archive file(s)
  • *
  • Downloaded NCBI taxonomy archive file
  • *
* *

See argument documentation for information about where to download the archive files.

* *

Output

*
    *
  • A binary file containing reference taxonomy information
  • *
* *

Usage examples

* *
 * gatk PathSeqBuildReferenceTaxonomy \
 *   --reference microbe_reference.fasta \
 *   --output taxonomy.db \
 *   --refseq-catalog RefSeq-releaseXX.catalog.gz \
 *   --tax-dump taxdump.tar.gz \
 *   --min-non-virus-contig-length 2000
 * 
* *

Notes

* *

Often there are inconsistencies between the reference sequences, NCBI catalog, and taxonomy archive. To * minimize this issue, ensure that the input files are retrieved on the same date.

* * @author Mark Walker <[email protected]> */ @DocumentedFeature @CommandLineProgramProperties(summary = "Build an annotated taxonomy datafile for a given microbe reference. The output file from this tool is required to run the PathSeq pipeline.", oneLineSummary = "Builds a taxonomy datafile of the microbe reference", programGroup = MetagenomicsProgramGroup.class) public class PathSeqBuildReferenceTaxonomy extends CommandLineProgram { public static final String REFSEQ_CATALOG_LONG_NAME = "refseq-catalog"; public static final String REFSEQ_CATALOG_SHORT_NAME = "RC"; public static final String GENBANK_CATALOG_LONG_NAME = "genbank-catalog"; public static final String GENBANK_CATALOG_SHORT_NAME = "GC"; public static final String TAX_DUMP_LONG_NAME = "tax-dump"; public static final String TAX_DUMP_SHORT_NAME = "TD"; public static final String MIN_NON_VIRUS_CONTIG_LENGTH_LONG_NAME = "min-non-virus-contig-length"; public static final String MIN_NON_VIRUS_CONTIG_LENGTH_SHORT_NAME = MIN_NON_VIRUS_CONTIG_LENGTH_LONG_NAME; @ArgumentCollection protected final ReferenceInputArgumentCollection referenceArguments = new RequiredReferenceInputArgumentCollection(); @Argument(doc = "Local path for the output file. By convention, the extension should be \".db\"", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) public String outputPath; @Argument(doc = "Local path to catalog file " + "(RefSeq-releaseXX.catalog.gz available at ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/)", fullName = REFSEQ_CATALOG_LONG_NAME, shortName = REFSEQ_CATALOG_SHORT_NAME, optional = true) public String refseqCatalogPath = null; /** * This may be supplied alone or in addition to the RefSeq catalog in the case that sequences from GenBank are * present in the reference. */ @Argument(doc = "Local path to Genbank catalog file " + "(gbXXX.catalog.XXX.txt.gz at ftp://ftp.ncbi.nlm.nih.gov/genbank/catalog/)", fullName = GENBANK_CATALOG_LONG_NAME, shortName = GENBANK_CATALOG_SHORT_NAME, optional = true) public String genbankCatalogPath = null; @Argument(doc = "Local path to taxonomy dump tarball (taxdump.tar.gz available at ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/)", fullName = TAX_DUMP_LONG_NAME, shortName = TAX_DUMP_SHORT_NAME) public String taxdumpPath; /** * Sequences from non-virus organisms less than this length will be filtered out such that any reads aligning to them will * be ignored. This is a quality control measure to remove shorter sequences from draft genomes that are likely to * contain sequencing artifacts such as cross-species contamination or sequencing adapters. Note this may * remove some bacteria plasmid sequences. */ @Argument(doc = "Minimum reference contig length for non-viruses", fullName = MIN_NON_VIRUS_CONTIG_LENGTH_LONG_NAME, shortName = MIN_NON_VIRUS_CONTIG_LENGTH_SHORT_NAME, minValue = 0, minRecommendedValue = 500, maxRecommendedValue = 10000) public int minNonVirusContigLength = 0; @Override public Object doWork() { if (refseqCatalogPath == null && genbankCatalogPath == null) { throw new UserException.BadInput("At least one of --refseq-catalog or --genbank-catalog must be specified"); } logger.info("Parsing reference and files... (this may take a few minutes)"); final ReferenceDataSource reference = ReferenceDataSource.of(referenceArguments.getReferencePath()); if (reference.getSequenceDictionary() == null) { throw new UserException.BadInput("Reference sequence dictionary not found. Please build one using CreateSequenceDictionary."); } final List referenceRecords = reference.getSequenceDictionary().getSequences(); //Parse reference index, filling in data to taxIdToProperties and accessionToNameAndLength where possible final Map taxIdToProperties = new HashMap<>(); final Map> accessionToNameAndLength = PSBuildReferenceTaxonomyUtils.parseReferenceRecords(referenceRecords, taxIdToProperties); //Parse RefSeq catalog to determine taxonomic ID's of accession keys in accessionToNameAndLength Set accessionsNotFound = null; if (refseqCatalogPath != null) { try (final BufferedReader refseqCatalogStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderGz(refseqCatalogPath)) { accessionsNotFound = PSBuildReferenceTaxonomyUtils.parseCatalog(refseqCatalogStreamReader, accessionToNameAndLength, taxIdToProperties, false, null); } catch (IOException e) { throw new GATKException("Error reading RefSeq catalog", e); } } //Parse Genbank catalog if (genbankCatalogPath != null) { try (final BufferedReader genbankCatalogStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderGz(genbankCatalogPath)) { accessionsNotFound = PSBuildReferenceTaxonomyUtils.parseCatalog(genbankCatalogStreamReader, accessionToNameAndLength, taxIdToProperties, true, accessionsNotFound); } catch (IOException e) { throw new GATKException("Error reading GenBank catalog", e); } } if (accessionsNotFound != null && !accessionsNotFound.isEmpty()) { PSUtils.logItemizedWarning(logger, accessionsNotFound, "Did not find entries in the catalog for the following reference accessions"); } //Get the scientific name of every taxonomic node (even the ones not in the reference) try (final BufferedReader namesStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderTarGz(taxdumpPath, "names.dmp")) { PSBuildReferenceTaxonomyUtils.parseNamesFile(namesStreamReader, taxIdToProperties); } catch (IOException e) { throw new GATKException("Error reading taxdump names files", e); } //Gets the taxonomic rank (e.g. family, order, genus, species, etc.) and parent of each node try (final BufferedReader nodesStreamReader = PSBuildReferenceTaxonomyUtils.getBufferedReaderTarGz(taxdumpPath, "nodes.dmp")) { final Collection taxNotFound = PSBuildReferenceTaxonomyUtils.parseNodesFile(nodesStreamReader, taxIdToProperties); PSUtils.logItemizedWarning(logger, taxNotFound, "Did not find entry from reference sequence names or the names file for following some tax ID's. Setting name to tax_"); } catch (IOException e) { throw new GATKException("Error reading taxdump names files", e); } //Build the taxonomic tree and a map from accession ID's to taxonomic ID's logger.info("Building taxonomic database..."); final PSTree tree = PSBuildReferenceTaxonomyUtils.buildTaxonomicTree(taxIdToProperties); PSBuildReferenceTaxonomyUtils.removeUnusedTaxIds(taxIdToProperties, tree); final Map accessionToTaxId = PSBuildReferenceTaxonomyUtils.buildAccessionToTaxIdMap(taxIdToProperties, tree, minNonVirusContigLength); //Write output PSBuildReferenceTaxonomyUtils.writeTaxonomyDatabase(outputPath, new PSTaxonomyDatabase(tree, accessionToTaxId)); return null; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy