org.broadinstitute.hellbender.tools.spark.pathseq.PSBuildReferenceTaxonomyUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark.pathseq;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Output;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import scala.Tuple2;

import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;

public final class PSBuildReferenceTaxonomyUtils {

    protected static final Logger logger = LogManager.getLogger(PSBuildReferenceTaxonomyUtils.class);
    private static final String VERTICAL_BAR_DELIMITER_REGEX = "\\s*\\|\\s*";
    /**
     * Build set of accessions contained in the reference.
     * Returns: a map from accession to the name and length of the record. If the sequence name contains the
     * taxonomic ID, it instead gets added to taxIdToProperties. Later we merge both results into taxIdToProperties.
     * Method: First, look for either "taxid||" or "ref||" in the sequence name. If neither of
     * those are found, use the first word of the name as the accession.
     */
    protected static Map> parseReferenceRecords(final List dictionaryList,
                                                                             final Map taxIdToProperties) {

        final Map> accessionToNameAndLength = new HashMap<>();
        for (final SAMSequenceRecord record : dictionaryList) {
            final String recordName = record.getSequenceName();
            final long recordLength = record.getSequenceLength();
            final String[] tokens = recordName.split(VERTICAL_BAR_DELIMITER_REGEX);
            String recordAccession = null;
            int recordTaxId = PSTree.NULL_NODE;
            for (int i = 0; i < tokens.length - 1 && recordTaxId == PSTree.NULL_NODE; i++) {
                if (tokens[i].equals("ref")) {
                    recordAccession = tokens[i + 1];
                } else if (tokens[i].equals("taxid")) {
                    recordTaxId = parseTaxonId(tokens[i + 1]);
                }
            }
            if (recordTaxId == PSTree.NULL_NODE) {
                if (recordAccession == null) {
                    final String[] tokens2 = tokens[0].split(" "); //Default accession to first word in the name
                    recordAccession = tokens2[0];
                }
                accessionToNameAndLength.put(recordAccession, new Tuple2<>(recordName, recordLength));
            } else {
                addReferenceAccessionToTaxon(recordTaxId, recordName, recordLength, taxIdToProperties);
            }
        }
        return accessionToNameAndLength;
    }

    private static int parseTaxonId(final String taxonId) {
        try {
            return Integer.valueOf(taxonId);
        } catch (final NumberFormatException e) {
            throw new UserException.BadInput("Expected taxonomy ID to be an integer but found \"" + taxonId + "\"", e);
        }
    }

    /**
     * Helper classes for defining RefSeq and GenBank catalog formats. Columns should be given as 0-based indices.
     */
    private interface AccessionCatalogFormat {
        int getTaxIdColumn();
        int getAccessionColumn();
    }

    private static final class RefSeqCatalogFormat implements AccessionCatalogFormat {
        private static final int TAX_ID_COLUMN = 0;
        private static final int ACCESSION_COLUMN = 2;
        public int getTaxIdColumn() {
            return TAX_ID_COLUMN;
        }
        public int getAccessionColumn() {
            return ACCESSION_COLUMN;
        }
    }

    private static final class GenBankCatalogFormat implements AccessionCatalogFormat {
        private static final int TAX_ID_COLUMN = 6;
        private static final int ACCESSION_COLUMN = 1;
        public int getTaxIdColumn() {
            return TAX_ID_COLUMN;
        }
        public int getAccessionColumn() {
            return ACCESSION_COLUMN;
        }
    }

    /**
     * Builds maps of reference contig accessions to their taxonomic ids and vice versa.
     * Input can be a RefSeq or Genbank catalog file. accNotFound is an initial list of
     * accessions from the reference that have not been successfully looked up; if null,
     * will be initialized to the accToRefInfo key set by default.
     * 
     * Returns a collection of reference accessions that could not be found, if any.
     */
    protected static Set parseCatalog(final BufferedReader reader,
                                              final Map> accessionToNameAndLength,
                                              final Map taxIdToProperties,
                                              final boolean bGenBank,
                                              final Set accessionsNotFoundIn) {

        final Set accessionsNotFoundOut;
        try {
            String line;
            final AccessionCatalogFormat catalogFormat = bGenBank ? new GenBankCatalogFormat() : new RefSeqCatalogFormat();
            final int taxIdColumnIndex = catalogFormat.getTaxIdColumn();
            final int accessionColumnIndex = catalogFormat.getAccessionColumn();
            if (accessionsNotFoundIn == null) {
                //If accessionsNotFoundIn is null, this is the first call to parseCatalog, so initialize the set to all accessions
                accessionsNotFoundOut = new HashSet<>(accessionToNameAndLength.keySet());
            } else {
                //Otherwise this is a subsequent call and we continue to look for any remaining accessions
                accessionsNotFoundOut = new HashSet<>(accessionsNotFoundIn);
            }
            final int minColumns = Math.max(taxIdColumnIndex, accessionColumnIndex) + 1;
            long lineNumber = 1;
            while ((line = reader.readLine()) != null && !line.isEmpty()) {
                final String[] tokens = line.trim().split("\t", minColumns + 1);
                if (tokens.length >= minColumns) {
                    final int taxId = parseTaxonId(tokens[taxIdColumnIndex]);
                    final String accession = tokens[accessionColumnIndex];
                    if (accessionToNameAndLength.containsKey(accession)) {
                        final Tuple2 nameAndLength = accessionToNameAndLength.get(accession);
                        addReferenceAccessionToTaxon(taxId, nameAndLength._1, nameAndLength._2, taxIdToProperties);
                        accessionsNotFoundOut.remove(accession);
                    }
                } else {
                    throw new UserException.BadInput("Expected at least " + minColumns + " tab-delimited columns in " +
                            "GenBank catalog file, but only found " + tokens.length + " on line " + lineNumber);
                }
                lineNumber++;
            }
        } catch (final IOException e) {
            throw new UserException.CouldNotReadInputFile("Error reading from catalog file", e);
        }
        return accessionsNotFoundOut;
    }

    /**
     * Parses scientific name of each taxon and puts it in taxIdToProperties
     */
    protected static void parseNamesFile(final BufferedReader reader, final Map taxIdToProperties) {
        try {
            String line;
            while ((line = reader.readLine()) != null) {
                //Split into columns delimited by |
                final String[] tokens = line.split(VERTICAL_BAR_DELIMITER_REGEX);
                if (tokens.length < 4) {
                    throw new UserException.BadInput("Expected at least 4 columns in tax dump names file but found " + tokens.length);
                }
                final String nameType = tokens[3];
                if (nameType.equals("scientific name")) {
                    final int taxId = parseTaxonId(tokens[0]);
                    final String name = tokens[1];
                    if (taxIdToProperties.containsKey(taxId)) {
                        taxIdToProperties.get(taxId).setName(name);
                    } else {
                        taxIdToProperties.put(taxId, new PSPathogenReferenceTaxonProperties(name));
                    }
                }
            }
        } catch (final IOException e) {
            throw new UserException.CouldNotReadInputFile("Error reading from taxonomy dump names file", e);
        }
    }

    /**
     * Gets the rank and parent of each taxon.
     * Returns a Collection of tax ID's found in the nodes file that are not in taxIdToProperties (i.e. were not found in
     * a reference sequence name using the taxid|\ tag or the catalog file).
     */
    protected static Collection parseNodesFile(final BufferedReader reader, final Map taxIdToProperties) {
        try {
            final Collection taxIdsNotFound = new ArrayList<>();
            String line;
            while ((line = reader.readLine()) != null) {
                final String[] tokens = line.split(VERTICAL_BAR_DELIMITER_REGEX);
                if (tokens.length < 3) {
                    throw new UserException.BadInput("Expected at least 3 columns in tax dump nodes file but found " + tokens.length);
                }
                final int taxId = parseTaxonId(tokens[0]);
                final int parent = parseTaxonId(tokens[1]);
                final String rank = tokens[2];
                final PSPathogenReferenceTaxonProperties taxonProperties;
                if (taxIdToProperties.containsKey(taxId)) {
                    taxonProperties = taxIdToProperties.get(taxId);
                } else {
                    taxonProperties = new PSPathogenReferenceTaxonProperties("tax_" + taxId);
                    taxIdsNotFound.add(taxId);
                }
                taxonProperties.setRank(rank);
                if (taxId != PSTaxonomyConstants.ROOT_ID) { //keep root's parent set to null
                    taxonProperties.setParent(parent);
                }
                taxIdToProperties.put(taxId, taxonProperties);
            }
            return taxIdsNotFound;
        } catch (final IOException e) {
            throw new UserException.CouldNotReadInputFile("Error reading from taxonomy dump nodes file", e);
        }
    }

    /**
     * Helper function for building the map from tax id to reference contig accession
     */
    private static void addReferenceAccessionToTaxon(final int taxId, final String accession, final long length, final Map taxIdToProperties) {
        taxIdToProperties.putIfAbsent(taxId, new PSPathogenReferenceTaxonProperties());
        taxIdToProperties.get(taxId).addAccession(accession, length);
    }

    /**
     * Removes nodes not in the tree from the tax_id-to-properties map
     */
    static void removeUnusedTaxIds(final Map taxIdToProperties,
                                   final PSTree tree) {
        taxIdToProperties.keySet().retainAll(tree.getNodeIDs());
    }

    /**
     * Create reference_name-to-taxid map (just an inversion on taxIdToProperties)
     */
    protected static Map buildAccessionToTaxIdMap(final Map taxIdToProperties,
                                                                  final PSTree tree,
                                                                  final int minNonVirusContigLength) {
        final Map accessionToTaxId = new HashMap<>();
        for (final int taxId : taxIdToProperties.keySet()) {
            final boolean isVirus = tree.getPathOf(taxId).contains(PSTaxonomyConstants.VIRUS_ID);
            final PSPathogenReferenceTaxonProperties taxonProperties = taxIdToProperties.get(taxId);
            for (final String name : taxonProperties.getAccessions()) {
                if (isVirus || taxonProperties.getAccessionLength(name) >= minNonVirusContigLength) {
                    accessionToTaxId.put(name, taxId);
                }
            }
        }
        return accessionToTaxId;
    }

    /**
     * Returns a PSTree representing a reduced taxonomic tree containing only taxa present in the reference
     */
    protected static PSTree buildTaxonomicTree(final Map taxIdToProperties) {

        //Build tree of all taxa
        final PSTree tree = new PSTree(PSTaxonomyConstants.ROOT_ID);
        final Collection invalidIds = new HashSet<>(taxIdToProperties.size());
        for (final int taxId : taxIdToProperties.keySet()) {
            if (taxId != PSTaxonomyConstants.ROOT_ID) {
                final PSPathogenReferenceTaxonProperties taxonProperties = taxIdToProperties.get(taxId);
                if (taxonProperties.getName() != null && taxonProperties.getParent() != PSTree.NULL_NODE && taxonProperties.getRank() != null) {
                    tree.addNode(taxId, taxonProperties.getName(), taxonProperties.getParent(), taxonProperties.getTotalLength(), taxonProperties.getRank());
                } else {
                    invalidIds.add(taxId);
                }
            }
        }
        PSUtils.logItemizedWarning(logger, invalidIds, "The following taxonomic IDs did not have name/taxonomy information (this may happen when the catalog and taxdump files are inconsistent)");

        final Set unreachableNodes = tree.removeUnreachableNodes();
        if (!unreachableNodes.isEmpty()) {
            PSUtils.logItemizedWarning(logger, unreachableNodes, "Removed " + unreachableNodes.size() + " unreachable tree nodes");
        }

        tree.checkStructure();

        //Trim tree down to nodes corresponding only to reference taxa (and their ancestors)
        final Set relevantNodes = new HashSet<>();
        for (final int taxonId : taxIdToProperties.keySet()) {
            if (!taxIdToProperties.get(taxonId).getAccessions().isEmpty() && tree.hasNode(taxonId)) {
                relevantNodes.addAll(tree.getPathOf(taxonId));
            }
        }
        if (relevantNodes.isEmpty()) {
            throw new UserException.BadInput("Did not find any taxa corresponding to reference sequence names.\n\n"
                    + "Check that reference names follow one of the required formats:\n\n"
                    + "\t...|ref||...\n"
                    + "\t...|taxid||...\n"
                    + "\t...");
        }
        tree.retainNodes(relevantNodes);

        return tree;
    }

    /**
     * Gets a buffered reader for a gzipped file
     * @param path   File path
     * @return  Reader for the file
     */
    public static BufferedReader getBufferedReaderGz(final String path) {
        try {
            return new BufferedReader(IOUtils.makeReaderMaybeGzipped(IOUtils.getPath(path)));
        } catch (final IOException e) {
            throw new UserException.BadInput("Could not open file " + path, e);
        }
    }

    /**
     * Gets a Reader for a file in a gzipped tarball
     * @param tarPath   Path to the tarball
     * @param fileName   File within the tarball
     * @return   The file's reader
     */
    public static BufferedReader getBufferedReaderTarGz(final String tarPath, final String fileName) {
        try {
            InputStream result = null;
            final TarArchiveInputStream tarStream = new TarArchiveInputStream(new GZIPInputStream(new FileInputStream(tarPath)));
            TarArchiveEntry entry = tarStream.getNextEntry();
            while (entry != null) {
                if (entry.getName().equals(fileName)) {
                    result = tarStream;
                    break;
                }
                entry = tarStream.getNextEntry();
            }
            if (result == null) {
                throw new UserException.BadInput("Could not find file " + fileName + " in tarball " + tarPath);
            }
            return new BufferedReader(new InputStreamReader(result));
        } catch (final IOException e) {
            throw new UserException.BadInput("Could not open compressed tarball file " + fileName + " in " + tarPath, e);
        }
    }

    /**
     * Writes objects using Kryo to specified local file path.
     * NOTE: using setReferences(false), which must also be set when reading the file. Does not work with nested
     * objects that reference its parent.
     */
    public static void writeTaxonomyDatabase(final String filePath, final PSTaxonomyDatabase taxonomyDatabase) {
        try {
            final Kryo kryo = new Kryo();
            kryo.setReferences(false);
            Output output = new Output(new FileOutputStream(filePath));
            kryo.writeObject(output, taxonomyDatabase);
            output.close();
        } catch (final FileNotFoundException e) {
            throw new UserException.CouldNotCreateOutputFile("Could not serialize objects to file", e);
        }
    }
}