de.julielab.genemapper.resources.ncbigene.GeneXMLUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources.ncbigene;

import com.google.common.collect.Sets;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.stream.Collectors;

public class GeneXMLUtils {

    public static final String TAXIDS_FILENAME = "genexmldownloader.taxids.gz";
    public static final String EG2ENTREZGENE_PROT_FILENAME = "eg2entrezgene_prot-genexmldownloader.gz";
    public static final String EG2SUMMARY_FILENAME = "eg2summary-genexmldownloader.gz";
    private static final String EG2REFSEQ_AND_TRACK_STATUS_FILENAME = "eg2refseq_genetrack_status-genexmldownloader.gz";
    private static final String EG2EC_FILENAME = "eg2ecnumber-genexmldownloader.gz";
    private static final Logger log = LoggerFactory.getLogger(GeneXMLUtils.class);

    public static void writeGeneInfoToFile(List geneExtractList, OutputStream osSummaries,
                                           OutputStream osProtnames, OutputStream osRefSeqAndTrackStatus, OutputStream osECNumbers) throws IOException {
        log.trace("Writing gene summaries of current XML batch.");
        synchronized (osSummaries) {
            for (GeneXmlExtract extract : geneExtractList) {
                if (extract.summary != null) {
                    IOUtils.write(extract.geneId + "\t" + extract.summary + "\n", osSummaries, "UTF-8");
                }
            }
        }

        synchronized (osProtnames) {
            log.trace("Writing entrezgene_prot names of current XML batch.");
            for (GeneXmlExtract extract : geneExtractList) {
                if (extract.entrezgeneProt != null) {
                    if (extract.entrezgeneProt.protrefName != null) {
                        for (String protName : extract.entrezgeneProt.protrefName) {
                            IOUtils.write(extract.geneId + "\t" + protName + "\n", osProtnames, "UTF-8");
                        }
                    }
                    if (null != extract.entrezgeneProt.protrefDesc)
                        IOUtils.write(extract.geneId + "\t" + extract.entrezgeneProt.protrefDesc + "\n", osProtnames,
                                "UTF-8");
                }
            }
        }

        synchronized (osRefSeqAndTrackStatus) {
            log.trace("Writing entrez gene RefSeq status entries of current XML batch");
            for (GeneXmlExtract extract : geneExtractList) {
                String refSeqStatus = extract.refSeqStatus != null ? extract.refSeqStatus : "";
                IOUtils.write(extract.geneId + "\t" + refSeqStatus + "\t" + extract.geneTrackStatusValue + "\t" + extract.geneTrackStatus + "\n", osRefSeqAndTrackStatus, "UTF-8");
            }
        }

        synchronized (osECNumbers) {
            log.trace("Writing EC numbers of current XML batch");
            for (GeneXmlExtract extract : geneExtractList) {
                if (extract.ecNumber != null)
                IOUtils.write(extract.geneId + "\t" + extract.ecNumber + "\n", osECNumbers, "UTF-8");
            }
        }

    }

    public static List extractGeneInfoFromXml(InputStream openStream)
            throws XMLStreamException, IOException {
        List geneExtractList = new ArrayList<>();

        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLStreamReader parser = factory.createXMLStreamReader(openStream);
        String currentTag;
        GeneXmlExtract currentXmlExtract = null;
        boolean inEntrezGeneSource = false;
        boolean inTaxonDbtag = false;
        boolean inRefSeqStatusCommentary = false;
        boolean inEntrezgene_comments = false;
        boolean inECNumber = false;
        while (parser.hasNext()) {

            switch (parser.getEventType()) {
                case XMLStreamConstants.START_ELEMENT:
                    currentTag = parser.getLocalName();
                    switch (currentTag) {
                        case "Entrezgene":
                            currentXmlExtract = new GeneXmlExtract();
                            break;
                        case "Entrezgene_prot":
                            currentXmlExtract.entrezgeneProt = readEntrezgeneProtFromXml(parser);
                            break;
                        case "Gene-track_geneid":
                            currentXmlExtract.geneId = parser.getElementText();
                            break;
                        case "Entrezgene_summary":
                            currentXmlExtract.summary = parser.getElementText();
                            break;
                        case "Entrezgene_source":
                            inEntrezGeneSource = true;
                            break;
                        case "Dbtag_db":
                            if (parser.getElementText().equals("taxon"))
                                inTaxonDbtag = true;
                            break;
                        case "Object-id_id":
                            if (inEntrezGeneSource && inTaxonDbtag)
                                currentXmlExtract.taxId = parser.getElementText();
                            break;
                        case "Gene-commentary_heading":
                            if (parser.getElementText().equals("RefSeq Status")) {
                                inRefSeqStatusCommentary = true;
                            }
                            break;
                        case "Entrezgene_comments":
                            inEntrezgene_comments = true;
                            break;
                        case "Gene-commentary_label":
                            String elementText = parser.getElementText();
                            if (inEntrezgene_comments && inRefSeqStatusCommentary) {
                                currentXmlExtract.refSeqStatus = elementText;
                                inRefSeqStatusCommentary = false;
                            }
                            if (elementText.equals("EC"))
                                inECNumber = true;
                            break;
                        case "Gene-commentary_text":
                            if (inECNumber)
                                currentXmlExtract.ecNumber = parser.getElementText();
                            break;
                        case "Gene-track_status":
                            currentXmlExtract.geneTrackStatusValue = parser.getAttributeValue("", "value");
                            currentXmlExtract.geneTrackStatus = parser.getElementText();
                    }
                    break;
                case XMLStreamConstants.END_ELEMENT:
                    currentTag = parser.getLocalName();
                    switch (currentTag) {
                        case "Entrezgene":
                            geneExtractList.add(currentXmlExtract);
                            break;
                        case "Entrezgene_source":
                            inEntrezGeneSource = false;
                            break;
                        case "Dbtag":
                            inTaxonDbtag = false;
                            break;
                        case "Entrezgene_comments":
                            inEntrezgene_comments = false;
                            break;
                        case "Gene-commentary":
                            inECNumber = false;
                            break;
                    }
                case XMLStreamConstants.CHARACTERS:
                default:
                    break;
            }
            parser.next();
        }
        openStream.close();
        return geneExtractList;
    }

    private static EntrezgeneProt readEntrezgeneProtFromXml(XMLStreamReader parser) throws XMLStreamException {
        EntrezgeneProt prot = new EntrezgeneProt();

        String currentTag = parser.getLocalName();
        if (!currentTag.equals("Entrezgene_prot"))
            throw new IllegalStateException(
                    "Expected the tag Entrezgene_prot to begin reading protein names but got " + currentTag);
        do {
            parser.next();
            switch (parser.getEventType()) {
                case XMLStreamConstants.START_ELEMENT:
                    currentTag = parser.getLocalName();
                    switch (currentTag) {
                        case "Prot-ref_name_E":
                            prot.addProtrefName(parser.getElementText());
                            break;
                        case "Prot-ref_desc":
                            prot.protrefDesc = parser.getElementText();
                            break;
                    }
                    break;
                case XMLStreamConstants.END_ELEMENT:
                    currentTag = parser.getLocalName();
                    break;
                case XMLStreamConstants.CHARACTERS:
                    break;
            }
        } while (parser.getEventType() != XMLStreamConstants.END_ELEMENT || !currentTag.equals("Entrezgene_prot"));
        return prot;
    }

    public static List extractAndWriteGeneInfoToFile(OutputStream osSummaries, OutputStream osProtnames,
                                                                     OutputStream ofRefSeqAndTracStatus, OutputStream osECNumbers, InputStream is) throws XMLStreamException, IOException {
        List geneExtractList = extractGeneInfoFromXml(is);
        writeGeneInfoToFile(geneExtractList, osSummaries, osProtnames, ofRefSeqAndTracStatus, osECNumbers);
        return geneExtractList;
    }

    public static Set determineMissingTaxIds(File taxIdFile, File storageDirectory, File dbFile,
                                                     File downloadedTaxIdsFile) throws IOException {
        boolean dbFileIsNewer = !dbFile.exists() || downloadedTaxIdsFile.lastModified() < dbFile.lastModified();
        Set missingTaxIds;
        Set taxIds = FileUtils.readLines(taxIdFile, "UTF-8").stream().filter(line -> line.trim().length() != 0)
                .collect(Collectors.toSet());
        if (!dbFileIsNewer) {
            Set downloadedTaxIds = downloadedTaxIdsFile.exists()
                    ? FileUtilities.getReaderFromFile(downloadedTaxIdsFile).lines()
                    .filter(line -> line.trim().length() != 0).collect(Collectors.toSet())
                    : Collections.emptySet();
            log.debug("already created: {}", downloadedTaxIds);
            log.debug("requested: {}", taxIds);
            missingTaxIds = Sets.difference(taxIds, downloadedTaxIds);
            log.debug("difference: {}", missingTaxIds);
            if (missingTaxIds.isEmpty()) {
                log.info("Files for given taxonomy IDs have already been created created in {}", storageDirectory);
                System.exit(0);
            }
            log.info(
                    "Got {} taxonomy IDs for which gene meta information need to be downloaded and {} requested IDs already downloaded",
                    missingTaxIds.size(), taxIds.size() - missingTaxIds.size());
        } else {
            String reason = dbFile.exists()
                    ? "is newer than the existing meta data in " + storageDirectory.getAbsolutePath()
                    : "does not exist";
            log.info("The given gene_info file {}. The data will be downloaded and created from scratch.", reason);
            missingTaxIds = taxIds;
        }
        return missingTaxIds;
    }

    /**
     * Returns the file objects for the meta information files retrieved from
     * NCBI Gene XML data. The list of files contains
     *
     * 
     * gene summaries file
     * gene protein names file
     * taxonomy ID list of organisms for which the other files contain
     * records
     * gene RefSeq status
     * 
     *
     * @param storageDirectory The base directory where to find/store the meta data.
     * @return An ordered list of gene meta data files.
     */
    public static List getMetaFiles(File storageDirectory) {
        File summariesFile = new File(
                storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2SUMMARY_FILENAME);
        File proteinNamesFile = new File(
                storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2ENTREZGENE_PROT_FILENAME);
        File downloadedTaxIdsFile = new File(
                storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.TAXIDS_FILENAME);
        File refseqStatusFile = new File(storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2REFSEQ_AND_TRACK_STATUS_FILENAME);
        File egNumbersFile = new File(storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2EC_FILENAME);
        List geneXmlDownloaderFiles = Arrays.asList(summariesFile, proteinNamesFile, downloadedTaxIdsFile, refseqStatusFile, egNumbersFile);
        return geneXmlDownloaderFiles;
    }

}