All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.GeneXMLFromASN1Extractor Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.Sets;
import de.julielab.genemapper.resources.ncbigene.GeneXMLUtils;
import de.julielab.genemapper.resources.util.UncheckedGeneMapperResourcesException;
import de.julielab.java.utilities.CLIInteractionUtilities;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLStreamException;
import java.io.*;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * 

* Reads ASN.1 files obtainable from * ftp://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/ and extracts the required gene * information for the JCoRe Gene Mapper. *

*

* This approach is much more robust than downloading everything from the * Internet and is most likely also much quicker. However, it relies on an * external tool, gene2xml, which is written in C and thus platform dependent. * This might cause issues depending on the environment. The available gene2xml * programs are located at * ftp://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/gene2xml/. The path to * the program to use is given as the last parameter when calling this class. * This class was used with the linux64 program. *

* * @author faessler */ public class GeneXMLFromASN1Extractor { private static final Logger log = LoggerFactory.getLogger(GeneXMLFromASN1Extractor.class); /** * The path to the gene2xml program to use. */ public static void main(String[] args) throws Exception { if (args.length != 4) { System.err.println("Usage: " + GeneXMLFromASN1Extractor.class.getSimpleName() + " "); System.exit(0); } File taxIdFile = new File(args[0]); File storageDirectory = new File(args[1]); File asnFile = new File(args[2]); File gene2xml = new File(args[3]); log.info("Taxonomy ID file: {}", taxIdFile); log.info("Storage directory for created resource files: {}", storageDirectory); log.info("ASN.1 file to extract data from: {}", asnFile); log.info("gene2xml executable path: {}", gene2xml); if (!gene2xml.exists()) throw new IllegalArgumentException( "The gene2xml executable path " + gene2xml.getAbsolutePath() + " does not exist."); else if (!gene2xml.canExecute()) { throw new IllegalStateException("The gene2xml executable at " + gene2xml.getAbsolutePath() + " is not allowed to be executed by the current user. Set executable rights (the 'x' flag un *nix systems) and try again."); } List geneXmlDownloaderFiles = GeneXMLUtils.getMetaFiles(storageDirectory); File downloadedTaxIdsFile = geneXmlDownloaderFiles.get(2); // Check if the gene meta information cache at the given directory is // deprecated. boolean dbFileIsNewer = !asnFile.exists() || !downloadedTaxIdsFile.exists() || downloadedTaxIdsFile.lastModified() < asnFile.lastModified(); if (dbFileIsNewer && downloadedTaxIdsFile.exists()) { if (!CLIInteractionUtilities.readYesNoFromStdInWithMessage("The ASN.1 file at " + asnFile.getAbsolutePath() + " is newer than the meta cache files at " + storageDirectory.getAbsolutePath() + ". By continuing, the old cache will completely deleted and built from scratch for the taxonomy IDs given by " + taxIdFile.getAbsolutePath() + ". Do you wish to proceed?", true)) { log.info("Aborting due to user wish."); System.exit(2); } FileUtils.deleteDirectory(storageDirectory); } if (!storageDirectory.exists()) { log.info("Creating directory {}", storageDirectory); storageDirectory.mkdirs(); } // Determine the tax IDs for which we need to extract gene meta information. Set missingTaxIds = GeneXMLUtils.determineMissingTaxIds(taxIdFile, storageDirectory, asnFile, downloadedTaxIdsFile); if (dbFileIsNewer || !missingTaxIds.isEmpty()) { if (!missingTaxIds.isEmpty()) log.info("There are missing taxonomy IDs for which gene meta information needs to be extracted."); if (dbFileIsNewer) log.info( "The {} file has been updated and is newer than the existing gene meta information. The old gene meta information cache has been deleted and is now built again.", asnFile); log.info("Extracting gene meta information from {}. This will take a few hours.", asnFile); extractGeneInfoFromASN1(asnFile, geneXmlDownloaderFiles, taxIdFile, gene2xml); log.info("Finished gene meta information extraction."); } else { log.info( "Gene meta information for all given taxonomy IDs has already been extracted. Nothing to do, exiting."); } } /** * This method decompresses the input file and extracts xml data for each of the * given organisms. * * @param inputFile * @param geneXmlDownloaderFiles * @param requestedTaxIdsFile * @param gene2xml * @throws IOException * @throws XMLStreamException */ private static void extractGeneInfoFromASN1(File inputFile, List geneXmlDownloaderFiles, File requestedTaxIdsFile, File gene2xml) throws IOException, XMLStreamException { // -i - input // -c - compressed // -b - binary Process proc = Runtime.getRuntime().exec(gene2xml.getAbsolutePath() + " -i " + inputFile + " -c -b"); try (InputStream is = proc.getInputStream(); OutputStream osSummaries = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(0)); OutputStream osProtnames = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(1)); OutputStream osRefSeqStatus = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(3)); OutputStream osECNumbers = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(4))) { GeneXMLUtils.extractAndWriteGeneInfoToFile(osSummaries, osProtnames, osRefSeqStatus, osECNumbers, is); File alreadyExtractedTaxIdList = geneXmlDownloaderFiles.get(2); Set alreadyExtractedTaxIds = alreadyExtractedTaxIdList.exists() ? FileUtilities.getReaderFromFile(alreadyExtractedTaxIdList).lines() .filter(line -> line.trim().length() != 0).collect(Collectors.toSet()) : Collections.emptySet(); Set requestedTaxIds = FileUtils.readLines(requestedTaxIdsFile, "UTF-8").stream() .filter(line -> line.trim().length() != 0).collect(Collectors.toSet()); try (BufferedWriter bw = FileUtilities.getWriterToFile(alreadyExtractedTaxIdList)) { Sets.union(alreadyExtractedTaxIds, requestedTaxIds).stream().forEach(t -> { try { bw.write(t); bw.newLine(); } catch (IOException e) { throw new UncheckedGeneMapperResourcesException(e); } }); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy