de.julielab.genemapper.resources.GeneXMLFromASN1Extractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import com.google.common.collect.Sets;
import de.julielab.genemapper.resources.ncbigene.GeneXMLUtils;
import de.julielab.genemapper.resources.util.UncheckedGeneMapperResourcesException;
import de.julielab.java.utilities.CLIInteractionUtilities;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLStreamException;
import java.io.*;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
/**
*
* Reads ASN.1 files obtainable from
* ftp://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/ and extracts the required gene
* information for the JCoRe Gene Mapper.
*
*
* This approach is much more robust than downloading everything from the
* Internet and is most likely also much quicker. However, it relies on an
* external tool, gene2xml, which is written in C and thus platform dependent.
* This might cause issues depending on the environment. The available gene2xml
* programs are located at
* ftp://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/gene2xml/. The path to
* the program to use is given as the last parameter when calling this class.
* This class was used with the linux64 program.
*
*
* @author faessler
*/
public class GeneXMLFromASN1Extractor {
private static final Logger log = LoggerFactory.getLogger(GeneXMLFromASN1Extractor.class);
/**
* The path to the gene2xml program to use.
*/
public static void main(String[] args) throws Exception {
if (args.length != 4) {
System.err.println("Usage: " + GeneXMLFromASN1Extractor.class.getSimpleName()
+ " ");
System.exit(0);
}
File taxIdFile = new File(args[0]);
File storageDirectory = new File(args[1]);
File asnFile = new File(args[2]);
File gene2xml = new File(args[3]);
log.info("Taxonomy ID file: {}", taxIdFile);
log.info("Storage directory for created resource files: {}", storageDirectory);
log.info("ASN.1 file to extract data from: {}", asnFile);
log.info("gene2xml executable path: {}", gene2xml);
if (!gene2xml.exists())
throw new IllegalArgumentException(
"The gene2xml executable path " + gene2xml.getAbsolutePath() + " does not exist.");
else if (!gene2xml.canExecute()) {
throw new IllegalStateException("The gene2xml executable at " + gene2xml.getAbsolutePath()
+ " is not allowed to be executed by the current user. Set executable rights (the 'x' flag un *nix systems) and try again.");
}
List geneXmlDownloaderFiles = GeneXMLUtils.getMetaFiles(storageDirectory);
File downloadedTaxIdsFile = geneXmlDownloaderFiles.get(2);
// Check if the gene meta information cache at the given directory is
// deprecated.
boolean dbFileIsNewer = !asnFile.exists() || !downloadedTaxIdsFile.exists() || downloadedTaxIdsFile.lastModified() < asnFile.lastModified();
if (dbFileIsNewer && downloadedTaxIdsFile.exists()) {
if (!CLIInteractionUtilities.readYesNoFromStdInWithMessage("The ASN.1 file at " + asnFile.getAbsolutePath()
+ " is newer than the meta cache files at " + storageDirectory.getAbsolutePath()
+ ". By continuing, the old cache will completely deleted and built from scratch for the taxonomy IDs given by "
+ taxIdFile.getAbsolutePath() + ". Do you wish to proceed?", true)) {
log.info("Aborting due to user wish.");
System.exit(2);
}
FileUtils.deleteDirectory(storageDirectory);
}
if (!storageDirectory.exists()) {
log.info("Creating directory {}", storageDirectory);
storageDirectory.mkdirs();
}
// Determine the tax IDs for which we need to extract gene meta information.
Set missingTaxIds = GeneXMLUtils.determineMissingTaxIds(taxIdFile, storageDirectory, asnFile,
downloadedTaxIdsFile);
if (dbFileIsNewer || !missingTaxIds.isEmpty()) {
if (!missingTaxIds.isEmpty())
log.info("There are missing taxonomy IDs for which gene meta information needs to be extracted.");
if (dbFileIsNewer)
log.info(
"The {} file has been updated and is newer than the existing gene meta information. The old gene meta information cache has been deleted and is now built again.",
asnFile);
log.info("Extracting gene meta information from {}. This will take a few hours.", asnFile);
extractGeneInfoFromASN1(asnFile, geneXmlDownloaderFiles, taxIdFile, gene2xml);
log.info("Finished gene meta information extraction.");
} else {
log.info(
"Gene meta information for all given taxonomy IDs has already been extracted. Nothing to do, exiting.");
}
}
/**
* This method decompresses the input file and extracts xml data for each of the
* given organisms.
*
* @param inputFile
* @param geneXmlDownloaderFiles
* @param requestedTaxIdsFile
* @param gene2xml
* @throws IOException
* @throws XMLStreamException
*/
private static void extractGeneInfoFromASN1(File inputFile, List geneXmlDownloaderFiles,
File requestedTaxIdsFile, File gene2xml) throws IOException, XMLStreamException {
// -i - input
// -c - compressed
// -b - binary
Process proc = Runtime.getRuntime().exec(gene2xml.getAbsolutePath() + " -i " + inputFile + " -c -b");
try (InputStream is = proc.getInputStream();
OutputStream osSummaries = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(0));
OutputStream osProtnames = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(1));
OutputStream osRefSeqStatus = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(3));
OutputStream osECNumbers = FileUtilities.getOutputStreamToFile(geneXmlDownloaderFiles.get(4))) {
GeneXMLUtils.extractAndWriteGeneInfoToFile(osSummaries, osProtnames, osRefSeqStatus, osECNumbers, is);
File alreadyExtractedTaxIdList = geneXmlDownloaderFiles.get(2);
Set alreadyExtractedTaxIds = alreadyExtractedTaxIdList.exists()
? FileUtilities.getReaderFromFile(alreadyExtractedTaxIdList).lines()
.filter(line -> line.trim().length() != 0).collect(Collectors.toSet())
: Collections.emptySet();
Set requestedTaxIds = FileUtils.readLines(requestedTaxIdsFile, "UTF-8").stream()
.filter(line -> line.trim().length() != 0).collect(Collectors.toSet());
try (BufferedWriter bw = FileUtilities.getWriterToFile(alreadyExtractedTaxIdList)) {
Sets.union(alreadyExtractedTaxIds, requestedTaxIds).stream().forEach(t -> {
try {
bw.write(t);
bw.newLine();
} catch (IOException e) {
throw new UncheckedGeneMapperResourcesException(e);
}
});
}
}
}
}