de.julielab.genemapper.resources.ncbigene.GeneXMLUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources.ncbigene;
import com.google.common.collect.Sets;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.stream.Collectors;
public class GeneXMLUtils {
public static final String TAXIDS_FILENAME = "genexmldownloader.taxids.gz";
public static final String EG2ENTREZGENE_PROT_FILENAME = "eg2entrezgene_prot-genexmldownloader.gz";
public static final String EG2SUMMARY_FILENAME = "eg2summary-genexmldownloader.gz";
private static final String EG2REFSEQ_AND_TRACK_STATUS_FILENAME = "eg2refseq_genetrack_status-genexmldownloader.gz";
private static final String EG2EC_FILENAME = "eg2ecnumber-genexmldownloader.gz";
private static final Logger log = LoggerFactory.getLogger(GeneXMLUtils.class);
public static void writeGeneInfoToFile(List geneExtractList, OutputStream osSummaries,
OutputStream osProtnames, OutputStream osRefSeqAndTrackStatus, OutputStream osECNumbers) throws IOException {
log.trace("Writing gene summaries of current XML batch.");
synchronized (osSummaries) {
for (GeneXmlExtract extract : geneExtractList) {
if (extract.summary != null) {
IOUtils.write(extract.geneId + "\t" + extract.summary + "\n", osSummaries, "UTF-8");
}
}
}
synchronized (osProtnames) {
log.trace("Writing entrezgene_prot names of current XML batch.");
for (GeneXmlExtract extract : geneExtractList) {
if (extract.entrezgeneProt != null) {
if (extract.entrezgeneProt.protrefName != null) {
for (String protName : extract.entrezgeneProt.protrefName) {
IOUtils.write(extract.geneId + "\t" + protName + "\n", osProtnames, "UTF-8");
}
}
if (null != extract.entrezgeneProt.protrefDesc)
IOUtils.write(extract.geneId + "\t" + extract.entrezgeneProt.protrefDesc + "\n", osProtnames,
"UTF-8");
}
}
}
synchronized (osRefSeqAndTrackStatus) {
log.trace("Writing entrez gene RefSeq status entries of current XML batch");
for (GeneXmlExtract extract : geneExtractList) {
String refSeqStatus = extract.refSeqStatus != null ? extract.refSeqStatus : "";
IOUtils.write(extract.geneId + "\t" + refSeqStatus + "\t" + extract.geneTrackStatusValue + "\t" + extract.geneTrackStatus + "\n", osRefSeqAndTrackStatus, "UTF-8");
}
}
synchronized (osECNumbers) {
log.trace("Writing EC numbers of current XML batch");
for (GeneXmlExtract extract : geneExtractList) {
if (extract.ecNumber != null)
IOUtils.write(extract.geneId + "\t" + extract.ecNumber + "\n", osECNumbers, "UTF-8");
}
}
}
public static List extractGeneInfoFromXml(InputStream openStream)
throws XMLStreamException, IOException {
List geneExtractList = new ArrayList<>();
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLStreamReader parser = factory.createXMLStreamReader(openStream);
String currentTag;
GeneXmlExtract currentXmlExtract = null;
boolean inEntrezGeneSource = false;
boolean inTaxonDbtag = false;
boolean inRefSeqStatusCommentary = false;
boolean inEntrezgene_comments = false;
boolean inECNumber = false;
while (parser.hasNext()) {
switch (parser.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
currentTag = parser.getLocalName();
switch (currentTag) {
case "Entrezgene":
currentXmlExtract = new GeneXmlExtract();
break;
case "Entrezgene_prot":
currentXmlExtract.entrezgeneProt = readEntrezgeneProtFromXml(parser);
break;
case "Gene-track_geneid":
currentXmlExtract.geneId = parser.getElementText();
break;
case "Entrezgene_summary":
currentXmlExtract.summary = parser.getElementText();
break;
case "Entrezgene_source":
inEntrezGeneSource = true;
break;
case "Dbtag_db":
if (parser.getElementText().equals("taxon"))
inTaxonDbtag = true;
break;
case "Object-id_id":
if (inEntrezGeneSource && inTaxonDbtag)
currentXmlExtract.taxId = parser.getElementText();
break;
case "Gene-commentary_heading":
if (parser.getElementText().equals("RefSeq Status")) {
inRefSeqStatusCommentary = true;
}
break;
case "Entrezgene_comments":
inEntrezgene_comments = true;
break;
case "Gene-commentary_label":
String elementText = parser.getElementText();
if (inEntrezgene_comments && inRefSeqStatusCommentary) {
currentXmlExtract.refSeqStatus = elementText;
inRefSeqStatusCommentary = false;
}
if (elementText.equals("EC"))
inECNumber = true;
break;
case "Gene-commentary_text":
if (inECNumber)
currentXmlExtract.ecNumber = parser.getElementText();
break;
case "Gene-track_status":
currentXmlExtract.geneTrackStatusValue = parser.getAttributeValue("", "value");
currentXmlExtract.geneTrackStatus = parser.getElementText();
}
break;
case XMLStreamConstants.END_ELEMENT:
currentTag = parser.getLocalName();
switch (currentTag) {
case "Entrezgene":
geneExtractList.add(currentXmlExtract);
break;
case "Entrezgene_source":
inEntrezGeneSource = false;
break;
case "Dbtag":
inTaxonDbtag = false;
break;
case "Entrezgene_comments":
inEntrezgene_comments = false;
break;
case "Gene-commentary":
inECNumber = false;
break;
}
case XMLStreamConstants.CHARACTERS:
default:
break;
}
parser.next();
}
openStream.close();
return geneExtractList;
}
private static EntrezgeneProt readEntrezgeneProtFromXml(XMLStreamReader parser) throws XMLStreamException {
EntrezgeneProt prot = new EntrezgeneProt();
String currentTag = parser.getLocalName();
if (!currentTag.equals("Entrezgene_prot"))
throw new IllegalStateException(
"Expected the tag Entrezgene_prot to begin reading protein names but got " + currentTag);
do {
parser.next();
switch (parser.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
currentTag = parser.getLocalName();
switch (currentTag) {
case "Prot-ref_name_E":
prot.addProtrefName(parser.getElementText());
break;
case "Prot-ref_desc":
prot.protrefDesc = parser.getElementText();
break;
}
break;
case XMLStreamConstants.END_ELEMENT:
currentTag = parser.getLocalName();
break;
case XMLStreamConstants.CHARACTERS:
break;
}
} while (parser.getEventType() != XMLStreamConstants.END_ELEMENT || !currentTag.equals("Entrezgene_prot"));
return prot;
}
public static List extractAndWriteGeneInfoToFile(OutputStream osSummaries, OutputStream osProtnames,
OutputStream ofRefSeqAndTracStatus, OutputStream osECNumbers, InputStream is) throws XMLStreamException, IOException {
List geneExtractList = extractGeneInfoFromXml(is);
writeGeneInfoToFile(geneExtractList, osSummaries, osProtnames, ofRefSeqAndTracStatus, osECNumbers);
return geneExtractList;
}
public static Set determineMissingTaxIds(File taxIdFile, File storageDirectory, File dbFile,
File downloadedTaxIdsFile) throws IOException {
boolean dbFileIsNewer = !dbFile.exists() || downloadedTaxIdsFile.lastModified() < dbFile.lastModified();
Set missingTaxIds;
Set taxIds = FileUtils.readLines(taxIdFile, "UTF-8").stream().filter(line -> line.trim().length() != 0)
.collect(Collectors.toSet());
if (!dbFileIsNewer) {
Set downloadedTaxIds = downloadedTaxIdsFile.exists()
? FileUtilities.getReaderFromFile(downloadedTaxIdsFile).lines()
.filter(line -> line.trim().length() != 0).collect(Collectors.toSet())
: Collections.emptySet();
log.debug("already created: {}", downloadedTaxIds);
log.debug("requested: {}", taxIds);
missingTaxIds = Sets.difference(taxIds, downloadedTaxIds);
log.debug("difference: {}", missingTaxIds);
if (missingTaxIds.isEmpty()) {
log.info("Files for given taxonomy IDs have already been created created in {}", storageDirectory);
System.exit(0);
}
log.info(
"Got {} taxonomy IDs for which gene meta information need to be downloaded and {} requested IDs already downloaded",
missingTaxIds.size(), taxIds.size() - missingTaxIds.size());
} else {
String reason = dbFile.exists()
? "is newer than the existing meta data in " + storageDirectory.getAbsolutePath()
: "does not exist";
log.info("The given gene_info file {}. The data will be downloaded and created from scratch.", reason);
missingTaxIds = taxIds;
}
return missingTaxIds;
}
/**
* Returns the file objects for the meta information files retrieved from
* NCBI Gene XML data. The list of files contains
*
*
* - gene summaries file
* - gene protein names file
* - taxonomy ID list of organisms for which the other files contain
* records
* - gene RefSeq status
*
*
* @param storageDirectory The base directory where to find/store the meta data.
* @return An ordered list of gene meta data files.
*/
public static List getMetaFiles(File storageDirectory) {
File summariesFile = new File(
storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2SUMMARY_FILENAME);
File proteinNamesFile = new File(
storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2ENTREZGENE_PROT_FILENAME);
File downloadedTaxIdsFile = new File(
storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.TAXIDS_FILENAME);
File refseqStatusFile = new File(storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2REFSEQ_AND_TRACK_STATUS_FILENAME);
File egNumbersFile = new File(storageDirectory.getAbsolutePath() + File.separator + GeneXMLUtils.EG2EC_FILENAME);
List geneXmlDownloaderFiles = Arrays.asList(summariesFile, proteinNamesFile, downloadedTaxIdsFile, refseqStatusFile, egNumbersFile);
return geneXmlDownloaderFiles;
}
}