de.julielab.genemapper.resources.UniprotDictCreator Maven / Gradle / Ivy
Show all versions of gene-mapper-resources Show documentation
package de.julielab.genemapper.resources;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.zip.GZIPInputStream;
/**
* @author engelmann, faessler
*
* The UniprotDictCreator is used to retrieve relevant content for the
* uniprot-dictionary as found in the shell script makeGeneDictionary.sh
* for the creation of the resources. Note that this solution was
* reverse-engineered looking at the already existing uniprot.all.dict
* and searching for the elements in question to retrieve from
* uniprot.sprot.xml.
* A later change was made by faessler to use the primary accession ID instead
* of the UniProt mnemonic ID (IDs like IL2_MOUSE).
*/
public class UniprotDictCreator {
private static final HashMap> dictContent = new HashMap<>();
public static void main(String[] args) {
if (args.length == 2) {
File inputFile = new File(args[0]);
File outputFile = new File(args[1]);
System.out.println("Reading UniProt XML from " + inputFile.getAbsolutePath() + " and writing dictionary to " + outputFile.getAbsolutePath());
UniprotDictCreator dictCreator = new UniprotDictCreator();
dictCreator.readEntries(inputFile);
dictCreator.writeEntries(outputFile);
} else {
System.err
.println("usage:\nUniProtDictCreator ");
System.exit(-1);
}
}
public void readEntries(File inputFile) {
boolean isInEntry;
boolean retrievedAccession;
boolean isInRecommendedName;
boolean isInAlternativeName;
boolean isInGene;
String accession = "";
ArrayList otherNames = new ArrayList();
try {
InputStream fis = new FileInputStream(inputFile);
if (inputFile.getName().endsWith(".gz"))
fis = new GZIPInputStream(fis);
XMLStreamReader reader = XMLInputFactory.newInstance()
.createXMLStreamReader(fis);
while (reader.hasNext()) {
reader.next();
if (reader.getEventType() == XMLStreamReader.START_ELEMENT
&& reader.getLocalName().equals("entry")) {
isInEntry = true;
retrievedAccession = false;
while (isInEntry && reader.hasNext()) {
reader.next();
if (reader.getEventType() == XMLStreamReader.START_ELEMENT) {
String localName = reader.getLocalName();
if (localName.equals("accession")
&& retrievedAccession == false) {
accession = reader.getElementText();
// get the primary accession (or the only one)
accession = accession.split(",", 2)[0];
retrievedAccession = true;
if (accession.equals("P04578"))
System.out.println("Found accession P04578");
} else if (localName.equals("recommendedName")) {
isInRecommendedName = true;
while (isInRecommendedName && reader.hasNext()) {
reader.next();
if (reader.getEventType() == XMLStreamReader.END_ELEMENT
&& reader
.getLocalName()
.equals("recommendedName")) {
isInRecommendedName = false;
} else if (reader.getEventType() == XMLStreamReader.START_ELEMENT
&& reader.getLocalName()
.contains("Name")) {
String otherName = reader
.getElementText();
// The -1 is the name priority: the recommended name gets -1 (like the official symbol from NCBI gene)
otherNames.add(otherName + "\t-1");
}
}
} else if (localName.equals("alternativeName")) {
isInAlternativeName = true;
while (isInAlternativeName && reader.hasNext()) {
reader.next();
if (reader.getEventType() == XMLStreamReader.END_ELEMENT
&& reader
.getLocalName()
.equals("alternativeName")) {
isInAlternativeName = false;
} else if (reader.getEventType() == XMLStreamReader.START_ELEMENT
&& reader.getLocalName()
.contains("Name")) {
String otherName = reader
.getElementText();
// The 2 is the name priority: the alternative name gets 2 (like the synonyms from NCBI gene)
otherNames.add(otherName + "\t2");
}
}
} else if (localName.equals("gene")) {
isInGene = true;
while (isInGene && reader.hasNext()) {
reader.next();
if (reader.getEventType() == XMLStreamReader.END_ELEMENT
&& reader.getLocalName()
.equals("gene")) {
isInGene = false;
}
if (reader.getEventType() == XMLStreamReader.START_ELEMENT
&& reader.getLocalName()
.equals("name")) {
String type = reader.getAttributeValue(null, "type");
if (type.equals("primary")) {
String otherName = reader.getElementText();
// the gene name gets priority 0
otherNames.add(otherName + "\t0");
isInGene = false;
isInEntry = false;
dictContent.put(accession, new ArrayList<>(otherNames));
accession = "";
otherNames.clear();
}
}
}
}
} else if (reader.getEventType() == XMLStreamReader.END_ELEMENT && reader.getLocalName().equals("entry") && !accession.isBlank()) {
// System.out.println("WARNING: Got the end of an entry element and the accession is not blank");
}
}
}
}
reader.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (XMLStreamException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FactoryConfigurationError e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void writeEntries(File outputFile) {
try (FileWriter writer = new FileWriter(outputFile)) {
for (String name : dictContent.keySet()) {
if (name.equals("P04578"))
System.out.println("Got key P04578 and writing its names to file");
for (String otherName : dictContent.get(name)) {
writer.write(otherName + "\t" + name + "\n");
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}