All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.UniprotDictCreator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.zip.GZIPInputStream;

/**
 * @author engelmann, faessler
 * 

* The UniprotDictCreator is used to retrieve relevant content for the * uniprot-dictionary as found in the shell script makeGeneDictionary.sh * for the creation of the resources. Note that this solution was * reverse-engineered looking at the already existing uniprot.all.dict * and searching for the elements in question to retrieve from * uniprot.sprot.xml. * A later change was made by faessler to use the primary accession ID instead * of the UniProt mnemonic ID (IDs like IL2_MOUSE). */ public class UniprotDictCreator { private static final HashMap> dictContent = new HashMap<>(); public static void main(String[] args) { if (args.length == 2) { File inputFile = new File(args[0]); File outputFile = new File(args[1]); System.out.println("Reading UniProt XML from " + inputFile.getAbsolutePath() + " and writing dictionary to " + outputFile.getAbsolutePath()); UniprotDictCreator dictCreator = new UniprotDictCreator(); dictCreator.readEntries(inputFile); dictCreator.writeEntries(outputFile); } else { System.err .println("usage:\nUniProtDictCreator "); System.exit(-1); } } public void readEntries(File inputFile) { boolean isInEntry; boolean retrievedAccession; boolean isInRecommendedName; boolean isInAlternativeName; boolean isInGene; String accession = ""; ArrayList otherNames = new ArrayList(); try { InputStream fis = new FileInputStream(inputFile); if (inputFile.getName().endsWith(".gz")) fis = new GZIPInputStream(fis); XMLStreamReader reader = XMLInputFactory.newInstance() .createXMLStreamReader(fis); while (reader.hasNext()) { reader.next(); if (reader.getEventType() == XMLStreamReader.START_ELEMENT && reader.getLocalName().equals("entry")) { isInEntry = true; retrievedAccession = false; while (isInEntry && reader.hasNext()) { reader.next(); if (reader.getEventType() == XMLStreamReader.START_ELEMENT) { String localName = reader.getLocalName(); if (localName.equals("accession") && retrievedAccession == false) { accession = reader.getElementText(); // get the primary accession (or the only one) accession = accession.split(",", 2)[0]; retrievedAccession = true; if (accession.equals("P04578")) System.out.println("Found accession P04578"); } else if (localName.equals("recommendedName")) { isInRecommendedName = true; while (isInRecommendedName && reader.hasNext()) { reader.next(); if (reader.getEventType() == XMLStreamReader.END_ELEMENT && reader .getLocalName() .equals("recommendedName")) { isInRecommendedName = false; } else if (reader.getEventType() == XMLStreamReader.START_ELEMENT && reader.getLocalName() .contains("Name")) { String otherName = reader .getElementText(); // The -1 is the name priority: the recommended name gets -1 (like the official symbol from NCBI gene) otherNames.add(otherName + "\t-1"); } } } else if (localName.equals("alternativeName")) { isInAlternativeName = true; while (isInAlternativeName && reader.hasNext()) { reader.next(); if (reader.getEventType() == XMLStreamReader.END_ELEMENT && reader .getLocalName() .equals("alternativeName")) { isInAlternativeName = false; } else if (reader.getEventType() == XMLStreamReader.START_ELEMENT && reader.getLocalName() .contains("Name")) { String otherName = reader .getElementText(); // The 2 is the name priority: the alternative name gets 2 (like the synonyms from NCBI gene) otherNames.add(otherName + "\t2"); } } } else if (localName.equals("gene")) { isInGene = true; while (isInGene && reader.hasNext()) { reader.next(); if (reader.getEventType() == XMLStreamReader.END_ELEMENT && reader.getLocalName() .equals("gene")) { isInGene = false; } if (reader.getEventType() == XMLStreamReader.START_ELEMENT && reader.getLocalName() .equals("name")) { String type = reader.getAttributeValue(null, "type"); if (type.equals("primary")) { String otherName = reader.getElementText(); // the gene name gets priority 0 otherNames.add(otherName + "\t0"); isInGene = false; isInEntry = false; dictContent.put(accession, new ArrayList<>(otherNames)); accession = ""; otherNames.clear(); } } } } } else if (reader.getEventType() == XMLStreamReader.END_ELEMENT && reader.getLocalName().equals("entry") && !accession.isBlank()) { // System.out.println("WARNING: Got the end of an entry element and the accession is not blank"); } } } } reader.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (XMLStreamException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (FactoryConfigurationError e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void writeEntries(File outputFile) { try (FileWriter writer = new FileWriter(outputFile)) { for (String name : dictContent.keySet()) { if (name.equals("P04578")) System.out.println("Got key P04578 and writing its names to file"); for (String otherName : dictContent.get(name)) { writer.write(otherName + "\t" + name + "\n"); } } } catch (IOException e) { e.printStackTrace(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy