
org.biojava.nbio.genome.parsers.geneid.GeneIDXMLReader Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.genome.parsers.geneid;
import org.biojava.nbio.core.sequence.AccessionID;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.io.FastaWriterHelper;
import org.biojava.nbio.core.util.XMLHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import java.io.File;
import java.util.ArrayList;
import java.util.LinkedHashMap;
/**
*
* @author Scooter Willis
*/
public class GeneIDXMLReader {
private static final Logger logger = LoggerFactory.getLogger(GeneIDXMLReader.class);
Document geneidDoc = null;
public GeneIDXMLReader(String geneidXMLFile) throws Exception {
logger.info("Start read of {}", geneidXMLFile);
geneidDoc = XMLHelper.loadXML(geneidXMLFile);
logger.info("Read finished");
}
public LinkedHashMap getProteinSequences() throws Exception {
LinkedHashMap proteinSequenceList = new LinkedHashMap();
ArrayList elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein");
logger.info("{} hits", elementList.size());
for (Element proteinElement : elementList) {
Element geneElement = (Element) proteinElement.getParentNode();
String sequence = proteinElement.getTextContent().replaceAll("\\W","");
ProteinSequence proteinSequence = new ProteinSequence(sequence);
String idGene = geneElement.getAttribute("idGene");
proteinSequence.setAccession(new AccessionID(idGene));
proteinSequenceList.put(idGene, proteinSequence);
}
return proteinSequenceList;
}
public LinkedHashMap getDNACodingSequences() throws Exception {
LinkedHashMap dnaSequenceList = new LinkedHashMap();
ArrayList elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/cDNA");
logger.info("{} hits", elementList.size());
for (Element dnaElement : elementList) {
Element geneElement = (Element) dnaElement.getParentNode();
String sequence = dnaElement.getTextContent().replaceAll("\\W","");
DNASequence dnaSequence = new DNASequence(sequence);
String idGene = geneElement.getAttribute("idGene");
dnaSequence.setAccession(new AccessionID(idGene));
dnaSequenceList.put(idGene, dnaSequence);
}
return dnaSequenceList;
}
public static void main(String[] args) {
try {
GeneIDXMLReader geneIDXMLReader = new GeneIDXMLReader("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.xml");
LinkedHashMap proteinSequenceHashMap = geneIDXMLReader.getProteinSequences();
FastaWriterHelper.writeProteinSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.faa"), proteinSequenceHashMap.values());
LinkedHashMap dnaSequenceHashMap = geneIDXMLReader.getDNACodingSequences();
FastaWriterHelper.writeNucleotideSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.fna"), dnaSequenceHashMap.values());
} catch (Exception e) {
logger.error("Exception: ", e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy