All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.genome.parsers.geneid.GeneIDXMLReader Maven / Gradle / Ivy

/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.genome.parsers.geneid;

import org.biojava.nbio.core.sequence.AccessionID;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.io.FastaWriterHelper;
import org.biojava.nbio.core.util.XMLHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

import java.io.File;
import java.util.ArrayList;
import java.util.LinkedHashMap;

/**
 *
 * @author Scooter Willis 
 */
public class GeneIDXMLReader {

	private static final Logger logger = LoggerFactory.getLogger(GeneIDXMLReader.class);

	Document geneidDoc = null;

	public GeneIDXMLReader(String geneidXMLFile) throws Exception {
		logger.info("Start read of {}", geneidXMLFile);
		geneidDoc = XMLHelper.loadXML(geneidXMLFile);
		logger.info("Read finished");
	}

	public LinkedHashMap getProteinSequences() throws Exception {
		LinkedHashMap proteinSequenceList = new LinkedHashMap();
		ArrayList elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/protein");
		logger.info("{} hits", elementList.size());

		for (Element proteinElement : elementList) {
			Element geneElement = (Element) proteinElement.getParentNode();
			String sequence = proteinElement.getTextContent().replaceAll("\\W","");
			ProteinSequence proteinSequence = new ProteinSequence(sequence);
			String idGene = geneElement.getAttribute("idGene");
			proteinSequence.setAccession(new AccessionID(idGene));
			proteinSequenceList.put(idGene, proteinSequence);
		}

		return proteinSequenceList;
	}

	public LinkedHashMap getDNACodingSequences() throws Exception {
		LinkedHashMap dnaSequenceList = new LinkedHashMap();
		ArrayList elementList = XMLHelper.selectElements(geneidDoc.getDocumentElement(), "prediction/gene/cDNA");
		logger.info("{} hits", elementList.size());

		for (Element dnaElement : elementList) {
			Element geneElement = (Element) dnaElement.getParentNode();
			String sequence = dnaElement.getTextContent().replaceAll("\\W","");
			DNASequence dnaSequence = new DNASequence(sequence);
			String idGene = geneElement.getAttribute("idGene");
			dnaSequence.setAccession(new AccessionID(idGene));
			dnaSequenceList.put(idGene, dnaSequence);
		}

		return dnaSequenceList;
	}

	public static void main(String[] args) {
		try {
			GeneIDXMLReader geneIDXMLReader = new GeneIDXMLReader("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.xml");
			LinkedHashMap proteinSequenceHashMap = geneIDXMLReader.getProteinSequences();
			FastaWriterHelper.writeProteinSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.faa"), proteinSequenceHashMap.values());

			LinkedHashMap dnaSequenceHashMap = geneIDXMLReader.getDNACodingSequences();
			FastaWriterHelper.writeNucleotideSequence(new File("/Users/Scooter/scripps/dyadic/geneid/geneid/c1_geneid.fna"), dnaSequenceHashMap.values());

		} catch (Exception e) {
			logger.error("Exception: ", e);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy