All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.bio.program.sax.BlastSAXParser Maven / Gradle / Ivy

There is a newer version: 1.9.7
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.bio.program.sax;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * A SAX-like parser for dealing with native NCBI-Blast,Wu-Blastoutput,
 * and HMMER output (a single dataset).  That is this class allows
 * native BLAST-like format files to be processed  as if they were in
 * an XML format i.e. it sends messages to a user-written XML Handler.
 *
 * This class has package-level visibility, and is used
 * by generic BlastLikeSAXParser objects.
 *
 * Some functionality is delegated to Part objects within the class
 * whose implementations are selected on-the-fly according
 * the the program/version that produced the output.
 *  SummaryLineHelperIF
 *
 * NB Support for HMMER is currently only partial and likely to change
 * without notice as more functionality is added.
 *
 * Copyright 2000 Cambridge Antibody Technology Group plc.
 *
 *
 * This code released to the biojava project, May 2000
 * under the LGPL license.
 *
 * @author Simon Brocklehurst (CAT)
 * @author Tim Dilks          (CAT)
 * @author Colin Hardman      (CAT)
 * @author Stuart Johnston    (CAT)
 * @author Mathieu Wiepert    (Mayo Foundation)
 * @author Keith James        (Sanger Institute)
 * @author Mark Schreiber     (NITD)
 * @author Travis Banks		  (AAFC)
 * @version 0.2
 *
 * @see BlastLikeSAXParser
 */
final class BlastSAXParser extends AbstractNativeAppSAXParser {

	private BufferedReader       oContents;
	private AttributesImpl       oAtts              = new AttributesImpl();
	private QName                oAttQName          = new QName(this);     
	private ArrayList            oBuffer            = new ArrayList();
	private char[]               aoChars;
	private char[]               aoLineSeparator;
	private String[]             aoKeys;
	private String[]             aoArrayType        = new String[1];
	private HashMap              oMap               = new HashMap();
	private int                  iVer;

	private BlastLikeVersionSupport      oVersion;
	private HitSectionSAXParser          oHits;
	private SummaryLineHelperIF          oSummaryLineHelper;

	private String                       oQueryId;
	private String                       oDatabaseId;
	private String 					     oQueryLength; // patch from Michael Gang
	private static final int STARTUP             = 0;
	private static final int IN_TRAILER          = 1;
	private static final int AT_END              = 2;
	private static final int IN_HEADER           = 3;
	private static final int IN_SUMMARY          = 4;
	private static final int FINISHED_HITS       = 5;

	private boolean          tDoneSummary        = false;

	/**
	 * Creates a new BlastSAXParser instance.
	 * @param poNamespacePrefix the namespace prefix to use
	 * @param poVersion a BlastLikeVersionSupport value.
	 * @exception SAXException if an error occurs
	 */
	BlastSAXParser(BlastLikeVersionSupport poVersion, 
			String poNamespacePrefix) throws SAXException {

		oVersion = poVersion;
		this.setNamespacePrefix(poNamespacePrefix);

		this.addPrefixMapping("biojava","http://www.biojava.org");

		oHits = new HitSectionSAXParser(oVersion,this.getNamespacePrefix());

		this.changeState(STARTUP);
		aoLineSeparator = System.getProperty("line.separator").toCharArray();

		//Beginnings of using a Builder type pattern to create
		//parser from part objects. Only for done
		//Summary sections at present according to program type
		//at present. Significant benefit would be
		//for detail, allowing choice of part object optimised
		//for speed of processing etc.
		this.choosePartImplementations();
	}
	/**
	 * Parse the blast data and emit SAX events.
	 * @param poContents The BufferedReader that will read the BLAST 
	 * output
	 * @param poLine The first line of the BLAST record
	 * @throws org.xml.sax.SAXException If the input is malformed
	 * @return The last line of the BLAST data
	 */
	/*
	 * ALL REAL PARSING OF THE DATA OCCURS HERE IE CREATION OF THE BLASTDATASET ELEMENTS
	 */
	public String parse(BufferedReader poContents, String poLine)
	throws SAXException {
		oHits = new HitSectionSAXParser(oVersion,this.getNamespacePrefix());
		String oLine = null;

		oQueryId    = "";
		oDatabaseId = "";
		oContents = poContents;

		//First deal with first line which must be the start of 
		//a new Blast output
		//For a brand new collection, check for the start of a 
		//new BlastDataSet

		//look for characteristic of start of dataset
		if (oVersion.isStartOfDataSet(poLine)) {
			//just figure out whether it's a new DataSet
			//or not. i The onNewBlastDatSet method
			//takes care of the rest...
			this.onNewBlastDataSet(poLine);
		//	this.changeState(STARTUP);
		} else {
			throw new SAXException("unexpected poLine parameter, expecting start of BLAST like record");
			//return poLine;
		}
		//now parse stream...
		try {
			oLine = oContents.readLine();
			while ((oLine != null) &&
					(!checkNewBlastLikeDataSet(oLine))) {

				//System.out.println(oLine);\
				//interpret line and send messages accordingly
				this.interpret(oLine);
				oLine = oContents.readLine();

			} // end while
		} catch (java.io.IOException x) {
			System.out.println(x.getMessage());
			System.out.println("File read interrupted");
		} // end try/catch


		//Now close open elements...
		if (iState == IN_TRAILER) {
			this.emitRawOutput(oBuffer);
			this.endElement(new QName(this,this.prefix("Trailer")));
			this.changeState(AT_END);
		}

		this.endElement(new QName(this,this.prefix("BlastLikeDataSet")));

		return oLine;
	}
	/**
	 * Deal with line according to state parser is in.
	 *
	 * @param poLine     A line of Blast output
	 */
	private void interpret(String poLine) throws SAXException {

		if (iState == IN_HEADER) {
			//accumulate header information

			//things that can end header section
			//start of summmary section
			//start of detail section
			//start of trailer when there are no hits

			if (poLine.startsWith("Query=")) {
				StringTokenizer st = new StringTokenizer(poLine);
				// Skip the first token
				st.nextToken();

				if (st.hasMoreTokens())
					oQueryId = st.nextToken();
			}

			if (poLine.matches("^\\s+\\(\\d+\\sletters\\)\\s*$")) {           
				StringTokenizer st = new StringTokenizer(poLine);
				oQueryLength = st.nextToken().substring(1);
			}

			if (poLine.startsWith("Database:")) {
				int i = poLine.indexOf(":");
				oDatabaseId = poLine.substring(i + 1);
				while(true){
					try {
						poLine = oContents.readLine();
						if (poLine.startsWith("Searching")) {
							break;
						} else if (poLine.startsWith("Results of")) {
							// in PSI-blast is this line...
							System.err.println("this looks like a PSI-blast file, this is currently not supported, yet!");
							break;

						} else {
							oDatabaseId = oDatabaseId.concat(poLine);
						}
					} catch(java.io.IOException x){
						System.err.println(x.getMessage());
						System.err.println("File read interrupted");
					}
				}
			}

			if ((poLine.startsWith("Sequences producing significant alignments")) ||
					(poLine.startsWith("Sequences producing High-scoring Segment Pairs")) ||
					(poLine.startsWith(" ***** No hits found ******")) ||
					(poLine.startsWith("-------- "))) {
				this.emitRawOutput(oBuffer);
				this.emitHeaderIds();

				oAtts.clear();
				this.endElement(new QName(this,this.prefix("Header")));

				if (poLine.startsWith(" ***** No hits found ******")) {

					oAtts.clear();
					this.startElement(new QName(this,this.prefix("Trailer")),
							(Attributes) oAtts);
					this.changeState(IN_TRAILER);
					oBuffer.clear();
					return;
				}

				//change state
				this.changeState(IN_SUMMARY);
				oAtts.clear();
				this.startElement(new QName(this,this.prefix("Summary")),
						(Attributes)oAtts);

				//eat a blank line if there is one...
				//          this.interpret(oLine);
				//read next line

				try {
					poLine = oContents.readLine();
				} catch (java.io.IOException x) {
					System.err.println(x.getMessage());
					System.err.println("File read interrupted");
				} // end try/catch

				if (poLine.trim().equals("")) {
					//System.out.println("BLANK LINE");
				} else {
					//recursively interpret it...
					this.interpret(poLine);
				}

				return;
			} //end check start of summary

			//Currently doesn't handle output that starts
			//with a detail section i.e. no summary.
			//This is a BUG/FEATURE.

			oBuffer.add(poLine);
		} //end if inHeader state

		//Deal with summary state
		if (iState == IN_SUMMARY) {
			//check to see if end of summary
			//has been reached...

			//(signal is a blank line for NCBIBlast and Wu-Blast, 
			//(signal is either a blank line or 
			//Signal is \\End of List for GCG

			// HMMR has a longer summary section to include the
			// domain summary and the check for a blank line
			// will prematurely end the summary section so
			// skip this check for HMMR


			int iProgram = oVersion.getProgram();

			if (iProgram == BlastLikeVersionSupport.HMMER) {

				if (poLine.trim().equals("")) {
					return; // skip
				}

				//HMMER-specific
				if (poLine.startsWith("Parsed for domains:")) {
					//signifies domain summary info
					//		System.err.println( "Last-->" + oAfterHmmr + "<--" );
					return;
				}
			} else if ((poLine.trim().equals("")) ||
					(poLine.trim().startsWith("[no more scores")) ||
					(poLine.trim().startsWith("\\")) ) {
				//Can't change state, because we still want to
				//check for start of detail... 

				//Forgive non-standard white-space
				//between end of summary and start of detail
				if (!tDoneSummary) {
					tDoneSummary = true;
					this.endElement(new QName(this,this.prefix("Summary")));
				}
				return; //return before attempting to parse Summary Line
			}

			if (poLine.startsWith(">")) {
				//signifies start of detail section
				this.hitsSectionReached(poLine);
				return;
			}
			//need to check that we've end of summary
			//'cos could be spurious data between end of
			//summary and start of detail e.g. multi-line WARNINGs
			//at end of Summary section in Wu-Blast.
			if (!tDoneSummary) {
				this.parseSummaryLine(poLine);
			}
			return;
		}
		//State set to this when parsing of Hits finished
		if (iState == FINISHED_HITS) {
			//check end of detail section

			this.endElement(new QName(this,this.prefix("Detail")));
			oAtts.clear();
			this.startElement(new QName(this,this.prefix("Trailer")),
					(Attributes)oAtts);

			//change state to Trailer and initialse Buffer
			this.changeState(IN_TRAILER);
			oBuffer.clear();
			return;

		} //end if finishedHists

		if (iState == IN_TRAILER) {
			oBuffer.add(poLine);
		}
	}
	/**
	 * This method is called when a line indicating that
	 * that a new BlastLikeDataSet has been reached.
	 *
	 * NB This class deals NCBI-BLAST WU-BLAST and HMMER:
	 *
	 *  o flavours of NCBI Blast (e.g. blastn, blastp etc)
	 *  o flavours of WU Blast (e.g. blastn, blastp etc)
	 *
	 * When this method is called, the line will look something line:
	 *
	 * BLASTN 2.0.11 [Jan-20-2000] for NCBI Blast
	 *
	 * The above would be parsed to program ncbi-blastn, and version number
	 * 2.0.11
	 *
	 * @param poLine     -
	 */
	private void onNewBlastDataSet(String poLine) throws SAXException {
		if (!oVersion.isSupported()) {
			throw (new SAXException(
					"Program " + 
					oVersion.getProgramString() + " Version " +
					oVersion.getVersionString() +
			" is not supported by the biojava blast-like parsing framework"));

		}

		oAtts.clear();
		oAttQName.setQName("program");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA",oVersion.getProgramString());

		oAttQName.setQName("version");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA",oVersion.getVersionString());

		this.startElement(new QName(this,this.prefix("BlastLikeDataSet")),
				(Attributes)oAtts);

		//change state to reflect the fact we're in the Header
		iState = IN_HEADER;
		oBuffer.clear();

		oAtts.clear();
		this.startElement(new QName(this,this.prefix("Header")),
				(Attributes)oAtts);
	}
	/**
	 * Describe constructor here.
	 *
	 * @param oArrayList     -
	 */
	private void emitRawOutput(ArrayList poList) throws SAXException {

		oAtts.clear();
		oAttQName.setQName("xml:space");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"NMTOKEN","preserve");
		this.startElement(new QName (this,this.prefix("RawOutput")),
				(Attributes)oAtts);

		//Cycle through ArrayList and send character array data to
		//XML ContentHandler.

		int iTmpListSize = poList.size();
		for (int i = 0; i < iTmpListSize; i++) {
			//System.out.println("RAW:" + (String)poList.get(i));
			aoChars = ((String)poList.get(i)).toCharArray();
			this.characters(aoLineSeparator,0,1);
			this.characters(aoChars,0,aoChars.length);
		}

		this.endElement(new QName(this,this.prefix("RawOutput")));
	}

	/**
	 * Parses a summary line.  Actually parsing functionality
	 * is delegated to static method of a reusable Helper Class.
	 *
	 * For NCBI Blast, a summary line looks something like:
	 *
	 * U00431 Mus musculus HMG-1 mRNA, complete cds.        353  7e-95
	 *
	 * UO0431 is typically a database accession code
	 * Mus musculs.. is a description of the hit (this is optional)
	 *
	 * 353 is a bit score
	 *
	 * 7e-95 is an E Value
	 *
	 */
	private void parseSummaryLine(String poLine) throws SAXException {

		//Also remember in header add query attribute and database type?

		//Should split this out into different implementations
		//according to program and version

		oSummaryLineHelper.parse(poLine,oMap,oVersion);
		//Eat a line for GCG, which has two lines for a summary.
		//The first line becomes the beginning of the description
		if (iVer == BlastLikeVersionSupport.GCG_BLASTN) {
			try {
				poLine = oContents.readLine();
				oSummaryLineHelper.parse(poLine,oMap,oVersion);
			} catch (IOException x) {
				System.out.println(x.getMessage());
				System.out.println("GCG File read interrupted");
			} // end try/catch
		}

		/* Note have to do this check a hmmer can have empty summary
		 * sections and oMap.keySet().toArray will return an array
		 * of size 1 with a null first element if the map is empty.
		 */
		if ( oMap.size() == 0 ) {
			return;
		}

		aoKeys = (String[])(oMap.keySet().toArray(aoArrayType));

		oAtts.clear();

		//output contents of Map as either elements or
		//attribute lists

		//two passes first get HitsSummary element started
		for (int i = 0; i < aoKeys.length; i++) {

			if ((aoKeys[i].equals("hitId")) || 
					(aoKeys[i].equals("hitDescription")) ) {
				//do nothing
			} else {

				oAttQName.setQName(aoKeys[i]);
				oAtts.addAttribute(oAttQName.getURI(),
						oAttQName.getLocalName(),
						oAttQName.getQName(),
						"CDATA",(String)oMap.get(aoKeys[i]));
			}
		}

		this.startElement(new QName(this,this.prefix("HitSummary")),
				(Attributes)oAtts);

		for (int i = 0; i < aoKeys.length; i++) {

			if (aoKeys[i].equals("hitId")) {
				oAtts.clear();
				oAttQName.setQName("id");
				oAtts.addAttribute(oAttQName.getURI(),
						oAttQName.getLocalName(),
						oAttQName.getQName(),
						"CDATA",(String)oMap.get(aoKeys[i]));

				oAttQName.setQName("metaData");
				oAtts.addAttribute(oAttQName.getURI(),
						oAttQName.getLocalName(),
						oAttQName.getQName(),
						"CDATA","none");
				this.startElement(new QName(this,this.prefix("HitId")),
						(Attributes)oAtts);
				this.endElement(new QName(this,this.prefix("HitId")));

			} else if (aoKeys[i].equals("hitDescription")) {
				oAtts.clear();
				this.startElement(new QName(this,this.prefix("HitDescription")),
						(Attributes)oAtts);
				aoChars = ((String)oMap.get(aoKeys[i])).toCharArray();
				this.characters(aoChars,0,aoChars.length);
				this.endElement(new QName(this,this.prefix("HitDescription")));
			}
			//System.out.print(aoKeys[i] + ": ");
			//System.out.println(oMap.get(aoKeys[i]));
		}
		this.endElement(new QName(this,this.prefix("HitSummary")));

		oMap.clear();
	}

	/**
	 * Fires the QueryId and DatabaseId events.
	 */
	private void emitHeaderIds() throws SAXException {
		// Set attributes for QueryId element
		oAtts.clear();
		oAttQName.setQName("id");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA", oQueryId);
		oAttQName.setQName("metaData");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA", "none");

		oAttQName.setQName("queryLength");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA", oQueryLength);


		// Fire the QueryId element
		this.startElement(new QName(this,this.prefix("QueryId")),
				(Attributes) oAtts);
		this.endElement(new QName(this,this.prefix("QueryId")));

		// Set attributes for DatabaseId element
		oAtts.clear();
		oAttQName.setQName("id");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA", oDatabaseId);
		oAttQName.setQName("metaData");
		oAtts.addAttribute(oAttQName.getURI(),
				oAttQName.getLocalName(),
				oAttQName.getQName(),
				"CDATA", "none");

		// Fire the DatabaseId element
		this.startElement(new QName(this,this.prefix("DatabaseId")),
				(Attributes) oAtts);
		this.endElement(new QName(this,this.prefix("DatabaseId")));
	}

	/**
	 * From the specified line, hand over
	 * parsing of stream to a helperclass
	 *
	 * @param poLine     -
	 * @exception SAXException thrown if
	 * @exception  thrown if
	 */
	private void hitsSectionReached(String poLine)
	throws SAXException {

		//Parse Contents stream up to end of Hits
		oHits.setContentHandler(oHandler);
		//this returns when end of hits section reached...

		oAtts.clear();
		this.startElement(new QName(this,this.prefix("Detail")),
				(Attributes)oAtts);

		int iProgram = oVersion.getProgram();

		if ((iProgram == BlastLikeVersionSupport.NCBI_BLASTN) ||
				(iProgram == BlastLikeVersionSupport.NCBI_BLASTX) ||
				(iProgram == BlastLikeVersionSupport.NCBI_BLASTP) ||
				(iProgram == BlastLikeVersionSupport.NCBI_TBLASTN) ||
				(iProgram == BlastLikeVersionSupport.NCBI_TBLASTX)) {
			ArrayList hitEndSymbols=new ArrayList();
			hitEndSymbols.add("Database");
			hitEndSymbols.add("TBLAST");
			hitEndSymbols.add("BLAST");
			oHits.parse(oContents,poLine,hitEndSymbols);
		}

		if ((iProgram == BlastLikeVersionSupport.WU_BLASTN) ||
				(iProgram == BlastLikeVersionSupport.WU_BLASTX) ||
				(iProgram == BlastLikeVersionSupport.WU_BLASTP) ||
				(iProgram == BlastLikeVersionSupport.WU_TBLASTN) ||
				(iProgram == BlastLikeVersionSupport.WU_TBLASTX)) {
			ArrayList hitEndSymbols=new ArrayList();
			hitEndSymbols.add("Parameters:");
			oHits.parse(oContents,poLine,hitEndSymbols);
		}

		//Same as NCBI, left here for organization I suppose
		if (iProgram == BlastLikeVersionSupport.GCG_BLASTN) {
			ArrayList hitEndSymbols=new ArrayList();
			hitEndSymbols.add("Database:");
			oHits.parse(oContents,poLine,hitEndSymbols);
		}

		this.changeState(FINISHED_HITS);
	}

	/**
	 * Checks to see if a line of Blast like output 
	 * represents the start of a new BlastLike data set.
	 * Current supports:
	 *   o ncbi-blast all variants
	 *   o wu-blast all variants
	 *
	 * @param poLine     A String representation of the line
	 * @return boolean   true if it is a new dataset, false if not.
	 */
	private boolean checkNewBlastLikeDataSet(String poLine) {
		if ((poLine.startsWith("BLAST")) ||(poLine.startsWith("TBLAST"))) {
			return true;
		} else {
			return false;
		}
	}
	/**
	 * Choose particular implementations of Part objects
	 * that comprise the aggregate object according
	 * to version/type of program
	 *
	 * @param nil    -
	 */
	private void choosePartImplementations() throws SAXException {

		iVer = oVersion.getProgram();

		if ((iVer == BlastLikeVersionSupport.NCBI_BLASTN) ||
				(iVer == BlastLikeVersionSupport.NCBI_BLASTX) ||
				(iVer == BlastLikeVersionSupport.NCBI_BLASTP) ||
				(iVer == BlastLikeVersionSupport.NCBI_TBLASTN) ||
				(iVer == BlastLikeVersionSupport.NCBI_TBLASTX)) {
			oSummaryLineHelper =
				(SummaryLineHelperIF) new NcbiBlastSummaryLineHelper();
			return;
		}

		if ((iVer == BlastLikeVersionSupport.WU_BLASTN) ||
				(iVer == BlastLikeVersionSupport.WU_BLASTX) ||
				(iVer == BlastLikeVersionSupport.WU_BLASTP) ||
				(iVer == BlastLikeVersionSupport.WU_TBLASTN) ||
				(iVer == BlastLikeVersionSupport.WU_TBLASTX)) {
			oSummaryLineHelper = 
				(SummaryLineHelperIF) new WuBlastSummaryLineHelper();
			return;
		}

		if (iVer == BlastLikeVersionSupport.HMMER) {
			oSummaryLineHelper = 
				(SummaryLineHelperIF) new HmmerSummaryLineHelper();
			return;
		}

		if (iVer == BlastLikeVersionSupport.GCG_BLASTN) {
			oSummaryLineHelper = 
				(SummaryLineHelperIF) new GCGBlastSummaryLineHelper();
			return;
		}

		//If get to here, no implementation available

		//Exception to help SAX parser writers track down
		//problems writing software to support the framework
		throw (new SAXException("Could not choose a suitable implementation of the ".
				concat("SummaryLineHelperIF for program ").
				concat(oVersion.getProgramString()).
				concat(" version ").
				concat(oVersion.getVersionString())));
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy