![JAR search and dependency download from the Maven repository](/logo.png)
org.biojava.bio.program.sax.NeedleAlignmentSAXParser Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.program.sax;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* A SAX2 parser for dealing with a pairwise sequence
* alignment as produced by Needle (EMBOSS) outputing .needle format.
* For example,
*
TGFBG4_frame2
TGFB1 136 NTSELREAVPEPVLLSRAELRLLRLKLKVEQHVELYQKYSNNSWR 180
TGFBG4_frame2
TGFB1 181 YLSNRLLAPSDSPEWLSFDVTGVVRQWLSRG.............. 211
|
TGFBG4_frame2 1 LGELHSQTGFPLATPT 16
TGFB1 212 GEIEGFRLSAHCSCDSRDNTLQVDIN................... 237
||||||||||||||||||||||||||
TGFBG4_frame2 17 GEIEGFRLSAHCSCDSRDNTLQVDINGEACFPGHAQLRVCVCVFP 61
TGFB1 238 ........................GFTTGRRGDLATIHGMNRPFL 258
|||||||||||||||||||||
TGFBG4_frame2 62 SAPRPTYLSLECVCMSPIPLPHKAGFTTGRRGDLATIHGMNRPFL 106
TGFB1 259 LLMATPLERAQHLQSSRHRRALDTNYCFSSTEKNCCVRQLYIDFR 303
|||||||||||||||||||||||||||| :
TGFBG4_frame2 107 LLMATPLERAQHLQSSRHRRALDTNYCFRALP............. 138
TGFB1 304 KDLGWKWIHEPKGYHANFCLGPCPYIWSLDTQYSKVLALYNQHNP 348
||: | |
TGFBG4_frame2 139 ...GWR....PSRLGAL 148
TGFB1 349 GASAAPCCVPQALEPLPIVYYVGRKPKVEQLSNMIVRSCKCS 390
TGFBG4_frame2
*
*
* Please note, this parser reads the whole alignment in to
* core memory and thus does not scale to work with very large
* alignments on low-end hardware.
*
* Please also note that this class has not been tested with
* many version of Needle.
*
* WARNING: The parser currently assumes a line length of 45
* This restriction is easy to remove, just haven't done it.
*
* Copyright © 2001,2002 Cambridge Antibody Technology.
*
*
* Primary author -
* - Simon Brocklehurst (CAT)
*
* Other authors -
* - Neil Benn (CAT)
*
- Lawrence Bower (CAT)
*
- Derek Crockford (CAT)
*
- Tim Dilks (CAT)
*
- Colin Hardman (CAT)
*
- Stuart Johnston (CAT)
*
*
* @author Cambridge Antibody Technology (CAT)
* @author Greg Cox
* @version 1.0
*
*/
class NeedleAlignmentSAXParser extends AbstractNativeAppSAXParser {
private AttributesImpl oAtts = new AttributesImpl();
private QName oAttQName = new QName(this);
private char[] aoChars;
private String oSeqName;
private String oTmpSeq;
private StringBuffer oSeq = new StringBuffer();
private HashMap oAlignment = new HashMap();
private ArrayList oSeqNameList = new ArrayList();
private static final int STARTUP = 0;
private static final int IN_STREAM = 1;
/**
* Initialises internal state
* Sets namespace prefix to "biojava"
*/
public NeedleAlignmentSAXParser() {
iState = STARTUP;
this.setNamespacePrefix("biojava");
}
/**
* Describe 'parse' method here.
*
* @param nil -
*/
public void parse(InputSource poSource )
throws IOException,SAXException {
BufferedReader oContents;
String oLine = null;
//Use method form superclass
oContents = this.getContentStream(poSource);
// loop over file
try {
// loop over file
oLine = oContents.readLine();
while (oLine != null) {
//System.out.println(oLine);
this.interpret(oContents,oLine);
oLine = oContents.readLine();
} // end while
} catch (java.io.IOException x) {
System.out.println(x.getMessage());
System.out.println("Stream read interupted");
} // end try/catch
//at end of stream...
//at this point, alignment is parsed - nibble back extraneous end
//gaps
this.nibbleEndGaps();
// now cycle through and emit elements
for (int i = 0; i < oSeqNameList.size(); i++) {
oSeqName = (String) oSeqNameList.get(i);
this.emitSequence(oSeqName,
(String) oAlignment.get(oSeqName));
}
this.endElement(new QName(this,
this.prefix("SequenceCollection")));
oContents.close();
}
/**
* Describe interpret
method here.
*
* @param poContents a BufferedReader
value
* @param poLine a String
value
* @exception SAXException if an error occurs
*/
private void interpret(BufferedReader poContents, String poLine)
throws SAXException {
if (iState == STARTUP) {
oAtts.clear();
this.startElement(
new QName(this,
this.prefix("SequenceCollection")),
(Attributes)oAtts);
this.changeState(IN_STREAM);
}
if (iState == IN_STREAM) {
if (this.lineIsRelevant(poLine)) {
//build aligment in memory
this.appendToAlignment(poLine);
}
}
}
/**
* Parse a relevant line i.e. alignment line, and add to alignment
*
* @param poLine a String
value
*/
private void appendToAlignment(String poLine) {
//System.out.println(poLine);
StringTokenizer oSt = new StringTokenizer(poLine,"\n\t\r ");
//First token is sequence name
oSeqName = oSt.nextToken();
//System.out.println(oSeqName);
oSeq.setLength(0);
//Take from particular positions - assumes default output
//line length etc.
//Postions 25 to 69
oTmpSeq = poLine.substring(25,70).replace('.','-').replace(' ','-');
oSeq.append(oTmpSeq);
//System.out.println(oSeq);
//At this point, have name of sequence, and a segment of the sequence
//Update object...
if (oAlignment.get(oSeqName) == null) {
//Here if on first occurence of this sequence
//Add to alignment
oAlignment.put(oSeqName,oSeq.substring(0));
//maintain ordered list of sequence names
oSeqNameList.add(oSeqName);
} else {
//Here if building up an existing sequence
oTmpSeq = (String) oAlignment.get(oSeqName);
oAlignment.put(oSeqName,oTmpSeq.concat(oSeq.substring(0)));
}
}
/**
* Only interested in lines that are part of the alignment.
* Returns true if line is in alignment, false if not.
*
* @param poLine a String
value
* @return a boolean
value
*/
private boolean lineIsRelevant(String poLine) {
//blank lines not relevant
//lines that starts with a space not relevant (consensus line)
//lines that start with "Needle(" not relevant (title)
if ( (poLine.trim().equals("")) ||
(poLine.startsWith(" ")) ||
(poLine.startsWith("Global: ")) ||
(poLine.startsWith("Score: ")) ||
(poLine.startsWith("%id = ")) ||
(poLine.startsWith("Overall %id = ")) ) {
//System.out.println("Irrelevant|"+poLine+"|");
return false;
}
//if here,line is part of alignment, so return true
return true;
}
/**
* Emit a sequence element
*
* @param poSequenceName a String
value
* @param poSequence a String
value
* @exception SAXException if an error occurs
*/
private void emitSequence(String poSequenceName, String poSequence) throws SAXException {
oAtts.clear();
oAttQName.setQName("sequenceName");
oAtts.addAttribute(oAttQName.getURI(),
oAttQName.getLocalName(),
oAttQName.getQName(),
"CDATA",poSequenceName);
this.startElement(
new QName(this,
this.prefix("Sequence")),
(Attributes)oAtts);
aoChars = poSequence.toCharArray();
this.characters(aoChars,0,aoChars.length);
this.endElement(new QName(this,this.prefix("Sequence")));
}
/**
* After the alignment is parsed, the ends of the sequences are
* padded out with blanks to the end of the last line:
*
* e.g.
* GCACACA---------------------------------
* CAC-------------------------------------
*
* This method removes extraneous gaps to produce
*
* GCACACA
* CAC----
*
*/
private void nibbleEndGaps() {
//first get length of alignment
String oSeqName1 = (String) oSeqNameList.get(0);
String oSeqName2 = (String) oSeqNameList.get(1);
String oSeq1 = (String) oAlignment.get(oSeqName1);
String oSeq2 = (String) oAlignment.get(oSeqName2);
int iLength = oSeq1.length();
int iNibbles = 0;
boolean tNibble = true;
//index position of last character in String
int iPos = iLength-1;
//System.out.println("Seq1 length = " + oSeq1.length());
//System.out.println("Seq2 length = " + oSeq2.length());
while (tNibble) {
//System.out.println("-->iPos = " + iPos);
if ( (oSeq1.charAt(iPos) == '-') &&
(oSeq2.charAt(iPos) == '-') ) {
iNibbles++;
iPos--;
} else {
tNibble = false;
}
}
//System.out.println("----------->NIBBLE " + iNibbles);
if (iNibbles > 0) {
oAlignment.put(oSeqName1,oSeq1.substring(0,iLength-iNibbles));
oAlignment.put(oSeqName2,oSeq2.substring(0,iLength-iNibbles));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy