org.biojava.bio.program.ssbind.BlastLikeSearchBuilder Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.program.ssbind;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.biojava.bio.Annotation;
import org.biojava.bio.BioException;
import org.biojava.bio.alignment.SimpleAlignment;
import org.biojava.bio.search.SearchBuilder;
import org.biojava.bio.search.SeqSimilaritySearchHit;
import org.biojava.bio.search.SeqSimilaritySearchResult;
import org.biojava.bio.search.SeqSimilaritySearchSubHit;
import org.biojava.bio.search.SimpleSeqSimilaritySearchHit;
import org.biojava.bio.search.SimpleSeqSimilaritySearchResult;
import org.biojava.bio.search.SimpleSeqSimilaritySearchSubHit;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.StrandedFeature;
import org.biojava.bio.seq.StrandedFeature.Strand;
import org.biojava.bio.seq.db.SequenceDB;
import org.biojava.bio.seq.db.SequenceDBInstallation;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.FiniteAlphabet;
import org.biojava.bio.symbol.SimpleSymbolList;
import org.biojava.utils.SmallMap;
/**
* BlastLikeSearchBuilder
will create
* SeqSimilaritySearchResult
s from SAX events via a
* SeqSimilarityAdapter
. The SAX events should describe
* elements conforming to the BioJava BlastLikeDataSetCollection
* DTD. Suitable sources are BlastLikeSAXParser
or
* FastaSearchSAXParser
. The result objects are placed in
* the List
supplied to the constructor.
*
* The start/end/strand of SeqSimilaritySearchHit
s are
* calculated from their constituent
* SeqSimilaritySearchSubHit
s as follows:
*
*
* - The query start is the lowest query start coordinate of its
* sub-hits, regardless of strand
* - The query end is the highest query end coordinate of its sub-hits,
* regardless of strand
* - The hit start is the lowest hit start coordinate of its sub-hits,
* regardless of strand
* - The hit end is the highest hit end coordinate of its sub-hits,
* regardless of strand
* - The query strand is null for protein sequences. Otherwise it is
* equal to the query strand of its sub-hits if they are all on the
* same strand, or
StrandedFeature.UNKNOWN
if the sub-hits
* have mixed query strands
* - The hit strand is null for protein sequences. Otherwise it is
* equal to the hit strand of its sub-hits if they are all on the same
* strand, or
StrandedFeature.UNKNOWN
if the sub-hits have
* mixed hit strands
*
*
*
* This class has special meanings for particular keys: if you want to
* adapt this class for another parser, you will need to be aware of
* this. These originate from and are fully described in the
* BlastLikeDataSetCollection DTD.
*
*
*
* Key
* Meaning
*
*
* program
* either this value or the subjectSequenceType value must be set. This can take values
* acceptable to AlphabetResolver. These are BLASTN, BLASTP, BLASTX, TBLASTN,
* TBLASTX, DNA and PROTEIN.
*
*
*
* databaseId
* Identifier of database searched (in SequenceDBInstallation).
*
*
* subjectSequenceType
* type of sequence that hit is. Can be DNA or PROTEIN.
*
*
* subjectId
* id of sequence that is hit
*
*
* subjectDescription
* description of sequence that is hit
*
*
* queryStrand
* Strandedness of query in alignment. Takes values of "plus" and "minus"
*
*
* subjectStrand
* Strandedness of query in alignment. Takes values of "plus" and "minus"
*
*
* queryFrame
* self-evident
*
*
* subjectFrame
* self-evident
*
*
* querySequenceStart
* self-evident
*
*
* querySequenceEnd
* self-evident
*
*
* subjectSequenceStart
* self-evident
*
*
* subjectSequenceEnd
* self-evident
*
*
* score
* self-evident
*
*
* expectValue
* self-evident
*
*
* pValue
* self-evident
*
*
*
* @author Keith James
* @author Greg Cox
* @since 1.2
*/
public class BlastLikeSearchBuilder implements SearchBuilder
{
// Supplier of instances of searched databases
private SequenceDBInstallation subjectDBs;
// Holder for all query sequences
private SequenceDB querySeqHolder;
// The ID of the database searched
private String databaseID;
// The ID of the query sequence
private String queryID;
// Hit and Result annotation
private Annotation resultAnnotation;
// Data holders for search result properties
private Map resultPreAnnotation;
private Map searchParameters;
private Map hitData;
private Map subHitData;
private SymbolTokenization tokenParser;
private List hits;
private List subHits;
private SeqSimilaritySearchSubHit [] subs;
// Flag indicating whether there are more results in the stream
private boolean moreSearchesAvailable = false;
// List to accept all results in the stream
private List target;
/**
* Creates a new BlastLikeSearchBuilder
which will
* instantiate results into the List
target.
*
* @param target a List
.
*/
public BlastLikeSearchBuilder(List target)
{
this.target = target;
resultPreAnnotation = new HashMap();
searchParameters = new HashMap();
hitData = new HashMap();
subHitData = new HashMap();
}
/**
* Creates a new BlastLikeSearchBuilder
which will
* instantiate results into the List
target.
*
* @param target a List
.
* @param querySeqHolder a SequenceDB
of query
* sequences.
* @param subjectDBs a SequenceDBInstallation
of
* databases searched.
*/
public BlastLikeSearchBuilder(List target,
SequenceDB querySeqHolder,
SequenceDBInstallation subjectDBs)
{
this(target);
this.querySeqHolder = querySeqHolder;
this.subjectDBs = subjectDBs;
}
public SeqSimilaritySearchResult makeSearchResult()
throws BioException
{
if (querySeqHolder == null)
throw new BioException("Running BlastLikeSearchBuilder with null query SequenceDB");
if (subjectDBs == null)
throw new BioException("Running BlastLikeSearchBuilder with null subject SequenceDB installation");
Sequence query = querySeqHolder.getSequence(queryID);
if (query == null)
throw new BioException("Failed to retrieve query sequence from queryDB using ID '"
+ queryID
+ "' (sequence was null)");
SequenceDB subjectDB = (SequenceDB) subjectDBs.getSequenceDB(databaseID);
if (subjectDB == null)
throw new BioException("Failed to retrieve database from installation using ID '"
+ databaseID
+ "' (database was null)");
return new SimpleSeqSimilaritySearchResult(query,
subjectDB,
searchParameters,
hits,
resultAnnotation);
}
/**
* setQuerySeqHolder
sets the query sequence holder
* to a specific database.
*
* @param querySeqHolder a SequenceDB
containing the
* query sequence(s).
*/
public void setQuerySeqHolder(SequenceDB querySeqHolder)
{
this.querySeqHolder = querySeqHolder;
}
/**
* setSubjectDBInstallation
sets the subject database
* holder to a specific installation.
*
* @param subjectDBs a SequenceDBInstallation
* containing the subject database(s)
*/
public void setSubjectDBInstallation(SequenceDBInstallation subjectDBs)
{
this.subjectDBs = subjectDBs;
}
public void setQueryID(String queryID)
{
this.queryID = queryID;
addSearchProperty("queryId", queryID);
}
public void setDatabaseID(String databaseID)
{
this.databaseID = databaseID;
addSearchProperty("databaseId", databaseID);
}
public boolean getMoreSearches()
{
return moreSearchesAvailable;
}
public void setMoreSearches(boolean value)
{
moreSearchesAvailable = value;
}
public void startSearch()
{
hits = new ArrayList();
}
public void endSearch()
{
try
{
resultAnnotation = AnnotationFactory.makeAnnotation(resultPreAnnotation);
target.add(makeSearchResult());
}
catch (BioException be)
{
System.err.println("Failed to build SeqSimilaritySearchResult:");
be.printStackTrace();
}
}
public void startHeader()
{
resultPreAnnotation.clear();
searchParameters.clear();
}
public void endHeader() { }
public void startHit()
{
hitData.clear();
subHits = new ArrayList();
}
public void endHit()
{
hits.add(makeHit());
}
public void startSubHit()
{
subHitData.clear();
}
public void endSubHit()
{
try
{
subHits.add(makeSubHit());
}
catch (BioException be)
{
be.printStackTrace();
}
}
public void addSearchProperty(Object key, Object value)
{
resultPreAnnotation.put(key, value);
}
public void addHitProperty(Object key, Object value)
{
hitData.put(key, value);
}
public void addSubHitProperty(Object key, Object value)
{
subHitData.put(key, value);
}
/**
* makeHit
creates a new hit. The hit's strand data
* is the same as that of the highest-scoring sub-hit. The hit's
* start/end data are the same as the extent of the sub-hits on
* that strand.
*
* @return a SeqSimilaritySearchHit
.
*/
private SeqSimilaritySearchHit makeHit()
{
double sc = Double.NaN;
double ev = Double.NaN;
double pv = Double.NaN;
subs = (SeqSimilaritySearchSubHit []) subHits
.toArray(new SeqSimilaritySearchSubHit [subHits.size() - 1]);
// Sort to get highest score
Arrays.sort(subs, SeqSimilaritySearchSubHit.byScore);
sc = subs[subs.length - 1].getScore();
ev = subs[subs.length - 1].getEValue();
pv = subs[subs.length - 1].getPValue();
// Check for any mixed or null strands
boolean mixQueryStrand = false;
boolean mixSubjectStrand = false;
boolean nullQueryStrand = false;
boolean nullSubjectStrand = false;
// Start with index 0 value (arbitrarily)
Strand qStrand = subs[0].getQueryStrand();
Strand sStrand = subs[0].getSubjectStrand();
int qStart = subs[0].getQueryStart();
int qEnd = subs[0].getQueryEnd();
int sStart = subs[0].getSubjectStart();
int sEnd = subs[0].getSubjectEnd();
if (qStrand == null)
nullQueryStrand = true;
if (sStrand == null)
nullSubjectStrand = true;
// Compare all other values
for (int i = subs.length; --i > 0;)
{
Strand qS = subs[i].getQueryStrand();
Strand sS = subs[i].getSubjectStrand();
if (qS == null)
nullQueryStrand = true;
if (sS == null)
nullSubjectStrand = true;
if (qS != qStrand)
mixQueryStrand = true;
if (sS != sStrand)
mixSubjectStrand = true;
qStart = Math.min(qStart, subs[i].getQueryStart());
qEnd = Math.max(qEnd, subs[i].getQueryEnd());
sStart = Math.min(sStart, subs[i].getSubjectStart());
sEnd = Math.max(sEnd, subs[i].getSubjectEnd());
}
// Note any mixed strand hits as unknown strand
if (mixQueryStrand)
qStrand = StrandedFeature.UNKNOWN;
if (mixSubjectStrand)
sStrand = StrandedFeature.UNKNOWN;
// Any null strands from protein sequences
if (nullQueryStrand)
qStrand = null;
if (nullSubjectStrand)
sStrand = null;
String subjectID = (String) hitData.get("subjectId");
return new SimpleSeqSimilaritySearchHit(sc, ev, pv,
qStart, qEnd, qStrand,
sStart, sEnd, sStrand,
subjectID,
AnnotationFactory.makeAnnotation(hitData),
subHits);
}
/**
* makeSubHit
creates a new sub-hit.
*
* @return a SeqSimilaritySearchSubHit
.
*
* @exception BioException if an error occurs.
*/
private SeqSimilaritySearchSubHit makeSubHit() throws BioException
{
// Try to get a valid TokenParser
if (tokenParser == null)
{
String identifier;
// Try explicit sequence type first
if (subHitData.containsKey("subjectSequenceType"))
identifier = (String) subHitData.get("subjectSequenceType");
// Otherwise try to resolve from the program name (only
// works for Blast)
else if (resultPreAnnotation.containsKey("program"))
identifier = (String) resultPreAnnotation.get("program");
else
throw new BioException("Failed to determine sequence type");
FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier);
tokenParser = alpha.getTokenization("token");
}
// BLASTP output has the strands set null (protein sequences)
Strand qStrand = null;
Strand sStrand = null;
// Override where an explicit strand is given (FASTA DNA,
// BLASTN)
if (subHitData.containsKey("queryStrand"))
if (subHitData.get("queryStrand").equals("plus"))
qStrand = StrandedFeature.POSITIVE;
else
qStrand = StrandedFeature.NEGATIVE;
if (subHitData.containsKey("subjectStrand"))
if (subHitData.get("subjectStrand").equals("plus"))
sStrand = StrandedFeature.POSITIVE;
else
sStrand = StrandedFeature.NEGATIVE;
// Override where a frame is given as this contains strand
// information (BLASTX for query, TBLASTN for hit, TBLASTX for
// both)
if (subHitData.containsKey("queryFrame"))
if (((String) subHitData.get("queryFrame")).startsWith("plus"))
qStrand = StrandedFeature.POSITIVE;
else
qStrand = StrandedFeature.NEGATIVE;
if (subHitData.containsKey("subjectFrame"))
if (((String) subHitData.get("subjectFrame")).startsWith("plus"))
sStrand = StrandedFeature.POSITIVE;
else
sStrand = StrandedFeature.NEGATIVE;
// Get start/end
int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart"));
int qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd"));
int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart"));
int sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd"));
// The start/end coordinates from BioJava XML don't follow the
// BioJava paradigm of start < end, with orientation given by
// the strand property. Rather, they present start/end as
// displayed in BLAST output, with the coordinates being
// inverted on the reverse strand. We account for this here.
if (qStrand == StrandedFeature.NEGATIVE)
{
int swap = qStart;
qStart = qEnd;
qEnd = swap;
}
if (sStrand == StrandedFeature.NEGATIVE)
{
int swap = sStart;
sStart = sEnd;
sEnd = swap;
}
// Get scores
double sc = Double.NaN;
double ev = Double.NaN;
double pv = Double.NaN;
if (subHitData.containsKey("score"))
sc = Double.parseDouble((String) subHitData.get("score"));
if (subHitData.containsKey("expectValue"))
{
String val = (String) subHitData.get("expectValue");
// Blast sometimes uses invalid formatting such as 'e-156'
// rather than '1e-156'
if (val.startsWith("e"))
ev = Double.parseDouble("1" + val);
else
ev = Double.parseDouble(val);
}
if (subHitData.containsKey("pValue"))
pv = Double.parseDouble((String) subHitData.get("pValue"));
Map labelMap = new SmallMap();
// Note that the following is removing the raw sequences
StringBuffer tokenBuffer = new StringBuffer(1024);
tokenBuffer.append((String) subHitData.remove("querySequence"));
labelMap.put(SeqSimilaritySearchSubHit.QUERY_LABEL,
new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
tokenBuffer = new StringBuffer(1024);
tokenBuffer.append((String) subHitData.remove("subjectSequence"));
labelMap.put(hitData.get("subjectId"),
new SimpleSymbolList(tokenParser, tokenBuffer.substring(0)));
return new SimpleSeqSimilaritySearchSubHit(sc, ev, pv,
qStart, qEnd, qStrand,
sStart, sEnd, sStrand,
new SimpleAlignment(labelMap),
AnnotationFactory.makeAnnotation(subHitData));
}
}