com.hfg.bio.seq.pattern.BruteForceMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.util.ArrayList;
import java.util.List;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.collection.CollectionUtil;
//------------------------------------------------------------------------------
/**
Brute force string matching algorithm. Use when mismatches are allowed
and there is ambiguity within positions or range specifications.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
class BruteForceMatcher implements SeqPatternMatcher
{
private static String sAlgorithmName = "Brute Force";
private SeqPattern mPattern;
private S mTarget;
private SeqLocation mSeqLocation;
private List mPatternPositions;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
protected BruteForceMatcher(SeqPattern inPattern, S inTarget)
{
this(inPattern, inTarget, null);
}
//---------------------------------------------------------------------------
protected BruteForceMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation)
{
mPattern = inPattern;
mTarget = inTarget;
mSeqLocation = inSeqLocation != null ? inSeqLocation.clone() : null;
mPatternPositions = mPattern.getPrositePatternPositions();
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public T find(SeqLocation inSeqLocation)
{
List matches = null;
SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation);
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, getTarget().length());
}
String targetString = mTarget.getSubSequence(seqLocation);
if (! getPattern().isCaseSensitive())
{
targetString = targetString.toUpperCase();
}
if (getPattern().isRestrictedToSeqStart())
{
if (seqLocation.getStart().equals(1))
{
matches = eval(targetString, 0, 0, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps());
}
}
else
{
for (int i = 0; i < targetString.length(); i++)
{
matches = eval(targetString, i, seqLocation.getStart() - 1, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps());
if (matches != null)
{
break;
}
}
}
return CollectionUtil.hasValues(matches) ? matches.get(0) : null;
}
//---------------------------------------------------------------------------
@Override
public List findAll()
{
return findAll(null);
}
//---------------------------------------------------------------------------
@Override
public List findAll(SeqLocation inSeqLocation)
{
SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation.clone() : mSeqLocation);
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, getTarget().length());
}
List allMatches = null;
boolean matchFound = false;
do
{
List matches = null;
String targetString = mTarget.getSubSequence(seqLocation);
if (! getPattern().isCaseSensitive())
{
targetString = targetString.toUpperCase();
}
if (getPattern().isRestrictedToSeqStart())
{
if (seqLocation.getStart().equals(1))
{
matches = eval(targetString, 0, 0, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps());
}
}
else
{
for (int i = 0; i < targetString.length(); i++)
{
matches = eval(targetString, i, seqLocation.getStart() - 1, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps());
if (matches != null)
{
break;
}
}
}
if (CollectionUtil.hasValues(matches))
{
matchFound = true;
if (null == allMatches)
{
allMatches = matches;
}
else
{
allMatches.addAll(matches);
}
seqLocation.setStart(matches.get(matches.size() - 1).getSeqLocation().getStart() + 1);
if (seqLocation.length() <= 0)
{
break;
}
}
else
{
break;
}
}
while (matchFound);
return allMatches;
}
//---------------------------------------------------------------------------
public S getTarget()
{
return mTarget;
}
//---------------------------------------------------------------------------
public SeqLocation getSeqLocation()
{
return mSeqLocation;
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//---------------------------------------------------------------------------
protected SeqPattern getPattern()
{
return mPattern;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private List eval(String inTargetString, int inTargetIndex, int inTargetOffset, int inPatternIndex, int inRangeSize,
int inMismatchCount, int inMaxMismatches, boolean inIgnoreGaps)
{
List matches = null;
int mismatchCount = inMismatchCount;
int targetIndex = inTargetIndex;
int patternPositionIndex = inPatternIndex;
PrositePatternPosition position;
while (patternPositionIndex < mPatternPositions.size()
&& targetIndex < inTargetString.length())
{
position = mPatternPositions.get(patternPositionIndex);
char targetResidue = inTargetString.charAt(targetIndex);
if ('-' == targetResidue
&& inIgnoreGaps)
{
targetIndex++;
}
if (position.hasCountRange())
{
// Repeat the position until we get to the min count for the range
for (int i = 0; i < position.getCountRange().getStart() - 1; i++)
{
if (! position.matchesResidue(targetResidue))
{
mismatchCount++;
if (mismatchCount > inMaxMismatches
|| position.mismatchNotAllowed())
{
break;
}
}
if (targetIndex == inTargetString.length() - 1)
{
break;
}
targetResidue = inTargetString.charAt(++targetIndex);
}
if (mismatchCount > inMaxMismatches)
{
break;
}
for (int i = 0; i < position.getCountRange().length(); i++)
{
if (! position.matchesResidue(targetResidue))
{
mismatchCount++;
if (mismatchCount > inMaxMismatches
|| position.mismatchNotAllowed())
{
break;
}
}
List innerMatches = eval(inTargetString, targetIndex + 1, inTargetOffset,
patternPositionIndex + 1, inRangeSize + i + position.getCountRange().getStart() - 1, mismatchCount, inMaxMismatches, inIgnoreGaps);
if (CollectionUtil.hasValues(innerMatches))
{
if (null == matches)
{
matches = innerMatches;
}
else
{
matches.addAll(innerMatches);
}
if (position.useLazyMatchMode())
{
// When lazy matching an interval quantifier (range), stop at the first range value that produces a match
break;
}
}
if (targetIndex == inTargetString.length() - 1)
{
// We're at the end of the string.
break;
}
targetResidue = inTargetString.charAt(++targetIndex);
}
// We just finished recursively searching. No need to continue the loop.
break;
}
else if (! position.matchesResidue(targetResidue))
{
mismatchCount++;
if (mismatchCount > inMaxMismatches
|| position.mismatchNotAllowed())
{
break;
}
}
targetIndex++;
patternPositionIndex++;
}
if (null == matches
&& mismatchCount <= inMaxMismatches
&& (patternPositionIndex == mPatternPositions.size()
|| (patternPositionIndex == mPatternPositions.size() - 1
&& mPatternPositions.get(mPatternPositions.size() - 1).getType().equals(PrositePatternPositionType.ONE_OF)
&& mPatternPositions.get(mPatternPositions.size() - 1).matchesResidue('>'))))
{
SeqLocation matchLoc = new SeqLocation(inTargetOffset + inTargetIndex - inPatternIndex - inRangeSize + 1, inTargetOffset + targetIndex);
// Was the pattern restricted to the end of the sequence?
if (! getPattern().isRestrictedToSeqEnd()
|| matchLoc.getEnd().equals(getTarget().length()))
{
T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
match.setNumMismatches(mismatchCount);
matches = new ArrayList<>(1);
matches.add(match);
}
}
return matches;
}
}