com.hfg.bio.seq.pattern.BYPMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringUtil;
import com.hfg.util.io.StreamUtil;
//------------------------------------------------------------------------------
/**
Baeza-Yates, Perleberg string matching algorithm. Use when mismatches are allowed
but there is no ambiguity within positions or range specifications.
See: Baeza-yates, R. A., & Perleberg, C. H. (1992).
"Fast and Practical Approximate String Matching."
In Annual Symposium on Combinatorial Pattern Matching (pp. 1–9). Springer.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
class BYPMatcher implements SeqPatternMatcher
{
private static String sAlgorithmName = "Baeza-Yates, Perleberg";
private static int ALPHABET_SIZE = 256;
private static int MOD256 = 0xff;
private SeqPattern mPattern;
private String mBYPPattern;
private S mTarget;
private SeqLocation mSeqLocation;
private int[] mMismatchCount = new int[ALPHABET_SIZE];
private List mCharacterOffsetList;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
private BYPMatcher()
{
}
//---------------------------------------------------------------------------
protected BYPMatcher(SeqPattern inPattern, S inTarget)
{
this(inPattern, inTarget, null);
}
//---------------------------------------------------------------------------
protected BYPMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation)
{
mPattern = inPattern;
mTarget = inTarget;
mSeqLocation = inSeqLocation != null ? inSeqLocation.clone() : null;
verifyPatternCompatibility();
setup();
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public T find(SeqLocation inSeqLocation)
{
T match = null;
SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation);
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, getTarget().length());
}
int patternLength = getBYPPattern().length();
int maxMismatches = getPattern().getMaxMismatches();
Reader reader = null;
try
{
reader = getTarget().getSubSequenceReader(seqLocation);
for (int i = 0; i < ALPHABET_SIZE; i++)
{
mMismatchCount[i] = patternLength;
}
for (int i = 0; i < seqLocation.length(); i++)
{
int residue = reader.read();
if (! mPattern.isCaseSensitive())
{
residue = Character.toUpperCase((char)residue);
}
IndexNode indexNode = mCharacterOffsetList.get(residue);
int offset;
if ((offset = indexNode.getOffset()) >= 0)
{
mMismatchCount[(i + offset)&MOD256]--;
while (indexNode.nextIndexNode() != null)
{
indexNode = indexNode.nextIndexNode();
mMismatchCount[(i + indexNode.getOffset()) & MOD256]--;
}
}
int numMismatches = mMismatchCount[i&MOD256];
if (numMismatches <= maxMismatches
&& numMismatches >= 0)
{
// System.out.println(String.format("Match in position %d with %d mismatches", i - patternLength + 1, numMismatches));
SeqLocation matchLoc = new SeqLocation(seqLocation.getStart() + i - patternLength + 1, seqLocation.getStart() + i);
if (matchLoc.getStart() >= seqLocation.getStart()
&& matchLoc.length() == patternLength)
{
match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
match.setNumMismatches(numMismatches);
break;
}
}
mMismatchCount[i&MOD256] = patternLength;
}
}
catch (IOException e)
{
throw new ProgrammingException(e);
}
finally
{
StreamUtil.close(reader);
}
return match;
}
//---------------------------------------------------------------------------
public S getTarget()
{
return mTarget;
}
//---------------------------------------------------------------------------
public SeqLocation getSeqLocation()
{
return mSeqLocation;
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//---------------------------------------------------------------------------
protected SeqPattern getPattern()
{
return mPattern;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private String getBYPPattern()
{
if (null == mBYPPattern)
{
String prositePattern = getPattern().getPrositePattern();
mBYPPattern = StringUtil.replaceAllRegexp(prositePattern, "[\\-\\.]", "");
}
return mBYPPattern;
}
//---------------------------------------------------------------------------
private void verifyPatternCompatibility()
{
if (getPattern().containsPositionAmbiguity())
{
throw new ProgrammingException("Patterns containing ambiguity cannot by used by the " + sAlgorithmName + " algorithm!");
}
if (getPattern().containsRanges())
{
throw new ProgrammingException("Patterns containing ranges cannot by used by the " + sAlgorithmName + " algorithm!");
}
}
//---------------------------------------------------------------------------
private void setup()
{
String bypPattern = getBYPPattern();
int patternLength = bypPattern.length();
mCharacterOffsetList = new ArrayList<>(ALPHABET_SIZE);
for (int i = 0; i < ALPHABET_SIZE; i++)
{
mCharacterOffsetList.add(new IndexNode());
mMismatchCount[i] = patternLength;
}
for (int i = 0, j = 128; i < patternLength; i++)
{
mMismatchCount[i] = ALPHABET_SIZE;
char patternChar = bypPattern.charAt(i);
if (! mPattern.isCaseSensitive())
{
patternChar = Character.toUpperCase(patternChar);
}
IndexNode indexNode = mCharacterOffsetList.get((int)patternChar);
if (indexNode.getOffset() == -1)
{
indexNode.setOffset(patternLength - i - 1);
}
else
{
IndexNode nextIndexNode = indexNode.nextIndexNode();
while (nextIndexNode != null)
{
indexNode = nextIndexNode;
nextIndexNode = indexNode.nextIndexNode();
}
nextIndexNode = new IndexNode();
nextIndexNode.setOffset(patternLength - i - 1);
indexNode.setNextIndexNode(nextIndexNode);
}
}
mMismatchCount[patternLength - 1] = patternLength;
}
private class IndexNode
{
int mOffset = -1;
IndexNode mNextIndexNode;
public int getOffset()
{
return mOffset;
}
public void setOffset(int inValue)
{
mOffset = inValue;
}
public IndexNode nextIndexNode()
{
return mNextIndexNode;
}
public void setNextIndexNode(IndexNode inValue)
{
mNextIndexNode = inValue;
}
public String toString()
{
return "Offset: " + mOffset;
}
}
}