com.hfg.bio.seq.pattern.BYPMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringUtil;
import com.hfg.util.io.StreamUtil;
//------------------------------------------------------------------------------
/**
Baeza-Yates, Perleberg string matching algorithm. Use when mismatches are allowed
but there is no ambiguity within positions or range specifications.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
class BYPMatcher implements SeqPatternMatcher
{
private static String sAlgorithmName = "Baeza-Yates, Perleberg";
private static int ALPHABET_SIZE = 256;
private static int MOD256 = 0xff;
private SeqPattern mPattern;
private String mBYPPattern;
private S mTarget;
private SeqLocation mSeqLocation;
private int[] mMismatchCount = new int[ALPHABET_SIZE];
private List mOffsetList;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
private BYPMatcher()
{
}
//---------------------------------------------------------------------------
protected BYPMatcher(SeqPattern inPattern, S inTarget)
{
this(inPattern, inTarget, null);
}
//---------------------------------------------------------------------------
protected BYPMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation)
{
mPattern = inPattern;
mTarget = inTarget;
mSeqLocation = inSeqLocation;
verifyPatternCompatibility();
setup();
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public T find(SeqLocation inSeqLocation)
{
T match = null;
SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation);
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, getTarget().length());
}
int patternLength = getBYPPattern().length();
int maxMismatches = getPattern().getMaxMismatches();
Reader reader = null;
try
{
reader = getTarget().getSubSequenceReader(seqLocation);
for (int i = 0; i < seqLocation.length(); i++)
{
int residue = reader.read();
IndexNode indexNode = mOffsetList.get(residue);
int offset;
if ((offset = indexNode.getOffset()) >= 0)
{
mMismatchCount[(i + offset)&MOD256]--;
if (indexNode.nextIndex() >= 0)
{
for (indexNode = mOffsetList.get(indexNode.nextIndex()); indexNode != null; indexNode = mOffsetList.get(indexNode.nextIndex()))
{
mMismatchCount[(i + indexNode.getOffset()) & MOD256]--;
if (indexNode.nextIndex() < 0)
{
break;
}
}
}
}
if (mMismatchCount[i&MOD256] <= maxMismatches
&& mMismatchCount[i&MOD256] > 0)
{
// System.out.println(String.format("Match in position %d with %d mismatches", i - patternLength + 1, mMismatchCount[i&MOD256]));
SeqLocation matchLoc = new SeqLocation(seqLocation.getStart() + i - patternLength + 1, seqLocation.getStart() + i);
if (matchLoc.getStart() >= seqLocation.getStart()
&& matchLoc.length() == patternLength)
{
match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
match.setNumMismatches(mMismatchCount[i&MOD256]);
break;
}
}
mMismatchCount[i&MOD256] = patternLength;
}
}
catch (IOException e)
{
throw new ProgrammingException(e);
}
finally
{
StreamUtil.close(reader);
}
return match;
}
//---------------------------------------------------------------------------
public S getTarget()
{
return mTarget;
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//---------------------------------------------------------------------------
protected SeqPattern getPattern()
{
return mPattern;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private String getBYPPattern()
{
if (null == mBYPPattern)
{
String prositePattern = getPattern().getPrositePattern();
mBYPPattern = StringUtil.replaceAllRegexp(prositePattern, "[\\-\\.]", "");
}
return mBYPPattern;
}
//---------------------------------------------------------------------------
private void verifyPatternCompatibility()
{
if (getPattern().containsPositionAmbiguity())
{
throw new ProgrammingException("Patterns containing ambiguity cannot by used by the " + sAlgorithmName + " algorithm!");
}
if (getPattern().containsRanges())
{
throw new ProgrammingException("Patterns containing ranges cannot by used by the " + sAlgorithmName + " algorithm!");
}
}
//---------------------------------------------------------------------------
private void setup()
{
String bypPattern = getBYPPattern();
int patternLength = bypPattern.length();
mOffsetList = new ArrayList<>(ALPHABET_SIZE);
for (int i = 0; i < ALPHABET_SIZE; i++)
{
mOffsetList.add(new BYPMatcher(). new IndexNode());
mMismatchCount[i] = patternLength;
}
for (int i = 0, j = 128; i < patternLength; i++)
{
mMismatchCount[i] = ALPHABET_SIZE;
char patternChar = bypPattern.charAt(i);
IndexNode indexNode = mOffsetList.get((int)patternChar);
if (indexNode.getOffset() == -1)
{
indexNode.setOffset(patternLength - i - 1);
}
else
{
short nextIndex = indexNode.nextIndex();
indexNode.setNextIndex((short)j++);
indexNode = mOffsetList.get(indexNode.nextIndex());
indexNode.setOffset(patternLength - i - 1);
indexNode.setNextIndex(nextIndex);
}
}
mMismatchCount[patternLength - 1] = patternLength;
}
private class IndexNode
{
int mOffset = -1;
short mNext = -1;
public int getOffset()
{
return mOffset;
}
public void setOffset(int inValue)
{
mOffset = inValue;
}
public short nextIndex()
{
return mNext;
}
public void setNextIndex(short inValue)
{
mNext = inValue;
}
public String toString()
{
return "Offset: " + mOffset + "; Next: " + mNext;
}
}
}