All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.BYPMatcher Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringUtil;
import com.hfg.util.io.StreamUtil;

//------------------------------------------------------------------------------
/**
 Baeza-Yates, Perleberg string matching algorithm. Use when mismatches are allowed
 but there is no ambiguity within positions or range specifications.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ class BYPMatcher implements SeqPatternMatcher { private static String sAlgorithmName = "Baeza-Yates, Perleberg"; private static int ALPHABET_SIZE = 256; private static int MOD256 = 0xff; private SeqPattern mPattern; private String mBYPPattern; private S mTarget; private SeqLocation mSeqLocation; private int[] mMismatchCount = new int[ALPHABET_SIZE]; private List mOffsetList; //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- private BYPMatcher() { } //--------------------------------------------------------------------------- protected BYPMatcher(SeqPattern inPattern, S inTarget) { this(inPattern, inTarget, null); } //--------------------------------------------------------------------------- protected BYPMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation) { mPattern = inPattern; mTarget = inTarget; mSeqLocation = inSeqLocation; verifyPatternCompatibility(); setup(); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public T find(SeqLocation inSeqLocation) { T match = null; SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation); if (null == seqLocation) { seqLocation = new SeqLocation(1, getTarget().length()); } int patternLength = getBYPPattern().length(); int maxMismatches = getPattern().getMaxMismatches(); Reader reader = null; try { reader = getTarget().getSubSequenceReader(seqLocation); for (int i = 0; i < seqLocation.length(); i++) { int residue = reader.read(); IndexNode indexNode = mOffsetList.get(residue); int offset; if ((offset = indexNode.getOffset()) >= 0) { mMismatchCount[(i + offset)&MOD256]--; if (indexNode.nextIndex() >= 0) { for (indexNode = mOffsetList.get(indexNode.nextIndex()); indexNode != null; indexNode = mOffsetList.get(indexNode.nextIndex())) { mMismatchCount[(i + indexNode.getOffset()) & MOD256]--; if (indexNode.nextIndex() < 0) { break; } } } } if (mMismatchCount[i&MOD256] <= maxMismatches && mMismatchCount[i&MOD256] > 0) { // System.out.println(String.format("Match in position %d with %d mismatches", i - patternLength + 1, mMismatchCount[i&MOD256])); SeqLocation matchLoc = new SeqLocation(seqLocation.getStart() + i - patternLength + 1, seqLocation.getStart() + i); if (matchLoc.getStart() >= seqLocation.getStart() && matchLoc.length() == patternLength) { match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc); match.setNumMismatches(mMismatchCount[i&MOD256]); break; } } mMismatchCount[i&MOD256] = patternLength; } } catch (IOException e) { throw new ProgrammingException(e); } finally { StreamUtil.close(reader); } return match; } //--------------------------------------------------------------------------- public S getTarget() { return mTarget; } //########################################################################### // PROTECTED METHODS //########################################################################### //--------------------------------------------------------------------------- protected SeqPattern getPattern() { return mPattern; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private String getBYPPattern() { if (null == mBYPPattern) { String prositePattern = getPattern().getPrositePattern(); mBYPPattern = StringUtil.replaceAllRegexp(prositePattern, "[\\-\\.]", ""); } return mBYPPattern; } //--------------------------------------------------------------------------- private void verifyPatternCompatibility() { if (getPattern().containsPositionAmbiguity()) { throw new ProgrammingException("Patterns containing ambiguity cannot by used by the " + sAlgorithmName + " algorithm!"); } if (getPattern().containsRanges()) { throw new ProgrammingException("Patterns containing ranges cannot by used by the " + sAlgorithmName + " algorithm!"); } } //--------------------------------------------------------------------------- private void setup() { String bypPattern = getBYPPattern(); int patternLength = bypPattern.length(); mOffsetList = new ArrayList<>(ALPHABET_SIZE); for (int i = 0; i < ALPHABET_SIZE; i++) { mOffsetList.add(new BYPMatcher(). new IndexNode()); mMismatchCount[i] = patternLength; } for (int i = 0, j = 128; i < patternLength; i++) { mMismatchCount[i] = ALPHABET_SIZE; char patternChar = bypPattern.charAt(i); IndexNode indexNode = mOffsetList.get((int)patternChar); if (indexNode.getOffset() == -1) { indexNode.setOffset(patternLength - i - 1); } else { short nextIndex = indexNode.nextIndex(); indexNode.setNextIndex((short)j++); indexNode = mOffsetList.get(indexNode.nextIndex()); indexNode.setOffset(patternLength - i - 1); indexNode.setNextIndex(nextIndex); } } mMismatchCount[patternLength - 1] = patternLength; } private class IndexNode { int mOffset = -1; short mNext = -1; public int getOffset() { return mOffset; } public void setOffset(int inValue) { mOffset = inValue; } public short nextIndex() { return mNext; } public void setNextIndex(short inValue) { mNext = inValue; } public String toString() { return "Offset: " + mOffset + "; Next: " + mNext; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy