All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.BYPMatcher Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringUtil;
import com.hfg.util.io.StreamUtil;

//------------------------------------------------------------------------------
/**
 Baeza-Yates, Perleberg string matching algorithm. Use when mismatches are allowed
 but there is no ambiguity within positions or range specifications.
 
See: Baeza-yates, R. A., & Perleberg, C. H. (1992). "Fast and Practical Approximate String Matching." In Annual Symposium on Combinatorial Pattern Matching (pp. 1–9). Springer.
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ class BYPMatcher implements SeqPatternMatcher { private static String sAlgorithmName = "Baeza-Yates, Perleberg"; private static int ALPHABET_SIZE = 256; private static int MOD256 = 0xff; private SeqPattern mPattern; private String mBYPPattern; private S mTarget; private SeqLocation mSeqLocation; private int[] mMismatchCount = new int[ALPHABET_SIZE]; private List mCharacterOffsetList; //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- private BYPMatcher() { } //--------------------------------------------------------------------------- protected BYPMatcher(SeqPattern inPattern, S inTarget) { this(inPattern, inTarget, null); } //--------------------------------------------------------------------------- protected BYPMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation) { mPattern = inPattern; mTarget = inTarget; mSeqLocation = inSeqLocation != null ? inSeqLocation.clone() : null; verifyPatternCompatibility(); setup(); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public T find(SeqLocation inSeqLocation) { T match = null; SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation); if (null == seqLocation) { seqLocation = new SeqLocation(1, getTarget().length()); } int patternLength = getBYPPattern().length(); int maxMismatches = getPattern().getMaxMismatches(); Reader reader = null; try { reader = getTarget().getSubSequenceReader(seqLocation); for (int i = 0; i < ALPHABET_SIZE; i++) { mMismatchCount[i] = patternLength; } for (int i = 0; i < seqLocation.length(); i++) { int residue = reader.read(); if (! mPattern.isCaseSensitive()) { residue = Character.toUpperCase((char)residue); } IndexNode indexNode = mCharacterOffsetList.get(residue); int offset; if ((offset = indexNode.getOffset()) >= 0) { mMismatchCount[(i + offset)&MOD256]--; while (indexNode.nextIndexNode() != null) { indexNode = indexNode.nextIndexNode(); mMismatchCount[(i + indexNode.getOffset()) & MOD256]--; } } int numMismatches = mMismatchCount[i&MOD256]; if (numMismatches <= maxMismatches && numMismatches >= 0) { // System.out.println(String.format("Match in position %d with %d mismatches", i - patternLength + 1, numMismatches)); SeqLocation matchLoc = new SeqLocation(seqLocation.getStart() + i - patternLength + 1, seqLocation.getStart() + i); if (matchLoc.getStart() >= seqLocation.getStart() && matchLoc.length() == patternLength) { match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc); match.setNumMismatches(numMismatches); break; } } mMismatchCount[i&MOD256] = patternLength; } } catch (IOException e) { throw new ProgrammingException(e); } finally { StreamUtil.close(reader); } return match; } //--------------------------------------------------------------------------- public S getTarget() { return mTarget; } //--------------------------------------------------------------------------- public SeqLocation getSeqLocation() { return mSeqLocation; } //########################################################################### // PROTECTED METHODS //########################################################################### //--------------------------------------------------------------------------- protected SeqPattern getPattern() { return mPattern; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private String getBYPPattern() { if (null == mBYPPattern) { String prositePattern = getPattern().getPrositePattern(); mBYPPattern = StringUtil.replaceAllRegexp(prositePattern, "[\\-\\.]", ""); } return mBYPPattern; } //--------------------------------------------------------------------------- private void verifyPatternCompatibility() { if (getPattern().containsPositionAmbiguity()) { throw new ProgrammingException("Patterns containing ambiguity cannot by used by the " + sAlgorithmName + " algorithm!"); } if (getPattern().containsRanges()) { throw new ProgrammingException("Patterns containing ranges cannot by used by the " + sAlgorithmName + " algorithm!"); } } //--------------------------------------------------------------------------- private void setup() { String bypPattern = getBYPPattern(); int patternLength = bypPattern.length(); mCharacterOffsetList = new ArrayList<>(ALPHABET_SIZE); for (int i = 0; i < ALPHABET_SIZE; i++) { mCharacterOffsetList.add(new IndexNode()); mMismatchCount[i] = patternLength; } for (int i = 0, j = 128; i < patternLength; i++) { mMismatchCount[i] = ALPHABET_SIZE; char patternChar = bypPattern.charAt(i); if (! mPattern.isCaseSensitive()) { patternChar = Character.toUpperCase(patternChar); } IndexNode indexNode = mCharacterOffsetList.get((int)patternChar); if (indexNode.getOffset() == -1) { indexNode.setOffset(patternLength - i - 1); } else { IndexNode nextIndexNode = indexNode.nextIndexNode(); while (nextIndexNode != null) { indexNode = nextIndexNode; nextIndexNode = indexNode.nextIndexNode(); } nextIndexNode = new IndexNode(); nextIndexNode.setOffset(patternLength - i - 1); indexNode.setNextIndexNode(nextIndexNode); } } mMismatchCount[patternLength - 1] = patternLength; } private class IndexNode { int mOffset = -1; IndexNode mNextIndexNode; public int getOffset() { return mOffset; } public void setOffset(int inValue) { mOffset = inValue; } public IndexNode nextIndexNode() { return mNextIndexNode; } public void setNextIndexNode(IndexNode inValue) { mNextIndexNode = inValue; } public String toString() { return "Offset: " + mOffset; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy