All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.BruteForceMatcher Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.util.List;

import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.OrderedSet;

//------------------------------------------------------------------------------
/**
 Brute force string matching algorithm. Use when mismatches are allowed
 and there is ambiguity within positions or range specifications.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ class BruteForceMatcher implements SeqPatternMatcher { private static String sAlgorithmName = "Brute Force"; private SeqPattern mPattern; private S mTarget; private SeqLocation mSeqLocation; private List mPatternPositions; //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- protected BruteForceMatcher(SeqPattern inPattern, S inTarget) { this(inPattern, inTarget, null); } //--------------------------------------------------------------------------- protected BruteForceMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation) { mPattern = inPattern; mTarget = inTarget; mSeqLocation = inSeqLocation != null ? inSeqLocation.clone() : null; mPatternPositions = mPattern.getPrositePatternPositions(); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public T find(SeqLocation inSeqLocation) { List matches = null; SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation); if (null == seqLocation) { seqLocation = new SeqLocation(1, getTarget().length()); } String targetString = mTarget.getSubSequence(seqLocation); if (! getPattern().isCaseSensitive()) { targetString = targetString.toUpperCase(); } if (getPattern().isRestrictedToSeqStart()) { if (seqLocation.getStart().equals(1)) { matches = eval(targetString, 0, 0, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps()); } } else { for (int i = 0; i < targetString.length(); i++) { matches = eval(targetString, i, seqLocation.getStart() - 1, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps()); if (matches != null) { break; } } } return CollectionUtil.hasValues(matches) ? matches.get(0) : null; } //--------------------------------------------------------------------------- @Override public List findAll() { return findAll(null); } //--------------------------------------------------------------------------- @Override public List findAll(SeqLocation inSeqLocation) { SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation.clone() : mSeqLocation); if (null == seqLocation) { seqLocation = new SeqLocation(1, getTarget().length()); } List allMatches = null; boolean matchFound = false; do { List matches = null; String targetString = mTarget.getSubSequence(seqLocation); if (! getPattern().isCaseSensitive()) { targetString = targetString.toUpperCase(); } if (getPattern().isRestrictedToSeqStart()) { if (seqLocation.getStart().equals(1)) { matches = eval(targetString, 0, 0, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps()); } } else { for (int i = 0; i < targetString.length(); i++) { matches = eval(targetString, i, seqLocation.getStart() - 1, 0, 0, 0, getPattern().getMaxMismatches(), getPattern().getIgnoreGaps()); if (matches != null) { break; } } } if (CollectionUtil.hasValues(matches)) { matchFound = true; if (null == allMatches) { allMatches = matches; } else { allMatches.addAll(matches); } seqLocation.setStart(matches.get(matches.size() - 1).getSeqLocation().getStart() + 1); if (seqLocation.length() <= 0) { break; } } else { break; } } while (matchFound); return allMatches; } //--------------------------------------------------------------------------- public S getTarget() { return mTarget; } //--------------------------------------------------------------------------- public SeqLocation getSeqLocation() { return mSeqLocation; } //########################################################################### // PROTECTED METHODS //########################################################################### //--------------------------------------------------------------------------- protected SeqPattern getPattern() { return mPattern; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private List eval(String inTargetString, int inTargetIndex, int inTargetOffset, int inPatternIndex, int inRangeSize, int inMismatchCount, int inMaxMismatches, boolean inIgnoreGaps) { OrderedSet matches = null; int mismatchCount = inMismatchCount; int targetIndex = inTargetIndex; int patternPositionIndex = inPatternIndex; PrositePatternPosition position; while (patternPositionIndex < mPatternPositions.size() && targetIndex < inTargetString.length()) { position = mPatternPositions.get(patternPositionIndex); char targetResidue = inTargetString.charAt(targetIndex); if ('-' == targetResidue && inIgnoreGaps) { targetIndex++; } if (position.hasCountRange()) { // Repeat the position until we get to the min count for the range for (int i = 0; i < position.getCountRange().getStart() - 1; i++) { if (! position.matchesResidue(targetResidue)) { mismatchCount++; if (mismatchCount > inMaxMismatches || position.mismatchNotAllowed()) { break; } } if (targetIndex == inTargetString.length() - 1) { break; } targetResidue = inTargetString.charAt(++targetIndex); } if (mismatchCount > inMaxMismatches) { break; } for (int i = 0; i < position.getCountRange().length(); i++) { if (! position.matchesResidue(targetResidue)) { mismatchCount++; if (mismatchCount > inMaxMismatches || position.mismatchNotAllowed()) { break; } } List innerMatches = eval(inTargetString, targetIndex + 1, inTargetOffset, patternPositionIndex + 1, inRangeSize + i + position.getCountRange().getStart() - 1, mismatchCount, inMaxMismatches, inIgnoreGaps); if (CollectionUtil.hasValues(innerMatches)) { if (null == matches) { matches = new OrderedSet<>(innerMatches); } else { matches.addAll(innerMatches); } if (position.useLazyMatchMode()) { // When lazy matching an interval quantifier (range), stop at the first range value that produces a match break; } } if (targetIndex == inTargetString.length() - 1) { // We're at the end of the string. break; } targetResidue = inTargetString.charAt(++targetIndex); } // We just finished recursively searching. No need to continue the loop. break; } else if (! position.matchesResidue(targetResidue)) { mismatchCount++; if (mismatchCount > inMaxMismatches || position.mismatchNotAllowed()) { break; } } targetIndex++; patternPositionIndex++; } if (null == matches && mismatchCount <= inMaxMismatches && (patternPositionIndex == mPatternPositions.size() || (patternPositionIndex == mPatternPositions.size() - 1 && mPatternPositions.get(mPatternPositions.size() - 1).getType().equals(PrositePatternPositionType.ONE_OF) && mPatternPositions.get(mPatternPositions.size() - 1).matchesResidue('>')))) { SeqLocation matchLoc = new SeqLocation(inTargetOffset + inTargetIndex - inPatternIndex - inRangeSize + 1, inTargetOffset + targetIndex); // Was the pattern restricted to the end of the sequence? if (! getPattern().isRestrictedToSeqEnd() || matchLoc.getEnd().equals(getTarget().length())) { T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc); match.setNumMismatches(mismatchCount); matches = new OrderedSet<>(1); matches.add(match); } } return matches; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy