All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.RegExpMatcher Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 Regular Expression string matching algorithm. Use when mismatches are not allowed.
 Ambiguity within positions and range specifications are allowed.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ class RegExpMatcher implements SeqPatternMatcher { private static String sAlgorithmName = "Regular Expression"; private SeqPattern mPattern; private Pattern mRegExpPattern; private Pattern mRevStrandRegExpPattern; private S mTarget; private SeqLocation mSeqLocation; protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)\\)$"); //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- protected RegExpMatcher(SeqPattern inPattern, S inTarget) { this(inPattern, inTarget, null); } //--------------------------------------------------------------------------- protected RegExpMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation) { mPattern = inPattern; mTarget = inTarget; mSeqLocation = inSeqLocation; verifyPatternCompatibility(); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public T find(SeqLocation inSeqLocation) { T match = null; SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation); if (null == seqLocation) { seqLocation = new SeqLocation(1, getTarget().length()); } Pattern pattern = getRegExpPattern(); Matcher m = pattern.matcher(getTarget().getSubSequence(seqLocation)); if (m.find()) { int start = m.start() + seqLocation.getStart(); SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID()); match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc); } return match; } //--------------------------------------------------------------------------- @Override public List findAll(SeqLocation inSeqLocation) { SeqLocation seqLocation = inSeqLocation; if (null == seqLocation) { seqLocation = new SeqLocation(1, getTarget().length()); } List matches = new ArrayList<>(25); Pattern pattern = getRegExpPattern(); String sequence = getTarget().getSubSequence(seqLocation); Matcher m = pattern.matcher(sequence); int index = 0; while (m.find(index)) { index = m.start() + 1; int start = index + seqLocation.getStart() - 1; SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID()); T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc); if (getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) { ((NucleotidePatternMatch) match).setStrand(Strand.FORWARD); } matches.add(match); } if (getPattern() instanceof NucleotidePattern && ((NucleotidePattern) getPattern()).getStrandsToSearch() != null && ((NucleotidePattern) getPattern()).getStrandsToSearch().contains(Strand.REVERSE)) { m = getReverseStandRegExpPattern().matcher(sequence); index = 0; while (m.find(index)) { index = m.start() + 1; int start = index + seqLocation.getStart() - 1; SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID()); T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc); ((NucleotidePatternMatch) match).setStrand(Strand.REVERSE); matches.add(match); } } return matches; } //--------------------------------------------------------------------------- public S getTarget() { return mTarget; } //########################################################################### // PROTECTED METHODS //########################################################################### //--------------------------------------------------------------------------- protected SeqPattern getPattern() { return mPattern; } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private void verifyPatternCompatibility() { if (getPattern().getMaxMismatches() > 0) { throw new ProgrammingException("Patterns allowing mismatches cannot by used by the " + sAlgorithmName + " algorithm!"); } if ((getTarget() instanceof Protein && ! getPattern().getBioSequenceType().equals(BioSequenceType.PROTEIN)) || (getTarget() instanceof NucleicAcid && ! getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))) { throw new RuntimeException("Target / Pattern Mismatch! Target is a " + getTarget().getClass().getSimpleName() + " but the pattern type is " + getPattern().getBioSequenceType() + "!"); } if (getTarget() instanceof Protein && ((Protein)getTarget()).hasChains()) { throw new RuntimeException("Pattern matching can only be done with single-chain proteins! Please pass the chains of multi-chain proteins to the pattern matcher individually."); } } //--------------------------------------------------------------------------- // Turns the Prosite pattern into a regular expression. private Pattern getRegExpPattern() { if (null == mRegExpPattern) { if (getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) { mRegExpPattern = buildDNARegExp(); } else { mRegExpPattern = buildProteinRegExp(); } } return mRegExpPattern; } //--------------------------------------------------------------------------- private Pattern buildDNARegExp() { String prositePattern = getPattern().getPrositePattern(); // Remove the period at the end if (prositePattern.endsWith(".")) { prositePattern = prositePattern.substring(0, prositePattern.length() - 1); } StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getPattern().getIgnoreGaps() ? "\\-*" : ""); // Dashes separate positions String[] positions = prositePattern.split("\\-"); for (String position : positions) { boolean nTerm = false; boolean cTerm = false; if (position.startsWith("<")) // 5'? { nTerm = true; position = position.substring(1); } if (position.endsWith(">")) // 3'? { cTerm = true; position = position.substring(0, position.length() - 1); } // Extract the count spec if present String countSpec = null; Matcher m = PROSITE_COUNT_PATTERN.matcher(position); if (m.find()) { countSpec = "{" + m.group(1) + "}"; position = position.substring(0, m.start(1) - 1); } if (position.startsWith("{") && position.endsWith("}")) { StringBuilder positionBuffer = new StringBuilder("[^"); for (int i = 1; i < position.length() - 1; i++) { char theChar = position.charAt(i); Nucleotide base = Nucleotide.valueOf(theChar); if (null == base && theChar != '<' && theChar != '>') { throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); } positionBuffer.append(theChar); if (base != null && base.isAmbiguous()) { for (Nucleotide baseOption : base.getDegeneracy()) { positionBuffer.append(baseOption.getOneLetterCode()); } } } if (getPattern().getIgnoreGaps()) { positionBuffer.append("\\-"); } positionBuffer.append("]"); if (getPattern().getIgnoreGaps() && countSpec != null) { positionBuffer.insert(0, "(?:\\-*"); positionBuffer.append(")"); } position = positionBuffer.toString(); } else if (position.startsWith("[") && position.endsWith("]")) { StringBuilder positionBuffer = new StringBuilder("["); for (int i = 1; i < position.length() - 1; i++) { char theChar = position.charAt(i); Nucleotide base = Nucleotide.valueOf(theChar); if (null == base && theChar != '<' && theChar != '>') { throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); } positionBuffer.append(theChar); if (base != null && base.isAmbiguous()) { for (Nucleotide baseOption : base.getDegeneracy()) { positionBuffer.append(baseOption.getOneLetterCode()); } } } positionBuffer.append("]"); if (getPattern().getIgnoreGaps() && countSpec != null) { positionBuffer.insert(0, "(?:\\-*"); positionBuffer.append(")"); } position = positionBuffer.toString(); } else { if (position.length() > 1) { throw new SeqPatternConfigurationException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!"); } char residue = position.charAt(0); Nucleotide base = Nucleotide.valueOf(residue); if (null == base && ! position.equals("<") && ! position.equals(">")) { throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!"); } if (base != null && base.isAmbiguous()) { if (position.equalsIgnoreCase("N")) { if (getPattern().getIgnoreGaps()) { position = "[^\\-]"; } else { position = "."; } } else { StringBuilder positionBuffer = new StringBuilder("["); positionBuffer.append(residue); for (Nucleotide baseOption : base.getDegeneracy()) { positionBuffer.append(baseOption.getOneLetterCode()); } positionBuffer.append("]"); position = positionBuffer.toString(); } } if (getPattern().getIgnoreGaps() && countSpec != null) { position = "(?:\\-*" + position + ")"; } } if (position.contains("<")) { position = "(?:" + position.replace("<", "") + "|\\A)"; } else if (position.contains(">")) { position = "(?:" + position.replace(">", "") + "|\\Z)"; } if (getPattern().getIgnoreGaps() && countSpec != null) { regexp.append((nTerm ? "^" : "") + position + countSpec + (cTerm ? "$" : "")); } else { regexp.delimitedAppend((nTerm ? "^" : "") + position + (countSpec != null ? countSpec : "") + (cTerm ? "$" : "")); } } return Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE); } //--------------------------------------------------------------------------- private Pattern buildProteinRegExp() { String prositePattern = getPattern().getPrositePattern(); // Remove the period at the end if (prositePattern.endsWith(".")) { prositePattern = prositePattern.substring(0, prositePattern.length() - 1); } StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getPattern().getIgnoreGaps() ? "\\-*" : ""); // Dashes separate positions String[] positions = prositePattern.split("\\-"); for (String position : positions) { boolean nTerm = false; boolean cTerm = false; if (position.startsWith("<")) // N-terminus? { nTerm = true; position = position.substring(1); } if (position.endsWith(">")) // C-terminus? { cTerm = true; position = position.substring(0, position.length() - 1); } // Extract the count spec if present String countSpec = null; Matcher m = PROSITE_COUNT_PATTERN.matcher(position); if (m.find()) { countSpec = "{" + m.group(1) + "}"; position = position.substring(0, m.start(1) - 1); } if (position.equalsIgnoreCase("x")) { if (getPattern().getIgnoreGaps()) { position = "(?:[^\\-]\\-*)"; } else { position = "."; } } else if (position.startsWith("{") && position.endsWith("}")) { position = "[^" + position.substring(1, position.length() - 1) + (getPattern().getIgnoreGaps() ? "\\-" : "") + "]"; } if (position.contains("<")) { position = "(?:" + position.replace("<", "") + "|\\A)"; } else if (position.contains(">")) { position = "(?:" + position.replace(">", "") + "|\\Z)"; } regexp.delimitedAppend((nTerm ? "^" : "") + position + (countSpec != null ? countSpec : "") + (cTerm ? "$" : "")); } return Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE); } //--------------------------------------------------------------------------- private Pattern getReverseStandRegExpPattern() { Pattern fwdRegExp = getRegExpPattern(); if (null == mRevStrandRegExpPattern) { StringBuilderPlus regexp = new StringBuilderPlus(); for (int i = fwdRegExp.pattern().length() - 1; i >= 0; i--) { char theChar = fwdRegExp.pattern().charAt(i); if (Character.isLetter(theChar)) { regexp.append(Nucleotide.valueOf(theChar).getComplement().getOneLetterCode()); } else if (theChar == '[') { regexp.append(']'); } else if (theChar == ']') { regexp.append('['); } else { regexp.append(theChar); } } mRevStrandRegExpPattern = Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE); } return mRevStrandRegExpPattern; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy