All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.SeqPattern Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.math.Range;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;

//------------------------------------------------------------------------------
/**
 Abstract container for a sequence pattern (motif).
 

The pattern syntax is based on PROSITE with one addition - the symbol '!' can be added to end of a position specification to indicate that mismatches are not allowed at that position.

From the PROSITE user manual:

The patterns are described using the following conventions:

  • The standard IUPAC one-letter codes for the amino acids are used.
  • The symbol 'x' is used for a position where any amino acid is accepted.
  • Ambiguities are indicated by listing the acceptable amino acids for a given position, between square parentheses '[ ]'. For example: [ALT] stands for Ala or Leu or Thr.
  • Ambiguities are also indicated by listing between a pair of curly brackets '{ }' the amino acids that are not accepted at a given position. For example: {AM} stands for any amino acid except Ala and Met.
  • Each element in a pattern is separated from its neighbor by a '-'.
  • Repetition of an element of the pattern can be indicated by following that element with a numerical value or a numerical range between parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to x-x or x-x-x or x-x-x-x.
  • When a pattern is restricted to either the N- or C-terminal of a sequence, that pattern either starts with a '<' symbol or respectively ends with a '>' symbol. In some rare cases (e.g. PS00267 or PS00539), '>' can also occur inside square brackets for the C-terminal element. 'F-[GSTV]-P-R-L-[G>]' means that either 'F-[GSTV]-P-R-L-G' or 'F-[GSTV]-P-R-L>' are considered.
  • A period ends the pattern.
Examples:
    PA   [AC]-x-V-x(4)-{ED}.
   
This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any but Glu or Asp}
    PA   <A-x-[ST](2)-x(0,1)-V.
   
This pattern, which must be in the N-terminal of the sequence ('<'), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val
@author J. Alex Taylor, hairyfatguy.com */ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public abstract class SeqPattern implements Cloneable { private String mName; private boolean mLocked; private String mPrositePattern; private List mPatternPositions; private boolean mIsCaseSensitive = false; private boolean mIgnoreGaps = false; private boolean mContainsPositionAmbiguity; private boolean mContainsRanges; private boolean mIsRestrictedToSeqStart; private boolean mIsRestrictedToSeqEnd; private int mMaxMismatches = 0; protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)(\\?)?\\)$"); //########################################################################### // CONSTRUCTORS //########################################################################### //-------------------------------------------------------------------------- protected SeqPattern() { } //-------------------------------------------------------------------------- public SeqPattern(String inPrositePattern) { setPrositePattern(inPrositePattern); } //########################################################################### // PUBLIC METHODS //########################################################################### //-------------------------------------------------------------------------- public SeqPattern clone() { SeqPattern cloneObj; try { cloneObj = (SeqPattern) super.clone(); } catch (CloneNotSupportedException e) { throw new ProgrammingException(e); } if (mPatternPositions != null) { cloneObj.mPatternPositions = new ArrayList(mPatternPositions); } return cloneObj; } //-------------------------------------------------------------------------- @Override public String toString() { return mPrositePattern; } //--------------------------------------------------------------------------- public SeqPattern setName(String inValue) { mName = inValue; return this; } //--------------------------------------------------------------------------- public String name() { return mName; } //-------------------------------------------------------------------------- public boolean isLocked() { return mLocked; } //-------------------------------------------------------------------------- public SeqPattern lock() { mLocked = true; return this; } //-------------------------------------------------------------------------- public abstract BioSequenceType getBioSequenceType(); //-------------------------------------------------------------------------- public SeqPattern setIgnoreGaps(boolean inValue) { if (inValue != mIgnoreGaps) { mIgnoreGaps = inValue; } return this; } //-------------------------------------------------------------------------- public SeqPattern setIsCaseSensitive(boolean inValue) { mIsCaseSensitive = inValue; return this; } //-------------------------------------------------------------------------- public boolean isCaseSensitive() { return mIsCaseSensitive; } //-------------------------------------------------------------------------- public boolean getIgnoreGaps() { return mIgnoreGaps; } //-------------------------------------------------------------------------- protected void setPrositePattern(String inValue) { if (inValue != null) { mPrositePattern = inValue.trim(); if (! isCaseSensitive()) { mPrositePattern = mPrositePattern.toUpperCase(); } } evaluate(mPrositePattern); } //-------------------------------------------------------------------------- public String getPrositePattern() { return mPrositePattern; } //-------------------------------------------------------------------------- public SeqPattern setMaxMismatches(int inValue) { // TODO: check number against the number of pattern positions mMaxMismatches = inValue; return this; } //-------------------------------------------------------------------------- public int getMaxMismatches() { return mMaxMismatches; } //-------------------------------------------------------------------------- public boolean containsMismatchRestrictions() { boolean result = false; if (CollectionUtil.hasValues(getPrositePatternPositions())) { for (PrositePatternPosition position : getPrositePatternPositions()) { if (position.mismatchNotAllowed()) { result = true; break; } } } return result; } //-------------------------------------------------------------------------- public boolean containsPositionAmbiguity() { return mContainsPositionAmbiguity; } //-------------------------------------------------------------------------- public boolean containsRanges() { return mContainsRanges; } //-------------------------------------------------------------------------- public boolean isRestrictedToSeqStart() { return mIsRestrictedToSeqStart; } //-------------------------------------------------------------------------- public boolean isRestrictedToSeqEnd() { return mIsRestrictedToSeqEnd; } //--------------------------------------------------------------------------- public int getMaxLength() { int maxLength = 0; List positions = getPrositePatternPositions(); if (CollectionUtil.hasValues(positions)) { maxLength = positions.size(); for (PrositePatternPosition position : positions) { if (position.hasCountRange()) { maxLength += position.getCountRange().getEnd() - 1; } } } return maxLength; } //--------------------------------------------------------------------------- public List getPrositePatternPositions() { if (null == mPatternPositions) { String prositePattern = getPrositePattern(); // Remove the period at the end if (prositePattern.endsWith(".")) { prositePattern = prositePattern.substring(0, prositePattern.length() - 1); } // Dashes separate positions String[] positions = prositePattern.split("\\-"); List patternPositions = new ArrayList<>(positions.length); int positionIndex = 0; for (String positionString : positions) { positionIndex++; PrositePatternPosition position = new PrositePatternPosition(); if (positionString.endsWith("!")) // Mismatch not allowed? { position.setMismatchNotAllowed(true); positionString = positionString.substring(0, positionString.length() - 1); } if (positionString.startsWith("<")) // N-terminus? { positionString = positionString.substring(1); } if (positionString.endsWith(">")) // C-terminus? { positionString = positionString.substring(0, positionString.length() - 1); } Matcher m = SeqPattern.PROSITE_COUNT_PATTERN.matcher(positionString); if (m.find()) { int min = -1; int max = -1; String[] pieces = m.group(1).split(","); if (1 == pieces.length) { min = max = Integer.parseInt(pieces[0]); } else { min = Integer.parseInt(pieces[0]); max = Integer.parseInt(pieces[1]); } position.setCountRange(new Range<>(min, max)); positionString = positionString.substring(0, m.start(1) - 1); position.setUseLazyMatchMode(m.group(2) != null); } if (positionString.startsWith("[")) { position.setType(PrositePatternPositionType.ONE_OF); // TODO: Handle ambiguous protein residues StringBuilder buffer = new StringBuilder(); for (int i = 1; i < positionString.length() - 1; i++) { char residue = Character.toUpperCase(positionString.charAt(i)); buffer.append(residue); if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) { Nucleotide base = Nucleotide.valueOf(residue); if (null == base) { throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(positions[positionIndex - 1]) + " contains an invalid nucleotide value!"); } if (base.isAmbiguous()) { buffer.append(base.getDegeneracyAsString()); } } } position.setResidues(buffer.toString()); } else if (positionString.startsWith("{")) { position.setType(PrositePatternPositionType.NOT); // TODO: Handle ambiguous protein residues StringBuilder buffer = new StringBuilder(); for (int i = 1; i < positionString.length() - 1; i++) { char residue = Character.toUpperCase(positionString.charAt(i)); buffer.append(residue); if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) { Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase()); if (base.isAmbiguous()) { buffer.append(base.getDegeneracyAsString()); } } } position.setResidues(buffer.toString()); } else if ((positionString.equalsIgnoreCase("X") && getBioSequenceType().equals(BioSequenceType.PROTEIN)) || (positionString.equalsIgnoreCase("N") && getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))) { position.setType(PrositePatternPositionType.IS_ANY); } else { PrositePatternPositionType type = PrositePatternPositionType.IS; // TODO: Handle ambiguous protein residues if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) { Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase()); if (base.isAmbiguous()) { type = PrositePatternPositionType.ONE_OF; positionString += base.getDegeneracyAsString(); } } position.setType(type); position.setResidues(positionString); } if (position.getCountRange() != null && 1 == position.getCountRange().length()) { // Fixed number of identical positions. Unroll rather than treating as a range int count = position.getCountRange().getStart(); position.setCountRange(null); for (int i = 0; i < count; i++) { patternPositions.add(position); } } else { patternPositions.add(position); } } mPatternPositions = patternPositions; } return mPatternPositions; } //-------------------------------------------------------------------------- public SeqPatternMatcher matcher(S inTarget) { return matcher(inTarget, null); } //-------------------------------------------------------------------------- public SeqPatternMatcher matcher(S inTarget, SeqLocation inSeqLocation) { SeqPatternMatcher matcher; if (0 == getMaxMismatches()) { matcher = new RegExpMatcher<>(this, inTarget, inSeqLocation); } else if (! containsPositionAmbiguity() && ! containsRanges() && ! containsMismatchRestrictions()) { matcher = new BYPMatcher<>(this, inTarget, inSeqLocation); } else { // Default to brute force matcher = new BruteForceMatcher<>(this, inTarget, inSeqLocation); } return matcher; } //########################################################################### // PROTECTED METHODS //########################################################################### //-------------------------------------------------------------------------- protected abstract T createMatch(String inSeq, SeqLocation inLocation); //--------------------------------------------------------------------------- private void evaluate(String inPrositePattern) { String prositePattern = inPrositePattern; // Remove the period at the end if (prositePattern.endsWith(".")) { prositePattern = prositePattern.substring(0, prositePattern.length() - 1); } // Dashes separate positions String[] positions = prositePattern.split("\\-"); int positionIndex = 0; for (String position : positions) { positionIndex++; position = position.trim(); if (position.endsWith("!")) // Mismatch not allowed? { // Remove it so we can detect C-terminal restriction position = position.substring(0, position.length() - 1); } if (position.startsWith("<")) // N-terminus? { mIsRestrictedToSeqStart = true; position = position.substring(1); } if (position.endsWith(">")) // C-terminus? { mIsRestrictedToSeqEnd = true; position = position.substring(0, position.length() - 1); } // Extract the count spec if present String countSpec = null; Matcher m = PROSITE_COUNT_PATTERN.matcher(position); if (m.find()) { mContainsRanges = true; } if (position.startsWith("{")) { if (! position.endsWith("}")) { throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern " + StringUtil.singleQuote(inPrositePattern) + " starts with '{' but doesn't end with '}'!"); } mContainsPositionAmbiguity = true; } else if (position.startsWith("[")) { if (! position.endsWith("]")) { throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern " + StringUtil.singleQuote(inPrositePattern) + " starts with '[' but doesn't end with ']'!"); } mContainsPositionAmbiguity = true; } else if (getBioSequenceType().equals(BioSequenceType.PROTEIN) && position.equalsIgnoreCase("x")) { mContainsPositionAmbiguity = true; } else if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID) && Nucleotide.valueOf(position.charAt(0)).isAmbiguous()) { mContainsPositionAmbiguity = true; } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy