com.hfg.bio.seq.pattern.SeqPattern Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.Nucleotide;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.math.Range;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
//------------------------------------------------------------------------------
/**
Abstract container for a sequence pattern (motif).
From the PROSITE user manual:
The patterns are described using the following conventions:
- The standard IUPAC one-letter codes for the amino acids are used.
- The symbol 'x' is used for a position where any amino acid is accepted.
- Ambiguities are indicated by listing the acceptable amino acids for a given position, between square parentheses '[ ]'.
For example: [ALT] stands for Ala or Leu or Thr.
- Ambiguities are also indicated by listing between a pair of curly brackets '{ }' the amino acids that are not accepted
at a given position. For example: {AM} stands for any amino acid except Ala and Met.
- Each element in a pattern is separated from its neighbor by a '-'.
- Repetition of an element of the pattern can be indicated by following that element with a numerical value or a numerical
range between parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to x-x or x-x-x or x-x-x-x.
- When a pattern is restricted to either the N- or C-terminal of a sequence, that pattern either starts with a '<' symbol
or respectively ends with a '>' symbol. In some rare cases (e.g. PS00267 or PS00539), '>' can also occur inside square
brackets for the C-terminal element. 'F-[GSTV]-P-R-L-[G>]' means that either 'F-[GSTV]-P-R-L-G' or 'F-[GSTV]-P-R-L>' are considered.
- A period ends the pattern.
Examples:
PA [AC]-x-V-x(4)-{ED}.
This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any but Glu or Asp}
PA <A-x-[ST](2)-x(0,1)-V.
This pattern, which must be in the N-terminal of the sequence ('<'), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public abstract class SeqPattern
{
private String mName;
private boolean mLocked;
private String mPrositePattern;
private int mMaxMismatches = 0;
private List mPatternPositions;
private boolean mIsCaseSensitive = false;
private boolean mIgnoreGaps = false;
private boolean mContainsPositionAmbiguity;
private boolean mContainsRanges;
private boolean mIsRestrictedToSeqStart;
private boolean mIsRestrictedToSeqEnd;
protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)\\)$");
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//--------------------------------------------------------------------------
protected SeqPattern()
{
}
//--------------------------------------------------------------------------
public SeqPattern(String inPrositePattern)
{
setPrositePattern(inPrositePattern);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//--------------------------------------------------------------------------
@Override
public String toString()
{
return mPrositePattern;
}
//---------------------------------------------------------------------------
public SeqPattern setName(String inValue)
{
mName = inValue;
return this;
}
//---------------------------------------------------------------------------
public String name()
{
return mName;
}
//--------------------------------------------------------------------------
public boolean isLocked()
{
return mLocked;
}
//--------------------------------------------------------------------------
public SeqPattern lock()
{
mLocked = true;
return this;
}
//--------------------------------------------------------------------------
public abstract BioSequenceType getBioSequenceType();
//--------------------------------------------------------------------------
public SeqPattern setIgnoreGaps(boolean inValue)
{
if (inValue != mIgnoreGaps)
{
mIgnoreGaps = inValue;
}
return this;
}
//--------------------------------------------------------------------------
public boolean isCaseSensitive()
{
return mIsCaseSensitive;
}
//--------------------------------------------------------------------------
public boolean getIgnoreGaps()
{
return mIgnoreGaps;
}
//--------------------------------------------------------------------------
protected void setPrositePattern(String inValue)
{
if (inValue != null)
{
mPrositePattern = inValue.trim();
if (! isCaseSensitive())
{
mPrositePattern = mPrositePattern.toUpperCase();
}
}
evaluate(mPrositePattern);
}
//--------------------------------------------------------------------------
public String getPrositePattern()
{
return mPrositePattern;
}
//--------------------------------------------------------------------------
public SeqPattern setMaxMismatches(int inValue)
{
// TODO: check number against the number of pattern positions
mMaxMismatches = inValue;
return this;
}
//--------------------------------------------------------------------------
public int getMaxMismatches()
{
return mMaxMismatches;
}
//--------------------------------------------------------------------------
public boolean containsPositionAmbiguity()
{
return mContainsPositionAmbiguity;
}
//--------------------------------------------------------------------------
public boolean containsRanges()
{
return mContainsRanges;
}
//--------------------------------------------------------------------------
public boolean isRestrictedToSeqStart()
{
return mIsRestrictedToSeqStart;
}
//--------------------------------------------------------------------------
public boolean isRestrictedToSeqEnd()
{
return mIsRestrictedToSeqEnd;
}
//---------------------------------------------------------------------------
public int getMaxLength()
{
int maxLength = 0;
List positions = getPrositePatternPositions();
if (CollectionUtil.hasValues(positions))
{
maxLength = positions.size();
for (PrositePatternPosition position : positions)
{
if (position.hasCountRange())
{
maxLength += position.getCountRange().getEnd() - 1;
}
}
}
return maxLength;
}
//---------------------------------------------------------------------------
public List getPrositePatternPositions()
{
if (null == mPatternPositions)
{
String prositePattern = getPrositePattern();
// Remove the period at the end
if (prositePattern.endsWith("."))
{
prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
}
// Dashes separate positions
String[] positions = prositePattern.split("\\-");
List patternPositions = new ArrayList<>(positions.length);
int positionIndex = 0;
for (String positionString : positions)
{
positionIndex++;
if (positionString.startsWith("<")) // N-terminus?
{
positionString = positionString.substring(1);
}
if (positionString.endsWith(">")) // C-terminus?
{
positionString = positionString.substring(0, positionString.length() - 1);
}
PrositePatternPosition position = new PrositePatternPosition();
Matcher m = SeqPattern.PROSITE_COUNT_PATTERN.matcher(positionString);
if (m.find())
{
int min = -1;
int max = -1;
String[] pieces = m.group(1).split(",");
if (1 == pieces.length)
{
min = max = Integer.parseInt(pieces[0]);
}
else
{
min = Integer.parseInt(pieces[0]);
max = Integer.parseInt(pieces[1]);
}
position.setCountRange(new Range<>(min, max));
positionString = positionString.substring(0, m.start(1) - 1);
}
if (positionString.startsWith("["))
{
position.setType(PrositePatternPositionType.ONE_OF);
// TODO: Handle ambiguous protein residues
StringBuilder buffer = new StringBuilder();
for (int i = 1; i < positionString.length() - 1; i++)
{
char residue = Character.toUpperCase(positionString.charAt(i));
buffer.append(residue);
if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
{
Nucleotide base = Nucleotide.valueOf(residue);
if (null == base)
{
throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(positions[positionIndex - 1]) + " contains an invalid nucleotide value!");
}
if (base.isAmbiguous())
{
buffer.append(base.getDegeneracyAsString());
}
}
}
position.setResidues(buffer.toString());
}
else if (positionString.startsWith("{"))
{
position.setType(PrositePatternPositionType.NOT);
// TODO: Handle ambiguous protein residues
StringBuilder buffer = new StringBuilder();
for (int i = 1; i < positionString.length() - 1; i++)
{
char residue = Character.toUpperCase(positionString.charAt(i));
buffer.append(residue);
if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
{
Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase());
if (base.isAmbiguous())
{
buffer.append(base.getDegeneracyAsString());
}
}
}
position.setResidues(buffer.toString());
}
else if ((positionString.equalsIgnoreCase("X")
&& getBioSequenceType().equals(BioSequenceType.PROTEIN))
|| (positionString.equalsIgnoreCase("N")
&& getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)))
{
position.setType(PrositePatternPositionType.IS_ANY);
}
else
{
PrositePatternPositionType type = PrositePatternPositionType.IS;
// TODO: Handle ambiguous protein residues
if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
{
Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase());
if (base.isAmbiguous())
{
type = PrositePatternPositionType.ONE_OF;
positionString += base.getDegeneracyAsString();
}
}
position.setType(type);
position.setResidues(positionString);
}
if (position.getCountRange() != null
&& 1 == position.getCountRange().length())
{
// Fixed number of identical positions. Unroll rather than treating as a range
int count = position.getCountRange().getStart();
position.setCountRange(null);
for (int i = 0; i < count; i++)
{
patternPositions.add(position);
}
}
else
{
patternPositions.add(position);
}
}
mPatternPositions = patternPositions;
}
return mPatternPositions;
}
//--------------------------------------------------------------------------
public SeqPatternMatcher matcher(S inTarget)
{
return matcher(inTarget, null);
}
//--------------------------------------------------------------------------
public SeqPatternMatcher matcher(S inTarget, SeqLocation inSeqLocation)
{
SeqPatternMatcher matcher = null;
if (0 == getMaxMismatches())
{
matcher = new RegExpMatcher<>(this, inTarget, inSeqLocation);
}
else if (! containsPositionAmbiguity()
&& ! containsRanges())
{
matcher = new BYPMatcher<>(this, inTarget, inSeqLocation);
}
else
{
// Default to brute force
matcher = new BruteForceMatcher<>(this, inTarget, inSeqLocation);
}
return matcher;
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//--------------------------------------------------------------------------
protected abstract T createMatch(String inSeq, SeqLocation inLocation);
//---------------------------------------------------------------------------
private void evaluate(String inPrositePattern)
{
String prositePattern = inPrositePattern;
// Remove the period at the end
if (prositePattern.endsWith("."))
{
prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
}
// Dashes separate positions
String[] positions = prositePattern.split("\\-");
int positionIndex = 0;
for (String position : positions)
{
positionIndex++;
position = position.trim();
if (position.startsWith("<")) // N-terminus?
{
mIsRestrictedToSeqStart = true;
position = position.substring(1);
}
if (position.endsWith(">")) // C-terminus?
{
mIsRestrictedToSeqEnd = true;
position = position.substring(0, position.length() - 1);
}
// Extract the count spec if present
String countSpec = null;
Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
if (m.find())
{
mContainsRanges = true;
}
if (position.startsWith("{"))
{
if (! position.endsWith("}"))
{
throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern "
+ StringUtil.singleQuote(inPrositePattern) + " starts with '{' but doesn't end with '}'!");
}
mContainsPositionAmbiguity = true;
}
else if (position.startsWith("["))
{
if (! position.endsWith("]"))
{
throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern "
+ StringUtil.singleQuote(inPrositePattern) + " starts with '[' but doesn't end with ']'!");
}
mContainsPositionAmbiguity = true;
}
else if (getBioSequenceType().equals(BioSequenceType.PROTEIN)
&& position.equalsIgnoreCase("x"))
{
mContainsPositionAmbiguity = true;
}
else if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)
&& Nucleotide.valueOf(position.charAt(0)).isAmbiguous())
{
mContainsPositionAmbiguity = true;
}
}
}
}