com.hfg.bio.seq.pattern.RegExpMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
Regular Expression string matching algorithm. Use when mismatches are not allowed.
Ambiguity within positions and range specifications are allowed.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
class RegExpMatcher implements SeqPatternMatcher
{
private static String sAlgorithmName = "Regular Expression";
private SeqPattern mPattern;
private Pattern mRegExpPattern;
private Pattern mRevStrandRegExpPattern;
private S mTarget;
private SeqLocation mSeqLocation;
protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)\\)$");
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
protected RegExpMatcher(SeqPattern inPattern, S inTarget)
{
this(inPattern, inTarget, null);
}
//---------------------------------------------------------------------------
protected RegExpMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation)
{
mPattern = inPattern;
mTarget = inTarget;
mSeqLocation = inSeqLocation;
verifyPatternCompatibility();
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public T find(SeqLocation inSeqLocation)
{
T match = null;
SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation);
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, getTarget().length());
}
Pattern pattern = getRegExpPattern();
Matcher m = pattern.matcher(getTarget().getSubSequence(seqLocation));
if (m.find())
{
int start = m.start() + seqLocation.getStart();
SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID());
match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
}
return match;
}
//---------------------------------------------------------------------------
@Override
public List findAll(SeqLocation inSeqLocation)
{
SeqLocation seqLocation = inSeqLocation;
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, getTarget().length());
}
List matches = new ArrayList<>(25);
Pattern pattern = getRegExpPattern();
String sequence = getTarget().getSubSequence(seqLocation);
Matcher m = pattern.matcher(sequence);
int index = 0;
while (m.find(index))
{
index = m.start() + 1;
int start = index + seqLocation.getStart() - 1;
SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID());
T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
if (getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
{
((NucleotidePatternMatch) match).setStrand(Strand.FORWARD);
}
matches.add(match);
}
if (getPattern() instanceof NucleotidePattern
&& ((NucleotidePattern) getPattern()).getStrandsToSearch() != null
&& ((NucleotidePattern) getPattern()).getStrandsToSearch().contains(Strand.REVERSE))
{
m = getReverseStandRegExpPattern().matcher(sequence);
index = 0;
while (m.find(index))
{
index = m.start() + 1;
int start = index + seqLocation.getStart() - 1;
SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID());
T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
((NucleotidePatternMatch) match).setStrand(Strand.REVERSE);
matches.add(match);
}
}
return matches;
}
//---------------------------------------------------------------------------
public S getTarget()
{
return mTarget;
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//---------------------------------------------------------------------------
protected SeqPattern getPattern()
{
return mPattern;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private void verifyPatternCompatibility()
{
if (getPattern().getMaxMismatches() > 0)
{
throw new ProgrammingException("Patterns allowing mismatches cannot by used by the " + sAlgorithmName + " algorithm!");
}
if ((getTarget() instanceof Protein
&& ! getPattern().getBioSequenceType().equals(BioSequenceType.PROTEIN))
|| (getTarget() instanceof NucleicAcid
&& ! getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)))
{
throw new RuntimeException("Target / Pattern Mismatch! Target is a " + getTarget().getClass().getSimpleName() + " but the pattern type is " + getPattern().getBioSequenceType() + "!");
}
if (getTarget() instanceof Protein
&& ((Protein)getTarget()).hasChains())
{
throw new RuntimeException("Pattern matching can only be done with single-chain proteins! Please pass the chains of multi-chain proteins to the pattern matcher individually.");
}
}
//---------------------------------------------------------------------------
// Turns the Prosite pattern into a regular expression.
private Pattern getRegExpPattern()
{
if (null == mRegExpPattern)
{
if (getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
{
mRegExpPattern = buildDNARegExp();
}
else
{
mRegExpPattern = buildProteinRegExp();
}
}
return mRegExpPattern;
}
//---------------------------------------------------------------------------
private Pattern buildDNARegExp()
{
String prositePattern = getPattern().getPrositePattern();
// Remove the period at the end
if (prositePattern.endsWith("."))
{
prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
}
StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getPattern().getIgnoreGaps() ? "\\-*" : "");
// Dashes separate positions
String[] positions = prositePattern.split("\\-");
for (String position : positions)
{
boolean nTerm = false;
boolean cTerm = false;
if (position.startsWith("<")) // 5'?
{
nTerm = true;
position = position.substring(1);
}
if (position.endsWith(">")) // 3'?
{
cTerm = true;
position = position.substring(0, position.length() - 1);
}
// Extract the count spec if present
String countSpec = null;
Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
if (m.find())
{
countSpec = "{" + m.group(1) + "}";
position = position.substring(0, m.start(1) - 1);
}
if (position.startsWith("{")
&& position.endsWith("}"))
{
StringBuilder positionBuffer = new StringBuilder("[^");
for (int i = 1; i < position.length() - 1; i++)
{
char theChar = position.charAt(i);
Nucleotide base = Nucleotide.valueOf(theChar);
if (null == base
&& theChar != '<'
&& theChar != '>')
{
throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
}
positionBuffer.append(theChar);
if (base != null
&& base.isAmbiguous())
{
for (Nucleotide baseOption : base.getDegeneracy())
{
positionBuffer.append(baseOption.getOneLetterCode());
}
}
}
if (getPattern().getIgnoreGaps())
{
positionBuffer.append("\\-");
}
positionBuffer.append("]");
if (getPattern().getIgnoreGaps()
&& countSpec != null)
{
positionBuffer.insert(0, "(?:\\-*");
positionBuffer.append(")");
}
position = positionBuffer.toString();
}
else if (position.startsWith("[")
&& position.endsWith("]"))
{
StringBuilder positionBuffer = new StringBuilder("[");
for (int i = 1; i < position.length() - 1; i++)
{
char theChar = position.charAt(i);
Nucleotide base = Nucleotide.valueOf(theChar);
if (null == base
&& theChar != '<'
&& theChar != '>')
{
throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
}
positionBuffer.append(theChar);
if (base != null
&& base.isAmbiguous())
{
for (Nucleotide baseOption : base.getDegeneracy())
{
positionBuffer.append(baseOption.getOneLetterCode());
}
}
}
positionBuffer.append("]");
if (getPattern().getIgnoreGaps()
&& countSpec != null)
{
positionBuffer.insert(0, "(?:\\-*");
positionBuffer.append(")");
}
position = positionBuffer.toString();
}
else
{
if (position.length() > 1)
{
throw new SeqPatternConfigurationException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!");
}
char residue = position.charAt(0);
Nucleotide base = Nucleotide.valueOf(residue);
if (null == base
&& ! position.equals("<")
&& ! position.equals(">"))
{
throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!");
}
if (base != null
&& base.isAmbiguous())
{
if (position.equalsIgnoreCase("N"))
{
if (getPattern().getIgnoreGaps())
{
position = "[^\\-]";
}
else
{
position = ".";
}
}
else
{
StringBuilder positionBuffer = new StringBuilder("[");
positionBuffer.append(residue);
for (Nucleotide baseOption : base.getDegeneracy())
{
positionBuffer.append(baseOption.getOneLetterCode());
}
positionBuffer.append("]");
position = positionBuffer.toString();
}
}
if (getPattern().getIgnoreGaps()
&& countSpec != null)
{
position = "(?:\\-*" + position + ")";
}
}
if (position.contains("<"))
{
position = "(?:" + position.replace("<", "") + "|\\A)";
}
else if (position.contains(">"))
{
position = "(?:" + position.replace(">", "") + "|\\Z)";
}
if (getPattern().getIgnoreGaps()
&& countSpec != null)
{
regexp.append((nTerm ? "^" : "")
+ position
+ countSpec
+ (cTerm ? "$" : ""));
}
else
{
regexp.delimitedAppend((nTerm ? "^" : "")
+ position
+ (countSpec != null ? countSpec : "")
+ (cTerm ? "$" : ""));
}
}
return Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE);
}
//---------------------------------------------------------------------------
private Pattern buildProteinRegExp()
{
String prositePattern = getPattern().getPrositePattern();
// Remove the period at the end
if (prositePattern.endsWith("."))
{
prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
}
StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getPattern().getIgnoreGaps() ? "\\-*" : "");
// Dashes separate positions
String[] positions = prositePattern.split("\\-");
for (String position : positions)
{
boolean nTerm = false;
boolean cTerm = false;
if (position.startsWith("<")) // N-terminus?
{
nTerm = true;
position = position.substring(1);
}
if (position.endsWith(">")) // C-terminus?
{
cTerm = true;
position = position.substring(0, position.length() - 1);
}
// Extract the count spec if present
String countSpec = null;
Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
if (m.find())
{
countSpec = "{" + m.group(1) + "}";
position = position.substring(0, m.start(1) - 1);
}
if (position.equalsIgnoreCase("x"))
{
if (getPattern().getIgnoreGaps())
{
position = "(?:[^\\-]\\-*)";
}
else
{
position = ".";
}
}
else if (position.startsWith("{")
&& position.endsWith("}"))
{
position = "[^" + position.substring(1, position.length() - 1) + (getPattern().getIgnoreGaps() ? "\\-" : "") + "]";
}
if (position.contains("<"))
{
position = "(?:" + position.replace("<", "") + "|\\A)";
}
else if (position.contains(">"))
{
position = "(?:" + position.replace(">", "") + "|\\Z)";
}
regexp.delimitedAppend((nTerm ? "^" : "")
+ position
+ (countSpec != null ? countSpec : "")
+ (cTerm ? "$" : ""));
}
return Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE);
}
//---------------------------------------------------------------------------
private Pattern getReverseStandRegExpPattern()
{
Pattern fwdRegExp = getRegExpPattern();
if (null == mRevStrandRegExpPattern)
{
StringBuilderPlus regexp = new StringBuilderPlus();
for (int i = fwdRegExp.pattern().length() - 1; i >= 0; i--)
{
char theChar = fwdRegExp.pattern().charAt(i);
if (Character.isLetter(theChar))
{
regexp.append(Nucleotide.valueOf(theChar).getComplement().getOneLetterCode());
}
else if (theChar == '[')
{
regexp.append(']');
}
else if (theChar == ']')
{
regexp.append('[');
}
else
{
regexp.append(theChar);
}
}
mRevStrandRegExpPattern = Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE);
}
return mRevStrandRegExpPattern;
}
}