com.hfg.bio.seq.pattern.NucleotidePattern Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
//------------------------------------------------------------------------------
/**
Container for a nucleotide pattern (motif).
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class NucleotidePattern
{
private String mPatternString;
private String mRegExp;
private Pattern mPattern;
private boolean mIsPalindromic;
private String mAlternateStrandRegExp;
private Pattern mAlternateStrandPattern;
private int[] mFwdStrandCutSiteIndices;
private int[] mRevStrandCutSiteIndices;
private boolean mIgnoreGaps = false;
private static final Pattern s5PrimeNonPalindromicCutSpecPattern = Pattern.compile("^\\((\\-?\\d+)/(\\-?\\d+)\\)");
private static final Pattern s3PrimeNonPalindromicCutSpecPattern = Pattern.compile("\\((\\-?\\d+)/(\\-?\\d+)\\)$");
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//--------------------------------------------------------------------------
protected NucleotidePattern()
{
}
//--------------------------------------------------------------------------
public NucleotidePattern(String inPatternString)
{
setPatternString(inPatternString);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//--------------------------------------------------------------------------
@Override
public String toString()
{
return getPatternString();
}
//--------------------------------------------------------------------------
public String getPatternString()
{
return mPatternString;
}
//--------------------------------------------------------------------------
public NucleotidePattern setIgnoreGaps(boolean inValue)
{
if (inValue != mIgnoreGaps)
{
mIgnoreGaps = inValue;
mRegExp = null; // Clear the regexp
mPattern = null; // Clear the Pattern
}
return this;
}
//--------------------------------------------------------------------------
public String getRegExp()
{
if (null == mRegExp)
{
mRegExp = convertStringToRegExp();
}
return mRegExp;
}
//--------------------------------------------------------------------------
public Pattern getPattern()
{
if (null == mPattern)
{
mPattern = Pattern.compile(getRegExp(), Pattern.CASE_INSENSITIVE);
}
return mPattern;
}
//--------------------------------------------------------------------------
public Pattern getAlternateStrandPattern()
{
if (null == mAlternateStrandPattern)
{
StringBuilder buffer = new StringBuilder();
for (int i = mRegExp.length() - 1; i >= 0; i--)
{
char theChar = mRegExp.charAt(i);
if (Character.isLetter(theChar))
{
buffer.append(Nucleotide.valueOf(theChar).getComplement().getOneLetterCode());
}
else if (theChar == '[')
{
buffer.append(']');
}
else if (theChar == ']')
{
buffer.append('[');
}
else
{
buffer.append(theChar);
}
}
mAlternateStrandPattern = Pattern.compile(buffer.toString(), Pattern.CASE_INSENSITIVE);
}
return mAlternateStrandPattern;
}
//--------------------------------------------------------------------------
/**
Returns the forward strand cut site indices relative to the start of the motif.
* @return Usually one but possibly two cut site indices on the forward strand.
*/
public int[] getFwdStrandCutSiteIndices()
{
return mFwdStrandCutSiteIndices;
}
//--------------------------------------------------------------------------
/**
Returns the reverse strand cut site indices relative to the start of the motif.
* @return Usually one but possibly two cut site indices on the reverse strand.
*/
public int[] getRevStrandCutSiteIndices()
{
return mRevStrandCutSiteIndices;
}
//---------------------------------------------------------------------------
public boolean isPalindromic()
{
return mIsPalindromic;
}
//--------------------------------------------------------------------------
public List match(NucleicAcid inQuery)
{
List matches = new ArrayList<>(25);
Matcher m = getPattern().matcher(inQuery.getSequence());
int index = 0;
while (m.find(index))
{
index = m.start() + 1;
SeqLocation seqLoc = new SeqLocation(index, index + m.group().length() - 1);
matches.add(createMatch(this, m.group(), seqLoc));
}
// If the recognition site is palindromic, we are done.
// If it isn't, we need to search with the reverse-complement of the recognition site.
if (! mIsPalindromic)
{
m = getAlternateStrandPattern().matcher(inQuery.getSequence());
index = 0;
while (m.find(index))
{
index = m.start() + 1;
SeqLocation seqLoc = new SeqLocation(index, index + m.group().length() - 1);
T match = createMatch(this, m.group(), seqLoc);
match.setStrand(Strand.REVERSE);
matches.add(match);
}
}
return matches;
}
//--------------------------------------------------------------------------
protected void setPatternString(String inValue)
{
if (inValue != null)
{
mPatternString = inValue.trim().toUpperCase();
}
}
//--------------------------------------------------------------------------
protected T createMatch(NucleotidePattern inPattern, String inSeq, SeqLocation inLocation)
{
return (T) new NucleotidePatternMatch(inPattern, inSeq, inLocation);
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//--------------------------------------------------------------------------
private String convertStringToRegExp()
{
List forwardCutSiteIndices = new ArrayList<>(2);
List reverseCutSiteIndices = new ArrayList<>(2);
String patternString = mPatternString;
mIsPalindromic = true;
Matcher m = s5PrimeNonPalindromicCutSpecPattern.matcher(patternString);
if (m.find())
{
forwardCutSiteIndices.add(- Integer.parseInt(m.group(1)) - 1);
reverseCutSiteIndices.add(- Integer.parseInt(m.group(2)) - 1);
patternString = patternString.substring(m.group().length());
mIsPalindromic = false;
}
m = s3PrimeNonPalindromicCutSpecPattern.matcher(patternString);
if (m.find())
{
patternString = patternString.substring(0, patternString.length() - m.group().length());
forwardCutSiteIndices.add(patternString.length() + Integer.parseInt(m.group(1)));
reverseCutSiteIndices.add(patternString.length() + Integer.parseInt(m.group(2)));
mIsPalindromic = false;
}
StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(mIgnoreGaps ? "\\-*" : "");
for (int i = 0; i < patternString.length(); i++)
{
char theChar = patternString.charAt(i);
if (theChar == '/')
{
forwardCutSiteIndices.add(i);
forwardCutSiteIndices.add(patternString.length() - i - 1);
}
else
{
if (theChar == 'N')
{
regexp.delimitedAppend(".");
}
else
{
Nucleotide base = Nucleotide.valueOf(theChar);
if (null == base)
{
throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
}
if (base.isAmbiguous())
{
StringBuilder degeneracyString = new StringBuilder(theChar + "");
for (Nucleotide baseOption : base.getDegeneracy())
{
degeneracyString.append(baseOption.getOneLetterCode());
}
regexp.delimitedAppend("[" + degeneracyString + "]");
}
else
{
regexp.delimitedAppend(theChar);
}
}
}
}
mFwdStrandCutSiteIndices = new int[forwardCutSiteIndices.size()];
for (int i = 0; i < forwardCutSiteIndices.size(); i++)
{
mFwdStrandCutSiteIndices[i] = forwardCutSiteIndices.get(i);
}
if (CollectionUtil.hasValues(reverseCutSiteIndices))
{
mRevStrandCutSiteIndices = new int[reverseCutSiteIndices.size()];
for (int i = 0; i < reverseCutSiteIndices.size(); i++)
{
mRevStrandCutSiteIndices[i] = reverseCutSiteIndices.get(i);
}
}
return regexp.toString();
}
}