com.hfg.bio.seq.pattern.NucleotidePattern Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.pattern;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
Container for a nucleotide pattern (motif).
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class NucleotidePattern extends SeqPattern
{
private List mStrandsToSearch = Arrays.asList(Strand.values()); // Default to searching both strands
private Pattern mAlternateStrandPattern;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//--------------------------------------------------------------------------
protected NucleotidePattern()
{
}
//--------------------------------------------------------------------------
public NucleotidePattern(String inPatternString)
{
super(inPatternString.toUpperCase());
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//--------------------------------------------------------------------------
public NucleotidePattern clone()
{
NucleotidePattern cloneObj = (NucleotidePattern) super.clone();
if (mStrandsToSearch != null)
{
cloneObj.mStrandsToSearch = new ArrayList(mStrandsToSearch);
}
return cloneObj;
}
//--------------------------------------------------------------------------
public BioSequenceType getBioSequenceType()
{
return BioSequenceType.NUCLEIC_ACID;
}
//--------------------------------------------------------------------------
@Override
public NucleotidePattern setIgnoreGaps(boolean inValue)
{
return (NucleotidePattern) super.setIgnoreGaps(inValue);
}
//--------------------------------------------------------------------------
@Override
public NucleotidePattern setMaxMismatches(int inValue)
{
return (NucleotidePattern) super.setMaxMismatches(inValue);
}
//--------------------------------------------------------------------------
public NucleotidePattern setStrandsToSearch(Strand inValue)
{
mStrandsToSearch = new ArrayList<>(1);
mStrandsToSearch.add(inValue);
return this;
}
//--------------------------------------------------------------------------
public NucleotidePattern setStrandsToSearch(Strand[] inValues)
{
mStrandsToSearch = Arrays.asList(inValues);
return this;
}
//--------------------------------------------------------------------------
public Collection getStrandsToSearch()
{
return mStrandsToSearch;
}
//--------------------------------------------------------------------------
@Override
protected T createMatch(String inSeq, SeqLocation inLocation)
{
return (T) new NucleotidePatternMatch(this, inSeq, inLocation);
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//--------------------------------------------------------------------------
protected String convertStringToRegExp(String inPrositePattern)
{
// Remove the period at the end
if (inPrositePattern.endsWith("."))
{
inPrositePattern = inPrositePattern.substring(0, inPrositePattern.length() - 1);
}
StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getIgnoreGaps() ? "\\-*" : "");
// Dashes separate positions
String[] positions = inPrositePattern.split("\\-");
for (String position : positions)
{
boolean nTerm = false;
boolean cTerm = false;
if (position.startsWith("<")) // 5'?
{
nTerm = true;
position = position.substring(1);
}
if (position.endsWith(">")) // 3'?
{
cTerm = true;
position = position.substring(0, position.length() - 1);
}
// Extract the count spec if present
String countSpec = null;
Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
if (m.find())
{
countSpec = "{" + m.group(1) + "}";
position = position.substring(0, m.start(1) - 1);
}
if (position.startsWith("{")
&& position.endsWith("}"))
{
StringBuilder positionBuffer = new StringBuilder("[^");
for (int i = 1; i < position.length() - 1; i++)
{
char theChar = position.charAt(i);
Nucleotide base = Nucleotide.valueOf(theChar);
if (null == base
&& theChar != '<'
&& theChar != '>')
{
throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
}
positionBuffer.append(theChar);
if (base != null
&& base.isAmbiguous())
{
for (Nucleotide baseOption : base.getDegeneracy())
{
positionBuffer.append(baseOption.getOneLetterCode());
}
}
}
if (getIgnoreGaps())
{
positionBuffer.append("\\-");
}
positionBuffer.append("]");
if (getIgnoreGaps()
&& countSpec != null)
{
positionBuffer.insert(0, "(?:\\-*");
positionBuffer.append(")");
}
position = positionBuffer.toString();
}
else if (position.startsWith("[")
&& position.endsWith("]"))
{
StringBuilder positionBuffer = new StringBuilder("[");
for (int i = 1; i < position.length() - 1; i++)
{
char theChar = position.charAt(i);
Nucleotide base = Nucleotide.valueOf(theChar);
if (null == base
&& theChar != '<'
&& theChar != '>')
{
throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
}
positionBuffer.append(theChar);
if (base != null
&& base.isAmbiguous())
{
for (Nucleotide baseOption : base.getDegeneracy())
{
positionBuffer.append(baseOption.getOneLetterCode());
}
}
}
positionBuffer.append("]");
if (getIgnoreGaps()
&& countSpec != null)
{
positionBuffer.insert(0, "(?:\\-*");
positionBuffer.append(")");
}
position = positionBuffer.toString();
}
else
{
if (position.length() > 1)
{
throw new RuntimeException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!");
}
char residue = position.charAt(0);
Nucleotide base = Nucleotide.valueOf(residue);
if (null == base
&& ! position.equals("<")
&& ! position.equals(">"))
{
throw new RuntimeException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!");
}
if (base != null
&& base.isAmbiguous())
{
if (position.equalsIgnoreCase("N"))
{
if (getIgnoreGaps())
{
position = "[^\\-]";
}
else
{
position = ".";
}
}
else
{
StringBuilder positionBuffer = new StringBuilder("[");
positionBuffer.append(residue);
for (Nucleotide baseOption : base.getDegeneracy())
{
positionBuffer.append(baseOption.getOneLetterCode());
}
positionBuffer.append("]");
position = positionBuffer.toString();
}
}
if (getIgnoreGaps()
&& countSpec != null)
{
position = "(?:\\-*" + position + ")";
}
}
if (position.contains("<"))
{
position = "(?:" + position.replace("<", "") + "|\\A)";
}
else if (position.contains(">"))
{
position = "(?:" + position.replace(">", "") + "|\\Z)";
}
if (getIgnoreGaps()
&& countSpec != null)
{
regexp.append((nTerm ? "^" : "")
+ position
+ countSpec
+ (cTerm ? "$" : ""));
}
else
{
regexp.delimitedAppend((nTerm ? "^" : "")
+ position
+ (countSpec != null ? countSpec : "")
+ (cTerm ? "$" : ""));
}
}
return regexp.toString();
}
}