com.hfg.bio.seq.pattern.NucleotidePattern Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
There is a newer version: 20240423
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;

//------------------------------------------------------------------------------
/**
 Container for a nucleotide pattern (motif).

 
 @author J. Alex Taylor, hairyfatguy.com
 
 */
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class NucleotidePattern
{
   private String  mPatternString;
   private String  mRegExp;
   private Pattern mPattern;
   private boolean mIsPalindromic;
   private String  mAlternateStrandRegExp;
   private Pattern  mAlternateStrandPattern;

   private int[]   mFwdStrandCutSiteIndices;
   private int[]   mRevStrandCutSiteIndices;

   private boolean mIgnoreGaps = false;

   private static final Pattern s5PrimeNonPalindromicCutSpecPattern = Pattern.compile("^\\((\\-?\\d+)/(\\-?\\d+)\\)");
   private static final Pattern s3PrimeNonPalindromicCutSpecPattern = Pattern.compile("\\((\\-?\\d+)/(\\-?\\d+)\\)$");


   //###########################################################################
   // CONSTRUCTORS
   //###########################################################################

   //--------------------------------------------------------------------------
   protected NucleotidePattern()
   {
   }

   //--------------------------------------------------------------------------
   public NucleotidePattern(String inPatternString)
   {
      setPatternString(inPatternString);
   }

   //###########################################################################
   // PUBLIC METHODS
   //###########################################################################

   //--------------------------------------------------------------------------
   @Override
   public String toString()
   {
      return getPatternString();
   }

   //--------------------------------------------------------------------------
   public String getPatternString()
   {
      return mPatternString;
   }

   //--------------------------------------------------------------------------
   public NucleotidePattern setIgnoreGaps(boolean inValue)
   {
      if (inValue != mIgnoreGaps)
      {
         mIgnoreGaps = inValue;
         mRegExp = null; // Clear the regexp
         mPattern = null; // Clear the Pattern
      }

      return this;
   }

   //--------------------------------------------------------------------------
   public String getRegExp()
   {
      if (null == mRegExp)
      {
         mRegExp = convertStringToRegExp();
      }

      return mRegExp;
   }

   //--------------------------------------------------------------------------
   public Pattern getPattern()
   {
      if (null == mPattern)
      {
         mPattern = Pattern.compile(getRegExp(), Pattern.CASE_INSENSITIVE);
      }

      return mPattern;
   }

   //--------------------------------------------------------------------------
   public Pattern getAlternateStrandPattern()
   {
      if (null == mAlternateStrandPattern)
      {
         StringBuilder buffer = new StringBuilder();
         for (int i = mRegExp.length() - 1; i >= 0; i--)
         {
            char theChar = mRegExp.charAt(i);
            if (Character.isLetter(theChar))
            {
               buffer.append(Nucleotide.valueOf(theChar).getComplement().getOneLetterCode());
            }
            else if (theChar == '[')
            {
               buffer.append(']');
            }
            else if (theChar == ']')
            {
               buffer.append('[');
            }
            else
            {
               buffer.append(theChar);
            }
         }
         mAlternateStrandPattern = Pattern.compile(buffer.toString(), Pattern.CASE_INSENSITIVE);
      }

      return mAlternateStrandPattern;
   }

   //--------------------------------------------------------------------------
   /**
    Returns the forward strand cut site indices relative to the start of the motif.
    * @return Usually one but possibly two cut site indices on the forward strand.
    */
   public int[] getFwdStrandCutSiteIndices()
   {
      return mFwdStrandCutSiteIndices;
   }

   //--------------------------------------------------------------------------
   /**
    Returns the reverse strand cut site indices relative to the start of the motif.
    * @return Usually one but possibly two cut site indices on the reverse strand.
    */
   public int[] getRevStrandCutSiteIndices()
   {
      return mRevStrandCutSiteIndices;
   }

   //---------------------------------------------------------------------------
   public boolean isPalindromic()
   {
      return mIsPalindromic;
   }


   //--------------------------------------------------------------------------
   public List match(NucleicAcid inQuery)
   {
      List matches = new ArrayList<>(25);

      Matcher m = getPattern().matcher(inQuery.getSequence());
      int index = 0;
      while (m.find(index))
      {
         index = m.start() + 1;
         SeqLocation seqLoc = new SeqLocation(index, index + m.group().length() - 1);
         matches.add(createMatch(this, m.group(), seqLoc));
      }

      // If the recognition site is palindromic, we are done.
      // If it isn't, we need to search with the reverse-complement of the recognition site.
      if (! mIsPalindromic)
      {
         m = getAlternateStrandPattern().matcher(inQuery.getSequence());
         index = 0;
         while (m.find(index))
         {
            index = m.start() + 1;
            SeqLocation seqLoc = new SeqLocation(index, index + m.group().length() - 1);
            T match = createMatch(this, m.group(), seqLoc);
            match.setStrand(Strand.REVERSE);
            matches.add(match);
         }
      }

      return matches;
   }

   //--------------------------------------------------------------------------
   protected void setPatternString(String inValue)
   {
      if (inValue != null)
      {
         mPatternString = inValue.trim().toUpperCase();
      }
   }

   //--------------------------------------------------------------------------
   protected T createMatch(NucleotidePattern inPattern, String inSeq, SeqLocation inLocation)
   {
      return (T) new NucleotidePatternMatch(inPattern, inSeq, inLocation);
   }

   //###########################################################################
   // PRIVATE METHODS
   //###########################################################################

   //--------------------------------------------------------------------------
   private String convertStringToRegExp()
   {
      List forwardCutSiteIndices = new ArrayList<>(2);
      List reverseCutSiteIndices = new ArrayList<>(2);

      String patternString = mPatternString;

      mIsPalindromic = true;

      Matcher m = s5PrimeNonPalindromicCutSpecPattern.matcher(patternString);
      if (m.find())
      {
         forwardCutSiteIndices.add(- Integer.parseInt(m.group(1)) - 1);
         reverseCutSiteIndices.add(- Integer.parseInt(m.group(2)) - 1);
         patternString = patternString.substring(m.group().length());
         mIsPalindromic = false;
      }

      m = s3PrimeNonPalindromicCutSpecPattern.matcher(patternString);
      if (m.find())
      {
         patternString = patternString.substring(0, patternString.length() - m.group().length());
         forwardCutSiteIndices.add(patternString.length() + Integer.parseInt(m.group(1)));
         reverseCutSiteIndices.add(patternString.length() + Integer.parseInt(m.group(2)));
         mIsPalindromic = false;
      }

      StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(mIgnoreGaps ? "\\-*" : "");

      for (int i = 0; i < patternString.length(); i++)
      {
         char theChar = patternString.charAt(i);

         if (theChar == '/')
         {
            forwardCutSiteIndices.add(i);
            forwardCutSiteIndices.add(patternString.length() - i - 1);
         }
         else
         {
            if (theChar == 'N')
            {
               regexp.delimitedAppend(".");
            }
            else
            {
               Nucleotide base = Nucleotide.valueOf(theChar);
               if (null == base)
               {
                  throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
               }

               if (base.isAmbiguous())
               {
                  StringBuilder degeneracyString = new StringBuilder(theChar + "");
                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     degeneracyString.append(baseOption.getOneLetterCode());
                  }

                  regexp.delimitedAppend("[" + degeneracyString + "]");
               }
               else
               {
                  regexp.delimitedAppend(theChar);
               }
            }
         }
      }

      mFwdStrandCutSiteIndices = new int[forwardCutSiteIndices.size()];
      for (int i = 0; i < forwardCutSiteIndices.size(); i++)
      {
         mFwdStrandCutSiteIndices[i] = forwardCutSiteIndices.get(i);
      }

      if (CollectionUtil.hasValues(reverseCutSiteIndices))
      {
         mRevStrandCutSiteIndices = new int[reverseCutSiteIndices.size()];
         for (int i = 0; i < reverseCutSiteIndices.size(); i++)
         {
            mRevStrandCutSiteIndices[i] = reverseCutSiteIndices.get(i);
         }
      }

      return regexp.toString();
   }
}