com.hfg.bio.seq.pattern.NucleotidePattern Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
There is a newer version: 20240423
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 Container for a nucleotide pattern (motif).

 
 @author J. Alex Taylor, hairyfatguy.com
 
 */
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class NucleotidePattern extends SeqPattern
{
   private List mStrandsToSearch = Arrays.asList(Strand.values()); // Default to searching both strands
   private Pattern  mAlternateStrandPattern;

   //###########################################################################
   // CONSTRUCTORS
   //###########################################################################

   //--------------------------------------------------------------------------
   protected NucleotidePattern()
   {
   }

   //--------------------------------------------------------------------------
   public NucleotidePattern(String inPatternString)
   {
      super(inPatternString.toUpperCase());
   }

   //###########################################################################
   // PUBLIC METHODS
   //###########################################################################

   //--------------------------------------------------------------------------
   public NucleotidePattern clone()
   {
      NucleotidePattern cloneObj = (NucleotidePattern) super.clone();

      if (mStrandsToSearch != null)
      {
         cloneObj.mStrandsToSearch = new ArrayList(mStrandsToSearch);
      }

      return cloneObj;
   }

   //--------------------------------------------------------------------------
   public BioSequenceType getBioSequenceType()
   {
      return BioSequenceType.NUCLEIC_ACID;
   }

   //--------------------------------------------------------------------------
   @Override
   public NucleotidePattern setIgnoreGaps(boolean inValue)
   {
      return (NucleotidePattern) super.setIgnoreGaps(inValue);
   }

   //--------------------------------------------------------------------------
   @Override
   public NucleotidePattern setMaxMismatches(int inValue)
   {
      return (NucleotidePattern) super.setMaxMismatches(inValue);
   }

   //--------------------------------------------------------------------------
   public NucleotidePattern setStrandsToSearch(Strand inValue)
   {
      mStrandsToSearch = new ArrayList<>(1);
      mStrandsToSearch.add(inValue);
      return this;
   }

   //--------------------------------------------------------------------------
   public NucleotidePattern setStrandsToSearch(Strand[] inValues)
   {
      mStrandsToSearch = Arrays.asList(inValues);
      return this;
   }

   //--------------------------------------------------------------------------
   public Collection getStrandsToSearch()
   {
      return mStrandsToSearch;
   }


   //--------------------------------------------------------------------------
   @Override
   protected T createMatch(String inSeq, SeqLocation inLocation)
   {
      return (T) new NucleotidePatternMatch(this, inSeq, inLocation);
   }

   //###########################################################################
   // PROTECTED METHODS
   //###########################################################################

   //--------------------------------------------------------------------------
   protected String convertStringToRegExp(String inPrositePattern)
   {
      // Remove the period at the end
      if (inPrositePattern.endsWith("."))
      {
         inPrositePattern = inPrositePattern.substring(0, inPrositePattern.length() - 1);
      }

      StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getIgnoreGaps() ? "\\-*" : "");

      // Dashes separate positions
      String[] positions = inPrositePattern.split("\\-");
      for (String position : positions)
      {
         boolean nTerm = false;
         boolean cTerm = false;
         if (position.startsWith("<")) // 5'?
         {
            nTerm = true;
            position = position.substring(1);
         }

         if (position.endsWith(">")) // 3'?
         {
            cTerm = true;
            position = position.substring(0, position.length() - 1);
         }

         // Extract the count spec if present
         String countSpec = null;
         Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
         if (m.find())
         {
            countSpec = "{" + m.group(1) + "}";
            position = position.substring(0, m.start(1) - 1);
         }

         if (position.startsWith("{")
                  && position.endsWith("}"))
         {
            StringBuilder positionBuffer = new StringBuilder("[^");
            for (int i = 1; i < position.length() - 1; i++)
            {
               char theChar = position.charAt(i);

               Nucleotide base = Nucleotide.valueOf(theChar);
               if (null == base
                   && theChar != '<'
                   && theChar != '>')
               {
                  throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
               }

               positionBuffer.append(theChar);
               if (base != null
                   && base.isAmbiguous())
               {
                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     positionBuffer.append(baseOption.getOneLetterCode());
                  }
               }
            }

            if (getIgnoreGaps())
            {
               positionBuffer.append("\\-");
            }

            positionBuffer.append("]");

            if (getIgnoreGaps()
                && countSpec != null)
            {
               positionBuffer.insert(0, "(?:\\-*");
               positionBuffer.append(")");
            }

            position = positionBuffer.toString();
         }
         else if (position.startsWith("[")
                  && position.endsWith("]"))
         {
            StringBuilder positionBuffer = new StringBuilder("[");
            for (int i = 1; i < position.length() - 1; i++)
            {
               char theChar = position.charAt(i);

               Nucleotide base = Nucleotide.valueOf(theChar);
               if (null == base
                   && theChar != '<'
                   && theChar != '>')
               {
                  throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
               }

               positionBuffer.append(theChar);
               if (base != null
                   && base.isAmbiguous())
               {
                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     positionBuffer.append(baseOption.getOneLetterCode());
                  }
               }
            }

            positionBuffer.append("]");

            if (getIgnoreGaps()
                && countSpec != null)
            {
               positionBuffer.insert(0, "(?:\\-*");
               positionBuffer.append(")");
            }

            position = positionBuffer.toString();
         }
         else
         {
            if (position.length() > 1)
            {
               throw new RuntimeException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!");
            }

            char residue = position.charAt(0);
            Nucleotide base = Nucleotide.valueOf(residue);
            if (null == base
                && ! position.equals("<")
                && ! position.equals(">"))
            {
               throw new RuntimeException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!");
            }

            if (base != null
                && base.isAmbiguous())
            {
               if (position.equalsIgnoreCase("N"))
               {
                  if (getIgnoreGaps())
                  {
                     position = "[^\\-]";
                  }
                  else
                  {
                     position = ".";
                  }
               }
               else
               {
                  StringBuilder positionBuffer = new StringBuilder("[");
                  positionBuffer.append(residue);

                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     positionBuffer.append(baseOption.getOneLetterCode());
                  }

                  positionBuffer.append("]");

                  position = positionBuffer.toString();
               }
            }


            if (getIgnoreGaps()
                && countSpec != null)
            {
               position = "(?:\\-*" + position + ")";
            }
         }


         if (position.contains("<"))
         {
            position = "(?:" + position.replace("<", "") + "|\\A)";
         }
         else if (position.contains(">"))
         {
            position = "(?:" + position.replace(">", "") + "|\\Z)";
         }


         if (getIgnoreGaps()
             && countSpec != null)
         {
            regexp.append((nTerm ? "^" : "")
                          + position
                          + countSpec
                          + (cTerm ? "$" : ""));
         }
         else
         {
            regexp.delimitedAppend((nTerm ? "^" : "")
                                   + position
                                   + (countSpec != null ? countSpec : "")
                                   + (cTerm ? "$" : ""));
         }
      }

      return regexp.toString();
   }
}