com.hfg.bio.seq.pattern.RegExpMatcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
There is a newer version: 20240423
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 Regular Expression string matching algorithm. Use when mismatches are not allowed.
 Ambiguity within positions and range specifications are allowed.
 
 @author J. Alex Taylor, hairyfatguy.com
 
 */
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

class RegExpMatcher implements SeqPatternMatcher
{

   private static String sAlgorithmName = "Regular Expression";

   private SeqPattern  mPattern;
   private Pattern     mRegExpPattern;
   private Pattern     mRevStrandRegExpPattern;
   private S           mTarget;
   private SeqLocation mSeqLocation;

   protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)\\)$");

   //###########################################################################
   // CONSTRUCTORS
   //###########################################################################

   //---------------------------------------------------------------------------
   protected RegExpMatcher(SeqPattern inPattern, S inTarget)
   {
      this(inPattern, inTarget, null);
   }

   //---------------------------------------------------------------------------
   protected RegExpMatcher(SeqPattern inPattern, S inTarget, SeqLocation inSeqLocation)
   {
      mPattern = inPattern;
      mTarget = inTarget;
      mSeqLocation = inSeqLocation;

      verifyPatternCompatibility();
   }

   //###########################################################################
   // PUBLIC METHODS
   //###########################################################################

   //---------------------------------------------------------------------------
   public T find(SeqLocation inSeqLocation)
   {
      T match = null;

      SeqLocation seqLocation = (inSeqLocation != null ? inSeqLocation : mSeqLocation);
      if (null == seqLocation)
      {
         seqLocation = new SeqLocation(1, getTarget().length());
      }

      Pattern pattern = getRegExpPattern();
      Matcher m = pattern.matcher(getTarget().getSubSequence(seqLocation));

      if (m.find())
      {
         int start = m.start() + seqLocation.getStart();
         SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID());
         match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
      }

      return match;
   }

   //---------------------------------------------------------------------------
   @Override
   public List findAll(SeqLocation inSeqLocation)
   {
      SeqLocation seqLocation = inSeqLocation;
      if (null == seqLocation)
      {
         seqLocation = new SeqLocation(1, getTarget().length());
      }

      List matches = new ArrayList<>(25);

      Pattern pattern = getRegExpPattern();

      String sequence = getTarget().getSubSequence(seqLocation);

      Matcher m = pattern.matcher(sequence);

      int index = 0;
      while (m.find(index))
      {
         index = m.start() + 1;
         int start = index + seqLocation.getStart() - 1;
         SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID());
         T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);

         if (getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
         {
            ((NucleotidePatternMatch) match).setStrand(Strand.FORWARD);
         }

         matches.add(match);
      }

      if (getPattern() instanceof NucleotidePattern
          && ((NucleotidePattern) getPattern()).getStrandsToSearch() != null
          && ((NucleotidePattern) getPattern()).getStrandsToSearch().contains(Strand.REVERSE))
      {
         m = getReverseStandRegExpPattern().matcher(sequence);
         index = 0;
         while (m.find(index))
         {
            index = m.start() + 1;
            int start = index + seqLocation.getStart() - 1;
            SeqLocation matchLoc = new SeqLocation(start, start + m.group().length() - 1).setChainId(getTarget().getID());
            T match = getPattern().createMatch(getTarget().getSubSequence(matchLoc), matchLoc);
            ((NucleotidePatternMatch) match).setStrand(Strand.REVERSE);
            matches.add(match);
         }
      }

      return matches;
   }

   //---------------------------------------------------------------------------
   public S getTarget()
   {
      return mTarget;
   }

   //###########################################################################
   // PROTECTED METHODS
   //###########################################################################

   //---------------------------------------------------------------------------
   protected SeqPattern getPattern()
   {
      return mPattern;
   }

   //###########################################################################
   // PRIVATE METHODS
   //###########################################################################

   //---------------------------------------------------------------------------
   private void verifyPatternCompatibility()
   {
      if (getPattern().getMaxMismatches() > 0)
      {
         throw new ProgrammingException("Patterns allowing mismatches cannot by used by the " + sAlgorithmName + " algorithm!");
      }

      if ((getTarget() instanceof Protein
           && ! getPattern().getBioSequenceType().equals(BioSequenceType.PROTEIN))
          || (getTarget() instanceof NucleicAcid
              && ! getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)))
      {
         throw new RuntimeException("Target / Pattern Mismatch! Target is a " + getTarget().getClass().getSimpleName() + " but the pattern type is " + getPattern().getBioSequenceType() + "!");
      }

      if (getTarget() instanceof Protein
          && ((Protein)getTarget()).hasChains())
      {
         throw new RuntimeException("Pattern matching can only be done with single-chain proteins! Please pass the chains of multi-chain proteins to the pattern matcher individually.");
      }
   }

   //---------------------------------------------------------------------------
   // Turns the Prosite pattern into a regular expression.
   private Pattern getRegExpPattern()
   {
      if (null == mRegExpPattern)
      {
         if (getPattern().getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
         {
            mRegExpPattern = buildDNARegExp();
         }
         else
         {
            mRegExpPattern = buildProteinRegExp();
         }
      }

      return mRegExpPattern;
   }

   //---------------------------------------------------------------------------
   private Pattern buildDNARegExp()
   {
      String prositePattern = getPattern().getPrositePattern();

      // Remove the period at the end
      if (prositePattern.endsWith("."))
      {
         prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
      }

      StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getPattern().getIgnoreGaps() ? "\\-*" : "");

      // Dashes separate positions
      String[] positions = prositePattern.split("\\-");
      for (String position : positions)
      {
         boolean nTerm = false;
         boolean cTerm = false;
         if (position.startsWith("<")) // 5'?
         {
            nTerm = true;
            position = position.substring(1);
         }

         if (position.endsWith(">")) // 3'?
         {
            cTerm = true;
            position = position.substring(0, position.length() - 1);
         }

         // Extract the count spec if present
         String countSpec = null;
         Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
         if (m.find())
         {
            countSpec = "{" + m.group(1) + "}";
            position = position.substring(0, m.start(1) - 1);
         }

         if (position.startsWith("{")
             && position.endsWith("}"))
         {
            StringBuilder positionBuffer = new StringBuilder("[^");
            for (int i = 1; i < position.length() - 1; i++)
            {
               char theChar = position.charAt(i);

               Nucleotide base = Nucleotide.valueOf(theChar);
               if (null == base
                   && theChar != '<'
                   && theChar != '>')
               {
                  throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
               }

               positionBuffer.append(theChar);
               if (base != null
                   && base.isAmbiguous())
               {
                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     positionBuffer.append(baseOption.getOneLetterCode());
                  }
               }
            }

            if (getPattern().getIgnoreGaps())
            {
               positionBuffer.append("\\-");
            }

            positionBuffer.append("]");

            if (getPattern().getIgnoreGaps()
                && countSpec != null)
            {
               positionBuffer.insert(0, "(?:\\-*");
               positionBuffer.append(")");
            }

            position = positionBuffer.toString();
         }
         else if (position.startsWith("[")
                  && position.endsWith("]"))
         {
            StringBuilder positionBuffer = new StringBuilder("[");
            for (int i = 1; i < position.length() - 1; i++)
            {
               char theChar = position.charAt(i);

               Nucleotide base = Nucleotide.valueOf(theChar);
               if (null == base
                   && theChar != '<'
                   && theChar != '>')
               {
                  throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
               }

               positionBuffer.append(theChar);
               if (base != null
                   && base.isAmbiguous())
               {
                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     positionBuffer.append(baseOption.getOneLetterCode());
                  }
               }
            }

            positionBuffer.append("]");

            if (getPattern().getIgnoreGaps()
                && countSpec != null)
            {
               positionBuffer.insert(0, "(?:\\-*");
               positionBuffer.append(")");
            }

            position = positionBuffer.toString();
         }
         else
         {
            if (position.length() > 1)
            {
               throw new SeqPatternConfigurationException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!");
            }

            char residue = position.charAt(0);
            Nucleotide base = Nucleotide.valueOf(residue);
            if (null == base
                && ! position.equals("<")
                && ! position.equals(">"))
            {
               throw new SeqPatternConfigurationException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!");
            }

            if (base != null
                && base.isAmbiguous())
            {
               if (position.equalsIgnoreCase("N"))
               {
                  if (getPattern().getIgnoreGaps())
                  {
                     position = "[^\\-]";
                  }
                  else
                  {
                     position = ".";
                  }
               }
               else
               {
                  StringBuilder positionBuffer = new StringBuilder("[");
                  positionBuffer.append(residue);

                  for (Nucleotide baseOption : base.getDegeneracy())
                  {
                     positionBuffer.append(baseOption.getOneLetterCode());
                  }

                  positionBuffer.append("]");

                  position = positionBuffer.toString();
               }
            }


            if (getPattern().getIgnoreGaps()
                && countSpec != null)
            {
               position = "(?:\\-*" + position + ")";
            }
         }


         if (position.contains("<"))
         {
            position = "(?:" + position.replace("<", "") + "|\\A)";
         }
         else if (position.contains(">"))
         {
            position = "(?:" + position.replace(">", "") + "|\\Z)";
         }


         if (getPattern().getIgnoreGaps()
             && countSpec != null)
         {
            regexp.append((nTerm ? "^" : "")
                          + position
                          + countSpec
                          + (cTerm ? "$" : ""));
         }
         else
         {
            regexp.delimitedAppend((nTerm ? "^" : "")
                                   + position
                                   + (countSpec != null ? countSpec : "")
                                   + (cTerm ? "$" : ""));
         }
      }

      return Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE);
   }

   //---------------------------------------------------------------------------
   private Pattern buildProteinRegExp()
   {
      String prositePattern = getPattern().getPrositePattern();

      // Remove the period at the end
      if (prositePattern.endsWith("."))
      {
         prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
      }

      StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getPattern().getIgnoreGaps() ? "\\-*" : "");

      // Dashes separate positions
      String[] positions = prositePattern.split("\\-");
      for (String position : positions)
      {
         boolean nTerm = false;
         boolean cTerm = false;
         if (position.startsWith("<")) // N-terminus?
         {
            nTerm = true;
            position = position.substring(1);
         }

         if (position.endsWith(">")) // C-terminus?
         {
            cTerm = true;
            position = position.substring(0, position.length() - 1);
         }

         // Extract the count spec if present
         String countSpec = null;
         Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
         if (m.find())
         {
            countSpec = "{" + m.group(1) + "}";
            position = position.substring(0, m.start(1) - 1);
         }

         if (position.equalsIgnoreCase("x"))
         {
            if (getPattern().getIgnoreGaps())
            {
               position = "(?:[^\\-]\\-*)";
            }
            else
            {
               position = ".";
            }
         }
         else if (position.startsWith("{")
                  && position.endsWith("}"))
         {
            position = "[^" + position.substring(1, position.length() - 1) + (getPattern().getIgnoreGaps() ? "\\-" : "") + "]";
         }

         if (position.contains("<"))
         {
            position = "(?:" + position.replace("<", "") + "|\\A)";
         }
         else if (position.contains(">"))
         {
            position = "(?:" + position.replace(">", "") + "|\\Z)";
         }

         regexp.delimitedAppend((nTerm ? "^" : "")
                                + position
                                + (countSpec != null ? countSpec : "")
                                + (cTerm ? "$" : ""));
      }

      return Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE);
   }

   //---------------------------------------------------------------------------
   private Pattern getReverseStandRegExpPattern()
   {
      Pattern fwdRegExp = getRegExpPattern();

      if (null == mRevStrandRegExpPattern)
      {
         StringBuilderPlus regexp = new StringBuilderPlus();

         for (int i = fwdRegExp.pattern().length() - 1; i >= 0; i--)
         {
            char theChar = fwdRegExp.pattern().charAt(i);
            if (Character.isLetter(theChar))
            {
               regexp.append(Nucleotide.valueOf(theChar).getComplement().getOneLetterCode());
            }
            else if (theChar == '[')
            {
               regexp.append(']');
            }
            else if (theChar == ']')
            {
               regexp.append('[');
            }
            else
            {
               regexp.append(theChar);
            }
         }

         mRevStrandRegExpPattern = Pattern.compile(regexp.toString(), Pattern.CASE_INSENSITIVE);
      }

      return mRevStrandRegExpPattern;
   }


}
Related Artifacts

mysql-connector-java mysql
facebook-messenger com.github.codedrinker
selenium-java org.seleniumhq.selenium
instagram-java com.github.sola92
gson com.google.code.gson
poi org.apache.poi
httpclient org.apache.httpcomponents
json org.json
facebook-java-api com.google.code.facebook-java-api
poi-ooxml org.apache.poi
jackson-databind com.fasterxml.jackson.core
junit junit
primefaces org.primefaces
ojdbc7 com.github.noraui
jfoenix com.jfoenix
testng org.testng
json-simple com.googlecode.json-simple
selenium-server org.seleniumhq.selenium
itextpdf com.itextpdf
spring-core org.springframework

Related Groups

org.springframework
org.apache.poi
org.hibernate
org.springframework.boot
com.fasterxml.jackson.core
com.itextpdf
org.seleniumhq.selenium
mysql
org.finos.legend.engine
org.apache.httpcomponents
org.apache.logging.log4j
org.openjfx
org.apache.commons
org.json
com.google.guava
com.google.zxing
net.sf.jasperreports
javax.xml.bind
ojdbc
com.google.code.facebook-java-api