All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.NucleotidePattern Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;

//------------------------------------------------------------------------------
/**
 Container for a nucleotide pattern (motif).

 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class NucleotidePattern { private String mPatternString; private String mRegExp; private Pattern mPattern; private boolean mIsPalindromic; private String mAlternateStrandRegExp; private Pattern mAlternateStrandPattern; private int[] mFwdStrandCutSiteIndices; private int[] mRevStrandCutSiteIndices; private boolean mIgnoreGaps = false; private static final Pattern s5PrimeNonPalindromicCutSpecPattern = Pattern.compile("^\\((\\-?\\d+)/(\\-?\\d+)\\)"); private static final Pattern s3PrimeNonPalindromicCutSpecPattern = Pattern.compile("\\((\\-?\\d+)/(\\-?\\d+)\\)$"); //########################################################################### // CONSTRUCTORS //########################################################################### //-------------------------------------------------------------------------- protected NucleotidePattern() { } //-------------------------------------------------------------------------- public NucleotidePattern(String inPatternString) { setPatternString(inPatternString); } //########################################################################### // PUBLIC METHODS //########################################################################### //-------------------------------------------------------------------------- @Override public String toString() { return getPatternString(); } //-------------------------------------------------------------------------- public String getPatternString() { return mPatternString; } //-------------------------------------------------------------------------- public NucleotidePattern setIgnoreGaps(boolean inValue) { if (inValue != mIgnoreGaps) { mIgnoreGaps = inValue; mRegExp = null; // Clear the regexp mPattern = null; // Clear the Pattern } return this; } //-------------------------------------------------------------------------- public String getRegExp() { if (null == mRegExp) { mRegExp = convertStringToRegExp(); } return mRegExp; } //-------------------------------------------------------------------------- public Pattern getPattern() { if (null == mPattern) { mPattern = Pattern.compile(getRegExp(), Pattern.CASE_INSENSITIVE); } return mPattern; } //-------------------------------------------------------------------------- public Pattern getAlternateStrandPattern() { if (null == mAlternateStrandPattern) { StringBuilder buffer = new StringBuilder(); for (int i = mRegExp.length() - 1; i >= 0; i--) { char theChar = mRegExp.charAt(i); if (Character.isLetter(theChar)) { buffer.append(Nucleotide.valueOf(theChar).getComplement().getOneLetterCode()); } else if (theChar == '[') { buffer.append(']'); } else if (theChar == ']') { buffer.append('['); } else { buffer.append(theChar); } } mAlternateStrandPattern = Pattern.compile(buffer.toString(), Pattern.CASE_INSENSITIVE); } return mAlternateStrandPattern; } //-------------------------------------------------------------------------- /** Returns the forward strand cut site indices relative to the start of the motif. * @return Usually one but possibly two cut site indices on the forward strand. */ public int[] getFwdStrandCutSiteIndices() { return mFwdStrandCutSiteIndices; } //-------------------------------------------------------------------------- /** Returns the reverse strand cut site indices relative to the start of the motif. * @return Usually one but possibly two cut site indices on the reverse strand. */ public int[] getRevStrandCutSiteIndices() { return mRevStrandCutSiteIndices; } //--------------------------------------------------------------------------- public boolean isPalindromic() { return mIsPalindromic; } //-------------------------------------------------------------------------- public List match(NucleicAcid inQuery) { List matches = new ArrayList<>(25); Matcher m = getPattern().matcher(inQuery.getSequence()); int index = 0; while (m.find(index)) { index = m.start() + 1; SeqLocation seqLoc = new SeqLocation(index, index + m.group().length() - 1); matches.add(createMatch(this, m.group(), seqLoc)); } // If the recognition site is palindromic, we are done. // If it isn't, we need to search with the reverse-complement of the recognition site. if (! mIsPalindromic) { m = getAlternateStrandPattern().matcher(inQuery.getSequence()); index = 0; while (m.find(index)) { index = m.start() + 1; SeqLocation seqLoc = new SeqLocation(index, index + m.group().length() - 1); T match = createMatch(this, m.group(), seqLoc); match.setStrand(Strand.REVERSE); matches.add(match); } } return matches; } //-------------------------------------------------------------------------- protected void setPatternString(String inValue) { if (inValue != null) { mPatternString = inValue.trim().toUpperCase(); } } //-------------------------------------------------------------------------- protected T createMatch(NucleotidePattern inPattern, String inSeq, SeqLocation inLocation) { return (T) new NucleotidePatternMatch(inPattern, inSeq, inLocation); } //########################################################################### // PRIVATE METHODS //########################################################################### //-------------------------------------------------------------------------- private String convertStringToRegExp() { List forwardCutSiteIndices = new ArrayList<>(2); List reverseCutSiteIndices = new ArrayList<>(2); String patternString = mPatternString; mIsPalindromic = true; Matcher m = s5PrimeNonPalindromicCutSpecPattern.matcher(patternString); if (m.find()) { forwardCutSiteIndices.add(- Integer.parseInt(m.group(1)) - 1); reverseCutSiteIndices.add(- Integer.parseInt(m.group(2)) - 1); patternString = patternString.substring(m.group().length()); mIsPalindromic = false; } m = s3PrimeNonPalindromicCutSpecPattern.matcher(patternString); if (m.find()) { patternString = patternString.substring(0, patternString.length() - m.group().length()); forwardCutSiteIndices.add(patternString.length() + Integer.parseInt(m.group(1))); reverseCutSiteIndices.add(patternString.length() + Integer.parseInt(m.group(2))); mIsPalindromic = false; } StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(mIgnoreGaps ? "\\-*" : ""); for (int i = 0; i < patternString.length(); i++) { char theChar = patternString.charAt(i); if (theChar == '/') { forwardCutSiteIndices.add(i); forwardCutSiteIndices.add(patternString.length() - i - 1); } else { if (theChar == 'N') { regexp.delimitedAppend("."); } else { Nucleotide base = Nucleotide.valueOf(theChar); if (null == base) { throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); } if (base.isAmbiguous()) { StringBuilder degeneracyString = new StringBuilder(theChar + ""); for (Nucleotide baseOption : base.getDegeneracy()) { degeneracyString.append(baseOption.getOneLetterCode()); } regexp.delimitedAppend("[" + degeneracyString + "]"); } else { regexp.delimitedAppend(theChar); } } } } mFwdStrandCutSiteIndices = new int[forwardCutSiteIndices.size()]; for (int i = 0; i < forwardCutSiteIndices.size(); i++) { mFwdStrandCutSiteIndices[i] = forwardCutSiteIndices.get(i); } if (CollectionUtil.hasValues(reverseCutSiteIndices)) { mRevStrandCutSiteIndices = new int[reverseCutSiteIndices.size()]; for (int i = 0; i < reverseCutSiteIndices.size(); i++) { mRevStrandCutSiteIndices[i] = reverseCutSiteIndices.get(i); } } return regexp.toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy