All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.pattern.NucleotidePattern Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.pattern;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.Nucleotide;
import com.hfg.bio.Strand;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqLocation;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 Container for a nucleotide pattern (motif).

 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class NucleotidePattern extends SeqPattern { private List mStrandsToSearch = Arrays.asList(Strand.values()); // Default to searching both strands private Pattern mAlternateStrandPattern; //########################################################################### // CONSTRUCTORS //########################################################################### //-------------------------------------------------------------------------- protected NucleotidePattern() { } //-------------------------------------------------------------------------- public NucleotidePattern(String inPatternString) { super(inPatternString.toUpperCase()); } //########################################################################### // PUBLIC METHODS //########################################################################### //-------------------------------------------------------------------------- public BioSequenceType getBioSequenceType() { return BioSequenceType.NUCLEIC_ACID; } //-------------------------------------------------------------------------- @Override public NucleotidePattern setIgnoreGaps(boolean inValue) { return (NucleotidePattern) super.setIgnoreGaps(inValue); } //-------------------------------------------------------------------------- @Override public NucleotidePattern setMaxMismatches(int inValue) { return (NucleotidePattern) super.setMaxMismatches(inValue); } //-------------------------------------------------------------------------- public NucleotidePattern setStrandsToSearch(Strand inValue) { mStrandsToSearch = new ArrayList<>(1); mStrandsToSearch.add(inValue); return this; } //-------------------------------------------------------------------------- public NucleotidePattern setStrandsToSearch(Strand[] inValues) { mStrandsToSearch = Arrays.asList(inValues); return this; } //-------------------------------------------------------------------------- public Collection getStrandsToSearch() { return mStrandsToSearch; } //-------------------------------------------------------------------------- @Override protected T createMatch(String inSeq, SeqLocation inLocation) { return (T) new NucleotidePatternMatch(this, inSeq, inLocation); } //########################################################################### // PROTECTED METHODS //########################################################################### //-------------------------------------------------------------------------- protected String convertStringToRegExp(String inPrositePattern) { // Remove the period at the end if (inPrositePattern.endsWith(".")) { inPrositePattern = inPrositePattern.substring(0, inPrositePattern.length() - 1); } StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getIgnoreGaps() ? "\\-*" : ""); // Dashes separate positions String[] positions = inPrositePattern.split("\\-"); for (String position : positions) { boolean nTerm = false; boolean cTerm = false; if (position.startsWith("<")) // 5'? { nTerm = true; position = position.substring(1); } if (position.endsWith(">")) // 3'? { cTerm = true; position = position.substring(0, position.length() - 1); } // Extract the count spec if present String countSpec = null; Matcher m = PROSITE_COUNT_PATTERN.matcher(position); if (m.find()) { countSpec = "{" + m.group(1) + "}"; position = position.substring(0, m.start(1) - 1); } if (position.startsWith("{") && position.endsWith("}")) { StringBuilder positionBuffer = new StringBuilder("[^"); for (int i = 1; i < position.length() - 1; i++) { char theChar = position.charAt(i); Nucleotide base = Nucleotide.valueOf(theChar); if (null == base && theChar != '<' && theChar != '>') { throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); } positionBuffer.append(theChar); if (base != null && base.isAmbiguous()) { for (Nucleotide baseOption : base.getDegeneracy()) { positionBuffer.append(baseOption.getOneLetterCode()); } } } if (getIgnoreGaps()) { positionBuffer.append("\\-"); } positionBuffer.append("]"); if (getIgnoreGaps() && countSpec != null) { positionBuffer.insert(0, "(?:\\-*"); positionBuffer.append(")"); } position = positionBuffer.toString(); } else if (position.startsWith("[") && position.endsWith("]")) { StringBuilder positionBuffer = new StringBuilder("["); for (int i = 1; i < position.length() - 1; i++) { char theChar = position.charAt(i); Nucleotide base = Nucleotide.valueOf(theChar); if (null == base && theChar != '<' && theChar != '>') { throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); } positionBuffer.append(theChar); if (base != null && base.isAmbiguous()) { for (Nucleotide baseOption : base.getDegeneracy()) { positionBuffer.append(baseOption.getOneLetterCode()); } } } positionBuffer.append("]"); if (getIgnoreGaps() && countSpec != null) { positionBuffer.insert(0, "(?:\\-*"); positionBuffer.append(")"); } position = positionBuffer.toString(); } else { if (position.length() > 1) { throw new RuntimeException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!"); } char residue = position.charAt(0); Nucleotide base = Nucleotide.valueOf(residue); if (null == base && ! position.equals("<") && ! position.equals(">")) { throw new RuntimeException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!"); } if (base != null && base.isAmbiguous()) { if (position.equalsIgnoreCase("N")) { if (getIgnoreGaps()) { position = "[^\\-]"; } else { position = "."; } } else { StringBuilder positionBuffer = new StringBuilder("["); positionBuffer.append(residue); for (Nucleotide baseOption : base.getDegeneracy()) { positionBuffer.append(baseOption.getOneLetterCode()); } positionBuffer.append("]"); position = positionBuffer.toString(); } } if (getIgnoreGaps() && countSpec != null) { position = "(?:\\-*" + position + ")"; } } if (position.contains("<")) { position = "(?:" + position.replace("<", "") + "|\\A)"; } else if (position.contains(">")) { position = "(?:" + position.replace(">", "") + "|\\Z)"; } if (getIgnoreGaps() && countSpec != null) { regexp.append((nTerm ? "^" : "") + position + countSpec + (cTerm ? "$" : "")); } else { regexp.delimitedAppend((nTerm ? "^" : "") + position + (countSpec != null ? countSpec : "") + (cTerm ? "$" : "")); } } return regexp.toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy