All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.SeqUtil Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq;


import com.hfg.util.StringUtil;
import com.hfg.util.collection.OrderedMap;

import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;

//------------------------------------------------------------------------------
/**
 Utility class of generic sequence functions.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class SeqUtil { private static final Pattern NON_NUC_PATTERN = Pattern.compile("[^ATUGCN]", Pattern.CASE_INSENSITIVE); //-------------------------------------------------------------------------- /** Retains only a single copy of each distinct sequence. @param inSequences the collection of (potentially redundant) input sequences @param BioSequence-implementing class @return the non-redundant collection of sequences */ public static Collection unique(Collection inSequences) { Map uniqueMap = new OrderedMap(); for (T seq : inSequences) { byte[] checksum = seq.getSHA1Checksum(); String checksumString = new String(checksum); if (! uniqueMap.containsKey(checksumString)) { uniqueMap.put(checksumString, seq); } // else // { // System.out.println(seq.getID() + " is a duplicate of " + uniqueMap.get(checksumString).getID()); // } } return uniqueMap.values(); } //-------------------------------------------------------------------------- /** Guesses the BioSequenceType for the specified sequence. If the sequence is greater than 80% ATUGCN, it will be called as NUCLEIC_ACID. Otherwise it will be called as PROTEIN. @param inSequence the sequence to guess the sequence type from @return the BioSequenceType of the input sequence */ public static BioSequenceType guessBioSequenceType(String inSequence) { BioSequenceType guessedType = BioSequenceType.PROTEIN; // Is the sequence greater that 80% ATGCN? if (StringUtil.replaceAllRegexp(inSequence, NON_NUC_PATTERN, "").length() > (0.8 * inSequence.length())) { guessedType = BioSequenceType.NUCLEIC_ACID; } return guessedType; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy