com.hfg.bio.seq.SeqUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.OrderedMap;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
//------------------------------------------------------------------------------
/**
Utility class of generic sequence functions.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class SeqUtil
{
private static final Pattern NON_NUC_PATTERN = Pattern.compile("[^ATUGCN]", Pattern.CASE_INSENSITIVE);
//--------------------------------------------------------------------------
/**
Retains only a single copy of each distinct sequence.
@param inSequences the collection of (potentially redundant) input sequences
@param BioSequence-implementing class
@return the non-redundant collection of sequences
*/
public static Collection unique(Collection inSequences)
{
Map uniqueMap = new OrderedMap();
for (T seq : inSequences)
{
byte[] checksum = seq.getSHA1Checksum();
String checksumString = new String(checksum);
if (! uniqueMap.containsKey(checksumString))
{
uniqueMap.put(checksumString, seq);
}
// else
// {
// System.out.println(seq.getID() + " is a duplicate of " + uniqueMap.get(checksumString).getID());
// }
}
return uniqueMap.values();
}
//--------------------------------------------------------------------------
/**
Guesses the BioSequenceType for the specified sequence. If the sequence is greater than 80% ATUGCN,
it will be called as NUCLEIC_ACID. Otherwise it will be called as PROTEIN.
@param inSequence the sequence to guess the sequence type from
@return the BioSequenceType of the input sequence
*/
public static BioSequenceType guessBioSequenceType(String inSequence)
{
BioSequenceType guessedType = BioSequenceType.PROTEIN;
// Is the sequence greater that 80% ATGCN?
if (StringUtil.replaceAllRegexp(inSequence, NON_NUC_PATTERN, "").length() > (0.8 * inSequence.length()))
{
guessedType = BioSequenceType.NUCLEIC_ACID;
}
return guessedType;
}
}