com.hfg.bio.seq.SeqUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.OrderedMap;
import com.hfg.util.io.StreamUtil;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
//------------------------------------------------------------------------------
/**
Utility class of generic sequence functions.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class SeqUtil
{
private static final Pattern NON_NUC_PATTERN = Pattern.compile("[^ATUGCN]", Pattern.CASE_INSENSITIVE);
//--------------------------------------------------------------------------
/**
Retains only a single copy of each distinct sequence.
@param inSequences the collection of (potentially redundant) input sequences
@param BioSequence-implementing class
@return the non-redundant collection of sequences
*/
public static Collection unique(Collection inSequences)
{
Map uniqueMap = new OrderedMap();
for (T seq : inSequences)
{
byte[] checksum = seq.getSHA1Checksum();
String checksumString = new String(checksum);
if (! uniqueMap.containsKey(checksumString))
{
uniqueMap.put(checksumString, seq);
}
// else
// {
// System.out.println(seq.getID() + " is a duplicate of " + uniqueMap.get(checksumString).getID());
// }
}
return uniqueMap.values();
}
//--------------------------------------------------------------------------
/**
Guesses the BioSequenceType for the specified sequence. If the sequence is greater than 80% ATUGCN,
it will be called as NUCLEIC_ACID. Otherwise it will be called as PROTEIN.
@param inSequence the sequence to guess the sequence type from
@return the BioSequenceType of the input sequence
*/
public static BioSequenceType guessBioSequenceType(String inSequence)
{
BioSequenceType guessedType = BioSequenceType.PROTEIN;
// Is the sequence greater that 80% ATGCN?
if (StringUtil.replaceAllRegexp(inSequence, NON_NUC_PATTERN, "").length() > (0.8 * inSequence.length()))
{
guessedType = BioSequenceType.NUCLEIC_ACID;
}
return guessedType;
}
//--------------------------------------------------------------------------
public static String getReverseComplementSequence(String inNucleicAcidSeq)
{
String outRevCompSeq;
try
{
outRevCompSeq = StreamUtil.inputStreamToString(new NucleicAcidComplementFilterInputStream(new ByteArrayInputStream(new StringBuilder(inNucleicAcidSeq).reverse().toString().getBytes())));
}
catch (IOException e)
{
throw new RuntimeException(e);
}
return outRevCompSeq;
}
//--------------------------------------------------------------------------
/**
* Creates a new SeqLocation that is relative to the opposite strand.
* Ex: [2, 6] on the reverse strand of a nucleotide sequence of length 10,
* would be converted to a forward strand relative location of [9, 5].
* @param inSeqLoc the initial sequence location
* @param inNucleotideSeqLength the length of the nucleotide sequence containing the location
* @return a SeqLocation relative to the opposite strand
*/
public static SeqLocation flipStrandSeqLocation(SeqLocation inSeqLoc, int inNucleotideSeqLength)
{
SeqLocation fwdRelativeSeqLoc = new SeqLocation()
.setChainId(inSeqLoc.getChainId())
.setStart(inNucleotideSeqLength - inSeqLoc.getStart() + 1)
.setEnd(inNucleotideSeqLength - inSeqLoc.getEnd() + 1);
return fwdRelativeSeqLoc;
}
}