com.hfg.bio.seq.BioSequenceImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq;
import java.lang.reflect.Constructor;
import java.util.*;
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.*;
import com.hfg.bio.seq.format.SeqIOException;
import com.hfg.chem.Element;
import com.hfg.chem.Molecule;
import com.hfg.chem.OrganicMatter;
import com.hfg.chem.OrganicMatterImpl;
import com.hfg.exception.ProgrammingException;
import com.hfg.exception.UnimplementedMethodException;
import com.hfg.util.ChecksumUtil;
import com.hfg.util.io.GZIP;
import com.hfg.util.io.SegmentReader;
import com.hfg.util.io.StreamUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.StringUtil;
import com.hfg.xml.XMLNode;
import com.hfg.xml.XMLTag;
import com.hfg.xml.XMLizable;
//------------------------------------------------------------------------------
/**
Biological Protein, DNA, or RNA sequence.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
// TODO:
// - Create a composition filter that will track the composition as the sequence is set.
// - Restrict the sequence chars to the residue set.
public class BioSequenceImpl implements Cloneable, OrganicMatter, BioSequence
{
//##########################################################################
// PRIVATE FIELDS
//##########################################################################
private String mID;
private String mDescription;
private String mUncompressedSequence;
private List mCompressedSeqChunks;
private int mSeqLength;
private Integer mNumGaps;
private Integer mTotalGapLength;
private Map mComposition;
private Map mAttributes;
private byte[] mMD5Checksum;
private byte[] mSHA1Checksum;
// Tracks elemental composition and mass
private OrganicMatterImpl mMatter = new OrganicMatterImpl();
// How long the sequence should be before compression is used.
private static int sCompressionThreshold = 4 * 1024;
// Pattern of the characters that should get removed when setting the sequence
// private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[\\s\\d;_'\\\"]");
private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[^a-z\\*\\-]", Pattern.CASE_INSENSITIVE);
protected static final Pattern GAP_PATTERN = Pattern.compile("-+");
//##########################################################################
// CONSTRUCTORS
//##########################################################################
//--------------------------------------------------------------------------
public BioSequenceImpl()
{
}
//--------------------------------------------------------------------------
public BioSequenceImpl(XMLNode inXML)
{
setID(inXML.getAttributeValue(HfgBioXML.ID_ATT));
XMLNode descriptionTag = inXML.getOptionalSubtagByName(HfgBioXML.DESCRIPTION_TAG);
if (descriptionTag != null) setDescription(descriptionTag.getContent());
setSequence(inXML.getContent());
XMLNode attributesTag = inXML.getOptionalSubtagByName(HfgBioXML.ATTRIBUTES_TAG);
if (attributesTag != null)
{
List extends XMLNode> attributeTags = attributesTag.getSubtagsByName(HfgBioXML.ATTRIBUTE_TAG);
for (XMLNode attrTag : attributeTags)
{
String name = attrTag.getAttributeValue(HfgBioXML.NAME_ATT);
String className = attrTag.getAttributeValue(HfgBioXML.CLASS_ATT);
if (attrTag.getAttributeValue(HfgBioXML.IS_NULL_ATT) != null)
{
setAttribute(name, null);
}
else
{
String value = attrTag.getUnescapedContent();
try
{
Class clazz = Class.forName(className);
if (String.class.getName().equals(className))
{
setAttribute(name, value);
}
else if (Integer.class.getName().equals(className))
{
setAttribute(name, Integer.valueOf(value));
}
else if (Long.class.getName().equals(className))
{
setAttribute(name, Long.valueOf(value));
}
else if (Float.class.getName().equals(className))
{
setAttribute(name, Float.valueOf(value));
}
else if (Double.class.getName().equals(className))
{
setAttribute(name, Double.valueOf(value));
}
else if (Boolean.class.getName().equals(className))
{
setAttribute(name, Boolean.valueOf(value));
}
else
{
try
{
Constructor constructor = clazz.getConstructor(XMLTag.class);
setAttribute(name, constructor.newInstance(new XMLTag(new ByteArrayInputStream(value.getBytes()))));
}
catch (NoSuchMethodException e)
{
Constructor constructor = clazz.getConstructor(String.class);
setAttribute(name, constructor.newInstance(value));
}
}
}
catch (Exception e)
{
throw new RuntimeException("Problem extracting the value for seq attribute " + StringUtil.singleQuote(name) + "!", e);
}
}
}
}
}
//##########################################################################
// PUBLIC METHODS
//##########################################################################
//--------------------------------------------------------------------------
public BioSequenceType getType()
{
return null;
}
//--------------------------------------------------------------------------
@Override
public String toString()
{
return getSequence();
}
//--------------------------------------------------------------------------
@Override
public BioSequence clone()
{
clearCalculatedProperties();
BioSequenceImpl theClone;
try
{
theClone = (BioSequenceImpl) super.clone();
}
catch (CloneNotSupportedException e)
{
throw new RuntimeException(e);
}
if (mCompressedSeqChunks != null)
{
theClone.mCompressedSeqChunks = new ArrayList<>(mCompressedSeqChunks.size());
for (byte[] chunk : mCompressedSeqChunks)
{
theClone.mCompressedSeqChunks.add(chunk);
}
}
if (mComposition != null)
{
theClone.mComposition = new HashMap<>(mComposition.size());
for (String residue : mComposition.keySet())
{
theClone.mComposition.put(residue, mComposition.get(residue));
}
}
theClone.mMatter = mMatter.clone();
if (mAttributes != null)
{
// For right now this is a shallow clone of the attributes
theClone.mAttributes = new HashMap<>(mAttributes);
}
return theClone;
}
//--------------------------------------------------------------------------
public BioSequence setID(String inValue)
{
mID = inValue;
return this;
}
//--------------------------------------------------------------------------
public String getID()
{
return mID;
}
//--------------------------------------------------------------------------
public BioSequence setDescription(CharSequence inValue)
{
mDescription = (inValue != null ? inValue.toString() : null);
return this;
}
//--------------------------------------------------------------------------
public String getDescription()
{
return mDescription;
}
//--------------------------------------------------------------------------
public BioSequence setSequence(CharSequence inValue)
{
if (inValue != null)
{
setSequence(new StringReader(inValue.toString()));
}
else
{
mCompressedSeqChunks = null;
mUncompressedSequence = null;
clearCalculatedProperties();
}
return this;
}
//--------------------------------------------------------------------------
public BioSequence setSequence(Reader inReader)
throws SeqIOException
{
Reader bufferedReader = new BufferedReader(inReader);
boolean compressFlag = false;
int seqLength = 0;
StringBuilder uncompressedSeq = new StringBuilder();
try
{
char[] seqBuffer = new char[16 * 1024];
int readSize = 0;
while ((readSize = bufferedReader.read(seqBuffer)) >= 0)
{
// Remove whitespace and numbers
String rawSeqChunk = StringUtil.replaceAllRegexp(new String(seqBuffer, 0, readSize), SEQUENCE_PURIFICATION_PATTERN, "");
if (! compressFlag
&& uncompressedSeq.length() + rawSeqChunk.length() > sCompressionThreshold)
{
mCompressedSeqChunks = new ArrayList<>();
if (uncompressedSeq.length() > 0)
{
mCompressedSeqChunks.add(GZIP.compress(uncompressedSeq.toString()));
}
uncompressedSeq = null;
compressFlag = true;
}
if (compressFlag)
{
mCompressedSeqChunks.add(GZIP.compress(rawSeqChunk));
seqLength += rawSeqChunk.length();
}
else
{
uncompressedSeq.append(rawSeqChunk);
}
}
}
catch (Exception e)
{
if (e instanceof SeqIOException)
{
throw (SeqIOException) e;
}
else
{
throw new SeqIOException(e);
}
}
if (uncompressedSeq != null)
{
mUncompressedSequence = uncompressedSeq.toString();
mSeqLength = mUncompressedSequence.length();
}
else
{
mSeqLength = seqLength;
mUncompressedSequence = null;
}
// Calculate checksums TODO: Calculate on the stream as the sequence is set
clearCalculatedProperties();
return this;
}
//--------------------------------------------------------------------------
public String getSequence()
{
String outSeq = null;
if (mUncompressedSequence != null)
{
outSeq = mUncompressedSequence;
}
else if (CollectionUtil.hasValues(mCompressedSeqChunks))
{
try
{
outSeq = StreamUtil.inputStreamToString(getSequenceStream());
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
return outSeq;
}
//--------------------------------------------------------------------------
public Reader getSequenceReader()
{
Reader seqReader = null;
InputStream seqStream = getSequenceStream();
if (seqStream != null)
{
seqReader = new InputStreamReader(seqStream);
}
return seqReader;
}
//--------------------------------------------------------------------------
public Reader getSubSequenceReader(SeqLocation inSeqLocation)
{
SeqLocation seqLocation = inSeqLocation;
if (null == seqLocation)
{
seqLocation = new SeqLocation(1, length());
}
return new SegmentReader(getSequenceReader(), seqLocation);
}
//--------------------------------------------------------------------------
public InputStream getSequenceStream()
{
InputStream seqStream = null;
if (mUncompressedSequence != null)
{
seqStream = new ByteArrayInputStream(mUncompressedSequence.getBytes());
}
else if (CollectionUtil.hasValues(mCompressedSeqChunks))
{
seqStream = new SeqStreamer(Direction.FORWARD);
}
return seqStream;
}
//--------------------------------------------------------------------------
public String getSubSequence(SeqLocation inSeqLocation)
{
String sequence = null;
Reader reader = null;
try
{
reader = getSubSequenceReader(inSeqLocation);
sequence = StreamUtil.readerToString(reader);
}
catch (IOException e)
{
throw new ProgrammingException(e);
}
finally
{
StreamUtil.close(reader);
}
return sequence;
}
//--------------------------------------------------------------------------
public byte[] getMD5Checksum()
{
if (null == mMD5Checksum
&& getSequence() != null)
{
mMD5Checksum = ChecksumUtil.calculateMD5(getSequence());
}
return mMD5Checksum;
}
//--------------------------------------------------------------------------
public byte[] getSHA1Checksum()
{
if (null == mSHA1Checksum
&& getSequence() != null)
{
mSHA1Checksum = ChecksumUtil.calculateSHA1(getSequence());
}
return mSHA1Checksum;
}
//--------------------------------------------------------------------------
public boolean containsGaps()
{
return (getNumGaps() > 0);
}
//--------------------------------------------------------------------------
public Integer getTotalGapLength()
{
if (null == mTotalGapLength)
{
getNumGaps();
}
return mTotalGapLength;
}
//--------------------------------------------------------------------------
protected void setNumGaps(Integer inValue)
{
mNumGaps = inValue;
}
//--------------------------------------------------------------------------
protected void setTotalGapLength(Integer inValue)
{
mTotalGapLength = inValue;
}
//--------------------------------------------------------------------------
public Integer getNumGaps()
{
if (null == mNumGaps)
{
countGaps();
}
return mNumGaps;
}
//--------------------------------------------------------------------------
protected void countGaps()
{
Matcher m = GAP_PATTERN.matcher(getSequence());
int count = 0;
int totalGapLength = 0;
while (m.find())
{
count++;
totalGapLength += m.group(0).length();
}
mNumGaps = count;
mTotalGapLength = totalGapLength;
}
//--------------------------------------------------------------------------
public void setAttribute(String inName, Object inValue)
{
if (null == mAttributes)
{
mAttributes = new HashMap<>();
}
mAttributes.put(inName, inValue);
}
//--------------------------------------------------------------------------
public boolean hasAttribute(String inName)
{
return mAttributes != null && mAttributes.containsKey(inName);
}
//--------------------------------------------------------------------------
public Object getAttribute(String inName)
{
Object attr = null;
if (mAttributes != null)
{
attr = mAttributes.get(inName);
}
return attr;
}
//--------------------------------------------------------------------------
public Collection getAttributeNames()
{
Collection attrNames = null;
if (mAttributes != null)
{
attrNames = mAttributes.keySet();
}
return attrNames;
}
//--------------------------------------------------------------------------
public void clearAttributes()
{
if (mAttributes != null)
{
mAttributes.clear();
}
}
//--------------------------------------------------------------------------
public Object removeAttribute(String inName)
{
Object attr = null;
if (mAttributes != null)
{
attr = mAttributes.remove(inName);
}
return attr;
}
//--------------------------------------------------------------------------
public int length()
{
return mSeqLength;
}
//--------------------------------------------------------------------------
/**
Returns the residue character at the specified (1-based) sequence location.
@param inIndex the 1-based residue position
@return the residue at the specified position
*/
public char residueAt(int inIndex)
{
if (inIndex <= 0
|| inIndex > length())
{
throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!");
}
Reader reader = getSequenceReader();
try
{
reader.skip(inIndex - 1);
return (char) reader.read();
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
//--------------------------------------------------------------------------
/**
Specifies the residue to use at the specified (1-based) sequence location.
* @param inIndex the 1-based residue position
* @param inResidue the residue to place at the specified position
*/
public void setResidueAt(int inIndex, char inResidue)
{
if (inIndex <= 0
|| inIndex > length())
{
throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!");
}
StringBuilder buffer = new StringBuilder(getSequence());
buffer.setCharAt(inIndex - 1, inResidue);
setSequence(buffer.toString());
}
//--------------------------------------------------------------------------
public XMLNode toXMLNode()
{
XMLNode node = new XMLTag(HfgBioXML.HFGBIOSEQ_TAG);
node.setAttribute(HfgBioXML.ID_ATT, getID());
node.setAttribute(HfgBioXML.CLASS_ATT, getClass().getName());
node.setContent(getSequence());
if (StringUtil.isSet(getDescription()))
{
XMLNode descTag = new XMLTag(HfgBioXML.DESCRIPTION_TAG);
descTag.setContent(getDescription());
node.addSubtag(descTag);
}
if (mAttributes != null)
{
XMLNode attributesTag = new XMLTag(HfgBioXML.ATTRIBUTES_TAG);
node.addSubtag(attributesTag);
for (String attrName : mAttributes.keySet())
{
XMLNode attributeTag = new XMLTag(HfgBioXML.ATTRIBUTE_TAG);
attributeTag.setAttribute(HfgBioXML.NAME_ATT, attrName);
Object value = mAttributes.get(attrName);
if (value != null)
{
attributeTag.setAttribute(HfgBioXML.CLASS_ATT, value.getClass().getName());
if (value instanceof XMLizable)
{
attributeTag.addContentWithoutEscaping(((XMLizable)value).toXML());
}
else
{
attributeTag.setContent(value.toString());
}
}
else
{
attributeTag.setAttribute(HfgBioXML.IS_NULL_ATT, 1);
}
attributesTag.addSubtag(attributeTag);
}
}
return node;
}
//--------------------------------------------------------------------------
/**
Returns a Map with 1-letter code Strings as keys and Integers as the values.
If a residue isn't present in the sequence, there won't be an entry for it in the map.
@return Map with 1-letter code Strings as keys and Integers as the values
*/
protected Map getComposition()
{
if (null == mComposition)
{
Map map = new HashMap<>();
if (length() < sCompressionThreshold)
{
// This should be more performant when the sequence is short
String seq = getSequence();
for (int i = 0; i < seq.length(); i++)
{
char seqChar = seq.charAt(i);
int[] counter = map.get(seqChar + "");
if (null == counter)
{
map.put(seqChar + "", new int[]{1});
}
else
{
counter[0]++;
}
}
}
else
{
try
{
Reader seqReader = null;
try
{
seqReader = getSequenceReader();
if (seqReader != null)
{
int seqChar;
while ((seqChar = seqReader.read()) != -1)
{
int[] counter = map.get((char) seqChar + "");
if (null == counter)
{
map.put((char) seqChar + "", new int[]{1});
}
else
{
counter[0]++;
}
}
}
}
finally
{
if (seqReader != null) seqReader.close();
}
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
mComposition = new HashMap<>();
for (String residue : map.keySet())
{
mComposition.put(residue, map.get(residue)[0]);
}
}
return mComposition;
}
//--------------------------------------------------------------------------
/**
Returns an unmodifiable copy of the elemental composition Map. The keys are
Element objects and the values are Floats. Why Floats instead of Integers you
ask? Because some amino acid codes such as B and Z are ambiguous averages.
@return the elemental composition map
*/
public Map getElementalComposition()
{
if (! CollectionUtil.hasValues(mMatter.getElementalComposition()))
{
recalculateElementalComposition();
}
return mMatter.getElementalComposition();
}
//--------------------------------------------------------------------------
/**
Returns a chemical formula String like 'C5H11NO'. If carbon is present, it
is listed first followed by the other elements in ascending mass order.
Symbols for pure elements are enclosed in square brackets such as '[2H]2O'
for deuterated water.
@return the un-formatted chemical formula string
*/
public String getChemicalFormula()
{
if (! CollectionUtil.hasValues(mMatter.getElementalComposition()))
{
recalculateElementalComposition();
}
return mMatter.getChemicalFormula();
}
//--------------------------------------------------------------------------
/**
Returns a chemical formula String like 'C₅H₁₁NO'. If carbon is present, it
is listed first followed by the other elements in ascending mass order.
Symbols for isotopes are enclosed in square brackets such as '[²H]₂O'
for deuterated water.
@return the chemical formula string
*/
public String getChemicalFormulaWithSubscripts()
{
if (! CollectionUtil.hasValues(mMatter.getElementalComposition()))
{
recalculateElementalComposition();
}
return mMatter.getChemicalFormulaWithSubscripts();
}
//--------------------------------------------------------------------------
public Double getMonoisotopicMass()
{
if (! CollectionUtil.hasValues(mMatter.getElementalComposition()))
{
recalculateElementalComposition();
}
return mMatter.getMonoisotopicMass();
}
//--------------------------------------------------------------------------
public Double getAverageMass()
{
if (! CollectionUtil.hasValues(mMatter.getElementalComposition()))
{
recalculateElementalComposition();
}
return mMatter.getAverageMass();
}
//--------------------------------------------------------------------------
/**
Organic mass values used are from:
Zhang Z, Pan H, Chen X. 2009. Mass spectrometry for structural characterization
of therapeutic antibodies. Mass Spectrom Rev 28:147-176.
@return the organic average mass for the sequence
*/
public Double getOrganicAverageMass()
{
if (! CollectionUtil.hasValues(mMatter.getElementalComposition()))
{
recalculateElementalComposition();
}
return mMatter.getOrganicAverageMass();
}
//--------------------------------------------------------------------------
public void clearCalculatedProperties()
{
mMatter.clearCalculatedProperties();
mMatter.clearElementalComposition();
mComposition = null;
mNumGaps = null;
mTotalGapLength = null;
}
//##########################################################################
// PROTECTED METHODS
//##########################################################################
//--------------------------------------------------------------------------
// Needs to be implemented by extending classes.
protected Map getResidueComposition()
{
throw new UnimplementedMethodException("getResidueComposition() not implemented for this class!");
}
//--------------------------------------------------------------------------
// Needs to be implemented by extending classes.
protected Map getTerminiComposition()
{
throw new UnimplementedMethodException("getTerminiComposition() not implemented for this class!");
}
//--------------------------------------------------------------------------
// Needs to be implemented by extending classes.
protected Map extends Molecule, Integer> getXLinkComposition()
{
throw new UnimplementedMethodException("getXLinkComposition() not implemented for this class!");
}
//--------------------------------------------------------------------------
protected OrganicMatterImpl getOrganicMatter()
{
return mMatter;
}
//--------------------------------------------------------------------------
protected void recalculateElementalComposition()
{
mMatter.clearElementalComposition();
// Residues
Map residueComposition = getResidueComposition();
if (CollectionUtil.hasValues(residueComposition))
{
for (Molecule residue : residueComposition.keySet())
{
int count = residueComposition.get(residue);
if (count > 0)
{
mMatter.addElementalComposition(residue.getElementalComposition(), count);
}
}
}
// Termini
Map terminiComposition = getTerminiComposition();
if (CollectionUtil.hasValues(terminiComposition))
{
for (Molecule terminus : terminiComposition.keySet())
{
int count = terminiComposition.get(terminus);
if (count > 0)
{
mMatter.addElementalComposition(terminus.getElementalComposition(), count);
}
}
}
// X-Links?
Map extends Molecule, Integer> xLinkComposition = getXLinkComposition();
if (CollectionUtil.hasValues(xLinkComposition))
{
for (Molecule xlink : xLinkComposition.keySet())
{
int count = xLinkComposition.get(xlink);
if (count > 0)
{
mMatter.addElementalComposition(xlink.getElementalComposition(), count);
}
}
}
}
//--------------------------------------------------------------------------
protected InputStream getReverseSequenceStream()
{
InputStream seqStream = null;
if (mUncompressedSequence != null)
{
seqStream = new ByteArrayInputStream(new StringBuilder(mUncompressedSequence).reverse().toString().getBytes());
}
else if (CollectionUtil.hasValues(mCompressedSeqChunks))
{
seqStream = new SeqStreamer(Direction.REVERSE);
}
return seqStream;
}
//##########################################################################
// INNER CLASSES
//##########################################################################
private enum Direction
{
FORWARD, REVERSE;
}
private class SeqStreamer extends InputStream
{
private Direction mDirection;
private String mCurrentChunk;
private int mCurrentChunkIndex;
private int mCharIndex;
private boolean mDone = false;
//-----------------------------------------------------------------------
public SeqStreamer(Direction inDirection)
{
mDirection = inDirection;
mCurrentChunkIndex = (mDirection == Direction.FORWARD ? 0 : mCompressedSeqChunks.size() - 1);
}
//-----------------------------------------------------------------------
public int read()
{
return (mDone ? -1 : getNextChar());
}
//-----------------------------------------------------------------------
private char getNextChar()
{
if (null == mCurrentChunk)
{
if (mDirection == Direction.FORWARD)
{
mCurrentChunk = GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex));
}
else
{
mCurrentChunk = new StringBuilder(GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex))).reverse().toString();
}
mCharIndex = 0;
}
char nextChar = mCurrentChunk.charAt(mCharIndex++);
if (mCharIndex >= mCurrentChunk.length())
{
// This is the last char in this chunk.
mCurrentChunk = null;
mCurrentChunkIndex = mCurrentChunkIndex + (mDirection == Direction.FORWARD ? 1 : - 1);
if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedSeqChunks.size())
{
// This was the last chunk.
mDone = true;
}
}
return nextChar;
}
}
}