All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.BioSequenceImpl Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq;

import java.lang.reflect.Constructor;
import java.util.*;
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.*;
import com.hfg.bio.seq.format.SeqIOException;
import com.hfg.chem.Element;
import com.hfg.chem.Molecule;
import com.hfg.exception.ProgrammingException;
import com.hfg.exception.UnimplementedMethodException;
import com.hfg.util.ChecksumUtil;
import com.hfg.util.io.GZIP;
import com.hfg.util.io.SegmentReader;
import com.hfg.util.io.StreamUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.StringUtil;
import com.hfg.xml.XMLNode;
import com.hfg.xml.XMLTag;
import com.hfg.xml.XMLizable;


//------------------------------------------------------------------------------
/**
 Biological Protein, DNA, or RNA sequence.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg XML/HTML Coding Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ // TODO: // - Create a composition filter that will track the composition as the sequence is set. // - Restrict the sequence chars to the residue set. public abstract class BioSequenceImpl extends Molecule implements Cloneable, BioSequence { //########################################################################## // PRIVATE FIELDS //########################################################################## private String mID; private String mDescription; private String mUncompressedSequence; private List mCompressedSeqChunks; private int mSeqLength; private Integer mNumGaps; private Integer mTotalGapLength; private Map mComposition; private byte[] mMD5Checksum; private byte[] mSHA1Checksum; // How long the sequence should be before compression is used. private int mCompressionThreshold = sDefaultCompressionThreshold; // Default sequence compression threshold. private static int sDefaultCompressionThreshold = 4 * 1024; // Pattern of the characters that should get removed when setting the sequence // private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[\\s\\d;_'\\\"]"); private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[^a-z\\*\\-]", Pattern.CASE_INSENSITIVE); protected static final Pattern GAP_PATTERN = Pattern.compile("-+"); //########################################################################## // CONSTRUCTORS //########################################################################## //-------------------------------------------------------------------------- public BioSequenceImpl() { } //-------------------------------------------------------------------------- public BioSequenceImpl(XMLNode inXML) { setID(inXML.getAttributeValue(HfgBioXML.ID_ATT)); XMLNode descriptionTag = inXML.getOptionalSubtagByName(HfgBioXML.DESCRIPTION_TAG); if (descriptionTag != null) setDescription(descriptionTag.getContent()); setSequence(inXML.getContent()); XMLNode attributesTag = inXML.getOptionalSubtagByName(HfgBioXML.ATTRIBUTES_TAG); if (attributesTag != null) { List attributeTags = attributesTag.getSubtagsByName(HfgBioXML.ATTRIBUTE_TAG); for (XMLNode attrTag : attributeTags) { String name = attrTag.getAttributeValue(HfgBioXML.NAME_ATT); String className = attrTag.getAttributeValue(HfgBioXML.CLASS_ATT); if (attrTag.getAttributeValue(HfgBioXML.IS_NULL_ATT) != null) { setAttribute(name, null); } else { String value = attrTag.getUnescapedContent(); try { Class clazz = Class.forName(className); if (String.class.getName().equals(className)) { setAttribute(name, value); } else if (Integer.class.getName().equals(className)) { setAttribute(name, Integer.valueOf(value)); } else if (Long.class.getName().equals(className)) { setAttribute(name, Long.valueOf(value)); } else if (Float.class.getName().equals(className)) { setAttribute(name, Float.valueOf(value)); } else if (Double.class.getName().equals(className)) { setAttribute(name, Double.valueOf(value)); } else if (Boolean.class.getName().equals(className)) { setAttribute(name, Boolean.valueOf(value)); } else { try { Constructor constructor = clazz.getConstructor(XMLTag.class); setAttribute(name, constructor.newInstance(new XMLTag(new ByteArrayInputStream(value.getBytes())))); } catch (NoSuchMethodException e) { Constructor constructor = clazz.getConstructor(String.class); setAttribute(name, constructor.newInstance(value)); } } } catch (Exception e) { throw new RuntimeException("Problem extracting the value for seq attribute " + StringUtil.singleQuote(name) + "!", e); } } } } } //########################################################################## // PUBLIC METHODS //########################################################################## //--------------------------------------------------------------------------- public BioSequence setCompressionThreshold(int inNumBytes) { mCompressionThreshold = inNumBytes; return this; } //-------------------------------------------------------------------------- public BioSequenceType getType() { return null; } //-------------------------------------------------------------------------- @Override public String toString() { return getSequence(); } //-------------------------------------------------------------------------- @Override public BioSequenceImpl clone() { clearCalculatedProperties(); BioSequenceImpl theClone = (BioSequenceImpl) super.clone(); if (mCompressedSeqChunks != null) { theClone.mCompressedSeqChunks = new ArrayList<>(mCompressedSeqChunks.size()); for (byte[] chunk : mCompressedSeqChunks) { theClone.mCompressedSeqChunks.add(chunk); } } if (mComposition != null) { theClone.mComposition = new HashMap<>(mComposition.size()); for (String residue : mComposition.keySet()) { theClone.mComposition.put(residue, mComposition.get(residue)); } } return theClone; } //--------------------------------------------------------------------------- @Override public boolean equals(Object inObj2) { return (0 == compareTo(inObj2)); } //-------------------------------------------------------------------------- public BioSequence setID(String inValue) { mID = inValue; return this; } //-------------------------------------------------------------------------- public String getID() { return mID; } //-------------------------------------------------------------------------- public BioSequence setDescription(CharSequence inValue) { mDescription = (inValue != null ? inValue.toString() : null); return this; } //-------------------------------------------------------------------------- public String getDescription() { return mDescription; } //-------------------------------------------------------------------------- public BioSequence setSequence(CharSequence inValue) { if (inValue != null) { setSequence(new StringReader(inValue.toString())); } else { mCompressedSeqChunks = null; mUncompressedSequence = null; clearElementalCompositionAndCalculatedProperties(); } return this; } //-------------------------------------------------------------------------- public BioSequence setSequence(Reader inReader) throws SeqIOException { // Remove whitespace and numbers Reader bufferedReader = new BufferedReader(new SeqFilterReader(inReader)); boolean compressFlag = false; int seqLength = 0; StringBuilder uncompressedSeq = null; try { char[] readBuffer = new char[1024]; int readSize = 0; while ((readSize = bufferedReader.read(readBuffer)) >= 0) { if (readSize > mCompressionThreshold) { mCompressedSeqChunks = new ArrayList<>(4); compressFlag = true; } if (! compressFlag && uncompressedSeq != null && uncompressedSeq.length() + readSize > mCompressionThreshold) { // Compress what has already accumulated in the uncompressed seq mCompressedSeqChunks = new ArrayList<>(4); if (uncompressedSeq.length() > 0) { mCompressedSeqChunks.add(GZIP.compress(uncompressedSeq.toString())); seqLength = uncompressedSeq.length(); } uncompressedSeq = null; mCompressedSeqChunks.add(GZIP.compress(new String(readBuffer, 0, readSize))); seqLength += readSize; compressFlag = true; // Use a bigger read buffer moving forward readBuffer = new char[mCompressionThreshold]; } else if (compressFlag) { mCompressedSeqChunks.add(GZIP.compress(new String(readBuffer, 0, readSize))); seqLength += readSize; } else { if (null == uncompressedSeq) { uncompressedSeq = new StringBuilder(readSize); } uncompressedSeq.append(readBuffer, 0, readSize); } } } catch (SeqIOException e) { throw e; } catch (Exception e) { throw new SeqIOException(e); } if (uncompressedSeq != null) { mUncompressedSequence = uncompressedSeq.toString(); mSeqLength = mUncompressedSequence.length(); } else { mSeqLength = seqLength; mUncompressedSequence = null; } // Calculate checksums TODO: Calculate on the stream as the sequence is set clearElementalCompositionAndCalculatedProperties(); return this; } //-------------------------------------------------------------------------- public String getSequence() { String outSeq = null; if (mUncompressedSequence != null) { outSeq = mUncompressedSequence; } else if (CollectionUtil.hasValues(mCompressedSeqChunks)) { try { outSeq = StreamUtil.inputStreamToString(getSequenceStream()); } catch (IOException e) { throw new RuntimeException(e); } } return outSeq; } //-------------------------------------------------------------------------- public Reader getSequenceReader() { Reader seqReader = null; InputStream seqStream = getSequenceStream(); if (seqStream != null) { seqReader = new InputStreamReader(seqStream); } return seqReader; } //-------------------------------------------------------------------------- public Reader getSubSequenceReader(SeqLocation inSeqLocation) { SeqLocation seqLocation = inSeqLocation; if (null == seqLocation) { seqLocation = new SeqLocation(1, length()); } return new SegmentReader(getSequenceReader(), seqLocation); } //-------------------------------------------------------------------------- public InputStream getSequenceStream() { InputStream seqStream = null; if (mUncompressedSequence != null) { seqStream = new ByteArrayInputStream(mUncompressedSequence.getBytes()); } else if (CollectionUtil.hasValues(mCompressedSeqChunks)) { seqStream = new SeqStreamer(Direction.FORWARD); } return seqStream; } //-------------------------------------------------------------------------- public String getSubSequence(SeqLocation inSeqLocation) { String sequence = null; Reader reader = null; try { reader = getSubSequenceReader(inSeqLocation); sequence = StreamUtil.readerToString(reader); } catch (IOException e) { throw new ProgrammingException(e); } finally { StreamUtil.close(reader); } return sequence; } //-------------------------------------------------------------------------- public byte[] getMD5Checksum() { if (null == mMD5Checksum && getSequence() != null) { mMD5Checksum = ChecksumUtil.calculateMD5(getSequence()); } return mMD5Checksum; } //-------------------------------------------------------------------------- public byte[] getSHA1Checksum() { if (null == mSHA1Checksum && getSequence() != null) { mSHA1Checksum = ChecksumUtil.calculateSHA1(getSequence()); } return mSHA1Checksum; } //-------------------------------------------------------------------------- public boolean containsGaps() { return (getNumGaps() > 0); } //-------------------------------------------------------------------------- public Integer getTotalGapLength() { if (null == mTotalGapLength) { getNumGaps(); } return mTotalGapLength; } //-------------------------------------------------------------------------- protected void setNumGaps(Integer inValue) { mNumGaps = inValue; } //-------------------------------------------------------------------------- protected void setTotalGapLength(Integer inValue) { mTotalGapLength = inValue; } //-------------------------------------------------------------------------- public Integer getNumGaps() { if (null == mNumGaps) { countGaps(); } return mNumGaps; } //-------------------------------------------------------------------------- protected void countGaps() { Matcher m = GAP_PATTERN.matcher(getSequence()); int count = 0; int totalGapLength = 0; while (m.find()) { count++; totalGapLength += m.group(0).length(); } mNumGaps = count; mTotalGapLength = totalGapLength; } //-------------------------------------------------------------------------- public int length() { return mSeqLength; } //-------------------------------------------------------------------------- /** Returns the residue character at the specified (1-based) sequence location. @param inIndex the 1-based residue position @return the residue at the specified position */ public char residueAt(int inIndex) { if (inIndex <= 0 || inIndex > length()) { throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!"); } Reader reader = getSequenceReader(); try { reader.skip(inIndex - 1); return (char) reader.read(); } catch (IOException e) { throw new RuntimeException(e); } } //-------------------------------------------------------------------------- /** Specifies the residue to use at the specified (1-based) sequence location. * @param inIndex the 1-based residue position * @param inResidue the residue to place at the specified position */ public void setResidueAt(int inIndex, char inResidue) { if (inIndex <= 0 || inIndex > length()) { throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!"); } StringBuilder buffer = new StringBuilder(getSequence()); buffer.setCharAt(inIndex - 1, inResidue); setSequence(buffer.toString()); } //-------------------------------------------------------------------------- @Override public XMLNode toXMLNode() { XMLNode node = new XMLTag(HfgBioXML.HFGBIOSEQ_TAG); node.setAttribute(HfgBioXML.ID_ATT, getID()); node.setAttribute(HfgBioXML.CLASS_ATT, getClass().getName()); node.setContent(getSequence()); if (StringUtil.isSet(getDescription())) { XMLNode descTag = new XMLTag(HfgBioXML.DESCRIPTION_TAG); descTag.setContent(getDescription()); node.addSubtag(descTag); } if (hasAttributes()) { XMLNode attributesTag = new XMLTag(HfgBioXML.ATTRIBUTES_TAG); node.addSubtag(attributesTag); for (String attrName : getAttributeNames()) { XMLNode attributeTag = new XMLTag(HfgBioXML.ATTRIBUTE_TAG); attributeTag.setAttribute(HfgBioXML.NAME_ATT, attrName); Object value = getAttribute(attrName); if (value != null) { attributeTag.setAttribute(HfgBioXML.CLASS_ATT, value.getClass().getName()); if (value instanceof XMLizable) { attributeTag.addContentWithoutEscaping(((XMLizable)value).toXML()); } else { attributeTag.setContent(value.toString()); } } else { attributeTag.setAttribute(HfgBioXML.IS_NULL_ATT, 1); } attributesTag.addSubtag(attributeTag); } } return node; } //-------------------------------------------------------------------------- /** Returns a Map with 1-letter code Strings as keys and Integers as the values. If a residue isn't present in the sequence, there won't be an entry for it in the map. @return Map with 1-letter code Strings as keys and Integers as the values */ protected Map getComposition() { if (null == mComposition) { Map map = new HashMap<>(); if (length() < mCompressionThreshold) { // This should be more performant when the sequence is short String seq = getSequence(); for (int i = 0; i < seq.length(); i++) { char seqChar = seq.charAt(i); int[] counter = map.get(seqChar + ""); if (null == counter) { map.put(seqChar + "", new int[]{1}); } else { counter[0]++; } } } else { try { Reader seqReader = null; try { seqReader = getSequenceReader(); if (seqReader != null) { int seqChar; while ((seqChar = seqReader.read()) != -1) { int[] counter = map.get((char) seqChar + ""); if (null == counter) { map.put((char) seqChar + "", new int[]{1}); } else { counter[0]++; } } } } finally { if (seqReader != null) seqReader.close(); } } catch (IOException e) { throw new RuntimeException(e); } } mComposition = new HashMap<>(); for (String residue : map.keySet()) { mComposition.put(residue, map.get(residue)[0]); } } return mComposition; } //-------------------------------------------------------------------------- /** Returns an unmodifiable copy of the elemental composition Map. The keys are Element objects and the values are Floats. Why Floats instead of Integers you ask? Because some amino acid codes such as B and Z are ambiguous averages. @return the elemental composition map */ @Override public Map getElementalComposition() { Map elementalComposition = super.getElementalComposition(); if (! CollectionUtil.hasValues(elementalComposition)) { recalculateElementalComposition(); elementalComposition = super.getElementalComposition(); } return elementalComposition; } //-------------------------------------------------------------------------- /** Returns a chemical formula String like 'C5H11NO'. If carbon is present, it is listed first followed by the other elements in ascending mass order. Symbols for pure elements are enclosed in square brackets such as '[2H]2O' for deuterated water. @return the un-formatted chemical formula string */ @Override public String getChemicalFormula() { String chemicalFormula = super.getChemicalFormula(); if (null == chemicalFormula) { getElementalComposition(); chemicalFormula = super.getChemicalFormula(); } return chemicalFormula; } //-------------------------------------------------------------------------- /** Returns a chemical formula String like 'C₅H₁₁NO'. If carbon is present, it is listed first followed by the other elements in ascending mass order. Symbols for isotopes are enclosed in square brackets such as '[²H]₂O' for deuterated water. @return the chemical formula string */ @Override public String getChemicalFormulaWithSubscripts() { String chemicalFormula = super.getChemicalFormulaWithSubscripts(); if (null == chemicalFormula) { getElementalComposition(); chemicalFormula = super.getChemicalFormulaWithSubscripts(); } return chemicalFormula; } //-------------------------------------------------------------------------- @Override public Double getMonoisotopicMass() { Double mass = super.getMonoisotopicMass(); if (null == mass) { getElementalComposition(); mass = super.getMonoisotopicMass(); } return mass; } //-------------------------------------------------------------------------- @Override public Double getAverageMass() { Double mass = super.getAverageMass(); if (null == mass) { getElementalComposition(); mass = super.getAverageMass(); } return mass; } //-------------------------------------------------------------------------- /** Organic mass values used are from:
      Zhang Z, Pan H, Chen X. 2009. Mass spectrometry for structural characterization
      of therapeutic antibodies. Mass Spectrom Rev 28:147-176.
    
@return the organic average mass for the sequence */ @Override public Double getOrganicAverageMass() { Double mass = super.getOrganicAverageMass(); if (null == mass) { getElementalComposition(); mass = super.getOrganicAverageMass(); } return mass; } //-------------------------------------------------------------------------- @Override public void clearCalculatedProperties() { super.clearCalculatedProperties(); mComposition = null; mNumGaps = null; mTotalGapLength = null; } //########################################################################## // PROTECTED METHODS //########################################################################## //-------------------------------------------------------------------------- // Setup this way to avoid a stackoverflow if clearElementalComposition() is called within clearCalculatedProperties(). public void clearElementalCompositionAndCalculatedProperties() { super.clearElementalComposition(); mComposition = null; mNumGaps = null; mTotalGapLength = null; } //-------------------------------------------------------------------------- // Needs to be implemented by extending classes. protected Map getResidueComposition() { throw new UnimplementedMethodException("getResidueComposition() not implemented for this class!"); } //-------------------------------------------------------------------------- // Needs to be implemented by extending classes. protected Map getTerminiComposition() { throw new UnimplementedMethodException("getTerminiComposition() not implemented for this class!"); } //-------------------------------------------------------------------------- // Needs to be implemented by extending classes. protected Map getXLinkComposition() { throw new UnimplementedMethodException("getXLinkComposition() not implemented for this class!"); } //-------------------------------------------------------------------------- protected void recalculateElementalComposition() { clearElementalComposition(); // Residues Map residueComposition = getResidueComposition(); if (CollectionUtil.hasValues(residueComposition)) { for (Molecule residue : residueComposition.keySet()) { int count = residueComposition.get(residue); if (count > 0) { addElementalComposition(residue.getElementalComposition(), count); } } } // Termini Map terminiComposition = getTerminiComposition(); if (CollectionUtil.hasValues(terminiComposition)) { for (Molecule terminus : terminiComposition.keySet()) { int count = terminiComposition.get(terminus); if (count > 0) { addElementalComposition(terminus.getElementalComposition(), count); } } } // X-Links? Map xLinkComposition = getXLinkComposition(); if (CollectionUtil.hasValues(xLinkComposition)) { for (Molecule xlink : xLinkComposition.keySet()) { int count = xLinkComposition.get(xlink); if (count > 0) { addElementalComposition(xlink.getElementalComposition(), count); } } } } //-------------------------------------------------------------------------- protected InputStream getReverseSequenceStream() { InputStream seqStream = null; if (mUncompressedSequence != null) { seqStream = new ByteArrayInputStream(new StringBuilder(mUncompressedSequence).reverse().toString().getBytes()); } else if (CollectionUtil.hasValues(mCompressedSeqChunks)) { seqStream = new SeqStreamer(Direction.REVERSE); } return seqStream; } //########################################################################## // INNER CLASSES //########################################################################## private enum Direction { FORWARD, REVERSE; } private class SeqStreamer extends InputStream { private Direction mDirection; private String mCurrentChunk; private int mCurrentChunkIndex; private int mCharIndex; private boolean mDone = false; //----------------------------------------------------------------------- public SeqStreamer(Direction inDirection) { mDirection = inDirection; mCurrentChunkIndex = (mDirection == Direction.FORWARD ? 0 : mCompressedSeqChunks.size() - 1); } //----------------------------------------------------------------------- public int read() { return (mDone ? -1 : getNextChar()); } //----------------------------------------------------------------------- private char getNextChar() { if (null == mCurrentChunk) { if (mDirection == Direction.FORWARD) { mCurrentChunk = GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex)); } else { mCurrentChunk = new StringBuilder(GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex))).reverse().toString(); } mCharIndex = 0; } char nextChar = mCurrentChunk.charAt(mCharIndex++); if (mCharIndex >= mCurrentChunk.length()) { // This is the last char in this chunk. mCurrentChunk = null; mCurrentChunkIndex = mCurrentChunkIndex + (mDirection == Direction.FORWARD ? 1 : - 1); if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedSeqChunks.size()) { // This was the last chunk. mDone = true; } } return nextChar; } } public class SeqFilterReader extends FilterReader { private char[] mBuffer = new char[8196]; private int mBufferLimit; private int mBufferIndex; private boolean mEndOfStreamReached; //--------------------------------------------------------------------------- public SeqFilterReader(Reader inReader) { super(inReader); } //--------------------------------------------------------------------------- public int read() throws IOException { int returnChar; /* do { if (mBufferIndex >= mBufferLimit) { fillBuffer(); } returnChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); } while (returnChar >= 0 && returnChar != '*' && returnChar != '-' && (returnChar < 65 || returnChar > 90) // Not upper case letter && (returnChar < 97 || returnChar > 122) // Not lower case letter ); */ while (true) { if (mBufferIndex >= mBufferLimit) { fillBuffer(); } returnChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); if ((returnChar > 64 && returnChar < 91) // Upper-case letter || (returnChar > 96 && returnChar < 123) // Lower-case letter || returnChar == '-' // Gap || returnChar == '*' // Stop codon || returnChar < 0) // End of stream { break; } }; return returnChar; } //--------------------------------------------------------------------------- @Override public int read(char[] inBuffer, int inOffset, int inMaxReadLength) throws IOException { int theChar; int numCharsRead = 0; do { while (true) { if (mBufferIndex >= mBufferLimit) { fillBuffer(); } theChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); if ((theChar > 64 && theChar < 91) // Upper-case letter || (theChar > 96 && theChar < 123) // Lower-case letter || theChar == '-' // Gap || theChar == '*' // Stop codon || theChar < 0) // End of stream { break; } }; if (theChar > 0) { inBuffer[inOffset++] = (char) theChar; numCharsRead++; } } while (theChar >= 0 && numCharsRead < inMaxReadLength); return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead); } //--------------------------------------------------------------------------- private void fillBuffer() throws IOException { mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length); if (-1 == mBufferLimit) { mEndOfStreamReached = true; } // Reset the index mBufferIndex = 0; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy