com.hfg.bio.seq.ProteinDigest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.hfg.bio.AminoAcidSet;
import com.hfg.bio.CTerminalGroup;
import com.hfg.bio.DigestFragment;
import com.hfg.bio.DigestSettings;
import com.hfg.bio.NTerminalGroup;
import com.hfg.bio.Protease;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
//------------------------------------------------------------------------------
/**
Performs a digestion of a protein via protease(s).
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class ProteinDigest
{
private DigestSettings mSettings;
private Protein mProtein;
private Set mProteases;
private static final int NUM_N_TERMINAL_POSITIONS = 6;
private static final int NUM_C_TERMINAL_POSITIONS = 3;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public ProteinDigest(Protein inProtein, DigestSettings inSettings)
{
mProtein = inProtein;
mSettings = inSettings;
mProteases = inSettings.getProteases();
if (! CollectionUtil.hasValues(mProteases))
{
throw new RuntimeException("No protease(s) specified!");
}
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public DigestSettings getSettings()
{
return mSettings;
}
//---------------------------------------------------------------------------
public Protein getProtein()
{
return mProtein;
}
//---------------------------------------------------------------------------
public List getFragments()
{
List outFrags;
DigestSettings settings = getSettings();
if (settings.getAlkylatedCys() != null)
{
Protein proteinCopy = getProtein().clone();
proteinCopy.removeXLinks(ProteinXLinkType.DISULFIDE);
// Reflect that the cysteines in the protein are alkylated.
AminoAcidSet aaSet = new AminoAcidSet(getProtein().getAminoAcidSet());
aaSet.setMapping('c', settings.getAlkylatedCys());
aaSet.setMapping('C', settings.getAlkylatedCys());
proteinCopy.setAminoAcidSet(aaSet);
// Are there other types of x-links?
if (CollectionUtil.hasValues(proteinCopy.getXLinks()))
{
outFrags = complexDigestion(proteinCopy, settings);
}
else
{
outFrags = simpleDigestion(proteinCopy, settings);
}
}
else
{
// Native digest. Deal with X-links
outFrags = complexDigestion(getProtein(), settings);
}
/* TODO: What was this for?
Set fragSet = new OrderedSet<>(outFrags);
outFrags.clear();
outFrags.addAll(fragSet);
*/
if (settings.getUniqueSeqs())
{
reduceToUniqueFragments(outFrags);
}
return outFrags;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//--------------------------------------------------------------------------
public boolean isCleavageSite(CharSequence inNTerminalResidues, CharSequence inCTerminalResidues)
{
boolean isCleavageSite = false;
for (Protease protease : mProteases)
{
isCleavageSite = protease.isCleavageSite(inNTerminalResidues, inCTerminalResidues);
if (isCleavageSite) break;
}
return isCleavageSite;
}
//--------------------------------------------------------------------------
private void addPrecedingAndTrailingResidues(List inFragments, Protein inProtein)
{
for (DigestFragment fragment : inFragments)
{
int position = fragment.getBegin() - 1;
if (position > 0)
{
fragment.setPrecedingResidue(inProtein.residueAt(position));
}
position = fragment.getEnd() + 1;
if (position <= inProtein.length())
{
fragment.setTrailingResidue(inProtein.residueAt(position));
}
}
}
//--------------------------------------------------------------------------
private List simpleDigestion(Protein inProtein, DigestSettings inSettings)
{
List outFrags = new ArrayList<>();
if (CollectionUtil.hasValues(inProtein.getChains()))
{
for (Protein chain : inProtein.getChains())
{
outFrags.addAll(simpleDigestion(chain, inSettings));
}
}
else
{
AminoAcidSet aaSet;
if (inSettings.getAlkylatedCys() != null)
{
aaSet = new AminoAcidSet(inProtein.getAminoAcidSet());
aaSet.setMapping('c', inSettings.getAlkylatedCys());
aaSet.setMapping('C', inSettings.getAlkylatedCys());
}
else
{
aaSet = inProtein.getAminoAcidSet().clone();
}
// We don't want for custom N- or C-terminal groups to get used on anything but the terminal peptides
NTerminalGroup nTerminalGroup = inProtein.getAminoAcidSet().getNTerminalGroup();
if (! nTerminalGroup.equals(NTerminalGroup.UNMODIFIED_N_TERMINUS))
{
aaSet.setNTerminalGroup(NTerminalGroup.UNMODIFIED_N_TERMINUS);
}
CTerminalGroup cTerminalGroup = inProtein.getAminoAcidSet().getCTerminalGroup();
if (! cTerminalGroup.equals(CTerminalGroup.UNMODIFIED_C_TERMINUS))
{
aaSet.setCTerminalGroup(CTerminalGroup.UNMODIFIED_C_TERMINUS);
}
try
{
Reader seqReader = null;
try
{
seqReader = inProtein.getSequenceReader();
SlidingWindow fragWindow = new SlidingWindow(inProtein.getID(), aaSet, inSettings);
int bufferSize = NUM_N_TERMINAL_POSITIONS + NUM_C_TERMINAL_POSITIONS;
int[] site = new int[bufferSize];
for (int i = 0; i < bufferSize; i++)
{
site[i] = -1;
}
StringBuilder nTerminalResidues = new StringBuilder();
StringBuilder cTerminalResidues = new StringBuilder();
int residue = seqReader.read();
if (residue != -1)
{
StringBuilder frag = new StringBuilder((char) residue + "");
site[bufferSize - 1] = residue; // Drops in on the right side and slides left
while ((residue = seqReader.read()) != -1)
{
for (int i = 0; i < bufferSize - 1; i++)
{
site[i] = site[i + 1];
}
site[bufferSize - 1] = residue;
if (-1 == site[NUM_N_TERMINAL_POSITIONS - 1])
{
continue;
}
nTerminalResidues.setLength(0);
for (int i = 0; i < NUM_N_TERMINAL_POSITIONS; i++)
{
nTerminalResidues.append((char) site[i]);
}
cTerminalResidues.setLength(0);
for (int i = NUM_N_TERMINAL_POSITIONS; i < bufferSize; i++)
{
cTerminalResidues.append((char) site[i]);
}
if (isCleavageSite(nTerminalResidues, cTerminalResidues))
{
List fragments = fragWindow.push(frag.toString());
if (fragments != null)
{
outFrags.addAll(fragments);
}
frag.setLength(0);
frag.append((char)site[NUM_N_TERMINAL_POSITIONS]);
}
else
{
frag.append((char)site[NUM_N_TERMINAL_POSITIONS]);
}
}
// Process residues remaining in the site buffer
while (hasUnprocessedBufferContent(site))
{
for (int i = 0; i < bufferSize - 1; i++)
{
site[i] = site[i + 1];
}
site[bufferSize - 1] = -1;
if (-1 == site[NUM_N_TERMINAL_POSITIONS - 1]
|| -1 == site[NUM_N_TERMINAL_POSITIONS])
{
continue;
}
nTerminalResidues.setLength(0);
for (int i = 0; i < NUM_N_TERMINAL_POSITIONS; i++)
{
int aa = site[i];
nTerminalResidues.append(aa != -1 ? (char) aa : "");
}
cTerminalResidues.setLength(0);
for (int i = NUM_N_TERMINAL_POSITIONS; i < bufferSize; i++)
{
int aa = site[i];
cTerminalResidues.append(aa != -1 ? (char) aa : "");
}
if (isCleavageSite(nTerminalResidues, cTerminalResidues))
{
List fragments = fragWindow.push(frag.toString());
if (fragments != null)
{
outFrags.addAll(fragments);
}
frag.setLength(0);
frag.append((char)site[NUM_N_TERMINAL_POSITIONS]);
}
else
{
frag.append((char)site[NUM_N_TERMINAL_POSITIONS]);
}
}
List fragments = fragWindow.lastPush(frag.toString());
if (fragments != null)
{
outFrags.addAll(fragments);
}
}
addPrecedingAndTrailingResidues(outFrags, inProtein);
// Restore N-terminal group
if (! nTerminalGroup.equals(NTerminalGroup.UNMODIFIED_N_TERMINUS))
{
for (DigestFragment frag : outFrags)
{
if (1 == frag.getBegin())
{
frag.setAminoAcidSet(frag.getAminoAcidSet().clone().setNTerminalGroup(nTerminalGroup));
}
}
}
// Restore C-terminal group
if (! cTerminalGroup.equals(CTerminalGroup.UNMODIFIED_C_TERMINUS))
{
for (DigestFragment frag : outFrags)
{
if (inProtein.length() == frag.getEnd())
{
frag.setAminoAcidSet(frag.getAminoAcidSet().clone().setCTerminalGroup(cTerminalGroup));
}
}
}
}
finally
{
if (seqReader != null) seqReader.close();
}
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
// Apply the DigestSettings limits.
for (int i = 0; i < outFrags.size(); i++)
{
if (! inSettings.meetsCriteria(outFrags.get(i)))
{
outFrags.remove(i--);
}
}
return outFrags;
}
//--------------------------------------------------------------------------
private static boolean hasUnprocessedBufferContent(int[] inBuffer)
{
boolean hasUnprocessedContent = false;
for (int i = NUM_N_TERMINAL_POSITIONS - 1; i < NUM_N_TERMINAL_POSITIONS + NUM_C_TERMINAL_POSITIONS; i++)
{
if (inBuffer[i] != -1)
{
hasUnprocessedContent = true;
break;
}
}
return hasUnprocessedContent;
}
//--------------------------------------------------------------------------
private List complexDigestion(Protein inProtein, DigestSettings inSettings)
{
// Initially disable limits when finding fragments.
// We'll apply the desired settings after all the fragments have been linked up.
DigestSettings settingsWithoutLimits = inSettings.clone();
settingsWithoutLimits.setMinFragmentLength(null);
settingsWithoutLimits.setMaxFragmentLength(null);
settingsWithoutLimits.setMinFragmentMass(null);
settingsWithoutLimits.setMaxFragmentMass(null);
List rawFrags = simpleDigestion(inProtein, settingsWithoutLimits);
if (CollectionUtil.hasValues(inProtein.getXLinks()))
{
// For ea. x-link, bind it to the combinations of raw fragments
List orderedXLinks = new ArrayList<>(inProtein.getXLinks());
Collections.sort(orderedXLinks);
for (ProteinXLink xlink : orderedXLinks)
{
List donorFragIndexes = new ArrayList<>(5);
List acceptorFragIndexes = new ArrayList<>(5);
Set fragsToAdd = new HashSet<>(10);
Set fragsIndexesToRemove = new HashSet<>(10);
for (int fragIdx = 0; fragIdx < rawFrags.size(); fragIdx++)
{
DigestFragment frag = rawFrags.get(fragIdx);
boolean hasDonorSite = containsDonorSite(frag, xlink);
boolean hasAcceptorSite = containsAcceptorSite(frag, xlink);
if (hasDonorSite && hasAcceptorSite)
{
// This X-link is fully contained within this fragment
DigestFragment linkedFrag;
if (CollectionUtil.hasValues(frag.getChains()))
{
linkedFrag = frag;
}
else
{
linkedFrag = new DigestFragment();
linkedFrag.addChain(frag.clone());
// Add it back to the pool
fragsToAdd.add(linkedFrag);
// Flag the original unlinked frag for removal
fragsIndexesToRemove.add(fragIdx);
}
String donorFragId = getFragIdForDonorSite(linkedFrag, xlink);
String acceptorFragId = getFragIdForAcceptorSite(linkedFrag, xlink);
linkedFrag.addXLink(xlink.clone()
.setDonorChainId(donorFragId)
.setAcceptorChainId(acceptorFragId));
}
else
{
if (hasDonorSite) donorFragIndexes.add(fragIdx);
if (hasAcceptorSite) acceptorFragIndexes.add(fragIdx);
}
}
// Link the donors & acceptors in all possible combinations
for (int donorFragIndex : donorFragIndexes)
{
DigestFragment donorFrag = rawFrags.get(donorFragIndex);
String donorFragId = null;
if (CollectionUtil.hasValues(donorFrag.getChains()))
{
donorFragId = getFragIdForDonorSite(donorFrag, xlink);
}
else
{
donorFragId = donorFrag.getID();
}
for (int acceptorFragIndex : acceptorFragIndexes)
{
DigestFragment acceptorFrag = rawFrags.get(acceptorFragIndex);
String acceptorFragId = null;
DigestFragment linkedFrag;
if (CollectionUtil.hasValues(donorFrag.getChains()))
{
linkedFrag = donorFrag.clone();
}
else
{
linkedFrag = new DigestFragment();
donorFrag = (DigestFragment) donorFrag.clone()
.setID(donorFrag.getID() + ":" + donorFrag.getBeginFragIndex() + (donorFrag.getEndFragIndex() == donorFrag.getBeginFragIndex() ? "" : ".." + donorFrag.getEndFragIndex()));
donorFragId = donorFrag.getID();
linkedFrag.addChain(donorFrag);
}
// Add the acceptor chain (if it isn't already present)
acceptorFragId = getFragIdForAcceptorSite(linkedFrag, xlink);
if (! StringUtil.isSet(acceptorFragId))
{
acceptorFrag = (DigestFragment) acceptorFrag.clone()
.setID(acceptorFrag.getID() + ":" + acceptorFrag.getBeginFragIndex() + (acceptorFrag.getEndFragIndex() == acceptorFrag.getBeginFragIndex() ? "" : ".." + acceptorFrag.getEndFragIndex()));
acceptorFragId = getFragIdForAcceptorSite(acceptorFrag, xlink);
if (acceptorFrag.hasChains())
{
linkedFrag.addChains(acceptorFrag.getChains());
if (CollectionUtil.hasValues(acceptorFrag.getXLinks()))
{
for (ProteinXLink acceptorXlink : acceptorFrag.getXLinks())
{
linkedFrag.addXLink(acceptorXlink);
}
}
}
else
{
linkedFrag.addChain(acceptorFrag);
}
}
linkedFrag.addXLink(xlink.clone()
.setDonorChainId(donorFragId)
.setAcceptorChainId(acceptorFragId));
// Add it back to the pool
rawFrags.add(linkedFrag);
}
}
// Now remove the raw frags that were linked.
// We have to remove them in reverse order for all the indices to be valid
fragsIndexesToRemove.addAll(donorFragIndexes);
fragsIndexesToRemove.addAll(acceptorFragIndexes);
if (CollectionUtil.hasValues(fragsIndexesToRemove))
{
List orderedFragIndices = new ArrayList<>(fragsIndexesToRemove);
Collections.sort(orderedFragIndices);
Collections.reverse(orderedFragIndices);
for (int fragIndex : orderedFragIndices)
{
rawFrags.remove(fragIndex);
}
}
rawFrags.addAll(fragsToAdd);
}
}
// Apply the DigestSettings limits.
for (int i = 0; i < rawFrags.size(); i++)
{
if (! inSettings.meetsCriteria(rawFrags.get(i)))
{
rawFrags.remove(i--);
}
}
return rawFrags;
}
//--------------------------------------------------------------------------
private boolean containsDonorSite(DigestFragment inFrag, ProteinXLink inXLink)
{
DigestFragment donorFrag = getFragWithSite(inFrag, new SeqLocation(inXLink.getDonorPosition(), inXLink.getDonorPosition()).setChainId(inXLink.getDonorChainId()));
return (donorFrag != null);
}
//--------------------------------------------------------------------------
private boolean containsAcceptorSite(DigestFragment inFrag, ProteinXLink inXLink)
{
DigestFragment acceptorFrag = getFragWithSite(inFrag, new SeqLocation(inXLink.getAcceptorPosition(), inXLink.getAcceptorPosition()).setChainId(inXLink.getAcceptorChainId()));
return (acceptorFrag != null);
}
//--------------------------------------------------------------------------
private String getFragIdForDonorSite(DigestFragment inFrag, ProteinXLink inXLink)
{
DigestFragment donorFrag = getFragWithSite(inFrag, new SeqLocation(inXLink.getDonorPosition(), inXLink.getDonorPosition()).setChainId(inXLink.getDonorChainId()));
return (donorFrag != null ? donorFrag.getID() : null);
}
//--------------------------------------------------------------------------
private String getFragIdForAcceptorSite(DigestFragment inFrag, ProteinXLink inXLink)
{
DigestFragment acceptorFrag = getFragWithSite(inFrag, new SeqLocation(inXLink.getAcceptorPosition(), inXLink.getAcceptorPosition()).setChainId(inXLink.getAcceptorChainId()));
return (acceptorFrag != null ? acceptorFrag.getID() : null);
}
//--------------------------------------------------------------------------
// Recursive function needed to unroll complex fragments.
private DigestFragment getFragWithSite(DigestFragment inFrag, SeqLocation inLoc)
{
DigestFragment fragWithSite = null;
if (CollectionUtil.hasValues(inFrag.getChains()))
{
for (DigestFragment fragChain : (List) (Object) inFrag.getChains())
{
fragWithSite = getFragWithSite(fragChain, inLoc);
if (fragWithSite != null)
{
break;
}
}
}
else if (inFrag.getSrcChainId().equals(inLoc.getChainId())
&& inLoc.getStart() >= inFrag.getBegin()
&& inLoc.getStart() <= inFrag.getEnd())
{
fragWithSite = inFrag;
}
return fragWithSite;
}
//--------------------------------------------------------------------------
// TODO: Get this to work for complex digests.
private static void reduceToUniqueFragments(List inFrags)
{
Map uniqueMap = new HashMap<>(inFrags.size());
for (int i = 0; i < inFrags.size(); i++)
{
DigestFragment frag = inFrags.get(i);
String seqString;
if (frag.hasChains())
{
StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("/");
for (DigestFragment chain : (List) (Object) frag.getChains())
{
buffer.delimitedAppend(chain.getSequence());
}
seqString = buffer.toString();
}
else
{
seqString = frag.getSequence();
}
DigestFragment firstFrag = uniqueMap.get(seqString);
if (null == firstFrag)
{
uniqueMap.put(seqString, frag);
}
else
{
firstFrag.addIdenticalFragment(frag);
inFrags.remove(i--);
}
}
}
//##########################################################################
// INNER CLASS
//##########################################################################
protected class SlidingWindow
{
private String mChainId;
private AminoAcidSet mAminoAcidSet;
private StringBuilder[] mFrags;
private DigestSettings mDigestSettings;
private int mIndex = 1;
private int mLength;
private int mCurrentFragIndex = 0;
//-----------------------------------------------------------------------
public SlidingWindow(String inChainId, AminoAcidSet inAASet, DigestSettings inSettings)
{
mChainId = inChainId;
mAminoAcidSet = inAASet;
mDigestSettings = inSettings;
mFrags = new StringBuilder[inSettings.getMaxMissedCleavages() + 3];
mFrags[0] = new StringBuilder();
mFrags[1] = new StringBuilder();
mLength = 2;
}
//-----------------------------------------------------------------------
public List push(String inFrag)
{
List outFrags = null;
if (mLength < mFrags.length)
{
// Still filling the window
mFrags[mLength++] = new StringBuilder(inFrag);
}
else
{
StringBuilder tmp = mFrags[0];
for (int i = 1; i < mFrags.length; i++)
{
mFrags[i - 1] = mFrags[i];
}
mFrags[mFrags.length - 1] = tmp;
mFrags[mFrags.length - 1].setLength(0);
mFrags[mFrags.length - 1].append(inFrag);
mIndex += mFrags[0].length();
mCurrentFragIndex++;
List frags = evaluateCurrentFrag();
if (frags != null)
{
outFrags = new ArrayList<>(frags);
}
}
return outFrags;
}
//-----------------------------------------------------------------------
public List lastPush(String inFrag)
{
List outFrags = null;
List frags = push(inFrag);
if (frags != null)
{
outFrags = new ArrayList<>(frags);
}
for (int i = 0; i <= mDigestSettings.getMaxMissedCleavages(); i++)
{
frags = push("");
if (frags != null)
{
if (null == outFrags)
{
outFrags = new ArrayList<>(frags.size());
}
outFrags.addAll(frags);
}
}
return outFrags;
}
//-----------------------------------------------------------------------
private List evaluateCurrentFrag()
{
List fragments = null;
int maxMissedCleavages = 0;
if (mDigestSettings != null
&& mDigestSettings.getMaxMissedCleavages() != null)
{
maxMissedCleavages = mDigestSettings.getMaxMissedCleavages();
}
if (null == mDigestSettings
|| null == mDigestSettings.getMaxFragmentLength()
|| mFrags[1].length() <= mDigestSettings.getMaxFragmentLength())
{
StringBuilder frag = new StringBuilder();
for (int i = 1; i < mFrags.length - 1; i++)
{
if (mFrags[i].length() == 0) break;
frag.append(mFrags[i]);
if (null == mDigestSettings
|| null == mDigestSettings.getMinFragmentLength()
|| frag.length() >= mDigestSettings.getMinFragmentLength())
{
// Don't get too big
if (mDigestSettings != null
&& mDigestSettings.getMaxFragmentLength() != null
&& frag.length() > mDigestSettings.getMaxFragmentLength())
{
break;
}
DigestFragment digestFrag = allocateNewDigestFragment();
digestFrag.setSequence(frag.toString());
digestFrag.setBegin(mIndex);
digestFrag.setEnd(mIndex + frag.length() - 1);
digestFrag.setNumUncleavedSites(i - 1);
digestFrag.setBeginFragIndex(mCurrentFragIndex);
digestFrag.setEndFragIndex(mCurrentFragIndex + (i - 1));
if (null == fragments)
{
fragments = new ArrayList<>(maxMissedCleavages);
}
fragments.add(digestFrag);
}
}
}
return fragments;
}
//-----------------------------------------------------------------------
private DigestFragment allocateNewDigestFragment()
{
DigestFragment frag = new DigestFragment();
frag.setID(mChainId);
frag.setSrcChainId(mChainId);
frag.setAminoAcidSet(mAminoAcidSet);
return frag;
}
}
}