com.hfg.bio.seq.Protein Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq;
import java.io.Reader;
import java.math.BigDecimal;
import java.math.MathContext;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import com.hfg.bio.*;
import com.hfg.bio.glyco.Glycan;
import com.hfg.bio.proteinproperty.IsoelectricPoint;
import com.hfg.bio.proteinproperty.ProteinAnalysisMode;
import com.hfg.bio.proteinproperty.ReducedAnalysisMode;
import com.hfg.bio.proteinproperty.SimpleProteinPropertyCalcSettings;
import com.hfg.bio.taxonomy.NCBITaxon;
import com.hfg.chem.Element;
import com.hfg.chem.IonizableGroup;
import com.hfg.chem.Molecule;
import com.hfg.chem.OrganicMatterImpl;
import com.hfg.util.ChecksumUtil;
import com.hfg.util.CompareUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.OrderedMap;
import com.hfg.xml.XMLNode;
import com.hfg.xml.XMLTag;
//------------------------------------------------------------------------------
/**
Biological protein sequence.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class Protein extends BioSequencePlusImpl
{
//##########################################################################
// PRIVATE FIELDS
//##########################################################################
private AminoAcidSet mAASet;
// Properties
private AminoAcidComposition mAAComposition;
private Float mIsoelectricPoint;
private KaSet mIsoelectricPointKaSet;
private Integer mExtinctionCoeff;
private Float mPercentExtinctionCoeff;
private List mChains;
private Integer mNumDisulfideBonds;
private Set mXLinks;
private List mGlycans;
private Protein mParent;
private Map mChainIdMap = new OrderedMap<>(4);
private static Pattern sChainIdPattern = Pattern.compile("_(\\w)$");
//##########################################################################
// CONSTRUCTORS
//##########################################################################
//--------------------------------------------------------------------------
public Protein()
{
}
//--------------------------------------------------------------------------
public Protein(XMLNode inXML)
{
super(inXML);
if (! inXML.getTagName().equals(HfgBioXML.HFGBIOSEQ_TAG))
{
throw new RuntimeException("Cannot construct an " + this.getClass().getSimpleName() + " from a " + inXML.getTagName() + " tag!");
}
XMLNode aminoAcidSetTag = inXML.getOptionalSubtagByName(HfgBioXML.AASET_TAG);
if (aminoAcidSetTag != null)
{
mAASet = AminoAcidSet.instantiate(aminoAcidSetTag);
}
XMLNode chainsTag = inXML.getOptionalSubtagByName(HfgBioXML.CHAINS_TAG);
if (chainsTag != null)
{
for (XMLNode subtag : chainsTag.getXMLNodeSubtags())
{
addChain(new Protein(subtag));
}
}
// This needs to be set AFTER the chains have been added.
if (inXML.hasAttribute(HfgBioXML.DISULFIDE_CNT_ATT))
{
setNumDisulfideBonds(Integer.parseInt(inXML.getAttributeValue(HfgBioXML.DISULFIDE_CNT_ATT)));
}
XMLNode xlinksTag = inXML.getOptionalSubtagByName(HfgBioXML.XLINKS_TAG);
if (xlinksTag != null)
{
for (XMLNode subtag : chainsTag.getXMLNodeSubtags())
{
addXLink(new ProteinXLink(subtag));
}
}
}
//##########################################################################
// PUBLIC METHODS
//##########################################################################
//--------------------------------------------------------------------------
@Override
public BioSequenceType getType()
{
return BioSequenceType.PROTEIN;
}
//--------------------------------------------------------------------------
@Override
public Protein clone()
{
clearCalculatedProperties();
Protein theClone = (Protein) super.clone();
if (mAAComposition != null)
{
theClone.mAAComposition = mAAComposition.clone();
}
if (mChains != null)
{
theClone.mChains = new ArrayList<>(mChains.size());
for (Protein chain : mChains)
{
theClone.mChains.add(chain.clone());
}
theClone.mChainIdMap = new HashMap<>(mChains.size());
for (Protein chain : theClone.mChains)
{
theClone.mChainIdMap.put(chain.getID(), chain);
}
}
if (mXLinks != null)
{
theClone.mXLinks = new HashSet<>(mXLinks.size());
for (ProteinXLink xlink : mXLinks)
{
theClone.mXLinks.add(xlink.clone());
}
}
return theClone;
}
//---------------------------------------------------------------------------
// The hashcode is based on the sequence and not the id.
@Override
public int hashCode()
{
byte[] md5 = getMD5Checksum();
return md5 != null ? new String(md5).hashCode() : 0;
}
//---------------------------------------------------------------------------
@Override
public int compareTo(Object inObj2)
{
int result = -1;
if (this == inObj2)
{
result = 0;
}
else if (inObj2 != null
&& inObj2 instanceof Protein)
{
Protein protein2 = (Protein) inObj2;
// First compare the lengths
result = CompareUtil.compare(length(), protein2.length());
if (0 == result)
{
// Second compare the number of chains
result = CompareUtil.compare(getNumChains(), protein2.getNumChains());
}
if (0 == result)
{
// Third compare the sequences themselves. (We could use a checksum first but that causes calculation of the seq data string anyway.)
result = CompareUtil.compare(getSeqDataString(), protein2.getSeqDataString());
}
}
return result;
}
//--------------------------------------------------------------------------
@Override
public byte[] getMD5Checksum()
{
byte[] checksum = null;
if (getSequence() != null)
{
checksum = super.getMD5Checksum();
}
else if (CollectionUtil.hasValues(getChains()))
{
checksum = ChecksumUtil.calculateMD5(getSeqDataString());
}
return checksum;
}
//--------------------------------------------------------------------------
@Override
public byte[] getSHA1Checksum()
{
byte[] checksum = null;
if (getSequence() != null)
{
checksum = super.getSHA1Checksum();
}
else if (CollectionUtil.hasValues(getChains()))
{
checksum = ChecksumUtil.calculateSHA1(getSeqDataString());
}
return checksum;
}
//---------------------------------------------------------------------------
/**
Recursively calculates the number of chains in the protein.
@return the number of chains in the protein.
*/
public int getNumChains()
{
int numChains = 0;
if (getSequence() != null)
{
numChains = 1;
}
else if (CollectionUtil.hasValues(getChains()))
{
for (Protein chain : getChains())
{
numChains += chain.getNumChains();
}
}
return numChains;
}
//--------------------------------------------------------------------------
/**
Specifies the protein's name / identifier.
@param inValue the name / identifier for the protein
@return this Protein object to enable method chaining
*/
@Override
public Protein setID(String inValue)
{
String oldId = getID();
super.setID(inValue);
propogateIdChange(oldId, inValue);
if (mParent != null)
{
Protein topProtein = getTopProtein();
topProtein.mChainIdMap.remove(oldId);
String newId = topProtein.assignChainId(this);
if (getID() != null
&& !getID().equals(newId))
{
super.setID(newId);
propogateIdChange(oldId, newId);
}
}
return this;
}
//--------------------------------------------------------------------------
/**
Specifies the protein's description.
@param inValue the description of the protein
@return this Protein object to enable method chaining
*/
@Override
public Protein setDescription(CharSequence inValue)
{
super.setDescription(inValue);
return this;
}
//--------------------------------------------------------------------------
/**
Specifies the protein's sequence. A Protein can contain either a sequence or
other Protein objects as chains but not both.
@param inValue the sequence of the protein
@return this Protein object to enable method chaining
*/
@Override
public Protein setSequence(CharSequence inValue)
{
if (CollectionUtil.hasValues(mChains))
{
throw new RuntimeException("A Protein cannot have both chains and a sequence!");
}
return (Protein) super.setSequence(inValue);
}
//--------------------------------------------------------------------------
@Override
public Protein setNCBITaxon(NCBITaxon inValue)
{
return (Protein) super.setNCBITaxon(inValue);
}
//--------------------------------------------------------------------------
/**
Specifies the protein's sequence. A Protein can contain either a sequence or
other Protein objects as chains but not both.
@param inReader the sequence of the protein specified via a Reader
@return this Protein object to enable method chaining
*/
@Override
public Protein setSequence(Reader inReader)
{
if (CollectionUtil.hasValues(mChains))
{
throw new RuntimeException("A Protein cannot have both chains and a sequence!");
}
return (Protein) super.setSequence(inReader);
}
//--------------------------------------------------------------------------
/**
Specifies the protein's chains. A Protein can contain either a sequence or
other Protein objects as chains but not both.
@param inChains Protein objects that are chains of this Protein object
*/
public void setChains(Collection inChains)
{
if (super.length() > 0)
{
throw new RuntimeException("A Protein cannot have both chains and a sequence!");
}
mChains = null;
if (CollectionUtil.hasValues(inChains))
{
for (Protein chain : inChains)
{
addChain(chain);
}
}
}
//--------------------------------------------------------------------------
/**
Adds a specified protein chain to this Protein object (with a stoichiometry of one).
A Protein can contain either a sequence or other Protein objects as chains but not both.
@param inChain Protein object that is a chain of this Protein object
*/
public void addChain(Protein inChain)
{
if (super.length() > 0)
{
throw new RuntimeException("A Protein cannot have both chains and a sequence!");
}
if (inChain != null)
{
if (null == mChains) mChains = new ArrayList<>(5);
if (inChain.mParent != null)
{
inChain = inChain.clone();
}
mChains.add(inChain);
inChain.mParent = this;
clearCalculatedProperties();
inChain.checkId();
// If we're already using this chain, clone it before adding.
if (CollectionUtil.hasValues(inChain.getChains()))
{
List newList = new ArrayList<>(inChain.getChains().size());
boolean duplicatesDetected = false;
for (Protein chain : inChain.getChains())
{
if (getTopProtein().mChainIdMap.values().contains(chain))
{
chain = chain.clone();
newList.add(chain);
duplicatesDetected = true;
}
else
{
newList.add(chain);
}
chain.checkId();
}
if (duplicatesDetected) inChain.setChains(newList);
}
}
}
//--------------------------------------------------------------------------
/**
Adds specified protein chains to this Protein object (with a stoichiometry of one).
A Protein can contain either a sequence or other Protein objects as chains but not both.
@param inChains Protein objects that are chains of this Protein object
*/
public void addChains(Collection inChains)
{
// Not the most efficient way to do it, but it keeps things simple.
if (CollectionUtil.hasValues(inChains))
{
for (Protein chain : inChains)
{
addChain(chain);
}
}
}
//--------------------------------------------------------------------------
/**
Adds a specified number of copies of a specified protein chain to this Protein object.
(Ex. 2 heavy or light chains in an antibody.)
A Protein can contain either a sequence or other Protein objects as chains but not both.
@param inChain Protein object that is a chain of this Protein object
@param inNumCopies the number of copies of the specified chain that should be added
*/
public void addChains(Protein inChain, int inNumCopies)
{
String baseId = null;
for (int i = 0; i < inNumCopies; i++)
{
Protein chain = inChain.clone();
if (baseId != null)
{
chain.setID(baseId + "_" + (i + 1));
}
addChain(chain);
if (null == baseId)
{
baseId = chain.getID();
chain.setID(baseId + "_" + (i + 1));
}
}
}
//--------------------------------------------------------------------------
public boolean hasChains()
{
return CollectionUtil.hasValues(mChains);
}
//--------------------------------------------------------------------------
public Collection getChains()
{
return mChains;
}
//--------------------------------------------------------------------------
/**
Returns the chain with the specified id.
@param inChainId the id of the chain to return
@return the requested chain
*/
public Protein getChain(String inChainId)
{
return mChainIdMap.get(inChainId);
}
//--------------------------------------------------------------------------
public Collection getDistinctChains()
{
Set distinctChains = null;
if (CollectionUtil.hasValues(mChains))
{
distinctChains = new HashSet<>(3);
for (Protein chain : mChains)
{
if (chain.hasChains())
{
distinctChains.addAll(chain.getDistinctChains());
}
else
{
distinctChains.add(chain);
}
}
}
return distinctChains;
}
//--------------------------------------------------------------------------
/**
Returns chains of this Protein object organized into groups that are identical sequences.
@return the protein's chains grouped by sequence
*/
public Collection> getChainStoichiometryGroups()
{
Collection> stoichiometryGroups = new ArrayList<>();
if (CollectionUtil.hasValues(getChains()))
{
for (Protein chain : getChains())
{
boolean added = false;
for (Collection existingGroup : stoichiometryGroups)
{
Protein comparisonChain = existingGroup.iterator().next();
if (chain.length() == comparisonChain.length()
&& new String(chain.getMD5Checksum()).equals(new String(comparisonChain.getMD5Checksum()))
&& new String(chain.getSHA1Checksum()).equals(new String(comparisonChain.getSHA1Checksum())))
{
existingGroup.add(chain);
added = true;
break;
}
}
if (! added)
{
Collection newGroup = new ArrayList<>(5);
newGroup.add(chain);
stoichiometryGroups.add(newGroup);
}
}
}
return stoichiometryGroups;
}
//--------------------------------------------------------------------------
public Set getChainIds()
{
return (mChainIdMap != null ? mChainIdMap.keySet() : null);
}
//--------------------------------------------------------------------------
/**
If this Protein object contains a sequence, the length of that sequence is returned.
If this Protein object contains other Protein chains, the sum of their lengths is returned.
@return the total sequence length
*/
@Override
public int length()
{
int length = 0;
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
length += chain.length();
}
}
else
{
length = super.length();
}
return length;
}
//--------------------------------------------------------------------------
public void setAminoAcidSet(AminoAcidSet inValue)
{
mAASet = inValue;
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
chain.setAminoAcidSet(inValue);
}
}
clearCalculatedProperties();
}
//--------------------------------------------------------------------------
public AminoAcidSet getAminoAcidSet()
{
if (null == mAASet)
{
if (getParent() != null)
{
Protein topProtein = getTopProtein();
if (topProtein != null)
{
mAASet = topProtein.getAminoAcidSet();
}
}
if (null == mAASet)
{
mAASet = AminoAcidSet.STANDARD;
}
}
return mAASet;
}
//--------------------------------------------------------------------------
/**
Protein objects can be recursively composed of other Protein objects that represent
chains or subunits and this method returns the Protein object that contains this Protein object
or null if this Protein object is the top object.
@return the Protein object that contains this Protein object as a chain / subunit
or null if this Protein object is the top object.
*/
public Protein getParent()
{
return mParent;
}
//--------------------------------------------------------------------------
// Need to override in order to work with chains.
@Override
protected void countGaps()
{
int count = 0;
int totalGapLength = 0;
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
count += chain.getNumGaps();
totalGapLength += chain.getTotalGapLength();
}
}
else
{
Matcher m = GAP_PATTERN.matcher(getSequence());
while (m.find())
{
count++;
totalGapLength += m.group(0).length();
}
}
setNumGaps(count);
setTotalGapLength(totalGapLength);
}
//--------------------------------------------------------------------------
/**
Returns a Map containing the amino acid composition of the protein including any subchains.
@return a Map with AminoAcids as keys and Integers as the values
*/
public AminoAcidComposition getAminoAcidComposition()
{
if (null == mAAComposition)
{
AminoAcidComposition aaComposition = new AminoAcidComposition();
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
aaComposition.addAll(chain.getAminoAcidComposition());
}
}
else
{
// AA Composition
Map composition = getComposition();
for (String aaString : composition.keySet())
{
if (! aaString.equals("-")) // Ignore gaps
{
AminoAcid aa = getAminoAcidSet().getAA(aaString.charAt(0));
if (null == aa)
{
aa = AminoAcid.UNDEFINED;
}
Integer value = composition.get(aaString);
aaComposition.increment(aa, value);
}
}
}
mAAComposition = aaComposition;
}
return mAAComposition;
}
//--------------------------------------------------------------------------
/**
Returns an unmodifiable copy of the elemental composition Map adjusted (if necessary)
for the specified protein analysis mode. The keys are
Element objects and the values are Floats. Why Floats instead of Integers you
ask? Because some amino acid codes such as B and Z are ambiguous averages.
@return the elemental composition map
*/
public Map getElementalComposition(ProteinAnalysisMode inMode)
{
Map elementalCompositionMap = super.getElementalComposition();
if (inMode instanceof ReducedAnalysisMode)
{
OrganicMatterImpl organicMatter = new OrganicMatterImpl(elementalCompositionMap);
AminoAcid alkylatedCys = ((ReducedAnalysisMode)inMode).getAlkylatedCysteine();
if (alkylatedCys != null)
{
AminoAcid currentCysForm = getAminoAcidSet().getAA('C');
organicMatter.remove(currentCysForm, getAminoAcidComposition().get(currentCysForm));
organicMatter.add(alkylatedCys, getAminoAcidComposition().get(currentCysForm));
}
if (CollectionUtil.hasValues(mXLinks))
{
for (ProteinXLink xlink : mXLinks)
{
if (xlink.getType() == ProteinXLinkType.DISULFIDE)
{
organicMatter.add(Element.HYDROGEN, 2);
}
}
}
else if (mNumDisulfideBonds != null)
{
organicMatter.add(Element.HYDROGEN, 2 * mNumDisulfideBonds);
}
elementalCompositionMap = organicMatter.getElementalComposition();
}
return elementalCompositionMap;
}
//--------------------------------------------------------------------------
@Override
public Double getMonoisotopicMass()
{
return getMonoisotopicMass(ProteinAnalysisMode.NATIVE);
}
//--------------------------------------------------------------------------
public Double getMonoisotopicMass(ProteinAnalysisMode inMode)
{
Double mass = null;
if (inMode.equals(ProteinAnalysisMode.NATIVE))
{
mass = super.getMonoisotopicMass();
}
else
{
OrganicMatterImpl organicMatter = new OrganicMatterImpl(getElementalComposition(inMode));
mass = organicMatter.getMonoisotopicMass();
}
return mass;
}
//--------------------------------------------------------------------------
@Override
public Double getAverageMass()
{
return getAverageMass(ProteinAnalysisMode.NATIVE);
}
//--------------------------------------------------------------------------
public Double getAverageMass(ProteinAnalysisMode inMode)
{
Double mass = null;
if (inMode.equals(ProteinAnalysisMode.NATIVE))
{
mass = super.getAverageMass();
}
else
{
OrganicMatterImpl organicMatter = new OrganicMatterImpl(getElementalComposition(inMode));
mass = organicMatter.getAverageMass();
}
return mass;
}
//--------------------------------------------------------------------------
@Override
public Double getOrganicAverageMass()
{
return getOrganicAverageMass(ProteinAnalysisMode.NATIVE);
}
//--------------------------------------------------------------------------
public Double getOrganicAverageMass(ProteinAnalysisMode inMode)
{
Double mass = null;
if (inMode.equals(ProteinAnalysisMode.NATIVE))
{
mass = super.getOrganicAverageMass();
}
else
{
OrganicMatterImpl organicMatter = new OrganicMatterImpl(getElementalComposition(inMode));
mass = organicMatter.getOrganicAverageMass();
}
return mass;
}
//--------------------------------------------------------------------------
/**
Determines the isoelectric point (the pH at which the net charge is zero) for the protein.
Uses KaSet.BJELLQVIST by default.
@return the isoelectric point for the protein
*/
public Float getIsoelectricPoint()
{
return getIsoelectricPoint(KaSet.BJELLQVIST);
}
//--------------------------------------------------------------------------
/**
Determines the isoelectric point (the pH at which the net charge is zero)
for the protein assuming reducing conditions.
@param inKaSet the specific set of pKa values to use in calculating the isoelectric point
@return the isoelectric point for the protein
*/
public Float getIsoelectricPoint(KaSet inKaSet)
{
return getIsoelectricPoint(inKaSet, inKaSet.getDefaultProteinAnalysisMode());
}
//--------------------------------------------------------------------------
/**
Determines the isoelectric point (the pH at which the net charge is zero) for the protein.
@param inKaSet the specific set of pKa values to use in calculating the isoelectric point
@param inMode the anlysis mode conditions to apply to the calculation
@return the isoelectric point for the protein
*/
public Float getIsoelectricPoint(KaSet inKaSet, ProteinAnalysisMode inMode)
{
return IsoelectricPoint.valueOf(inKaSet).calculate(this, new SimpleProteinPropertyCalcSettings().setProteinAnalysisMode(inMode));
}
//--------------------------------------------------------------------------
/**
Estimates the protein's net charge at the specified pH assuming reducing conditions.
@param pH the specific pH value at which to calculate the protein's net charge
@param inKaSet the specific set of pKa values to use in calculating the isoelectric point
@return the net charge of the protein at the specified pH
*/
public Double getNetCharge(double pH, KaSet inKaSet)
{
return getNetCharge(pH, inKaSet, inKaSet.getDefaultProteinAnalysisMode());
}
//--------------------------------------------------------------------------
/**
Estimates the protein's net charge at the specified pH.
@param pH the specific pH value at which to calculate the protein's net charge
@param inKaSet the specific set of pKa values to use in calculating the isoelectric point
@param inMode the anlysis mode conditions to apply to the calculation
@return the net charge of the protein at the specified pH
*/
public Double getNetCharge(double pH, KaSet inKaSet, ProteinAnalysisMode inMode)
{
return getNetCharge(pH, constructIonizableGroupMap(inKaSet, inMode));
}
//--------------------------------------------------------------------------
/**
Returns the total number of specified disulfide bonds or null if the number
of disulfides has not been set at any chain level.
@return the total number of specified disulfide bonds
*/
public Integer getTotalNumDisulfideBonds()
{
// Possibilities: - set for 'parent' protein, null in individual chains
// - set for 'parent' protein and individual chains
// - null for 'parent' protein and individual chains
// - wouldn't really make sense for it to be null for the 'parent' protein and set for individual chains
int count = 0;
boolean allNull = true;
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
Integer chainCount = chain.getTotalNumDisulfideBonds();
if (chainCount != null)
{
count += chainCount;
allNull = false;
}
}
}
if (CollectionUtil.hasValues(mXLinks))
{
for (ProteinXLink xlink : mXLinks)
{
if (xlink.getType() == ProteinXLinkType.DISULFIDE)
{
count++;
allNull = false;
}
}
}
else if (mNumDisulfideBonds != null)
{
count += mNumDisulfideBonds;
allNull = false;
}
return (allNull ? null : count);
}
//--------------------------------------------------------------------------
public Protein setNumDisulfideBonds(int inValue)
{
// Possibilities: - set for 'parent' protein, null in individual chains
// - set for 'parent' protein and individual chains
// - null for 'parent' protein and individual chains
// - wouldn't really make sense for it to be null for the 'parent' protein and set for individual chains
// TODO: If it has already been set for chains below this protein what should I do?
int cysCount = getAminoAcidComposition().get(AminoAcid.CYSTEINE);
if (inValue > cysCount / 2)
{
throw new RuntimeException("There are not enough cysteines for " + inValue + " disulfide bonds!");
}
mNumDisulfideBonds = inValue;
clearCalculatedProperties();
return this;
}
//--------------------------------------------------------------------------
/**
Returns the total number of free cysteines (or the total number of cysteines if the number
of disulfides has not been set at any chain level.
@return the total number of free cysteines
*/
public int getTotalNumFreeCysteines()
{
int numCys = getAminoAcidComposition().get(AminoAcid.CYSTEINE);
Integer numDisulfides = getTotalNumDisulfideBonds();
if (numDisulfides != null) numCys -= (numDisulfides * 2);
return numCys;
}
//--------------------------------------------------------------------------
/**
Returns the estimated molar extinction coefficient at A280. If the number of
disulfide bonds has not been specified, it assumes that all cysteines are disulfide-linked.
This method utilizes the coefficients derived by Pace et al. (1995) Protein Science 4:2411-2423.
@return the estimated molar extinction coefficient for the protein
*/
public int getExtinctionCoeff()
{
if (null == mExtinctionCoeff)
{
double value = getRawExtinctionCoeff();
// To get to 3 sig. figs. ...
int length = new String((int)value + "").length();
if (length > 3)
{
BigDecimal bd = new BigDecimal(value);
bd = bd.round(new MathContext(3));
value = bd.doubleValue();
}
mExtinctionCoeff = (int) value;
}
return mExtinctionCoeff.intValue();
}
//--------------------------------------------------------------------------
/**
Returns the estimated mass attenuation coefficient (ml mg-1 cm-1) at A280. If the number of
disulfide bonds has not been specified, it assumes that all cysteines are disulfide-linked.
This method utilizes the coefficients derived by Pace et al. (1995) Protein Science 4:2411-2423.
@return the estimated percent molar extinction coefficient for the protein
*/
public float getPercentExtinctionCoeff()
{
if (null == mPercentExtinctionCoeff)
{
double value = getRawExtinctionCoeff() / getAverageMass();
// To get to 3 sig. figs. ...
int length = new String(value + "").length();
if (length > 3)
{
BigDecimal bd = new BigDecimal(value);
bd = bd.round(new MathContext(3));
value = bd.doubleValue();
}
mPercentExtinctionCoeff = (float) value;
}
return mPercentExtinctionCoeff;
}
//--------------------------------------------------------------------------
private int getRawExtinctionCoeff()
{
AminoAcidComposition aaComposition = getAminoAcidComposition();
Integer numDisulfideBonds = getTotalNumDisulfideBonds();
// If the number of disulfide bonds has not been specified, assume that all cysteines are disulfide-linked.
return aaComposition.get(AminoAcid.TRYPTOPHAN) * 5500
+ aaComposition.get(AminoAcid.TYROSINE) * 1490
+ (numDisulfideBonds != null ? numDisulfideBonds : (aaComposition.get(AminoAcid.CYSTEINE) / 2)) * 125;
}
//--------------------------------------------------------------------------
/**
Returns the concentration (mM) of the protein solution by using the Beer Lambert Law.
Abs = PCE
Where: Abs = Absorbance at a specific wavelength
P = path length of the cell (assumed to be 1 cm)
C = concentration in moles / liter
E = Molar extinction coeff at a specific wavelength
@param inAbsorbance the observed absorbance at 280nm
@return the estimated protein concentration in mM
*/
public float getMillimolarConcFromAbsorbance280(float inAbsorbance)
{
// Add a tiny amount to avoid divide by zero errors
return (float) (1000 * inAbsorbance / (getExtinctionCoeff() + 0.0000001));
}
//--------------------------------------------------------------------------
public AminoAcid aminoAcidAt(int inPosition)
{
return getAminoAcidSet().getAA(residueAt(inPosition));
}
//--------------------------------------------------------------------------
/**
Convenience method for setting the N-terminal group as pyro-glu based on
whether the N-terminal residue is Glu or Gln.
*/
public void createNTerminalPyroGlu()
{
if (length() > 0)
{
AminoAcid nTerminalResidue = aminoAcidAt(1);
NTerminalGroup nTerminalGroup;
if (nTerminalResidue.equals(AminoAcid.GLUTAMIC_ACID))
{
nTerminalGroup = NTerminalGroup.PYRO_GLU_N_TERM_GLU;
}
else if (nTerminalResidue.equals(AminoAcid.GLUTAMINE))
{
nTerminalGroup = NTerminalGroup.PYRO_GLU_N_TERM_GLN;
}
else
{
throw new RuntimeException("The N-Terminal residue must be Glu or Gln in order to form pyro-glutamic acid!");
}
if (getAminoAcidSet().isLocked())
{
setAminoAcidSet(getAminoAcidSet().clone());
}
getAminoAcidSet().setNTerminalGroup(nTerminalGroup);
}
}
//--------------------------------------------------------------------------
/**
This method converts the asparagine residue of each putative N-link site
into aspartic acid, mimicing enzymatic treatment with PNGase F to remove N-linked carbohydrates.
This method is not reversible and assumes that all putative N-link sites have attached carbohydrate structures.
*/
public void treatWithPNGaseF()
{
List nLinkSites = findNLinkedSites();
if (CollectionUtil.hasValues(nLinkSites))
{
for (SeqLocation seqLocation : nLinkSites)
{
Protein chain = (seqLocation.getChainId() != null && ! seqLocation.getChainId().equals(getID()) ? getChain(seqLocation.getChainId()) : this);
if (! chain.aminoAcidAt(seqLocation.getStart()).equals(AminoAcid.ASPARAGINE))
{
throw new RuntimeException("The residue at position " + seqLocation.getStart() + " isn't an asparagine as expected!");
}
chain.setResidueAt(seqLocation.getStart(), AminoAcid.ASPARTIC_ACID.getOneLetterCode());
}
}
}
//--------------------------------------------------------------------------
public List findNLinkedSites()
{
List sites = new ArrayList<>();
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
sites.addAll(chain.findNLinkedSites());
}
}
else
{
Pattern pattern = Pattern.compile("N[^P][ST]([^P]|$)", Pattern.CASE_INSENSITIVE);
Matcher m = pattern.matcher(this.getSequence());
int start = 0;
while (m.find(start))
{
sites.add(new SeqLocation(getID(), m.start() + 1, m.start() + 3));
start = m.start() + 1;
}
}
return sites;
}
//--------------------------------------------------------------------------
public XMLNode toXMLNode()
{
XMLNode node = super.toXMLNode();
//node.setTagName(HfgBioXML.PROTEIN_TAG);
if (mNumDisulfideBonds != null)
{
node.setAttribute(HfgBioXML.DISULFIDE_CNT_ATT, mNumDisulfideBonds);
}
if (null == mParent
|| getAminoAcidSet() != getTopProtein().getAminoAcidSet())
{
node.addSubtag(getAminoAcidSet().toXMLNode());
}
if (CollectionUtil.hasValues(mChains))
{
XMLNode chainsTag = new XMLTag(HfgBioXML.CHAINS_TAG);
node.addSubtag(chainsTag);
for (Protein chain : mChains)
{
chainsTag.addSubtag(chain.toXMLNode());
}
}
if (CollectionUtil.hasValues(mXLinks))
{
XMLNode xlinksTag = new XMLTag(HfgBioXML.XLINKS_TAG);
node.addSubtag(xlinksTag);
for (ProteinXLink xlink : mXLinks)
{
xlinksTag.addSubtag(xlink.toXMLNode());
}
}
return node;
}
// TODO: Site-specific glycan attachment?
//--------------------------------------------------------------------------
public Protein addGlycans(Glycan inValue, int inCount)
{
if (inValue != null
&& inCount > 0)
{
for (int i = 0; i < inCount; i++)
{
addGlycan(inValue);
}
}
return this;
}
//--------------------------------------------------------------------------
public Protein addGlycan(Glycan inValue)
{
if (inValue != null)
{
if (null == mGlycans) mGlycans = new ArrayList<>(5);
mGlycans.add(inValue);
clearCalculatedProperties();
}
return this;
}
//--------------------------------------------------------------------------
public List getGlycans()
{
List glycans = null;
if (CollectionUtil.hasValues(mGlycans))
{
glycans = new ArrayList<>(mGlycans);
}
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
List chainGlycans = chain.getGlycans();
if (CollectionUtil.hasValues(chainGlycans))
{
if (null == glycans)
{
glycans = new ArrayList<>(chainGlycans);
}
else
{
glycans.addAll(chainGlycans);
}
}
}
}
return glycans;
}
//--------------------------------------------------------------------------
public void addXLink(ProteinXLink inXLink)
{
if (inXLink != null)
{
if (null == mXLinks) mXLinks = new HashSet<>();
inXLink.setParentProtein(this);
mXLinks.add(inXLink);
clearCalculatedProperties();
}
}
//--------------------------------------------------------------------------
public Set getXLinks()
{
Set xLinks = new HashSet<>(10);
if (CollectionUtil.hasValues(mXLinks))
{
xLinks.addAll(mXLinks);
}
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
chain.getXLinks(xLinks);
}
}
return xLinks;
}
//--------------------------------------------------------------------------
public void removeXLink(ProteinXLink inXLink)
{
if (CollectionUtil.hasValues(mXLinks))
{
for (ProteinXLink xlink : mXLinks)
{
if (xlink.equals(inXLink))
{
mXLinks.remove(xlink);
break;
}
}
}
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
chain.removeXLink(inXLink);
}
}
}
//--------------------------------------------------------------------------
public Set removeXLinks()
{
return removeXLinks(null);
}
//--------------------------------------------------------------------------
public Set removeXLinks(ProteinXLinkType inXLinkType)
{
Set removedXLinks = new HashSet<>(10);
if (CollectionUtil.hasValues(mXLinks))
{
for (ProteinXLink xlink : mXLinks)
{
if (null == inXLinkType
|| xlink.getType().equals(inXLinkType))
{
removedXLinks.add(xlink);
}
}
for (ProteinXLink xlink : removedXLinks)
{
mXLinks.remove(xlink);
}
}
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
removedXLinks.addAll(chain.removeXLinks(inXLinkType));
}
}
return removedXLinks;
}
//--------------------------------------------------------------------------
@Override
public void clearCalculatedProperties()
{
super.clearCalculatedProperties();
mAAComposition = null;
mIsoelectricPoint = null;
mIsoelectricPointKaSet = null;
mExtinctionCoeff = null;
mPercentExtinctionCoeff = null;
}
//##########################################################################
// PROTECTED METHODS
//##########################################################################
//--------------------------------------------------------------------------
protected void getXLinks(Set inXLinkList)
{
if (CollectionUtil.hasValues(mXLinks))
{
inXLinkList.addAll(mXLinks);
}
}
//--------------------------------------------------------------------------
/**
Returns a map with AminoAcids as keys and Integers as the values.
*/
@Override
protected Map getComposition()
{
Map map;
if (CollectionUtil.hasValues(mChains))
{
map = new HashMap<>();
for (Protein chain : mChains)
{
Map chainMap = chain.getComposition();
for (String key : chainMap.keySet())
{
Integer oldValue = map.get(key);
map.put(key, (oldValue != null ? oldValue : 0) + chainMap.get(key));
}
}
}
else
{
map = super.getComposition();
}
return map;
}
//--------------------------------------------------------------------------
@Override
protected Map getResidueComposition()
{
Map residueComposition = new HashMap<>(25);
AminoAcidComposition aaComposition = getAminoAcidComposition();
for (AminoAcid aa : aaComposition.keySet())
{
residueComposition.put(aa, aaComposition.get(aa));
}
return residueComposition;
}
//--------------------------------------------------------------------------
@Override
protected Map getTerminiComposition()
{
Map terminiComposition = new HashMap<>(5);
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
Molecule nTerminus = chain.getAminoAcidSet().getNTerminalGroup();
Integer oldCount = terminiComposition.get(nTerminus);
int newCount = 1 + (oldCount != null ? oldCount : 0);
terminiComposition.put(nTerminus, newCount);
Molecule cTerminus = chain.getAminoAcidSet().getCTerminalGroup();
oldCount = terminiComposition.get(cTerminus);
newCount = 1 + (oldCount != null ? oldCount : 0);
terminiComposition.put(cTerminus, newCount);
}
}
else
{
terminiComposition.put(getAminoAcidSet().getNTerminalGroup(), 1);
terminiComposition.put(getAminoAcidSet().getCTerminalGroup(), 1);
}
return terminiComposition;
}
//--------------------------------------------------------------------------
@Override
protected Map getXLinkComposition()
{
Map xLinkComposition = new HashMap<>(5);
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
Map chainXLinkComposition = chain.getXLinkComposition();
if (CollectionUtil.hasValues(chainXLinkComposition))
{
for (ProteinXLinkType xlinkType : chainXLinkComposition.keySet())
{
Integer oldCount = xLinkComposition.get(xlinkType);
int newCount = chainXLinkComposition.get(xlinkType) + (oldCount != null ? oldCount : 0);
xLinkComposition.put(xlinkType, newCount);
}
}
}
}
if (CollectionUtil.hasValues(mXLinks))
{
for (ProteinXLink xlink : mXLinks)
{
Integer oldCount = xLinkComposition.get(xlink.getType());
int newCount = 1 + (oldCount != null ? oldCount : 0);
xLinkComposition.put(xlink.getType(), newCount);
}
}
else
{
Integer disulfideCount = getTotalNumDisulfideBonds();
if (disulfideCount != null)
{
Integer oldCount = xLinkComposition.get(ProteinXLinkType.DISULFIDE);
int newCount = disulfideCount + (oldCount != null ? oldCount : 0);
xLinkComposition.put(ProteinXLinkType.DISULFIDE, newCount);
}
}
return xLinkComposition;
}
//--------------------------------------------------------------------------
@Override
protected void recalculateElementalComposition()
{
super.recalculateElementalComposition();
List glycans = getGlycans();
if (CollectionUtil.hasValues(glycans))
{
for (Glycan glycan : glycans)
{
getOrganicMatter().addElementalComposition(glycan.getElementalComposition());
getOrganicMatter().remove(Molecule.H2O); // Subtract a water lost in the bonding
}
}
}
//##########################################################################
// PRIVATE METHODS
//##########################################################################
//--------------------------------------------------------------------------
private Protein getTopProtein()
{
return mParent != null ? mParent.getTopProtein() : this;
}
//--------------------------------------------------------------------------
private void checkId()
{
if (mParent != null)
{
Protein topProtein = getTopProtein();
// if (CollectionUtil.hasValues(mChains))
// {
// for (Protein chain : mChains)
// {
// chain.checkId();
// }
// }
// else
// {
String newId = topProtein.assignChainId(this);
if (null == getID()
|| ! getID().equals(newId))
{
String oldId = getID();
super.setID(newId);
propogateIdChange(oldId, newId);
}
// }
}
}
//--------------------------------------------------------------------------
private void propogateIdChange(String inOldId, String inNewId)
{
if (mXLinks != null)
{
for (ProteinXLink xlink : mXLinks)
{
if (xlink.getDonorChainId() != null
&& xlink.getDonorChainId().equals(inOldId))
{
xlink.setDonorChainId(inNewId);
}
if (xlink.getAcceptorChainId() != null
&& xlink.getAcceptorChainId().equals(inOldId))
{
xlink.setAcceptorChainId(inNewId);
}
}
}
Protein parent = mParent;
Protein topParent = getTopProtein();
while (parent != null
&& parent != topParent)
{
if (parent.mChainIdMap != null)
{
for (String id : parent.mChainIdMap.keySet())
{
if (parent.mChainIdMap.get(id).equals(this))
{
parent.mChainIdMap.remove(id);
parent.mChainIdMap.put(getID(), this);
break;
}
}
}
if (parent.mXLinks != null)
{
for (ProteinXLink xlink : parent.mXLinks)
{
if (xlink.getDonorChainId() != null
&& xlink.getDonorChainId().equals(inOldId))
{
xlink.setDonorChainId(inNewId);
}
if (xlink.getAcceptorChainId() != null
&& xlink.getAcceptorChainId().equals(inOldId))
{
xlink.setAcceptorChainId(inNewId);
}
}
}
parent = parent.mParent;
}
}
//--------------------------------------------------------------------------
private String assignChainId(Protein inChain)
{
String chainId = inChain.getID();
if (inChain.mParent != null)
{
if (! StringUtil.isSet(chainId))
{
chainId = "" + (char)('A' + mChainIdMap.size());
}
while (mChainIdMap.containsKey(chainId))
{
if (chainId.length() == 1
&& Character.isLetter(chainId.charAt(0)))
{
if (StringUtil.isSet(inChain.mParent.getID())
&& inChain.mParent != this)
{
chainId = inChain.mParent.getID() + " chain_" + chainId;
}
else
{
chainId = "" + (char)((int)chainId.charAt(0) + 1);
}
}
else
{
Matcher matcher = sChainIdPattern.matcher(chainId);
if (matcher.find())
{
chainId = matcher.replaceFirst("_" + (char) ((int)matcher.group(1).charAt(0) + 1));
}
else
{
chainId += "_B";
}
}
}
// We'll go into an infinite loop if we try inChain.setID() here.
mChainIdMap.put(chainId, inChain);
}
return chainId;
}
//--------------------------------------------------------------------------
private Map constructIonizableGroupMap(KaSet inKaSet, ProteinAnalysisMode inMode)
{
Map ionizableGroupMap = new HashMap<>();
if (CollectionUtil.hasValues(mChains))
{
for (Protein chain : mChains)
{
Map chainMap = chain.constructIonizableGroupMap(inKaSet, ProteinAnalysisMode.REDUCED);
for (IonizableGroup group : chainMap.keySet())
{
Integer oldValue = ionizableGroupMap.get(group);
int newValue = (oldValue != null ? oldValue : 0) + chainMap.get(group);
ionizableGroupMap.put(group, newValue);
}
}
if (inMode == ProteinAnalysisMode.NATIVE)
{
// Exclude disulfide-linked cysteines
List cysGroups = inKaSet.getIonizableGroups(AminoAcid.CYSTEINE);
if (cysGroups != null)
{
ionizableGroupMap.put(cysGroups.get(0), getTotalNumFreeCysteines());
}
}
}
else if (length() > 0)
{
AminoAcid cTerminalResidue = aminoAcidAt(length());
AminoAcidComposition aaComposition = getAminoAcidComposition();
for (AminoAcid aa : aaComposition.keySet())
{
Integer aaCount = aaComposition.get(aa);
if (aaCount != null && aaCount > 0)
{
if (aa == cTerminalResidue
&& inKaSet.getCTerminalSidechainKa(cTerminalResidue) != null
&& getAminoAcidSet().getCTerminalGroup().equals(CTerminalGroup.UNMODIFIED_C_TERMINUS))
{
IonizableGroup group = inKaSet.getCTerminalSidechainKa(cTerminalResidue);
if (group != null)
{
ionizableGroupMap.put(group, 1);
aaCount--;
}
}
List groups = inKaSet.getIonizableGroups(aa);
if (groups != null)
{
if (inMode == ProteinAnalysisMode.NATIVE)
{
// Exclude disulfide-linked cysteines
if (aa.equals(AminoAcid.CYSTEINE))
{
aaCount = getTotalNumFreeCysteines();
}
}
for (IonizableGroup group : groups)
{
ionizableGroupMap.put(group, aaCount);
}
}
}
}
IonizableGroup group = inKaSet.getNTerminalKa(getAminoAcidSet().getNTerminalGroup(), aminoAcidAt(1));
if (group != null) ionizableGroupMap.put(group, 1);
group = inKaSet.getCTerminalKa(getAminoAcidSet().getCTerminalGroup(), aminoAcidAt(length()));
if (group != null) ionizableGroupMap.put(group, 1);
}
return ionizableGroupMap;
}
//--------------------------------------------------------------------------
/**
Estimates the protein's net charge at the specified pH.
*/
private double getNetCharge(double pH, Map inIonizableGroupMap)
{
double netCharge = 0;
double concOfHIions = Math.pow(10, -pH);
if (inIonizableGroupMap != null)
{
for (IonizableGroup group : inIonizableGroupMap.keySet())
{
netCharge += group.getCharge(inIonizableGroupMap.get(group), concOfHIions);
}
}
return netCharge;
}
//---------------------------------------------------------------------------
private void recursivelyBuildSequenceInstanceMap(Map inSequenceInstanceMap)
{
if (getSequence() != null)
{
String refinedChain = getSequence().toUpperCase();
// Remove any trailing stops
if (refinedChain.endsWith("*"))
{
refinedChain = refinedChain.substring(0, refinedChain.length() - 1);
}
if (inSequenceInstanceMap.containsKey(refinedChain))
{
inSequenceInstanceMap.put(refinedChain, inSequenceInstanceMap.get(refinedChain) + 1);
}
else
{
inSequenceInstanceMap.put(refinedChain, 1);
}
}
else if (CollectionUtil.hasValues(getChains()))
{
for (Protein chain : getChains())
{
chain.recursivelyBuildSequenceInstanceMap(inSequenceInstanceMap);
}
}
}
//---------------------------------------------------------------------------
private String getSeqDataString()
{
// Build a chain map
Map sequenceInstanceMap = new HashMap<>(5);
recursivelyBuildSequenceInstanceMap(sequenceInstanceMap);
List sortedChains = new ArrayList<>(sequenceInstanceMap.keySet());
Collections.sort(sortedChains);
StringBuilderPlus seqData = new StringBuilderPlus().setDelimiter("/");
for (String chain : sortedChains)
{
seqData.delimitedAppend(sequenceInstanceMap.get(chain));
seqData.append("_");
seqData.append(chain);
}
return seqData.toString();
}
}