org.biojava.nbio.protmod.structure.ProteinModificationIdentifier Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created on Jun 12, 2010
* Author: Jianjiong Gao
*
*/
package org.biojava.nbio.protmod.structure;
import org.biojava.nbio.protmod.*;
import org.biojava.nbio.structure.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* Identify attachment modification in a 3-D structure.
*
* @author Jianjiong Gao
* @since 3.0
*/
public class ProteinModificationIdentifier {
private static final Logger logger = LoggerFactory.getLogger(ProteinModificationIdentifier.class);
private double bondLengthTolerance ;
private boolean recordUnidentifiableModifiedCompounds ;
private boolean recordAdditionalAttachments ;
private Set identifiedModifiedCompounds = null;
private Set unidentifiableAtomLinkages = null;
private Set unidentifiableModifiedResidues = null;
/**
* Temporary save the amino acids for each call of identify().
*/
private List residues;
public ProteinModificationIdentifier(){
bondLengthTolerance = 0.4;
recordUnidentifiableModifiedCompounds = false;
recordAdditionalAttachments = true;
reset();
}
public void destroy(){
if ( identifiedModifiedCompounds != null)
identifiedModifiedCompounds.clear();
if ( unidentifiableAtomLinkages != null)
unidentifiableAtomLinkages.clear();
if ( unidentifiableModifiedResidues != null)
unidentifiableModifiedResidues.clear();
unidentifiableAtomLinkages = null;
unidentifiableAtomLinkages = null;
unidentifiableModifiedResidues = null;
}
/**
*
* @param bondLengthTolerance tolerance of error (in Angstroms) of the
* covalent bond length, when calculating the atom distance threshold.
*/
public void setbondLengthTolerance(final double bondLengthTolerance) {
if (bondLengthTolerance<0) {
throw new IllegalArgumentException("bondLengthTolerance " +
"must be positive.");
}
this.bondLengthTolerance = bondLengthTolerance;
}
/**
*
* @param recordUnidentifiableModifiedCompounds true if choosing to record unidentifiable
* atoms; false, otherwise.
* @see #getRecordUnidentifiableCompounds
* @see #getUnidentifiableModifiedResidues
* @see #getUnidentifiableAtomLinkages
*/
public void setRecordUnidentifiableCompounds(boolean recordUnidentifiableModifiedCompounds) {
this.recordUnidentifiableModifiedCompounds = recordUnidentifiableModifiedCompounds;
}
/**
*
* @return true if choosing to record unidentifiable
* atoms; false, otherwise.
* @see #setRecordUnidentifiableCompounds
* @see #getUnidentifiableModifiedResidues
* @see #getUnidentifiableAtomLinkages
*/
public boolean getRecordUnidentifiableCompounds() {
return recordUnidentifiableModifiedCompounds;
}
/**
*
* @param recordAdditionalAttachments true if choosing to record additional attachments
* that are not directly attached to a modified residue.
* @see #getRecordAdditionalAttachments
*/
public void setRecordAdditionalAttachments(boolean recordAdditionalAttachments) {
this.recordAdditionalAttachments = recordAdditionalAttachments;
}
/**
*
* @return true if choosing to record additional attachments
* that are not directly attached to a modified residue.
* @see #setRecordAdditionalAttachments
*/
public boolean getRecordAdditionalAttachments() {
return recordAdditionalAttachments;
}
/**
*
* @return a set of identified {@link ModifiedCompound}s from
* the last parse result.
* @see ModifiedCompound
*/
public Set getIdentifiedModifiedCompound() {
if (identifiedModifiedCompounds==null) {
throw new IllegalStateException("No result available. Please call parse() first.");
}
return identifiedModifiedCompounds;
}
/**
*
* @return a set of atom linkages, which represent the
* atom bonds that were not covered by the identified
* {@link ModifiedCompound}s from the last parse result.
* Each element of the list is a array containing two atoms.
* @see StructureAtomLinkage
* @see #setRecordUnidentifiableCompounds
*/
public Set getUnidentifiableAtomLinkages() {
if (!recordUnidentifiableModifiedCompounds) {
throw new UnsupportedOperationException("Recording unidentified atom linkages" +
"is not supported. Please setRecordUnidentifiableCompounds(true) first.");
}
if (identifiedModifiedCompounds==null) {
throw new IllegalStateException("No result available. Please call parse() first.");
}
return unidentifiableAtomLinkages;
}
/**
*
* @return a set of modified residues that were not covered by
* the identified ModifiedCompounds from the last parse
* result.
* @see StructureGroup
* @see #setRecordUnidentifiableCompounds
* @see #getIdentifiedModifiedCompound
*/
public Set getUnidentifiableModifiedResidues() {
if (!recordUnidentifiableModifiedCompounds) {
throw new UnsupportedOperationException("Recording unidentified atom linkages" +
"is not supported. Please setRecordUnidentifiableCompounds(true) first.");
}
if (identifiedModifiedCompounds==null) {
throw new IllegalStateException("No result available. Please call parse() first.");
}
return unidentifiableModifiedResidues;
}
/**
* Identify all registered modifications in a structure.
* @param structure
*/
public void identify(final Structure structure) {
identify(structure, ProteinModificationRegistry.allModifications());
}
/**
* Identify a set of modifications in a structure.
* @param structure query {@link Structure}.
* @param potentialModifications query {@link ProteinModification}s.
*/
public void identify(final Structure structure,
final Set potentialModifications) {
if (structure==null) {
throw new IllegalArgumentException("Null structure.");
}
identify(structure.getChains(), potentialModifications);
}
/**
* Identify all registered modifications in a chain.
* @param chain query {@link Chain}.
*/
public void identify(final Chain chain) {
identify(Collections.singletonList(chain));
}
/**
* Identify all registered modifications in chains.
* @param chains query {@link Chain}s.
*/
public void identify(final List chains) {
identify(chains, ProteinModificationRegistry.allModifications());
}
/**
* Identify a set of modifications in a a chains.
* @param chain query {@link Chain}.
* @param potentialModifications query {@link ProteinModification}s.
*/
public void identify(final Chain chain,
final Set potentialModifications) {
identify(Collections.singletonList(chain), potentialModifications);
}
/**
* Identify a set of modifications in a a list of chains.
* @param chains query {@link Chain}s.
* @param potentialModifications query {@link ProteinModification}s.
*/
public void identify(final List chains,
final Set potentialModifications) {
if (chains==null) {
throw new IllegalArgumentException("Null structure.");
}
if (potentialModifications==null) {
throw new IllegalArgumentException("Null potentialModifications.");
}
reset();
if (potentialModifications.isEmpty()) {
return;
}
residues = new ArrayList();
List ligands = new ArrayList();
Map> mapCompGroups = new HashMap>();
for (Chain chain : chains) {
List ress = StructureUtil.getAminoAcids(chain);
//List ligs = chain.getAtomLigands();
List ligs = StructureTools.filterLigands(chain.getAtomGroups());
residues.addAll(ress);
residues.removeAll(ligs);
ligands.addAll(ligs);
addModificationGroups(potentialModifications, ress, ligs, mapCompGroups);
}
if (residues.isEmpty()) {
String pdbId = "?";
if ( chains.size() > 0) {
Structure struc = chains.get(0).getStructure();
if ( struc != null)
pdbId = struc.getPDBCode();
}
logger.warn("No amino acids found for {}. Either you did not parse the PDB file with alignSEQRES records, or this record does not contain any amino acids.", pdbId);
}
List modComps = new ArrayList();
for (ProteinModification mod : potentialModifications) {
ModificationCondition condition = mod.getCondition();
List components = condition.getComponents();
if (!mapCompGroups.keySet().containsAll(components)) {
// not all components exist for this mod.
continue;
}
int sizeComps = components.size();
if (sizeComps==1) {
processCrosslink1(mapCompGroups, modComps, mod, components);
} else {
processMultiCrosslink(mapCompGroups, modComps, mod, condition);
}
}
if (recordAdditionalAttachments) {
// identify additional groups that are not directly attached to amino acids.
for (ModifiedCompound mc : modComps) {
identifyAdditionalAttachments(mc, ligands, chains);
}
}
mergeModComps(modComps);
identifiedModifiedCompounds.addAll(modComps);
// record unidentifiable linkage
if (recordUnidentifiableModifiedCompounds) {
recordUnidentifiableAtomLinkages(modComps, ligands);
recordUnidentifiableModifiedResidues(modComps);
}
}
private void reset() {
identifiedModifiedCompounds = new LinkedHashSet();
if (recordUnidentifiableModifiedCompounds) {
unidentifiableAtomLinkages = new LinkedHashSet();
unidentifiableModifiedResidues = new LinkedHashSet();
}
}
private void processMultiCrosslink(
Map> mapCompGroups,
List modComps, ProteinModification mod,
ModificationCondition condition) {
// for multiple components
// find linkages first
List> matchedAtomsOfLinkages =
getMatchedAtomsOfLinkages(condition, mapCompGroups);
if (matchedAtomsOfLinkages.size() != condition.getLinkages().size()) {
return;
}
assembleLinkages(matchedAtomsOfLinkages, mod, modComps);
}
private void processCrosslink1(Map> mapCompGroups,
List modComps, ProteinModification mod,
List components) {
// modified residue
// TODO: is this the correct logic for CROSS_LINK_1?
Set modifiedResidues = mapCompGroups.get(components.get(0));
if (modifiedResidues != null) {
for (Group residue : modifiedResidues) {
StructureGroup strucGroup = StructureUtil.getStructureGroup(residue, true);
ModifiedCompound modRes = new ModifiedCompoundImpl(mod, strucGroup);
modComps.add(modRes);
}
}
}
/**
* identify additional groups that are not directly attached to amino acids.
* @param mc {@link ModifiedCompound}
* @param ligands {@link Group}
* @param chains List of {@link Chain}s
* @return a list of added groups
*/
private void identifyAdditionalAttachments(ModifiedCompound mc,
List ligands, List chains) {
if (ligands.isEmpty()) {
return;
}
// TODO: should the additional groups only be allowed to the identified
// ligands or both amino acids and ligands? Currently only on ligands
// ligands to amino acid bonds for same modification of unknown category
// will be combined in mergeModComps()
// TODO: how about chain-chain links?
List identifiedGroups = new ArrayList();
for (StructureGroup num : mc.getGroups(false)) {
Group group;
try {
//String numIns = "" + num.getResidueNumber();
//if (num.getInsCode() != null) {
// numIns += num.getInsCode();
//}
ResidueNumber resNum = new ResidueNumber();
resNum.setChainName(num.getChainId());
resNum.setSeqNum(num.getResidueNumber());
resNum.setInsCode(num.getInsCode());
//group = chain.getGroupByPDB(numIns);
group = getGroup(num,chains);
//group = mapChainIdChain.get(num.getChainId()).getGroupByPDB(resNum);
} catch (StructureException e) {
logger.error("Exception: ", e);
// should not happen
continue;
}
identifiedGroups.add(group);
}
int start = 0;
int n = identifiedGroups.size();
while (n > start) {
for (Group group1 : ligands) {
for (int i=start; i linkedAtoms = StructureUtil.findAtomLinkages(
group1, group2, false, bondLengthTolerance);
if (!linkedAtoms.isEmpty()) {
for (Atom[] atoms : linkedAtoms) {
mc.addAtomLinkage(StructureUtil.getStructureAtomLinkage(atoms[0],
false, atoms[1], false));
}
identifiedGroups.add(group1);
break;
}
}
}
}
start = n;
n = identifiedGroups.size();
}
}
private Group getGroup(StructureGroup num, List chains) throws StructureException {
for (Chain c : chains){
if ( c.getId().equals(num.getChainId())){
ResidueNumber resNum = new ResidueNumber();
resNum.setSeqNum(num.getResidueNumber());
resNum.setInsCode(num.getInsCode());
return c.getGroupByPDB(resNum);
}
}
throw new StructureException("Could not find residue " + num);
}
/**
* Merge identified modified compounds if linked.
*/
private void mergeModComps(List modComps) {
TreeSet remove = new TreeSet();
int n = modComps.size();
for (int icurr=1; icurr merging = new ArrayList();
int ipre = 0;
for (; ipre it = remove.descendingIterator();
while (it.hasNext()) {
modComps.remove(it.next().intValue());
}
}
/**
* Record unidentifiable atom linkages in a chain. Only linkages between two
* residues or one residue and one ligand will be recorded.
*/
private void recordUnidentifiableAtomLinkages(List modComps,
List ligands) {
// first put identified linkages in a map for fast query
Set identifiedLinkages = new HashSet();
for (ModifiedCompound mc : modComps) {
identifiedLinkages.addAll(mc.getAtomLinkages());
}
// record
// cross link
int nRes = residues.size();
for (int i=0; i linkages = StructureUtil.findAtomLinkages(
group1, group2, true, bondLengthTolerance);
for (Atom[] atoms : linkages) {
StructureAtomLinkage link = StructureUtil.getStructureAtomLinkage(atoms[0],
true, atoms[1], true);
unidentifiableAtomLinkages.add(link);
}
}
}
// attachment
int nLig = ligands.size();
for (int i=0; i linkages = StructureUtil.findAtomLinkages(
group1, group2, false, bondLengthTolerance);
for (Atom[] atoms : linkages) {
StructureAtomLinkage link = StructureUtil.getStructureAtomLinkage(atoms[0],
true, atoms[1], false);
unidentifiableAtomLinkages.add(link);
}
}
}
}
private void recordUnidentifiableModifiedResidues(List modComps) {
Set identifiedComps = new HashSet();
for (ModifiedCompound mc : modComps) {
identifiedComps.addAll(mc.getGroups(true));
}
// TODO: use the ModifiedAminoAcid after Andreas add that.
for (Group group : residues) {
if (group.getType().equals(GroupType.HETATM)) {
StructureGroup strucGroup = StructureUtil.getStructureGroup(
group, true);
strucGroup.setChainId(group.getChainId());
if (!identifiedComps.contains(strucGroup)) {
unidentifiableModifiedResidues.add(strucGroup);
}
}
}
}
/**
*
* @param modifications a set of {@link ProteinModification}s.
* @param residues
* @param ligands
* @param saveTo save result to
* @return map from component to list of corresponding residues
* in the chain.
*/
private void addModificationGroups(
final Set modifications,
final List residues,
final List ligands,
final Map> saveTo) {
if (residues==null || ligands==null || modifications==null) {
throw new IllegalArgumentException("Null argument(s).");
}
Map> mapSingleMultiComps = new HashMap>();
for (ProteinModification mod : modifications) {
ModificationCondition condition = mod.getCondition();
for (Component comp : condition.getComponents()) {
for (String pdbccId : comp.getPdbccIds()) {
Component single = Component.of(Collections.singleton(pdbccId),
comp.isNTerminal(), comp.isCTerminal());
Set mult = mapSingleMultiComps.get(single);
if (mult == null) {
mult = new HashSet();
mapSingleMultiComps.put(single, mult);
}
mult.add(comp);
}
}
}
{
// ligands
Set ligandsWildCard = mapSingleMultiComps.get(
Component.of("*"));
for (Group group : ligands) {
String pdbccId = group.getPDBName().trim();
Set comps = mapSingleMultiComps.get(
Component.of(pdbccId));
for (Component comp : unionComponentSet(ligandsWildCard, comps)) {
Set gs = saveTo.get(comp);
if (gs==null) {
gs = new LinkedHashSet();
saveTo.put(comp, gs);
}
gs.add(group);
}
}
}
{
// residues
if (residues.isEmpty()) {
return;
}
Set residuesWildCard = mapSingleMultiComps.get(
Component.of("*"));
// for all residues
for (Group group : residues) {
String pdbccId = group.getPDBName().trim();
Set comps = mapSingleMultiComps.get(
Component.of(pdbccId));
for (Component comp : unionComponentSet(residuesWildCard, comps)) {
Set gs = saveTo.get(comp);
if (gs==null) {
gs = new LinkedHashSet();
saveTo.put(comp, gs);
}
gs.add(group);
}
}
// for N-terminal
int nRes = residues.size();
int iRes = 0;
Group res;
do {
// for all ligands on N terminal and the first residue
res = residues.get(iRes++);
Set nTermWildCard = mapSingleMultiComps.get(
Component.of("*", true, false));
Set comps = mapSingleMultiComps.get(
Component.of(res.getPDBName(), true, false));
for (Component comp : unionComponentSet(nTermWildCard, comps)) {
Set gs = saveTo.get(comp);
if (gs==null) {
gs = new LinkedHashSet();
saveTo.put(comp, gs);
}
gs.add(res);
}
} while (iRes cTermWildCard = mapSingleMultiComps.get(
Component.of("*", false, true));
Set comps = mapSingleMultiComps.get(
Component.of(res.getPDBName(), false, true));
for (Component comp : unionComponentSet(cTermWildCard, comps)) {
Set gs = saveTo.get(comp);
if (gs==null) {
gs = new LinkedHashSet();
saveTo.put(comp, gs);
}
gs.add(res);
}
} while (iRes>=0 && ligands.contains(res));
}
}
private Set unionComponentSet(Set set1, Set set2) {
if (set1 == null && set2 == null)
return Collections.emptySet();
if (set1 == null)
return set2;
if (set2 == null)
return set1;
Set set = new HashSet(set1.size()+set2.size());
set.addAll(set1);
set.addAll(set2);
return set;
}
/**
* Get matched atoms for all linkages.
*/
private List> getMatchedAtomsOfLinkages(
ModificationCondition condition, Map> mapCompGroups) {
List linkages = condition.getLinkages();
int nLink = linkages.size();
List> matchedAtomsOfLinkages =
new ArrayList>(nLink);
for (int iLink=0; iLink groups1 = mapCompGroups.get(comp1);
Set groups2 = mapCompGroups.get(comp2);
List list = new ArrayList();
List potentialNamesOfAtomOnGroup1 = linkage.getPDBNameOfPotentialAtomsOnComponent1();
for (String name : potentialNamesOfAtomOnGroup1) {
if (name.equals("*")) {
// wildcard
potentialNamesOfAtomOnGroup1 = null; // search all atoms
break;
}
}
List potentialNamesOfAtomOnGroup2 = linkage.getPDBNameOfPotentialAtomsOnComponent2();
for (String name : potentialNamesOfAtomOnGroup2) {
if (name.equals("*")) {
// wildcard
potentialNamesOfAtomOnGroup2 = null; // search all atoms
break;
}
}
for (Group g1 : groups1) {
for (Group g2 : groups2) {
if (g1.equals(g2)) {
continue;
}
// only for wildcard match of two residues
boolean ignoreNCLinkage =
potentialNamesOfAtomOnGroup1 == null &&
potentialNamesOfAtomOnGroup2 == null &&
residues.contains(g1) &&
residues.contains(g2);
Atom[] atoms = StructureUtil.findNearestAtomLinkage(
g1, g2,
potentialNamesOfAtomOnGroup1,
potentialNamesOfAtomOnGroup2,
ignoreNCLinkage,
bondLengthTolerance);
if (atoms!=null) {
list.add(atoms);
}
}
}
if (list.isEmpty()) {
// broken linkage
break;
}
matchedAtomsOfLinkages.add(list);
}
return matchedAtomsOfLinkages;
}
/** Assembly the matched linkages
*
* @param matchedAtomsOfLinkages
* @param mod
* @param ret ModifiedCompound will be stored here
*/
private void assembleLinkages(List> matchedAtomsOfLinkages,
ProteinModification mod, List ret) {
ModificationCondition condition = mod.getCondition();
List modLinks = condition.getLinkages();
int nLink = matchedAtomsOfLinkages.size();
int[] indices = new int[nLink];
Set identifiedCompounds = new HashSet();
while (indices[0] atomLinkages = new ArrayList(nLink);
for (int iLink=0; iLink linkages = new ArrayList(n);
for (int i=0; i[0,0,2]=>[1,2,0])
int i = nLink-1;
while (i>=0) {
if (i==0 || indices[i] linkages,
List atomLinkages) {
int nLink = linkages.size();
if (nLink != atomLinkages.size()) {
return false;
}
for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy