org.biojava.nbio.protmod.structure.ProteinModificationIdentifier Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-modfinder Show documentation
There is a newer version: 7.1.3
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * Created on Jun 12, 2010
 * Author: Jianjiong Gao
 *
 */

package org.biojava.nbio.protmod.structure;

import org.biojava.nbio.protmod.*;
import org.biojava.nbio.structure.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 * Identify attachment modification in a 3-D structure.
 *
 * @author Jianjiong Gao
 * @since 3.0
 */
public class ProteinModificationIdentifier {

	private static final Logger logger = LoggerFactory.getLogger(ProteinModificationIdentifier.class);

	private double bondLengthTolerance ;
	private boolean recordUnidentifiableModifiedCompounds ;
	private boolean recordAdditionalAttachments ;

	private Set identifiedModifiedCompounds = null;
	private Set unidentifiableAtomLinkages = null;
	private Set unidentifiableModifiedResidues = null;

	/**
	 * Temporary save the amino acids for each call of identify().
	 */
	private List residues;


	public ProteinModificationIdentifier(){

		bondLengthTolerance =  0.4;
		recordUnidentifiableModifiedCompounds = false;
		recordAdditionalAttachments = true;

		reset();
	}


	public void destroy(){
		if ( identifiedModifiedCompounds != null)
			identifiedModifiedCompounds.clear();
		if ( unidentifiableAtomLinkages != null)
			unidentifiableAtomLinkages.clear();
		if ( unidentifiableModifiedResidues != null)
			unidentifiableModifiedResidues.clear();

		unidentifiableAtomLinkages = null;
		unidentifiableAtomLinkages = null;
		unidentifiableModifiedResidues = null;


	}

	/**
	 *
	 * @param bondLengthTolerance tolerance of error (in Angstroms) of the
	 *  covalent bond length, when calculating the atom distance threshold.
	 */
	public void setbondLengthTolerance(final double bondLengthTolerance) {
		if (bondLengthTolerance<0) {
			throw new IllegalArgumentException("bondLengthTolerance " +
					"must be positive.");
		}
		this.bondLengthTolerance = bondLengthTolerance;
	}

	/**
	 *
	 * @param recordUnidentifiableModifiedCompounds true if choosing to record unidentifiable
	 *  atoms; false, otherwise.
	 * @see #getRecordUnidentifiableCompounds
	 * @see #getUnidentifiableModifiedResidues
	 * @see #getUnidentifiableAtomLinkages
	 */
	public void setRecordUnidentifiableCompounds(boolean recordUnidentifiableModifiedCompounds) {
		this.recordUnidentifiableModifiedCompounds = recordUnidentifiableModifiedCompounds;
	}

	/**
	 *
	 * @return true if choosing to record unidentifiable
	 *  atoms; false, otherwise.
	 * @see #setRecordUnidentifiableCompounds
	 * @see #getUnidentifiableModifiedResidues
	 * @see #getUnidentifiableAtomLinkages
	 */
	public boolean getRecordUnidentifiableCompounds() {
		return recordUnidentifiableModifiedCompounds;
	}

	/**
	 *
	 * @param recordAdditionalAttachments true if choosing to record additional attachments
	 *  that are not directly attached to a modified residue.
	 * @see #getRecordAdditionalAttachments
	 */
	public void setRecordAdditionalAttachments(boolean recordAdditionalAttachments) {
		this.recordAdditionalAttachments = recordAdditionalAttachments;
	}

	/**
	 *
	 * @return true if choosing to record additional attachments
	 *  that are not directly attached to a modified residue.
	 * @see #setRecordAdditionalAttachments
	 */
	public boolean getRecordAdditionalAttachments() {
		return recordAdditionalAttachments;
	}

	/**
	 *
	 * @return a set of identified {@link ModifiedCompound}s from
	 *  the last parse result.
	 * @see ModifiedCompound
	 */
	public Set getIdentifiedModifiedCompound() {
		if (identifiedModifiedCompounds==null) {
			throw new IllegalStateException("No result available. Please call parse() first.");
		}

		return identifiedModifiedCompounds;
	}

	/**
	 *
	 * @return a set of atom linkages, which represent the
	 *  atom bonds that were not covered by the identified
	 *  {@link ModifiedCompound}s from the last parse result.
	 *  Each element of the list is a array containing two atoms.
	 * @see StructureAtomLinkage
	 * @see #setRecordUnidentifiableCompounds
	 */
	public Set getUnidentifiableAtomLinkages() {
		if (!recordUnidentifiableModifiedCompounds) {
			throw new UnsupportedOperationException("Recording unidentified atom linkages" +
					"is not supported. Please setRecordUnidentifiableCompounds(true) first.");
		}

		if (identifiedModifiedCompounds==null) {
			throw new IllegalStateException("No result available. Please call parse() first.");
		}

		return unidentifiableAtomLinkages;
	}

	/**
	 *
	 * @return a set of modified residues that were not covered by
	 *  the identified ModifiedCompounds from the last parse
	 *  result.
	 *  @see StructureGroup
	 *  @see #setRecordUnidentifiableCompounds
	 *  @see #getIdentifiedModifiedCompound
	 */
	public Set getUnidentifiableModifiedResidues() {
		if (!recordUnidentifiableModifiedCompounds) {
			throw new UnsupportedOperationException("Recording unidentified atom linkages" +
					"is not supported. Please setRecordUnidentifiableCompounds(true) first.");
		}

		if (identifiedModifiedCompounds==null) {
			throw new IllegalStateException("No result available. Please call parse() first.");
		}

		return unidentifiableModifiedResidues;
	}

	/**
	 * Identify all registered modifications in a structure.
	 * @param structure
	 */
	public void identify(final Structure structure) {
		identify(structure, ProteinModificationRegistry.allModifications());
	}

	/**
	 * Identify a set of modifications in a structure.
	 * @param structure query {@link Structure}.
	 * @param potentialModifications query {@link ProteinModification}s.
	 */
	public void identify(final Structure structure,
			final Set potentialModifications) {
		if (structure==null) {
			throw new IllegalArgumentException("Null structure.");
		}

		identify(structure.getChains(), potentialModifications);
	}

	/**
	 * Identify all registered modifications in a chain.
	 * @param chain query {@link Chain}.
	 */
	public void identify(final Chain chain) {
		identify(Collections.singletonList(chain));
	}

	/**
	 * Identify all registered modifications in chains.
	 * @param chains query {@link Chain}s.
	 */
	public void identify(final List chains) {
		identify(chains, ProteinModificationRegistry.allModifications());
	}

	/**
	 * Identify a set of modifications in a a chains.
	 * @param chain query {@link Chain}.
	 * @param potentialModifications query {@link ProteinModification}s.
	 */
	public void identify(final Chain chain,
			final Set potentialModifications)  {
		identify(Collections.singletonList(chain), potentialModifications);
	}

	/**
	 * Identify a set of modifications in a a list of chains.
	 * @param chains query {@link Chain}s.
	 * @param potentialModifications query {@link ProteinModification}s.
	 */
	public void identify(final List chains,
			final Set potentialModifications) {

		if (chains==null) {
			throw new IllegalArgumentException("Null structure.");
		}

		if (potentialModifications==null) {
			throw new IllegalArgumentException("Null potentialModifications.");
		}


		reset();

		if (potentialModifications.isEmpty()) {
			return;
		}


		residues = new ArrayList();
		List ligands = new ArrayList();
		Map> mapCompGroups = new HashMap>();

		for (Chain chain : chains) {

			List ress = StructureUtil.getAminoAcids(chain);

			//List ligs = chain.getAtomLigands();
			List ligs = StructureTools.filterLigands(chain.getAtomGroups());
			residues.addAll(ress);
			residues.removeAll(ligs);
			ligands.addAll(ligs);
			addModificationGroups(potentialModifications, ress, ligs, mapCompGroups);
		}

		if (residues.isEmpty()) {
			String pdbId = "?";
			if ( chains.size() > 0) {
				Structure struc = chains.get(0).getStructure();
				if ( struc != null)
					pdbId = struc.getPDBCode();
			}
			logger.warn("No amino acids found for {}. Either you did not parse the PDB file with alignSEQRES records, or this record does not contain any amino acids.", pdbId);
		}
		List modComps = new ArrayList();

		for (ProteinModification mod : potentialModifications) {
			ModificationCondition condition = mod.getCondition();
			List components = condition.getComponents();
			if (!mapCompGroups.keySet().containsAll(components)) {
				// not all components exist for this mod.
				continue;
			}

			int sizeComps = components.size();
			if (sizeComps==1) {

				processCrosslink1(mapCompGroups, modComps, mod, components);

			} else {

				processMultiCrosslink(mapCompGroups, modComps, mod, condition);
			}
		}

		if (recordAdditionalAttachments) {
			// identify additional groups that are not directly attached to amino acids.
			for (ModifiedCompound mc : modComps) {
				identifyAdditionalAttachments(mc, ligands, chains);
			}
		}

		mergeModComps(modComps);

		identifiedModifiedCompounds.addAll(modComps);


		// record unidentifiable linkage
		if (recordUnidentifiableModifiedCompounds) {
			recordUnidentifiableAtomLinkages(modComps, ligands);
			recordUnidentifiableModifiedResidues(modComps);
		}
	}

	private void reset() {
		identifiedModifiedCompounds = new LinkedHashSet();
		if (recordUnidentifiableModifiedCompounds) {
			unidentifiableAtomLinkages = new LinkedHashSet();
			unidentifiableModifiedResidues = new LinkedHashSet();
		}

	}

	private void processMultiCrosslink(
			Map> mapCompGroups,
			List modComps, ProteinModification mod,
			ModificationCondition condition) {
		// for multiple components

		// find linkages first
		List> matchedAtomsOfLinkages =
				getMatchedAtomsOfLinkages(condition, mapCompGroups);

		if (matchedAtomsOfLinkages.size() != condition.getLinkages().size()) {
			return;
		}

		assembleLinkages(matchedAtomsOfLinkages, mod, modComps);

	}

	private void processCrosslink1(Map> mapCompGroups,
			List modComps, ProteinModification mod,
			List components) {
		// modified residue
		// TODO: is this the correct logic for CROSS_LINK_1?
		Set modifiedResidues = mapCompGroups.get(components.get(0));
		if (modifiedResidues != null) {
			for (Group residue : modifiedResidues) {
				StructureGroup strucGroup = StructureUtil.getStructureGroup(residue, true);
				ModifiedCompound modRes = new ModifiedCompoundImpl(mod, strucGroup);
				modComps.add(modRes);
			}
		}
	}

	/**
	 * identify additional groups that are not directly attached to amino acids.
	 * @param mc {@link ModifiedCompound}
	 * @param ligands {@link Group}
	 * @param chains List of {@link Chain}s
	 * @return a list of added groups
	 */
	private void identifyAdditionalAttachments(ModifiedCompound mc,
			List ligands, List chains) {
		if (ligands.isEmpty()) {
			return;
		}

		// TODO: should the additional groups only be allowed to the identified
		// ligands or both amino acids and ligands? Currently only on ligands
		// ligands to amino acid bonds for same modification of unknown category
		// will be combined in mergeModComps()
		// TODO: how about chain-chain links?
		List identifiedGroups = new ArrayList();
		for (StructureGroup num : mc.getGroups(false)) {
			Group group;
			try {
				//String numIns = "" + num.getResidueNumber();
				//if (num.getInsCode() != null) {
				//	numIns += num.getInsCode();
				//}
				ResidueNumber resNum = new ResidueNumber();
				resNum.setChainName(num.getChainId());
				resNum.setSeqNum(num.getResidueNumber());
				resNum.setInsCode(num.getInsCode());
				//group = chain.getGroupByPDB(numIns);

				group = getGroup(num,chains);
				//group = mapChainIdChain.get(num.getChainId()).getGroupByPDB(resNum);
			} catch (StructureException e) {
				logger.error("Exception: ", e);
				// should not happen
				continue;
			}
			identifiedGroups.add(group);
		}

		int start = 0;

		int n = identifiedGroups.size();
		while (n > start) {
			for (Group group1 : ligands) {
				for (int i=start; i linkedAtoms = StructureUtil.findAtomLinkages(
								group1, group2, false, bondLengthTolerance);
						if (!linkedAtoms.isEmpty()) {
							for (Atom[] atoms : linkedAtoms) {
								mc.addAtomLinkage(StructureUtil.getStructureAtomLinkage(atoms[0],
										false, atoms[1], false));
							}
							identifiedGroups.add(group1);
							break;
						}
					}
				}
			}

			start = n;
			n = identifiedGroups.size();
		}
	}

	private Group getGroup(StructureGroup num, List chains) throws StructureException {
		for (Chain c : chains){
			if ( c.getId().equals(num.getChainId())){

				ResidueNumber resNum = new ResidueNumber();

				resNum.setSeqNum(num.getResidueNumber());
				resNum.setInsCode(num.getInsCode());


				return c.getGroupByPDB(resNum);
			}
		}

		throw new StructureException("Could not find residue " + num);
	}

	/**
	 * Merge identified modified compounds if linked.
	 */
	private void mergeModComps(List modComps) {
		TreeSet remove = new TreeSet();
		int n = modComps.size();
		for (int icurr=1; icurr merging = new ArrayList();
			int ipre = 0;
			for (; ipre it = remove.descendingIterator();
		while (it.hasNext()) {
			modComps.remove(it.next().intValue());
		}
	}

	/**
	 * Record unidentifiable atom linkages in a chain. Only linkages between two
	 * residues or one residue and one ligand will be recorded.
	 */
	private void recordUnidentifiableAtomLinkages(List modComps,
			List ligands) {

		// first put identified linkages in a map for fast query
		Set identifiedLinkages = new HashSet();
		for (ModifiedCompound mc : modComps) {
			identifiedLinkages.addAll(mc.getAtomLinkages());
		}

		// record
		// cross link
		int nRes = residues.size();
		for (int i=0; i linkages = StructureUtil.findAtomLinkages(
						group1, group2, true, bondLengthTolerance);
				for (Atom[] atoms : linkages) {
					StructureAtomLinkage link = StructureUtil.getStructureAtomLinkage(atoms[0],
							true, atoms[1], true);
					unidentifiableAtomLinkages.add(link);
				}
			}
		}

		// attachment
		int nLig = ligands.size();
		for (int i=0; i linkages = StructureUtil.findAtomLinkages(
						group1, group2, false, bondLengthTolerance);
				for (Atom[] atoms : linkages) {
					StructureAtomLinkage link = StructureUtil.getStructureAtomLinkage(atoms[0],
							true, atoms[1], false);
					unidentifiableAtomLinkages.add(link);
				}
			}
		}
	}

	private void recordUnidentifiableModifiedResidues(List modComps) {
		Set identifiedComps = new HashSet();
		for (ModifiedCompound mc : modComps) {
			identifiedComps.addAll(mc.getGroups(true));
		}

		// TODO: use the ModifiedAminoAcid after Andreas add that.
		for (Group group : residues) {
			if (group.getType().equals(GroupType.HETATM)) {
				StructureGroup strucGroup = StructureUtil.getStructureGroup(
						group, true);
				strucGroup.setChainId(group.getChainId());

				if (!identifiedComps.contains(strucGroup)) {
					unidentifiableModifiedResidues.add(strucGroup);
				}
			}
		}
	}

	/**
	 *
	 * @param modifications a set of {@link ProteinModification}s.
	 * @param residues
	 * @param ligands
	 * @param saveTo save result to
	 * @return map from component to list of corresponding residues
	 *  in the chain.
	 */
	private void addModificationGroups(
			final Set modifications,
			final List residues,
			final List ligands,
			final Map> saveTo) {
		if (residues==null || ligands==null || modifications==null) {
			throw new IllegalArgumentException("Null argument(s).");
		}

		Map> mapSingleMultiComps = new HashMap>();
		for (ProteinModification mod : modifications) {
			ModificationCondition condition = mod.getCondition();
			for (Component comp : condition.getComponents()) {
				for (String pdbccId : comp.getPdbccIds()) {
					Component single = Component.of(Collections.singleton(pdbccId),
							comp.isNTerminal(), comp.isCTerminal());
					Set mult = mapSingleMultiComps.get(single);
					if (mult == null) {
						mult = new HashSet();
						mapSingleMultiComps.put(single, mult);
					}
					mult.add(comp);
				}
			}
		}

		{
			// ligands
			Set ligandsWildCard = mapSingleMultiComps.get(
					Component.of("*"));
			for (Group group : ligands) {
				String pdbccId = group.getPDBName().trim();
				Set comps = mapSingleMultiComps.get(
						Component.of(pdbccId));

				for (Component comp : unionComponentSet(ligandsWildCard, comps)) {
					Set gs = saveTo.get(comp);
					if (gs==null) {
						gs = new LinkedHashSet();
						saveTo.put(comp, gs);
					}
					gs.add(group);
				}
			}
		}

		{
			// residues
			if (residues.isEmpty()) {
				return;
			}

			Set residuesWildCard = mapSingleMultiComps.get(
					Component.of("*"));

			// for all residues
			for (Group group : residues) {
				String pdbccId = group.getPDBName().trim();
				Set comps = mapSingleMultiComps.get(
						Component.of(pdbccId));

				for (Component comp : unionComponentSet(residuesWildCard, comps)) {
					Set gs = saveTo.get(comp);
					if (gs==null) {
						gs = new LinkedHashSet();
						saveTo.put(comp, gs);
					}
					gs.add(group);
				}
			}

			// for N-terminal
			int nRes = residues.size();
			int iRes = 0;
			Group res;
			do {
				// for all ligands on N terminal and the first residue
				res = residues.get(iRes++);

				Set nTermWildCard = mapSingleMultiComps.get(
						Component.of("*", true, false));

				Set comps = mapSingleMultiComps.get(
						Component.of(res.getPDBName(), true, false));

				for (Component comp : unionComponentSet(nTermWildCard, comps)) {
					Set gs = saveTo.get(comp);
					if (gs==null) {
						gs = new LinkedHashSet();
						saveTo.put(comp, gs);
					}
					gs.add(res);
				}
			} while (iRes cTermWildCard = mapSingleMultiComps.get(
						Component.of("*", false, true));

				Set comps = mapSingleMultiComps.get(
						Component.of(res.getPDBName(), false, true));

				for (Component comp : unionComponentSet(cTermWildCard, comps)) {
					Set gs = saveTo.get(comp);
					if (gs==null) {
						gs = new LinkedHashSet();
						saveTo.put(comp, gs);
					}
					gs.add(res);
				}
			} while (iRes>=0 && ligands.contains(res));
		}
	}

	private Set unionComponentSet(Set set1, Set set2) {
		if (set1 == null && set2 == null)
			return Collections.emptySet();

		if (set1 == null)
			return set2;

		if (set2 == null)
			return set1;

		Set set = new HashSet(set1.size()+set2.size());
		set.addAll(set1);
		set.addAll(set2);

		return set;
	}

	/**
	 * Get matched atoms for all linkages.
	 */
	private List> getMatchedAtomsOfLinkages(
			ModificationCondition condition, Map> mapCompGroups) {
		List linkages = condition.getLinkages();
		int nLink = linkages.size();

		List> matchedAtomsOfLinkages =
				new ArrayList>(nLink);

		for (int iLink=0; iLink groups1 = mapCompGroups.get(comp1);
			Set groups2 = mapCompGroups.get(comp2);

			List list = new ArrayList();

			List potentialNamesOfAtomOnGroup1 = linkage.getPDBNameOfPotentialAtomsOnComponent1();
			for (String name : potentialNamesOfAtomOnGroup1) {
				if (name.equals("*")) {
					// wildcard
					potentialNamesOfAtomOnGroup1 = null; // search all atoms
					break;
				}
			}

			List potentialNamesOfAtomOnGroup2 = linkage.getPDBNameOfPotentialAtomsOnComponent2();
			for (String name : potentialNamesOfAtomOnGroup2) {
				if (name.equals("*")) {
					// wildcard
					potentialNamesOfAtomOnGroup2 = null; // search all atoms
					break;
				}
			}

			for (Group g1 : groups1) {
				for (Group g2 : groups2) {
					if (g1.equals(g2)) {
						continue;
					}

					// only for wildcard match of two residues
					boolean ignoreNCLinkage =
						potentialNamesOfAtomOnGroup1 == null &&
						potentialNamesOfAtomOnGroup2 == null &&
						residues.contains(g1) &&
						residues.contains(g2);

					Atom[] atoms = StructureUtil.findNearestAtomLinkage(
							g1, g2,
							potentialNamesOfAtomOnGroup1,
							potentialNamesOfAtomOnGroup2,
							ignoreNCLinkage,
							bondLengthTolerance);
					if (atoms!=null) {
						list.add(atoms);
					}
				}
			}

			if (list.isEmpty()) {
				// broken linkage
				break;
			}

			matchedAtomsOfLinkages.add(list);
		}

		return matchedAtomsOfLinkages;
	}

	/** Assembly the matched linkages
	 *
	 * @param matchedAtomsOfLinkages
	 * @param mod
	 * @param ret ModifiedCompound will be stored here
     */
	private void assembleLinkages(List> matchedAtomsOfLinkages,
			ProteinModification mod, List ret) {
		ModificationCondition condition = mod.getCondition();
		List modLinks = condition.getLinkages();

		int nLink = matchedAtomsOfLinkages.size();
		int[] indices = new int[nLink];
		Set identifiedCompounds = new HashSet();
		while (indices[0] atomLinkages = new ArrayList(nLink);
			for (int iLink=0; iLink linkages = new ArrayList(n);
				for (int i=0; i[0,0,2]=>[1,2,0])
			int i = nLink-1;
			while (i>=0) {
				if (i==0 || indices[i] linkages,
			List atomLinkages) {
		int nLink = linkages.size();
		if (nLink != atomLinkages.size()) {
			return false;
		}
		for (int i=0; i