org.biojava.nbio.structure.io.EntityFinder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
There is a newer version: 7.1.3
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io;

import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
import org.biojava.nbio.alignment.template.GapPenalty;
import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
import org.biojava.nbio.core.alignment.template.SequencePair;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.biojava.nbio.structure.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

/**
 * Heuristical finding of Entities (called Compounds in legacy PDB format)
 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains
 * in the Structure.
 *
 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where
 * possibly no SEQRES is given.
 *
 * @author Jose Duarte
 */
public class EntityFinder {

	private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class);

	/**
	 * Above this ratio of mismatching residue types for same residue numbers we
	 * consider the 2 chains to have mismatching residue numbers and warn about it
	 */
	public static final double RATIO_GAPS_FOR_MISMATCH = 0.50;

	/**
	 * Identity value for 2 chains to be considered part of same entity
	 */
	public static final double IDENTITY_THRESHOLD = 0.99999;

	/**
	 * Gap coverage value (num gaps over length of sequence) for each chain of the match:
	 * 2 chains with more gap coverage than this value will not be considered part of the same entity
	 */
	public static final double GAP_COVERAGE_THRESHOLD = 0.3;




	/**
	 * Utility method that employs some heuristics to find the {@link EntityInfo}s
	 * for the polymeric chains given in constructor.
	 * To be used in case the information is missing in PDB/mmCIF file
	 * @return
	 */
	public static List findPolyEntities(List> polyModels) {
		TreeMap chainIds2entities = findEntitiesFromAlignment(polyModels);

		return findUniqueEntities(chainIds2entities);
	}

	/**
	 * Utility method to obtain a list of unique entities from the chainIds2entities map
	 * @return
	 */
	private static List findUniqueEntities(TreeMap chainIds2entities) {

		List list = new ArrayList<>();

		for (EntityInfo cluster:chainIds2entities.values()) {
			boolean present = false;
			for (EntityInfo cl:list) {
				if (cl==cluster) {
					present = true;
					break;
				}
			}
			if (!present) list.add(cluster);
		}
		return list;
	}

	/**
	 * Given all chains of all models find entities for the nonpolymers and water chains within them,
	 * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List.
	 * @param nonPolyModels
	 * @param waterModels
	 * @param entities
	 */
	public static void createPurelyNonPolyEntities(List> nonPolyModels, List> waterModels, List entities) {

		if (nonPolyModels.isEmpty()) return;

		// let's find first the max entity id to assign entity ids to the newly found entities
		int maxMolId = 0;
		if (!entities.isEmpty()) {
			maxMolId = Collections.max(entities, Comparator.comparingInt(EntityInfo::getMolId)).getMolId();
		}
		// we go one over the max
		int molId = maxMolId + 1;

		if (!nonPolyModels.get(0).isEmpty()) {
			List nonPolyEntities = new ArrayList<>();

			for (List model:nonPolyModels) {
				for (Chain c: model) {
					// we assume there's only 1 group per non-poly chain
					String molecPdbName = c.getAtomGroup(0).getPDBName();

					EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities);
					if (nonPolyEntity == null) {
						nonPolyEntity = new EntityInfo();
						nonPolyEntity.setDescription(molecPdbName);
						nonPolyEntity.setType(EntityType.NONPOLYMER);
						nonPolyEntity.setMolId(molId++);
						nonPolyEntities.add(nonPolyEntity);

					}
					nonPolyEntity.addChain(c);
					c.setEntityInfo(nonPolyEntity);
				}
			}
			entities.addAll(nonPolyEntities);
		}

		if (!waterModels.get(0).isEmpty()) {
			EntityInfo waterEntity = new EntityInfo();
			waterEntity.setType(EntityType.WATER);
			waterEntity.setDescription("water");
			waterEntity.setMolId(molId);

			for (List model:waterModels) {
				for (Chain waterChain:model) {
					waterEntity.addChain(waterChain);
					waterChain.setEntityInfo(waterEntity);
				}
			}
			entities.add(waterEntity);

		}

	}

	private static EntityInfo findNonPolyEntityWithDescription(String description, List nonPolyEntities) {
		for (EntityInfo e:nonPolyEntities) {
			if (e.getDescription().equals(description)) return e;
		}
		return null;
	}


	private static boolean areResNumbersAligned(Chain c1, Chain c2) {

		boolean isC1prot = c1.isProtein();
		boolean isC2prot = c2.isProtein();

		// different kind of chain: we won't try to align them
		if (isC1prot != isC2prot ) return false;

		List c1AtomGroups = null;
		if (isC1prot) {
			c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID);
		}
		else {
			c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE);
		}

		int countNonExisting = 0;

		for (Group g1:c1AtomGroups) {
			try {
				Group g2 = c2.getGroupByPDB(g1.getResidueNumber());
				if (!g2.getPDBName().equals(g1.getPDBName())) {
					logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}",
							c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName());
					return false;
				}
			} catch (StructureException e) {
				// the group doesn't exist (no density) in the chain, go on
				countNonExisting++;
			}
		}

		if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) {
			logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.",
					RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId());
			return false;
		}

		return true;
	}



	private static TreeMap findEntitiesFromAlignment(List> polyModels) {

		TreeMap chainIds2entities = new TreeMap<>();

		if (polyModels.isEmpty()) return chainIds2entities;

		Set polyChainIndices = new TreeSet<>();
		for (int i=0;i(), false);

				for (int j:polyChainIndices) {

					if (j<=i) continue;

					Chain c2 = polyModels.get(0).get(j);
					String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), new HashMap<>(), false);

					int seq1Length = 0;
					int seq2Length = 0;

					SequencePair pair;
					if (isProteinSequence(str1) && isProteinSequence(str2)) {
						ProteinSequence s1 = getProteinSequence(str1);
						ProteinSequence s2 = getProteinSequence(str2);
						seq1Length = s1.getLength();
						seq2Length = s2.getLength();

						pair = alignProtein(s1,s2);

					} else if (isDNASequence(str1) && isDNASequence(str2)) {
						DNASequence s1 = getDNASequence(str1);
						DNASequence s2 = getDNASequence(str2);
						seq1Length = s1.getLength();
						seq2Length = s2.getLength();

						pair = alignDNA(s1,s2);

					} else if (isRNASequence(str1) && isRNASequence(str2)) {
						RNASequence s1 = getRNASequence(str1);
						RNASequence s2 = getRNASequence(str2);
						seq1Length = s1.getLength();
						seq2Length = s2.getLength();

						pair = alignRNA(s1,s2);

					} else {
						logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers", c1.getId(), c2.getId());
						continue;
					}

					int numGaps = getNumGaps(pair);
					int numGaps1 = getNumGapsQuery(pair);
					int numGaps2 = getNumGapsTarget(pair);

					int nonGaps = pair.getLength() - numGaps;

					double identity = (double)pair.getNumIdenticals()/(double)nonGaps;
					double gapCov1 = (double) numGaps1 / (double) seq1Length;
					double gapCov2 = (double) numGaps2 / (double) seq2Length;

					logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}",
							c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2));
					logger.debug("\n{}", pair.toString(100));

					if (identity > IDENTITY_THRESHOLD && gapCov11) {
						logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})",
								c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity);
						logger.warn("\n"+pair.toString(100));
					}

					if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities
						break outer;
				}
			}

		// anything not in an Entity will be its own Entity
		for (int i:polyChainIndices) {
			Chain c = polyModels.get(0).get(i);
			if (!chainIds2entities.containsKey(c.getId())) {
				logger.debug("Creating a 1-member entity for chain {}",c.getId());

				EntityInfo ent = new EntityInfo();
				ent.addChain(c);
				ent.setMolId(molId++);
				ent.setType(EntityType.POLYMER);
				c.setEntityInfo(ent);

				chainIds2entities.put(c.getId(),ent);
			}
		}

		// now that we've found the entities for first model, let's do the same for the rest of the models

		for (int i=1;i alignProtein(ProteinSequence s1, ProteinSequence s2) {
		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getIdentity();

		GapPenalty penalty = new SimpleGapPenalty(8, 1);

		PairwiseSequenceAligner nw =
				Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);

		return nw.getPair();
	}

	private static SequencePair alignDNA(DNASequence s1, DNASequence s2) {
		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getNuc4_4();

		GapPenalty penalty = new SimpleGapPenalty(8, 1);

		PairwiseSequenceAligner nw =
				Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);

		return nw.getPair();
	}

	private static SequencePair alignRNA(RNASequence s1, RNASequence s2) {
		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getNuc4_4();

		GapPenalty penalty = new SimpleGapPenalty(8, 1);

		PairwiseSequenceAligner nw =
				Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);

		return nw.getPair();
	}



	private static int getNumGaps(SequencePair pair) {
		int numGaps = 0;
		for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
			if (pair.hasGap(alignmentIndex)) numGaps++;
		}
		return numGaps;
	}

	private static int getNumGapsQuery(SequencePair pair) {
		int numGaps = 0;
		for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
			if ("-".equals(pair.getCompoundInQueryAt(alignmentIndex).getShortName())) {
				numGaps++;
			}
		}
		return numGaps;
	}

	private static int getNumGapsTarget(SequencePair pair) {
		int numGaps = 0;
		for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
			if ("-".equals(pair.getCompoundInTargetAt(alignmentIndex).getShortName())) {
				numGaps++;
			}
		}
		return numGaps;
	}

	private static boolean isProteinSequence(String str) {
		try {
			new ProteinSequence(str);
		} catch (CompoundNotFoundException e) {

			return false;
		}
		return true;
	}

	private static boolean isDNASequence(String str) {
		try {
			new DNASequence(str);
		} catch (CompoundNotFoundException e) {

			return false;
		}
		return true;
	}

	private static boolean isRNASequence(String str) {
		try {
			new RNASequence(str);
		} catch (CompoundNotFoundException e) {

			return false;
		}
		return true;

	}

	/**
	 * Returns the ProteinSequence or null if one can't be created
	 * @param str
	 * @return
	 */
	private static ProteinSequence getProteinSequence(String str) {
		try {
			ProteinSequence s = new ProteinSequence(str);
			return s;
		} catch (CompoundNotFoundException e) {

			logger.error("Unexpected error when creating ProteinSequence",e);
		}
		return null;
	}

	/**
	 * Returns the DNASequence or null if one can't be created
	 * @param str
	 * @return
	 */
	private static DNASequence getDNASequence(String str) {
		try {
			DNASequence s = new DNASequence(str);
			return s;
		} catch (CompoundNotFoundException e) {

			logger.error("Unexpected error when creating DNASequence ",e);
		}
		return null;
	}

	/**
	 * Returns the RNASequence or null if one can't be created
	 * @param str
	 * @return
	 */
	private static RNASequence getRNASequence(String str) {
		try {
			RNASequence s = new RNASequence(str);
			return s;
		} catch (CompoundNotFoundException e) {

			logger.error("Unexpected error when creating RNASequence ",e);
		}
		return null;
	}

}