org.biojava.nbio.structure.io.EntityFinder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
There is a newer version: 7.2.2
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io;

import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
import org.biojava.nbio.alignment.template.GapPenalty;
import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
import org.biojava.nbio.core.alignment.template.SequencePair;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.biojava.nbio.structure.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

/**
 * Heuristical finding of Entities (called Compounds in legacy PDB format)
 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains
 * in the Structure.
 *
 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where
 * possibly no SEQRES is given.
 *
 * @author Jose Duarte
 */
public class EntityFinder {

	private Structure s;

	private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class);

	/**
	 * Above this ratio of mismatching residue types for same residue numbers we
	 * consider the 2 chains to have mismatching residue numbers and warn about it
	 */
	public static final double RATIO_GAPS_FOR_MISMATCH = 0.50;

	/**
	 * Identity value for 2 chains to be considered part of same entity
	 */
	public static final double IDENTITY_THRESHOLD = 0.99999;

	/**
	 * Gap coverage value (num gaps over length of sequence) for each chain of the match:
	 * 2 chains with more gap coverage than this value will not be considered part of the same entity
	 */
	public static final double GAP_COVERAGE_THRESHOLD = 0.3;


	public EntityFinder(Structure s) {
		this.s = s;
	}

	/**
	 * Utility method that employs some heuristics to find the {@link EntityInfo}s
	 * for this Structure in case the information is missing in PDB/mmCIF file
	 * @return
	 */
	public List findEntities() {

		TreeMap chainIds2entities = findCompoundsFromAlignment();

		List compounds = findUniqueCompounds(chainIds2entities);

		createPurelyNonPolyCompounds(compounds);

		return compounds;
	}

	/**
	 * Utility method to obtain a list of unique entities from the chainIds2entities map
	 * @return
	 */
	private static List findUniqueCompounds(TreeMap chainIds2entities) {

		List list = new ArrayList();

		for (EntityInfo cluster:chainIds2entities.values()) {
			boolean present = false;
			for (EntityInfo cl:list) {
				if (cl==cluster) {
					present = true;
					break;
				}
			}
			if (!present) list.add(cluster);
		}
		return list;
	}

	private void createPurelyNonPolyCompounds(List compounds) {

		List pureNonPolymerChains = new ArrayList();
		for (int i=0;i() {
					@Override
					public int compare(EntityInfo o1, EntityInfo o2) {
						return new Integer(o1.getMolId()).compareTo(o2.getMolId());
					}
				}).getMolId();
			}

			int molId = maxMolId + 1;
			// for the purely non-polymeric chains we assign additional compounds
			for (Chain c: pureNonPolymerChains) {
				EntityInfo comp = new EntityInfo();
				comp.addChain(c);
				comp.setMolId(molId);
				if (StructureTools.isChainWaterOnly(c)) {
					comp.setType(EntityType.WATER);
				} else {
					comp.setType(EntityType.NONPOLYMER);
				}
				logger.warn("Chain {} is purely non-polymeric, will assign a new Compound (entity) to it (entity id {})", c.getChainID(), molId);
				molId++;

				compounds.add(comp);
			}

		}
	}


	private static boolean areResNumbersAligned(Chain c1, Chain c2) {

		boolean isC1prot = StructureTools.isProtein(c1);
		boolean isC2prot = StructureTools.isProtein(c2);

		// different kind of chain: we won't try to align them
		if (isC1prot != isC2prot ) return false;

		List c1AtomGroups = null;
		if (isC1prot) {
			c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID);
		}
		else {
			c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE);
		}

		int countNonExisting = 0;

		for (Group g1:c1AtomGroups) {
			try {
				Group g2 = c2.getGroupByPDB(g1.getResidueNumber());
				if (!g2.getPDBName().equals(g1.getPDBName())) {
					logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}",
							c1.getChainID(),c2.getChainID(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName());
					return false;
				}
			} catch (StructureException e) {
				// the group doesn't exist (no density) in the chain, go on
				countNonExisting++;
				continue;
			}
		}

		if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) {
			logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.",
					RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getChainID(), c1.getChainID());
			return false;
		}

		return true;
	}



	private TreeMap findCompoundsFromAlignment() {

		// first we determine which chains to consider: anything not looking
		// polymeric (protein or nucleotide chain) should be discarded
		Set polyChainIndices = new TreeSet();
		List pureNonPolymerChains = new ArrayList();
		for (int i=0;i chainIds2compounds = new TreeMap();

		int molId = 1;

		outer:
			for (int i:polyChainIndices) {
				for (int j:polyChainIndices) {

					if (j<=i) continue;

					Chain c1 = s.getChain(i);
					Chain c2 = s.getChain(j);

					Map positionIndex1 = new HashMap();
					Map positionIndex2 = new HashMap();
					// here we use false, which means that X will be used for unknown compounds
					String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false);
					String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false);

					int seq1Length = 0;
					int seq2Length = 0;

					SequencePair pair = null;
					if (isProteinSequence(str1) && isProteinSequence(str2)) {
						ProteinSequence s1 = getProteinSequence(str1);
						ProteinSequence s2 = getProteinSequence(str2);
						seq1Length = s1.getLength();
						seq2Length = s2.getLength();

						pair = alignProtein(s1,s2);

					} else if (isDNASequence(str1) && isDNASequence(str2)) {
						DNASequence s1 = getDNASequence(str1);
						DNASequence s2 = getDNASequence(str2);
						seq1Length = s1.getLength();
						seq2Length = s2.getLength();

						pair = alignDNA(s1,s2);

					} else if (isRNASequence(str1) && isRNASequence(str2)) {
						RNASequence s1 = getRNASequence(str1);
						RNASequence s2 = getRNASequence(str2);
						seq1Length = s1.getLength();
						seq2Length = s2.getLength();

						pair = alignRNA(s1,s2);

					} else {
						logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers");
						continue;
					}


					int numGaps = getNumGaps(pair);
					int numGaps1 = getNumGapsQuery(pair);
					int numGaps2 = getNumGapsTarget(pair);

					int nonGaps = pair.getLength() - numGaps;

					double identity = (double)pair.getNumIdenticals()/(double)nonGaps;
					double gapCov1 = (double) numGaps1 / (double) seq1Length;
					double gapCov2 = (double) numGaps2 / (double) seq2Length;

					logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}",
							c1.getChainID(), c2.getChainID(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2));
					logger.debug("\n"+pair.toString(100));

					if (identity > IDENTITY_THRESHOLD && gapCov11) {
						logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})",
								c1.getChainID(),c2.getChainID(),pair.getNumIdenticals(),nonGaps,identity);
						logger.warn("\n"+pair.toString(100));
					}

					if (chainIds2compounds.size()==polyChainIndices.size()) // we've got all chains in entities
						break outer;
				}
			}

		// anything not in a Compound will be its own Compound
		for (int i:polyChainIndices) {
			Chain c = s.getChain(i);
			if (!chainIds2compounds.containsKey(c.getChainID())) {
				logger.debug("Creating a 1-member Compound for chain {}",c.getChainID());

				EntityInfo ent = new EntityInfo();
				ent.addChain(c);
				ent.setMolId(molId++);

				chainIds2compounds.put(c.getChainID(),ent);
			}
		}


		return chainIds2compounds;
	}

	private SequencePair alignProtein(ProteinSequence s1, ProteinSequence s2) {
		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getIdentity();

		GapPenalty penalty = new SimpleGapPenalty(8, 1);

		PairwiseSequenceAligner nw =
				Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);

		return nw.getPair();
	}

	private SequencePair alignDNA(DNASequence s1, DNASequence s2) {
		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getNuc4_4();

		GapPenalty penalty = new SimpleGapPenalty(8, 1);

		PairwiseSequenceAligner nw =
				Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);

		return nw.getPair();
	}

	private SequencePair alignRNA(RNASequence s1, RNASequence s2) {
		SubstitutionMatrix matrix = SubstitutionMatrixHelper.getNuc4_4();

		GapPenalty penalty = new SimpleGapPenalty(8, 1);

		PairwiseSequenceAligner nw =
				Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);

		return nw.getPair();
	}



	private static int getNumGaps(SequencePair pair) {
		int numGaps = 0;
		for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
			if (pair.hasGap(alignmentIndex)) numGaps++;
		}
		return numGaps;
	}

	private static int getNumGapsQuery(SequencePair pair) {
		int numGaps = 0;
		for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
			if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) {
				numGaps++;
			}
		}
		return numGaps;
	}

	private static int getNumGapsTarget(SequencePair pair) {
		int numGaps = 0;
		for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
			if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) {
				numGaps++;
			}
		}
		return numGaps;
	}

	private boolean isProteinSequence(String str) {
		try {
			new ProteinSequence(str);
		} catch (CompoundNotFoundException e) {

			return false;
		}
		return true;
	}

	private boolean isDNASequence(String str) {
		try {
			new DNASequence(str);
		} catch (CompoundNotFoundException e) {

			return false;
		}
		return true;
	}

	private boolean isRNASequence(String str) {
		try {
			new RNASequence(str);
		} catch (CompoundNotFoundException e) {

			return false;
		}
		return true;

	}

	/**
	 * Returns the ProteinSequence or null if one can't be created
	 * @param str
	 * @return
	 */
	private ProteinSequence getProteinSequence(String str) {
		try {
			ProteinSequence s = new ProteinSequence(str);
			return s;
		} catch (CompoundNotFoundException e) {

			logger.error("Unexpected error when creating ProteinSequence",e);
		}
		return null;
	}

	/**
	 * Returns the DNASequence or null if one can't be created
	 * @param str
	 * @return
	 */
	private DNASequence getDNASequence(String str) {
		try {
			DNASequence s = new DNASequence(str);
			return s;
		} catch (CompoundNotFoundException e) {

			logger.error("Unexpected error when creating DNASequence ",e);
		}
		return null;
	}

	/**
	 * Returns the RNASequence or null if one can't be created
	 * @param str
	 * @return
	 */
	private RNASequence getRNASequence(String str) {
		try {
			RNASequence s = new RNASequence(str);
			return s;
		} catch (CompoundNotFoundException e) {

			logger.error("Unexpected error when creating RNASequence ",e);
		}
		return null;
	}

}