org.biojava.nbio.structure.ChainImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
There is a newer version: 7.1.3
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * Created on 12.03.2004
 * @author Andreas Prlic
 *
 */
package org.biojava.nbio.structure;


import org.biojava.nbio.structure.chem.ChemComp;
import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
import org.biojava.nbio.structure.chem.PolymerType;
import org.biojava.nbio.structure.io.FileConvert;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.template.Sequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;


/**
 * A Chain in a PDB file. It contains several groups which can be of
 * one of the types defined in the {@link GroupType} constants.
 *
 * @author Andreas Prlic
 * @author Jules Jacobsen
 * @since 1.4
 */
public class ChainImpl implements Chain {

	private final static Logger logger = LoggerFactory.getLogger(ChainImpl.class);

	private static final long serialVersionUID = 1990171805277911840L;

	/**
	 * The default chain identifier used to be an empty space
	 */
	private static final String DEFAULT_CHAIN_ID = "A";

	private String authId; // the 'public' chain identifier as assigned by authors in PDB files
	private String asymId; // the 'internal' chain identifier as used in mmCIF files

	private List groups;
	private List seqResGroups;

	private EntityInfo entity;
	private Structure parent;

	private final Map pdbResnumMap;

	private List seqMisMatches;

	/**
	 *  Constructs a ChainImpl object.
	 */
	public ChainImpl() {
		super();

		authId = DEFAULT_CHAIN_ID;
		groups = new ArrayList<>() ;

		seqResGroups = new ArrayList<>();
		pdbResnumMap = new HashMap<>();
		asymId = null;

	}

	@Override
	public String getId() {
		return asymId;
	}

	@Override
	public void setId(String asymId) {
		this.asymId = asymId;
	}

	@Override
	public String getName() { return authId; }

	@Override
	public void setName(String authId) { this.authId = authId; }

	@Override
	public void setStructure(Structure parent){
		this.parent = parent;
	}

	@Override
	public Structure getStructure() {

		return parent;
	}

	@Override
	public Object clone() {

		// go through all groups and add to new Chain.
		ChainImpl n = new ChainImpl();
		// copy chain data:

		n.setId(getId());
		n.setName(getName());

		// NOTE the EntityInfo will be reset at the parent level (Structure) if cloning is happening from parent level
		// here we don't deep-copy it and just keep the same reference, in case the cloning is happening at the Chain level only
		n.setEntityInfo(this.entity);


		for (Group group : groups) {
			Group g = (Group) group.clone();
			n.addGroup(g);
			g.setChain(n);
		}

		if (seqResGroups!=null){

			List tmpSeqRes = new ArrayList<>();

			// cloning seqres and atom groups is ugly, due to their
			// nested relationship (some of the atoms can be in the seqres, but not all)

			for (Group seqResGroup : seqResGroups) {

				if (seqResGroup==null) {
					tmpSeqRes.add(null);
					continue;
				}

				int i = groups.indexOf(seqResGroup);

				Group g ;

				if (i!=-1) {
					// group found in atom groups, we get the equivalent reference from the newly cloned atom groups
					g = n.getAtomGroup(i);
				} else {
					// group not found in atom groups, we clone the seqres group
					g = (Group) seqResGroup.clone();
				}
				g.setChain(n);
				tmpSeqRes.add(g);
			}

			n.setSeqResGroups(tmpSeqRes);
		}

		return n ;
	}

	@Override
	public void setEntityInfo(EntityInfo mol) {
		this.entity = mol;
	}

	@Override
	public EntityInfo getEntityInfo() {
		return this.entity;
	}

	@Override
	public void addGroup(Group group) {

		group.setChain(this);

		// Set the altlocs chain as well
		for(Group g : group.getAltLocs()) {
			g.setChain(this);
		}

		groups.add(group);

		// store the position internally for quick access of this group

		String pdbResnum = null ;
		ResidueNumber resNum = group.getResidueNumber();
		if ( resNum != null)
			pdbResnum = resNum.toString();
		if ( pdbResnum != null) {
			Integer pos = groups.size() - 1;
			// ARGH sometimes numbering in PDB files is confusing.
			// e.g. PDB: 1sfe
			/*
			 * ATOM    620  N   GLY    93     -24.320  -6.591   4.210  1.00 46.82           N
			 * ATOM    621  CA  GLY    93     -24.960  -6.849   5.497  1.00 47.35           C
			 * ATOM    622  C   GLY    93     -26.076  -5.873   5.804  1.00 47.24           C
			 * ATOM    623  O   GLY    93     -26.382  -4.986   5.006  1.00 47.56           O
			 *    and ...
			 * HETATM 1348  O   HOH    92     -21.853 -16.886  19.138  1.00 66.92           O
			 * HETATM 1349  O   HOH    93     -26.126   1.226  29.069  1.00 71.69           O
			 * HETATM 1350  O   HOH    94     -22.250 -18.060  -6.401  1.00 61.97           O
			 */

			// this check is to give in this case the entry priority that is an AminoAcid / comes first...
			// a good example of same residue number for 2 residues is 3th3, chain T, residue 201 (a LYS and a sugar BGC covalently attached to it) - JD 2016-03-09
			if (  pdbResnumMap.containsKey(pdbResnum)) {

				logger.warn("Adding residue {}({}) to chain {} but a residue with same residue number is already present: {}({}). Will add only the aminoacid residue (if any) to the lookup, lookups for that residue number won't work properly.",
						pdbResnum, group.getPDBName(), getId(), groups.get(pdbResnumMap.get(pdbResnum)).getResidueNumber(), groups.get(pdbResnumMap.get(pdbResnum)).getPDBName());
				if ( group instanceof AminoAcid)
					pdbResnumMap.put(pdbResnum,pos);
			} else
				pdbResnumMap.put(pdbResnum,pos);
		}

	}

	@Override
	public Group getAtomGroup(int position) {

		return groups.get(position);
	}

	@Override
	public List getAtomGroups(GroupType type){

		List tmp = new ArrayList<>() ;
		for (Group g : groups) {
			if (g.getType().equals(type)) {
				tmp.add(g);
			}
		}

		return tmp ;
	}

	@Override
	public List getAtomGroups(){
		return groups ;
	}

	@Override
	public void setAtomGroups(List groups){
		for (Group g:groups){
			g.setChain(this);
		}
		this.groups = groups;
	}

	@Override
	public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end, boolean ignoreMissing)
			throws StructureException {
		// Short-circut for include all groups
		if(start == null && end == null) {
			return groups.toArray(new Group[0]);
		}

		List retlst = new ArrayList<>();

		boolean adding, foundStart;
		if( start == null ) {
			// start with first group
			adding = true;
			foundStart = true;
		} else {
			adding = false;
			foundStart = false;
		}

		for (Group g: groups){

			// Check for start
			if (!adding && start.equalsPositional(g.getResidueNumber())) {
				adding = true;
				foundStart = true;
			}

			// Check if past start
			if ( ignoreMissing && ! (foundStart && adding) ) {
				ResidueNumber pos = g.getResidueNumber();

				if ( start != null && start.compareToPositional(pos) <= 0) {
					foundStart = true;
					adding = true;
				}
			}

			if ( adding)
				retlst.add(g);

			// check for end
			if ( end != null && end.equalsPositional(g.getResidueNumber())) {
				if ( ! adding)
					throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId);
				adding = false;
				break;
			}
			// check if past end
			if ( ignoreMissing && adding && end != null){

				ResidueNumber pos = g.getResidueNumber();
				if ( end.compareToPositional(pos) <= 0) {
					adding = false;
					break;
				}

			}
		}

		if ( ! foundStart){
			throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId);
		}
		if ( end != null && adding && !ignoreMissing) {
			throw new StructureException("did not find end PDB residue number " + end + " in chain " + authId);
		}

		//not checking if the end has been found in this case...

		return retlst.toArray(new Group[0]);
	}

	@Override
	public Group getGroupByPDB(ResidueNumber resNum) throws StructureException {
		String pdbresnum = resNum.toString();
		if ( pdbResnumMap.containsKey(pdbresnum)) {
			Integer pos = pdbResnumMap.get(pdbresnum);
			return groups.get(pos);
		} else {
			throw new StructureException("unknown PDB residue number " + pdbresnum + " in chain " + authId);
		}
	}

	@Override
	public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end)
			throws StructureException {
		return getGroupsByPDB(start, end, false);
	}

	@Override
	public int getSeqResLength() {
		//new method returns the length of the sequence defined in the SEQRES records
		return seqResGroups.size();
	}

	@Override
	public String toString(){
		String newline = System.getProperty("line.separator");
		StringBuilder str = new StringBuilder();
		str.append("Chain asymId:").append(getId()).append(" authId:").append(getName()).append(newline);
		if ( entity != null ){
			if ( entity.getDescription() != null){
				str.append(entity.getDescription()).append(newline);
			}
		}
		str.append("total SEQRES length: ").append(getSeqResGroups().size()).append(" total ATOM length:")
		.append(getAtomLength()).append(" residues ").append(newline);

		return str.toString() ;
	}

	@Override
	public Sequence getBJSequence()  {

		String seq = getSeqResSequence();

		Sequence s = null;

		try {
			s = new ProteinSequence(seq);
		} catch (CompoundNotFoundException e) {
			logger.error("Could not create sequence object from seqres sequence. Some unknown compound: {}",e.getMessage());
		}

		//TODO: return a DNA sequence if the content is DNA...
		return s;
	}

	@Override
	public String getAtomSequence(){

		List groups = getAtomGroups();
		StringBuilder sequence = new StringBuilder() ;

		for ( Group g: groups){
			ChemComp cc = g.getChemComp();

			if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) ||
					PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){
				// an amino acid residue.. use for alignment
				String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc);
				if ( oneLetter == null)
					oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL);
				sequence.append(oneLetter);
			}

		}
		return sequence.toString();
	}

	@Override
	public String getSeqResSequence(){

		StringBuilder str = new StringBuilder();
		for (Group g : seqResGroups) {
			ChemComp cc = g.getChemComp();
			if ( cc == null) {
				logger.warn("Could not load ChemComp for group: {}", g);
				str.append(StructureTools.UNKNOWN_GROUP_LABEL);
			} else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) ||
					PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){
				// an amino acid residue.. use for alignment
				String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc);
				// AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible.
				if ( oneLetter == null || oneLetter.isEmpty() || "?".equals(oneLetter)) {
					oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL);
				}
				str.append(oneLetter);
			} else {
				str.append(StructureTools.UNKNOWN_GROUP_LABEL);
			}
		}
		return str.toString();
	}

	/**
	 * Get the one letter sequence so that Sequence is guaranteed to
	 * be the same length as seqResGroups.
	 * Method related to https://github.com/biojava/biojava/issues/457
	 * @return a string of the sequence guaranteed to be the same length
	 * as seqResGroups.
	 */
	public String getSeqResOneLetterSeq(){

		StringBuilder str = new StringBuilder();
		for (Group g : seqResGroups) {
			ChemComp cc = g.getChemComp();
			if ( cc == null) {
				logger.warn("Could not load ChemComp for group: {}", g);
				str.append(StructureTools.UNKNOWN_GROUP_LABEL);
			} else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) ||
					PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){
				// an amino acid residue.. use for alignment
				String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc);
				// AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible.
				if ( oneLetter == null || oneLetter.isEmpty() || "?".equals(oneLetter) || oneLetter.length()!=1) {
					oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL);
				}
				str.append(oneLetter);
			} else {
				str.append(StructureTools.UNKNOWN_GROUP_LABEL);
			}
		}
		return str.toString();
	}

	@Override
	public Group getSeqResGroup(int position) {
		return seqResGroups.get(position);
	}

	@Override
	public List getSeqResGroups(GroupType type) {
		List tmp = new ArrayList<>() ;
		for (Group g : seqResGroups) {
			if (g.getType().equals(type)) {
				tmp.add(g);
			}
		}

		return tmp ;
	}

	@Override
	public List getSeqResGroups() {
		return seqResGroups;
	}

	@Override
	public void setSeqResGroups(List groups){
		for (Group g: groups){
			if (g != null) {
				g.setChain(this);
			}
		}
		this.seqResGroups = groups;
	}

	@Override
	public int getAtomLength() {

		return groups.size();
	}

	@Override
	public String toPDB() {
		return FileConvert.toPDB(this);
	}

	@Override
	public String toMMCIF() {
		return FileConvert.toMMCIF(this);
	}

	@Override
	public void setSeqMisMatches(List seqMisMatches) {
		this.seqMisMatches = seqMisMatches;
	}

	@Override
	public List getSeqMisMatches() {
		return seqMisMatches;
	}

	@Override
	public EntityType getEntityType() {
		if (getEntityInfo()==null) return null;
		return getEntityInfo().getType();
	}

	@Override
	public boolean isWaterOnly() {
		for (Group g : getAtomGroups()) {
			if (!g.isWater())
				return false;
		}
		return true;
	}

	@Override
	public boolean isPureNonPolymer() {
		for (Group g : getAtomGroups()) {

			//ChemComp cc = g.getChemComp();

			if ( 	g.isPolymeric() &&
					!g.isHetAtomInFile() ) {

				// important: the aminoacid or nucleotide residue can be in Atom records

				return false;
			}

		}
		return true;
	}

	@Override
	public GroupType getPredominantGroupType(){

		double ratioResiduesToTotal = StructureTools.RATIO_RESIDUES_TO_TOTAL;

		int sizeAminos = getAtomGroups(GroupType.AMINOACID).size();
		int sizeNucleotides = getAtomGroups(GroupType.NUCLEOTIDE).size();
		List hetAtoms = getAtomGroups(GroupType.HETATM);
		int sizeHetatoms = hetAtoms.size();
		int sizeWaters = 0;
		for (Group g : hetAtoms) {
			if (g.isWater())
				sizeWaters++;
		}
		int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters;

		int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater;

		if ((double) sizeAminos / (double) fullSize > ratioResiduesToTotal)
			return GroupType.AMINOACID;

		if ((double) sizeNucleotides / (double) fullSize > ratioResiduesToTotal)
			return GroupType.NUCLEOTIDE;

		if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > ratioResiduesToTotal)
			return GroupType.HETATM;

		// finally if neither condition works, we try based on majority, but log
		// it
		GroupType max;
		if (sizeNucleotides > sizeAminos) {
			if (sizeNucleotides > sizeHetatomsWithoutWater) {
				max = GroupType.NUCLEOTIDE;
			} else {
				max = GroupType.HETATM;
			}
		} else {
			if (sizeAminos > sizeHetatomsWithoutWater) {
				max = GroupType.AMINOACID;
			} else {
				max = GroupType.HETATM;
			}
		}
		logger.debug("Ratio of residues to total for chain with asym_id {} is below {}. Assuming it is a {} chain. Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, ratio aa/total: {}, ratio nuc/total: {}{}{}{}{}", getId(), ratioResiduesToTotal, max, sizeAminos, sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters, (double) sizeAminos, (double) fullSize, (double) sizeNucleotides, (double) fullSize);

		return max;
	}

	@Override
	public  boolean isProtein() {
		return getPredominantGroupType() == GroupType.AMINOACID;
	}

	@Override
	public  boolean isNucleicAcid() {
		return getPredominantGroupType() == GroupType.NUCLEOTIDE;
	}


}