
org.biojava.nbio.structure.SubstructureIdentifier Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created on December 19, 2013
* Author: Douglas Myers-Turnbull
*/
package org.biojava.nbio.structure;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.contact.Grid;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This is the canonical way to identify a part of a structure.
*
* The current syntax allows the specification of a set of residues from
* the first model of a structure. Future versions may be extended to represent
* additional properties.
*
*
Identifiers should adhere to the following specification, although some
* additional forms may be tolerated where unambiguous for backwards compatibility.
*
* name := pdbID
* | pdbID '.' chainID
* | pdbID '.' range
* range := range (',' range)?
* | chainID
* | chainID '_' resNum '-' resNum
* pdbID := [1-9][a-zA-Z0-9]{3}
* | PDB_[a-zA-Z0-9]{8}
* chainID := [a-zA-Z0-9]+
* resNum := [-+]?[0-9]+[A-Za-z]?
*
* For example:
*
* 1TIM #whole structure (short format)
* 1tim #same as above
* 4HHB.C #single chain
* 3AA0.A,B #two chains
* 4GCR.A_1-40 #substructure
* 3iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts
* PDB_00001TIM #whole structure (extended format)
* pdb_00001tim #same as above
* PDB_00004HHB.C #single chain
* PDB_00003AA0.A,B #two chains
* PDB_00004GCR.A_1-40 #substructure
* pdb_00003iek.A_17-28,A_56-294,A_320-377 #substructure of 3 disjoint parts
*
* More options may be added to the specification at a future time.
* @author dmyersturnbull
* @author Spencer Bliven
*/
public class SubstructureIdentifier implements StructureIdentifier {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(SubstructureIdentifier.class);
private final PdbId pdbId;
private final List ranges;
/**
* Create a new identifier from a string.
* @param id
*/
public SubstructureIdentifier(String id) {
String[] idRange = id.split("\\.");
if(1 > idRange.length || idRange.length > 2 ) {
throw new IllegalArgumentException(String.format("Malformed %s: %s",getClass().getSimpleName(),id));
}
//used tempId to avoid writing 2 assignment statements to a final field,
// although one is in the try block and the other in the catch block.
PdbId tempId = null;
try {
tempId = new PdbId(idRange[0]);
} catch (IllegalArgumentException e) {
// Changed from Exception to a warning to support files and stuff -sbliven 2015/01/22
logger.warn(String.format("Unrecognized PDB code %s", idRange[0]));
}
this.pdbId = tempId;
if( idRange.length == 2) {
String rangeStr = idRange[1].trim();
this.ranges = ResidueRange.parseMultiple(rangeStr);
} else {
this.ranges = new LinkedList();
}
}
/**
* Create a new identifier based on a set of ranges.
*
* If ranges is empty, includes all residues.
* @param pdbId
* @param ranges
*/
public SubstructureIdentifier(String pdbId, List ranges) {
this(new PdbId(pdbId), ranges);
}
/**
* Create a new identifier based on a set of ranges.
*
* If ranges is empty, includes all residues.
* @param pdbId
* @param ranges
*/
public SubstructureIdentifier(PdbId pdbId, List ranges) {
if(ranges == null) {
throw new NullPointerException("Null ranges list");
}
this.pdbId = pdbId;
this.ranges = ranges;
}
@Override
public String toString() {
return getIdentifier();
}
/**
* Get the String form of this identifier.
*
* This provides the canonical form for a StructureIdentifier and has
* all the information needed to recreate a particular substructure.
*
* Example: 3iek.A_17-28,A_56-294
* @return The String form of this identifier
*/
@Override
public String getIdentifier() {
String pdbId = this.pdbId == null? "": this.pdbId.getId();
if (ranges.isEmpty()) return pdbId;
return pdbId + "." + ResidueRange.toString(ranges);
}
/**
* Get the PDB identifier part of the SubstructureIdentifier
* @return the PDB ID
*/
public PdbId getPdbId() {
return pdbId;
}
public List getResidueRanges() {
return ranges;
}
/**
* Return itself. SubstructureIdentifiers are canonical!
*/
@Override
public SubstructureIdentifier toCanonical() {
return this;
}
/**
* Takes a complete structure as input and reduces it to residues present in
* the specified ranges
*
* The returned structure will be a shallow copy of the input, with shared
* Chains, Residues, etc.
*
*
Ligands are handled in a special way. If a full chain is selected
* (e.g. '1ABC.A') then any waters and ligands with matching chain name are
* included. If a residue range is present ('1ABC.A:1-100') then any
* ligands (technically non-water non-polymer atoms) within
* {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF} of the selected
* range are included, regardless of chain.
* @param input A full structure, e.g. as loaded from the PDB. The structure
* ID should match that returned by getPdbId().
* @return
* @throws StructureException
* @see StructureTools#getReducedStructure(Structure, String)
*/
@Override
public Structure reduce(Structure s) throws StructureException {
// Follows StructureImpl.clone()
if(s == null)
throw new StructureException("NullPointerException Possibly due to malformed PIBId format.");
// Create new structure & copy basic properties
Structure newS = new StructureImpl();
newS.setPdbId(s.getPdbId());
newS.setPDBHeader(s.getPDBHeader());
newS.setName(this.toString());
newS.setDBRefs(s.getDBRefs());
newS.setBiologicalAssembly(s.isBiologicalAssembly());
newS.getPDBHeader().setDescription(
"sub-range " + ranges + " of " + newS.getPdbId() + " "
+ s.getPDBHeader().getDescription());
newS.setEntityInfos(new ArrayList<>());
// TODO The following should be only copied for atoms which are present in the range.
newS.setSSBonds(s.getSSBonds());
newS.setSites(s.getSites());
newS.setStructureIdentifier(this);
for( int modelNr=0;modelNr());
if(getResidueRanges().isEmpty()) {
// Include all residues
newS.setEntityInfos(s.getEntityInfos());
newS.setSSBonds(s.getSSBonds());
newS.setSites(s.getSites());
newS.setModel(modelNr, s.getModel(modelNr));
} else {
// Restrict residues
for( ResidueRange range: getResidueRanges()) {
String chainName = range.getChainName();
ResidueNumber pdbresnum1 = range.getStart();
ResidueNumber pdbresnum2 = range.getEnd();
// StructureTools.addGroupsToStructure(newS, groups, modelNr, false);
Chain polyChain; //polymer
if(chainName.equals("_") ) {
// Handle special case of "_" chain for single-chain proteins
polyChain = s.getPolyChains(modelNr).get(0);
chainName = polyChain.getName();
if(pdbresnum1 != null)
pdbresnum1.setChainName(chainName);
if(pdbresnum2 != null)
pdbresnum2.setChainName(chainName);
if(s.getPolyChains().size() != 1) {
// SCOP 1.71 uses this for some proteins with multiple chains
// Print a warning in this ambiguous case
logger.warn("Multiple possible chains match '_'. Using chain {}",chainName);
}
} else {
// Explicit chain
polyChain = s.getPolyChainByPDB(chainName,modelNr);
if( polyChain == null ) {
// Chain not found
// Maybe it was a chain index, masquerading as a chainName?
try {
int chainNum = Integer.parseInt(chainName);
polyChain = s.getChainByIndex(modelNr, chainNum);
chainName = polyChain.getName();
if(pdbresnum1 != null)
pdbresnum1.setChainName(chainName);
if(pdbresnum2 != null)
pdbresnum2.setChainName(chainName);
logger.warn("No chain found for {}. Interpretting it as an index, using chain {} instead",chainName,polyChain.getId());
} catch(NumberFormatException e3) {
// Not an index. Throw the original exception
throw new StructureException(String.format("Unrecognized chain %s in %s",chainName,getIdentifier()));
}
}
}
if(pdbresnum1 == null && pdbresnum2 == null) {
// Include all atoms with matching chainName
StructureTools.addGroupsToStructure(newS, polyChain.getAtomGroups(), modelNr, false);
for(Chain chain : s.getNonPolyChainsByPDB(chainName, modelNr) ) {
StructureTools.addGroupsToStructure(newS, chain.getAtomGroups(), modelNr, false);
}
Chain waters = s.getWaterChainByPDB(chainName, modelNr);
if( waters != null) {
StructureTools.addGroupsToStructure(newS, waters.getAtomGroups(), modelNr, false);
}
// TODO do we need to prune SeqRes down to the atoms present? -SB 2016-10-7
} else {
// Include polymer range and any proximal ligands
List polygroups = Arrays.asList(polyChain.getGroupsByPDB(pdbresnum1, pdbresnum2));
StructureTools.addGroupsToStructure(newS, polygroups, modelNr, false);
copyLigandsByProximity(s,newS, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, modelNr, modelNr);
}
} // end range
}
} // end modelNr
return newS;
}
/**
* Loads the complete structure based on {@link #getPdbId()}.
*
* @param AtomCache A source of structures
* @return A Structure containing at least the atoms identified by this,
* or null if no PDB ID is set
* @throws StructureException For errors loading and parsing the structure
* @throws IOException Errors reading the structure from disk
*/
@Override
public Structure loadStructure(AtomCache cache) throws IOException, StructureException {
PdbId pdb = getPdbId();
if(pdb == null)
return null;
return cache.getStructureForPdbId(pdb);
}
/**
* Supplements the reduced structure with ligands from the full structure based on
* a distance cutoff. Ligand groups are moved (destructively) from full to reduced
* if they fall within the cutoff of any atom in the reduced structure.
* The {@link StructureTools#DEFAULT_LIGAND_PROXIMITY_CUTOFF default cutoff}
* is used.
* @param full Structure containing all ligands
* @param reduced Structure with a subset of the polymer groups from full
* @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double)
*/
protected static void copyLigandsByProximity(Structure full, Structure reduced) {
// Normal case where all models should be copied from full to reduced
assert full.nrModels() >= reduced.nrModels();
for(int model = 0; model< reduced.nrModels(); model++) {
copyLigandsByProximity(full, reduced, StructureTools.DEFAULT_LIGAND_PROXIMITY_CUTOFF, model, model);
}
}
/**
* Supplements the reduced structure with ligands from the full structure based on
* a distance cutoff. Ligand groups are moved (destructively) from full to reduced
* if they fall within the cutoff of any atom in the reduced structure.
* @param full Structure containing all ligands
* @param reduced Structure with a subset of the polymer groups from full
* @param cutoff Distance cutoff (Å)
* @param fromModel source model in full
* @param toModel destination model in reduced
* @see StructureTools#getLigandsByProximity(java.util.Collection, Atom[], double)
*/
protected static void copyLigandsByProximity(Structure full, Structure reduced, double cutoff, int fromModel, int toModel) {
// Geometric hashing of the reduced structure
Grid grid = new Grid(cutoff);
Atom[] nonwaters = StructureTools.getAllNonHAtomArray(reduced,true,toModel);
if( nonwaters.length < 1 )
return;
grid.addAtoms(nonwaters);
full.getNonPolyChains(fromModel).stream() //potential ligand chains
.flatMap((chain) -> chain.getAtomGroups().stream() ) // potential ligand groups
.filter( (g) -> !g.isWater() ) // ignore waters
.filter( (g) -> !g.isPolymeric() ) // already shouldn't be polymeric, but filter anyways
.filter( (g) -> grid.hasAnyContact(Calc.atomsToPoints(g.getAtoms())) ) // must contact reduced
.sequential() // Keeps ligands from the same chain together if possible
.reduce((Chain)null, // reduction updates the chain guess
(guess, g ) -> {
boolean wasAdded;
try {
// Check that it's not in reduced already
wasAdded = reduced.findGroup(g.getChainId(),
g.getResidueNumber().toString(), toModel) != null;
} catch (StructureException e) {
// not found
wasAdded = false;
}
if( !wasAdded ) {
// Add the ligand to reduced
// note this is not idempotent, but it is synchronized on reduced
logger.info("Adding ligand group {} {} by proximity",g.getPDBName(), g.getResidueNumber().toPDB());
return StructureTools.addGroupToStructure(reduced, g, toModel, guess, false);
}
return guess;
},
// update to the new guess
(oldGuess, newGuess) -> newGuess );
}
}