
org.biojava.nbio.structure.align.client.StructureName Maven / Gradle / Ivy
Show all versions of biojava-structure Show documentation
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure.align.client;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.biojava.nbio.structure.BioAssemblyIdentifier;
import org.biojava.nbio.structure.ResidueRange;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureIdentifier;
import org.biojava.nbio.structure.SubstructureIdentifier;
import org.biojava.nbio.structure.URLIdentifier;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.cath.CathDomain;
import org.biojava.nbio.structure.cath.CathFactory;
import org.biojava.nbio.structure.domain.PDPDomain;
import org.biojava.nbio.structure.domain.PDPProvider;
import org.biojava.nbio.structure.domain.RemotePDPProvider;
import org.biojava.nbio.structure.ecod.EcodFactory;
import org.biojava.nbio.structure.io.util.FileDownloadUtils;
import org.biojava.nbio.structure.scop.ScopDatabase;
import org.biojava.nbio.structure.scop.ScopDomain;
import org.biojava.nbio.structure.scop.ScopFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A utility class that makes working with names of structures, domains and ranges easier.
*
* Accepts a wide range of identifier formats, including {@link ScopDomain},
* {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue
* ranges.
*
* Where possible, data is extracted from the input string. Otherwise, range
* information may be loaded from one of the factory classes:
* {@link CathFactory},{@link ScopFactory}, etc.
*
* @see #getName the name. e.g. 4hhb, 4hhb.A, d4hhba_, PDP:4HHBAa etc.
*/
public class StructureName implements Comparable, Serializable, StructureIdentifier {
private static final long serialVersionUID = 4021229518711762957L;
private static final Logger logger = LoggerFactory.getLogger(StructureName.class);
protected String name;
protected String pdbId;
protected String chainId;
private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE);
// ds046__ is a special case with no PDB entry
private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
// ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B'
private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE);
// Names are automatically used as prefixes
public enum Source {
PDB,
SCOP,
PDP,
CATH,
URL,
FILE,
ECOD,
BIO,
};
private Source mySource = null;
// cache for getBaseIdentifier() method
private StructureIdentifier base = null;
/**
* Create a new StructureName from the given identifier, which may be a
* domain name, a substructure identifier, etc.
*
* The source and PDB-Id are extracted at compile time, but fully
* interpreting the ID, which may require additional parsing or remote
* calls, is done lazily.
*
* The following sources are supported. Any may be prefixed by the source
* name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used
* unequivocally. If no source is specified, StructureName will make a
* (usually reliable) guess as to which source was intended.
*
* - PDBPDB identifier, optionally followed by chain and/or residue
* ranges. Internally represented by a {@link SubstructureIdentifier};
* see that class for the full format specification.
* Examples: 4hhb, 4hhb.A, 4hhb.A:1-50.
*
- SCOP SCOP domain (or SCOPe, depending on the
* {@link ScopFactory#getSCOP()} version). Example: d1h6w.2
*
- PDP Protein Domain Parser domain. PDP domains are not guessed,
* making the PDP: prefix obligatory. Example: PDP:4HHBAa
*
- CATH Cath domains. Example: 1qvrC03
*
- URL Arbitrary URLs. Most common protocols are handled,
* including http://, ftp://, and file://. Some parsing information can
* be passed as custom query parameters. Example:
* http://www.rcsb.org/pdb/files/1B8G.pdb.gz
*
- FILE A file path. Supports relative paths and expands ~ to
* the user's home directory. Only existing files will be automatically
* detected; to refer to a potentially not-yet existing file, prepend
* the prefix. Internally represented as a {@link URLIdentifier}
* after path expansion. Example: ~/custom_protein.pdb
*
- ECOD ECOD domain. Example: e1lyw.1
*
- BIO Biological assembly. These are not guessed, making
* the BIO: prefix obligatory. Example: BIO:2ehz:1
*
* @param name An identifier string
* @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid
*/
public StructureName(String name){
this.name = name;
init();//sets pdbId and mySource
}
/**
* Tries to determine the source and pdbId without fully realizing the identifier,
* which could require I/O depending on the source
* @throws IllegalArgumentException if the source is recognizable but invalid
*/
private void init(){
// First try identifying a prefix
String[] prefix = name.split(":", 2);
mySource = null;
if(prefix.length > 1) {
// Match Source prefixes
String suffix = prefix[1];
try {
mySource = Source.valueOf(prefix[0].toUpperCase());
} catch( IllegalArgumentException e ) {
// unrecognized prefix; fall back on guessing
mySource = null;
}
if(mySource != null) {
switch( mySource) {
case SCOP:
if( ! initFromScop(suffix) )
throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix);
return;
case PDP:
if( ! initFromPDP(name) )
throw new IllegalArgumentException("Malformed PDP domain name:"+suffix);
return;
case CATH:
if( ! initFromCATH(suffix) )
throw new IllegalArgumentException("Malformed CATH domain name:"+suffix);
return;
case BIO:
if( ! initFromBIO(name) )
throw new IllegalArgumentException("Malformed BIO name:"+suffix);
return;
case ECOD:
if( ! initFromECOD(suffix) )
throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix);
return;
case PDB:
if( ! initFromPDB(suffix) )
throw new IllegalArgumentException("Malformed PDB specification:"+suffix);
return;
case FILE:
// Treat file:/ prefixes as URLs
if( ! suffix.startsWith("/")) {
// Otherwise, treat as file
initFromFile();
return;
}
// fall through to URL case
case URL:
if( ! initFromURL(name))
throw new IllegalArgumentException("Malformed URL specification:"+suffix);
return;
default:
throw new IllegalStateException("Unimplemented Source "+mySource);
}
}
}
// No known prefix, so revert to guessing
// First guess regex-based identifiers
// SCOP domain
if( initFromScop(name) )
return;
// CATH
if( initFromCATH(name) )
return;
// ECOD
if( initFromECOD(name) )
return;
// Never guess BIO or PDP
// URL
if( initFromURL(name) )
return;
// Guess FILE based on file existence
File file = new File(FileDownloadUtils.expandUserHome(name));
if( file.canRead() && !file.isDirectory() ) {
// an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it
if (name.matches("\\d\\w\\w\\w")) {
// the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it
logger.warn("Provided 4-letter structure name '{}' matches "
+ "file name in directory {}. Will read structure "
+ "data from file {} and not consider the name as a "
+ "structure identifier. If this is not what you "
+ "want, use 'FILE:{}'",
name, file.getAbsoluteFile().getParent(),
file.getAbsolutePath(), name);
} else {
logger.info("Provided structure name '{}' matches "
+ "file name in directory {}. Will read structure "
+ "data from file {}.",
name, file.getAbsoluteFile().getParent(),
file.getAbsolutePath());
}
initFromFile();
return;
}
// Default to PDB
initFromPDB( name );
}
private boolean initFromScop(String name) {
Matcher matcher = scopPattern.matcher(name);
if ( matcher.matches() ) {
mySource = Source.SCOP;
pdbId = matcher.group(1).toUpperCase();
chainId = matcher.group(2);
return true;
}
return false;
}
private boolean initFromPDP(String name) {
Matcher matcher = PDPDomain.PDP_NAME_PATTERN.matcher(name);
if( matcher.matches() ) {
pdbId = matcher.group(1).toUpperCase();
chainId = matcher.group(2);
return true;
}
return false;
}
private boolean initFromCATH(String name) {
Matcher matcher = cathPattern.matcher(name);
if ( matcher.matches() ){
mySource = Source.CATH;
pdbId = matcher.group(1).toUpperCase();
chainId = matcher.group(2);
return true;
}
return false;
}
private boolean initFromECOD(String name) {
Matcher matcher = ecodPattern.matcher(name);
if ( matcher.matches() ){
mySource = Source.ECOD;
pdbId = matcher.group(1).toUpperCase();
chainId = null;
return true;
}
return false;
}
private boolean initFromBIO(String name) {
Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name);
if( matcher.matches() ) {
pdbId = matcher.group(1).toUpperCase();
return true;
}
return false;
}
private boolean initFromPDB(String suffix) {
mySource = Source.PDB;
SubstructureIdentifier si = new SubstructureIdentifier(suffix);
base = si; // Safe to realize immediately
pdbId = si.getPdbId();
// Set chainId if unique
Set chains = getChainIds(si);
if(chains.size() == 1) {
this.chainId = chains.iterator().next();
} else if(chains.size() > 1) {
this.chainId = ".";
} else {
this.chainId = null;
}
return true;
}
private boolean initFromURL(String suffix) {
try {
URL url = new URL(suffix);
String path = url.getPath();
mySource = Source.URL;
pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) );
chainId = null; // Don't bother checking query params here
return true;
} catch(MalformedURLException e) {
return false;
}
}
private boolean initFromFile() {
mySource = Source.FILE;
pdbId = null;
chainId = null;
return true;
}
private static Set getChainIds(SubstructureIdentifier si) {
Set chains = new TreeSet();
List ranges = si.getResidueRanges();
for(ResidueRange range : ranges) {
String chain = range.getChainId();
if(chain != null) {
chains.add(chain);
}
}
return chains;
}
/**
* Get the PDB ID for this name, if any.
*
* Equivalent to {@link SubstructureIdentifier#getPdbId()
* toCanonical().getPdbId()}
* @return The upper-case PDB Name, or null if not applicable
* @throws StructureException Wraps errors which occur when converting to canonical form
*/
public String getPdbId() throws StructureException {
if( pdbId == null) {
pdbId = toCanonical().getPdbId();
}
return pdbId;
}
/**
* Gets the chain ID, for structures where it is unique and well-defined.
* May return '.' for multi-chain ranges, '_' for wildcard chains, or
* null if the information is unavailable.
*
* This method should only be used casually. For precise chainIds, it
* is better to use {@link #toCanonical()} and iterate through the
* residue ranges.
* @return
*/
public String getChainId() {
return chainId;
}
/**
*
* @return the identifier string
* @deprecated use {@link #getIdentifier()}
*/
@Deprecated
public String getName(){
return getIdentifier();
}
/**
* Get the original form of the identifier
*/
@Override
public String getIdentifier() {
return name;
}
@Override
public String toString(){
return name;
}
public boolean isScopName() {
return mySource == Source.SCOP;
}
public boolean isPDPDomain(){
return mySource == Source.PDP;
}
public boolean isCathID(){
return mySource == Source.CATH;
}
public boolean isPdbId(){
return mySource == Source.PDB;
}
public boolean isURL() {
return mySource == Source.URL;
}
/**
* Indicates that the identifier was determined to correspond to a file.
* Note that some file identifiers may also be valid URLs; in that case,
* the URL source is preferred.
* @return
*/
public boolean isFile() {
return mySource == Source.FILE;
}
public boolean isEcodDomain() {
return mySource == Source.ECOD;
}
public boolean isBioAssembly() {
return mySource == Source.BIO;
}
public Source getSource() {
return mySource;
}
/**
* StructureName wraps another StructureIdentifier. The type of the base
* identifier depends on the {@link #getSource() source}. Most StructureName
* methods deligate to the base identifier.
*
*
It is possible that future versions of StructureName might change the
* return type. Except for some specialized uses, it is probably better
* to create the correct type of identifier directly, rather than creating
* a StructureName and casting the result of this method.
* @return A Str
* @throws StructureException Wraps exceptions that may be thrown by
* individual implementations. For example, a SCOP identifier may require
* that the domain definitions be available for download.
*/
public StructureIdentifier getBaseIdentifier() throws StructureException {
if( base == null ) {
switch(mySource) {
case CATH:
base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier());
break;
case ECOD:
try {
base = EcodFactory.getEcodDatabase().getDomainsById(name);
} catch (IOException e) {
throw new StructureException("Unable to get ECOD domain "+name,e);
}
break;
case SCOP:
// Fuzzy matching of the domain name to the current default factory
base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP());
if(base == null) {
// Guessing didn't work, so just use the PDBID and Chain from name
// Guess that '_' means 'whole structure'
if (chainId.equals("_")) {
base = new SubstructureIdentifier(pdbId);
} else {
base = new SubstructureIdentifier(pdbId,ResidueRange.parseMultiple(chainId));
}
logger.error("Unable to find {}, so using {}",name,base);
}
break;
case FILE:
try {
String[] prefix = name.split(":", 2);
String filename;
if(prefix.length > 1) {
filename = prefix[1];
} else {
filename = name;
}
filename = FileDownloadUtils.expandUserHome(filename);
base = new URLIdentifier(new File(filename).toURI().toURL());
} catch (MalformedURLException e) {
// Should never happen
throw new StructureException("Unable to get URL for file: "+name,e);
}
break;
case URL:
try {
base = new URLIdentifier(name);
} catch (MalformedURLException e) {
throw new StructureException("Invalid URL: "+name,e);
}
break;
case PDP:
try {
PDPProvider provider = new RemotePDPProvider(false);
base = provider.getPDPDomain(name);
} catch (IOException e) {
throw new StructureException("Unable to fetch PDP domain "+name, e);
}
break;
case BIO:
base = new BioAssemblyIdentifier(name);
break;
case PDB:
base = new SubstructureIdentifier(getIdentifier());
break;
default:
throw new IllegalStateException("Unimplemented source: "+mySource);
}
}
return base;
}
@Override
public SubstructureIdentifier toCanonical() throws StructureException {
return getBaseIdentifier().toCanonical();
}
@Override
public Structure reduce(Structure input) throws StructureException {
return getBaseIdentifier().reduce(input);
}
@Override
public Structure loadStructure(AtomCache cache) throws StructureException,
IOException {
return getBaseIdentifier().loadStructure(cache);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
StructureName other = (StructureName) obj;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}
/**
* Orders identifiers lexicographically by PDB ID and then full Identifier
*/
@Override
public int compareTo(StructureName o) {
if ( this.equals(o))
return 0;
String pdb1 = null;
String pdb2 = null;
try {
pdb1 = this.getPdbId();
} catch (StructureException e) {}
try {
pdb2 = this.getPdbId();
} catch (StructureException e) {}
int comp = 0;
// Sort those with PDBIDs before those without
if( pdb1 == null ) {
if( pdb2 != null) {
return 1; // this > o
}
// both null
} else if( pdb2 == null){
return -1; // this < o
} else {
// neither null
comp = pdb1.compareTo(pdb2);
}
if( comp != 0 ) {
return comp;
}
// break tie with full identifiers
pdb1 = this.getIdentifier();
pdb2 = o.getIdentifier();
// Throws NPE for nulls
return pdb1.compareTo(pdb2);
}
/**
*
* Guess a scop domain. If an exact match is found, return that.
*
*
* Otherwise, return the first scop domain found for the specified protein such that
*
* - The chains match, or one of the chains is '_' or '.'.
*
- The domains match, or one of the domains is '_'.
*
*
* In some cases there may be several valid matches. In this case a warning
* will be logged.
*
* @param name SCOP domain name, or a guess thereof
* @param scopDB SCOP domain provider
* @return The best match for name among the domains of scopDB, or null if none match.
*/
public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) {
List matches = new LinkedList();
// Try exact match first
ScopDomain domain = scopDB.getDomainByScopID(name);
if (domain != null) {
return domain;
}
// Didn't work. Guess it!
logger.warn("Warning, could not find SCOP domain: " + name);
Matcher scopMatch = scopPattern.matcher(name);
if (scopMatch.matches()) {
String pdbID = scopMatch.group(1);
String chainID = scopMatch.group(2);
String domainID = scopMatch.group(3);
for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) {
Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId());
if (potMatch.matches()) {
if (chainID.equals(potMatch.group(2)) || chainID.equals("_") || chainID.equals(".")
|| potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) {
if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) {
// Match, or near match
matches.add(potentialSCOP);
}
}
}
}
}
Iterator match = matches.iterator();
if (match.hasNext()) {
ScopDomain bestMatch = match.next();
if(logger.isWarnEnabled()) {
StringBuilder warnMsg = new StringBuilder();
warnMsg.append("Trying domain " + bestMatch.getScopId() + ".");
if (match.hasNext()) {
warnMsg.append(" Other possibilities: ");
while (match.hasNext()) {
warnMsg.append(match.next().getScopId() + " ");
}
}
warnMsg.append(System.getProperty("line.separator"));
logger.warn(warnMsg.toString());
}
return bestMatch;
} else {
return null;
}
}
}