All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.structure.align.client.StructureName Maven / Gradle / Ivy

There is a newer version: 7.2.2
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.align.client;


import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.biojava.nbio.structure.BioAssemblyIdentifier;
import org.biojava.nbio.structure.ResidueRange;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureIdentifier;
import org.biojava.nbio.structure.SubstructureIdentifier;
import org.biojava.nbio.structure.URLIdentifier;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.cath.CathDomain;
import org.biojava.nbio.structure.cath.CathFactory;
import org.biojava.nbio.structure.domain.PDPDomain;
import org.biojava.nbio.structure.domain.PDPProvider;
import org.biojava.nbio.structure.domain.RemotePDPProvider;
import org.biojava.nbio.structure.ecod.EcodFactory;
import org.biojava.nbio.structure.io.util.FileDownloadUtils;
import org.biojava.nbio.structure.scop.ScopDatabase;
import org.biojava.nbio.structure.scop.ScopDomain;
import org.biojava.nbio.structure.scop.ScopFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * A utility class that makes working with names of structures, domains and ranges easier.
 *
 * Accepts a wide range of identifier formats, including {@link ScopDomain},
 * {@link CathDomain}, PDP domains, and {@link SubstructureIdentifier} residue
 * ranges.
 *
 * Where possible, data is extracted from the input string. Otherwise, range
 * information may be loaded from one of the factory classes:
 * {@link CathFactory},{@link ScopFactory}, etc.
 *
 * @see #getName the name. e.g. 4hhb, 4hhb.A, d4hhba_, PDP:4HHBAa etc.
 */

public class StructureName implements Comparable, Serializable, StructureIdentifier {
	private static final long serialVersionUID = 4021229518711762957L;
	private static final Logger logger = LoggerFactory.getLogger(StructureName.class);

	protected String name;
	protected String pdbId;
	protected String chainId;

	private static final Pattern cathPattern = Pattern.compile("^(?:CATH:)?([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE);
	// ds046__ is a special case with no PDB entry
	private static final Pattern scopPattern = Pattern.compile("^(?:SCOP:)?d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
	// ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B'
	private static final Pattern ecodPattern = Pattern.compile("^(?:ECOD:)?e([0-9][a-z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE);

	// Names are automatically used as prefixes
	public enum Source {
		PDB,
		SCOP,
		PDP,
		CATH,
		URL,
		FILE,
		ECOD,
		BIO,
	};

	private Source mySource = null;

	// cache for getBaseIdentifier() method
	private StructureIdentifier base = null;

	/**
	 * Create a new StructureName from the given identifier, which may be a
	 * domain name, a substructure identifier, etc.
	 * 

* The source and PDB-Id are extracted at compile time, but fully * interpreting the ID, which may require additional parsing or remote * calls, is done lazily. *

* The following sources are supported. Any may be prefixed by the source * name followed by a colon (e.g. PDB:4HHB). In this case, that source will be used * unequivocally. If no source is specified, StructureName will make a * (usually reliable) guess as to which source was intended. *

    *
  • PDBPDB identifier, optionally followed by chain and/or residue * ranges. Internally represented by a {@link SubstructureIdentifier}; * see that class for the full format specification. * Examples: 4hhb, 4hhb.A, 4hhb.A:1-50. *
  • SCOP SCOP domain (or SCOPe, depending on the * {@link ScopFactory#getSCOP()} version). Example: d1h6w.2 *
  • PDP Protein Domain Parser domain. PDP domains are not guessed, * making the PDP: prefix obligatory. Example: PDP:4HHBAa *
  • CATH Cath domains. Example: 1qvrC03 *
  • URL Arbitrary URLs. Most common protocols are handled, * including http://, ftp://, and file://. Some parsing information can * be passed as custom query parameters. Example: * http://www.rcsb.org/pdb/files/1B8G.pdb.gz *
  • FILE A file path. Supports relative paths and expands ~ to * the user's home directory. Only existing files will be automatically * detected; to refer to a potentially not-yet existing file, prepend * the prefix. Internally represented as a {@link URLIdentifier} * after path expansion. Example: ~/custom_protein.pdb *
  • ECOD ECOD domain. Example: e1lyw.1 *
  • BIO Biological assembly. These are not guessed, making * the BIO: prefix obligatory. Example: BIO:2ehz:1 *
* @param name An identifier string * @throws IllegalArgumentException if the name has a recognizable source but is semantically invalid */ public StructureName(String name){ this.name = name; init();//sets pdbId and mySource } /** * Tries to determine the source and pdbId without fully realizing the identifier, * which could require I/O depending on the source * @throws IllegalArgumentException if the source is recognizable but invalid */ private void init(){ // First try identifying a prefix String[] prefix = name.split(":", 2); mySource = null; if(prefix.length > 1) { // Match Source prefixes String suffix = prefix[1]; try { mySource = Source.valueOf(prefix[0].toUpperCase()); } catch( IllegalArgumentException e ) { // unrecognized prefix; fall back on guessing mySource = null; } if(mySource != null) { switch( mySource) { case SCOP: if( ! initFromScop(suffix) ) throw new IllegalArgumentException("Malformed SCOP domain name:"+suffix); return; case PDP: if( ! initFromPDP(name) ) throw new IllegalArgumentException("Malformed PDP domain name:"+suffix); return; case CATH: if( ! initFromCATH(suffix) ) throw new IllegalArgumentException("Malformed CATH domain name:"+suffix); return; case BIO: if( ! initFromBIO(name) ) throw new IllegalArgumentException("Malformed BIO name:"+suffix); return; case ECOD: if( ! initFromECOD(suffix) ) throw new IllegalArgumentException("Malformed ECOD domain name:"+suffix); return; case PDB: if( ! initFromPDB(suffix) ) throw new IllegalArgumentException("Malformed PDB specification:"+suffix); return; case FILE: // Treat file:/ prefixes as URLs if( ! suffix.startsWith("/")) { // Otherwise, treat as file initFromFile(); return; } // fall through to URL case case URL: if( ! initFromURL(name)) throw new IllegalArgumentException("Malformed URL specification:"+suffix); return; default: throw new IllegalStateException("Unimplemented Source "+mySource); } } } // No known prefix, so revert to guessing // First guess regex-based identifiers // SCOP domain if( initFromScop(name) ) return; // CATH if( initFromCATH(name) ) return; // ECOD if( initFromECOD(name) ) return; // Never guess BIO or PDP // URL if( initFromURL(name) ) return; // Guess FILE based on file existence File file = new File(FileDownloadUtils.expandUserHome(name)); if( file.canRead() && !file.isDirectory() ) { // an attempt to mitigate issue #398. It doesn't fix it but it catches the most common case of passing a pdb id and finding a file in working dir matching it if (name.matches("\\d\\w\\w\\w")) { // the plain pdb id case, this is unlikely to be what the user wants: let's let it through but warn about it logger.warn("Provided 4-letter structure name '{}' matches " + "file name in directory {}. Will read structure " + "data from file {} and not consider the name as a " + "structure identifier. If this is not what you " + "want, use 'FILE:{}'", name, file.getAbsoluteFile().getParent(), file.getAbsolutePath(), name); } else { logger.info("Provided structure name '{}' matches " + "file name in directory {}. Will read structure " + "data from file {}.", name, file.getAbsoluteFile().getParent(), file.getAbsolutePath()); } initFromFile(); return; } // Default to PDB initFromPDB( name ); } private boolean initFromScop(String name) { Matcher matcher = scopPattern.matcher(name); if ( matcher.matches() ) { mySource = Source.SCOP; pdbId = matcher.group(1).toUpperCase(); chainId = matcher.group(2); return true; } return false; } private boolean initFromPDP(String name) { Matcher matcher = PDPDomain.PDP_NAME_PATTERN.matcher(name); if( matcher.matches() ) { pdbId = matcher.group(1).toUpperCase(); chainId = matcher.group(2); return true; } return false; } private boolean initFromCATH(String name) { Matcher matcher = cathPattern.matcher(name); if ( matcher.matches() ){ mySource = Source.CATH; pdbId = matcher.group(1).toUpperCase(); chainId = matcher.group(2); return true; } return false; } private boolean initFromECOD(String name) { Matcher matcher = ecodPattern.matcher(name); if ( matcher.matches() ){ mySource = Source.ECOD; pdbId = matcher.group(1).toUpperCase(); chainId = null; return true; } return false; } private boolean initFromBIO(String name) { Matcher matcher = BioAssemblyIdentifier.BIO_NAME_PATTERN.matcher(name); if( matcher.matches() ) { pdbId = matcher.group(1).toUpperCase(); return true; } return false; } private boolean initFromPDB(String suffix) { mySource = Source.PDB; SubstructureIdentifier si = new SubstructureIdentifier(suffix); base = si; // Safe to realize immediately pdbId = si.getPdbId(); // Set chainId if unique Set chains = getChainIds(si); if(chains.size() == 1) { this.chainId = chains.iterator().next(); } else if(chains.size() > 1) { this.chainId = "."; } else { this.chainId = null; } return true; } private boolean initFromURL(String suffix) { try { URL url = new URL(suffix); String path = url.getPath(); mySource = Source.URL; pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) ); chainId = null; // Don't bother checking query params here return true; } catch(MalformedURLException e) { return false; } } private boolean initFromFile() { mySource = Source.FILE; pdbId = null; chainId = null; return true; } private static Set getChainIds(SubstructureIdentifier si) { Set chains = new TreeSet(); List ranges = si.getResidueRanges(); for(ResidueRange range : ranges) { String chain = range.getChainId(); if(chain != null) { chains.add(chain); } } return chains; } /** * Get the PDB ID for this name, if any. * * Equivalent to {@link SubstructureIdentifier#getPdbId() * toCanonical().getPdbId()} * @return The upper-case PDB Name, or null if not applicable * @throws StructureException Wraps errors which occur when converting to canonical form */ public String getPdbId() throws StructureException { if( pdbId == null) { pdbId = toCanonical().getPdbId(); } return pdbId; } /** * Gets the chain ID, for structures where it is unique and well-defined. * May return '.' for multi-chain ranges, '_' for wildcard chains, or * null if the information is unavailable. * *

This method should only be used casually. For precise chainIds, it * is better to use {@link #toCanonical()} and iterate through the * residue ranges. * @return */ public String getChainId() { return chainId; } /** * * @return the identifier string * @deprecated use {@link #getIdentifier()} */ @Deprecated public String getName(){ return getIdentifier(); } /** * Get the original form of the identifier */ @Override public String getIdentifier() { return name; } @Override public String toString(){ return name; } public boolean isScopName() { return mySource == Source.SCOP; } public boolean isPDPDomain(){ return mySource == Source.PDP; } public boolean isCathID(){ return mySource == Source.CATH; } public boolean isPdbId(){ return mySource == Source.PDB; } public boolean isURL() { return mySource == Source.URL; } /** * Indicates that the identifier was determined to correspond to a file. * Note that some file identifiers may also be valid URLs; in that case, * the URL source is preferred. * @return */ public boolean isFile() { return mySource == Source.FILE; } public boolean isEcodDomain() { return mySource == Source.ECOD; } public boolean isBioAssembly() { return mySource == Source.BIO; } public Source getSource() { return mySource; } /** * StructureName wraps another StructureIdentifier. The type of the base * identifier depends on the {@link #getSource() source}. Most StructureName * methods deligate to the base identifier. * *

It is possible that future versions of StructureName might change the * return type. Except for some specialized uses, it is probably better * to create the correct type of identifier directly, rather than creating * a StructureName and casting the result of this method. * @return A Str * @throws StructureException Wraps exceptions that may be thrown by * individual implementations. For example, a SCOP identifier may require * that the domain definitions be available for download. */ public StructureIdentifier getBaseIdentifier() throws StructureException { if( base == null ) { switch(mySource) { case CATH: base = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier()); break; case ECOD: try { base = EcodFactory.getEcodDatabase().getDomainsById(name); } catch (IOException e) { throw new StructureException("Unable to get ECOD domain "+name,e); } break; case SCOP: // Fuzzy matching of the domain name to the current default factory base = guessScopDomain(getIdentifier(),ScopFactory.getSCOP()); if(base == null) { // Guessing didn't work, so just use the PDBID and Chain from name // Guess that '_' means 'whole structure' if (chainId.equals("_")) { base = new SubstructureIdentifier(pdbId); } else { base = new SubstructureIdentifier(pdbId,ResidueRange.parseMultiple(chainId)); } logger.error("Unable to find {}, so using {}",name,base); } break; case FILE: try { String[] prefix = name.split(":", 2); String filename; if(prefix.length > 1) { filename = prefix[1]; } else { filename = name; } filename = FileDownloadUtils.expandUserHome(filename); base = new URLIdentifier(new File(filename).toURI().toURL()); } catch (MalformedURLException e) { // Should never happen throw new StructureException("Unable to get URL for file: "+name,e); } break; case URL: try { base = new URLIdentifier(name); } catch (MalformedURLException e) { throw new StructureException("Invalid URL: "+name,e); } break; case PDP: try { PDPProvider provider = new RemotePDPProvider(false); base = provider.getPDPDomain(name); } catch (IOException e) { throw new StructureException("Unable to fetch PDP domain "+name, e); } break; case BIO: base = new BioAssemblyIdentifier(name); break; case PDB: base = new SubstructureIdentifier(getIdentifier()); break; default: throw new IllegalStateException("Unimplemented source: "+mySource); } } return base; } @Override public SubstructureIdentifier toCanonical() throws StructureException { return getBaseIdentifier().toCanonical(); } @Override public Structure reduce(Structure input) throws StructureException { return getBaseIdentifier().reduce(input); } @Override public Structure loadStructure(AtomCache cache) throws StructureException, IOException { return getBaseIdentifier().loadStructure(cache); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((name == null) ? 0 : name.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; StructureName other = (StructureName) obj; if (name == null) { if (other.name != null) return false; } else if (!name.equals(other.name)) return false; return true; } /** * Orders identifiers lexicographically by PDB ID and then full Identifier */ @Override public int compareTo(StructureName o) { if ( this.equals(o)) return 0; String pdb1 = null; String pdb2 = null; try { pdb1 = this.getPdbId(); } catch (StructureException e) {} try { pdb2 = this.getPdbId(); } catch (StructureException e) {} int comp = 0; // Sort those with PDBIDs before those without if( pdb1 == null ) { if( pdb2 != null) { return 1; // this > o } // both null } else if( pdb2 == null){ return -1; // this < o } else { // neither null comp = pdb1.compareTo(pdb2); } if( comp != 0 ) { return comp; } // break tie with full identifiers pdb1 = this.getIdentifier(); pdb2 = o.getIdentifier(); // Throws NPE for nulls return pdb1.compareTo(pdb2); } /** *

* Guess a scop domain. If an exact match is found, return that. * *

* Otherwise, return the first scop domain found for the specified protein such that *

    *
  • The chains match, or one of the chains is '_' or '.'. *
  • The domains match, or one of the domains is '_'. *
* * In some cases there may be several valid matches. In this case a warning * will be logged. * * @param name SCOP domain name, or a guess thereof * @param scopDB SCOP domain provider * @return The best match for name among the domains of scopDB, or null if none match. */ public static ScopDomain guessScopDomain(String name, ScopDatabase scopDB) { List matches = new LinkedList(); // Try exact match first ScopDomain domain = scopDB.getDomainByScopID(name); if (domain != null) { return domain; } // Didn't work. Guess it! logger.warn("Warning, could not find SCOP domain: " + name); Matcher scopMatch = scopPattern.matcher(name); if (scopMatch.matches()) { String pdbID = scopMatch.group(1); String chainID = scopMatch.group(2); String domainID = scopMatch.group(3); for (ScopDomain potentialSCOP : scopDB.getDomainsForPDB(pdbID)) { Matcher potMatch = scopPattern.matcher(potentialSCOP.getScopId()); if (potMatch.matches()) { if (chainID.equals(potMatch.group(2)) || chainID.equals("_") || chainID.equals(".") || potMatch.group(2).equals("_") || potMatch.group(2).equals(".")) { if (domainID.equals(potMatch.group(3)) || domainID.equals("_") || potMatch.group(3).equals("_")) { // Match, or near match matches.add(potentialSCOP); } } } } } Iterator match = matches.iterator(); if (match.hasNext()) { ScopDomain bestMatch = match.next(); if(logger.isWarnEnabled()) { StringBuilder warnMsg = new StringBuilder(); warnMsg.append("Trying domain " + bestMatch.getScopId() + "."); if (match.hasNext()) { warnMsg.append(" Other possibilities: "); while (match.hasNext()) { warnMsg.append(match.next().getScopId() + " "); } } warnMsg.append(System.getProperty("line.separator")); logger.warn(warnMsg.toString()); } return bestMatch; } else { return null; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy