org.biojava.nbio.structure.URLIdentifier Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.io.PDBFileReader;
import org.biojava.nbio.structure.io.cif.CifStructureConverter;
import org.biojava.nbio.structure.io.StructureFiletype;
import org.biojava.nbio.structure.io.mmtf.MmtfActions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Represents a structure loaded from a URL (including a file URL)
*
* A few custom query parameters are supported:
*
*
* - format=[pdb|cif] Specify the file format (will otherwise be
* guessed from the extension)
*
- pdbId=[String] Specify the PDB ID (also guessed from the filename)
*
- chainID=[String] A single chain from the structure
*
- residues=[String] Residue ranges, in a form understood by
* {@link SubstructureIdentifier}
*
* @author Spencer Bliven
*
*/
public class URLIdentifier implements StructureIdentifier {
private static final long serialVersionUID = -5161230822868926035L;
private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);
// Used for guessing the PDB ID from the filename
//UPDATE: It seems that this RegEx rarely succeeded , because the file
//name is most of the time in the format pdbxxxx.EXT not xxxx.EXT.
private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?([0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE);
// private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?((PDB_[0-9]{4})?[0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE);
/** URL parameter specifying the file format (PDB or CIF) */
public static final String FORMAT_PARAM = "format";
/** URL parameter specifying the PDB ID */
public static final String PDBID_PARAM = "pdbid";
/** URL parameter specifying a single chain to include; overridden by residues */
//TODO: should this get renamed to chainname or asymid?
public static final String CHAINID_PARAM = "chainid";
/**
* URL parameter specifying residue ranges to include, e.g. residues=A:1-70
* @see SubstructureIdentifier
*/
public static final String RESIDUES_PARAM = "residues";
final private URL url;
public URLIdentifier(URL url) {
this.url = url;
}
public URLIdentifier(String url) throws MalformedURLException {
this(new URL(url));
}
public URL getURL() {
return url;
}
@Override
public String getIdentifier() {
return url.toString();
}
/**
* @return A SubstructureIdentifier without ranges (e.g. including all residues)
*/
@Override
public SubstructureIdentifier toCanonical() throws StructureException{
String pdbId = null;
List ranges = Collections.emptyList();
try {
Map params = parseQuery(url);
if (params.containsKey(PDBID_PARAM)) {
pdbId = params.get(PDBID_PARAM);
}
if (params.containsKey(RESIDUES_PARAM)) {
ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
} else if (params.containsKey(CHAINID_PARAM)) {
ranges = Collections.singletonList(new ResidueRange(params.get(CHAINID_PARAM), (ResidueNumber) null, (ResidueNumber) null));
}
} catch (UnsupportedEncodingException e) {
logger.error("Unable to decode URL {}", url, e);
}
if (pdbId == null) {
String path = url.getPath();
pdbId = guessPDBID(path.substring(path.lastIndexOf("/") + 1));
}
return new SubstructureIdentifier((pdbId==null?(PdbId)null:new PdbId(pdbId)), ranges);
}
@Override
public Structure reduce(Structure input) throws StructureException {
return toCanonical().reduce(input);
}
/**
* Load the structure from the URL
* @return null
*/
@Override
public Structure loadStructure(AtomCache cache) throws StructureException, IOException {
StructureFiletype format = StructureFiletype.UNKNOWN;
// Use user-specified format
try {
Map params = parseQuery(url);
if (params.containsKey(FORMAT_PARAM)) {
String formatStr = params.get(FORMAT_PARAM);
format = StructureIO.guessFiletype("." + formatStr);
}
} catch (UnsupportedEncodingException e) {
logger.error("Unable to decode URL {}", url, e);
}
// Guess format from extension
if (format == StructureFiletype.UNKNOWN) {
format = StructureIO.guessFiletype(url.getPath());
}
switch(format) {
case CIF: case BCIF:
return CifStructureConverter.fromURL(url, cache.getFileParsingParams());
case MMTF:
return MmtfActions.readFromInputStream(url.openStream());
default: case PDB:
// pdb file based parsing
PDBFileReader reader = new PDBFileReader(cache.getPath());
reader.setFetchBehavior(cache.getFetchBehavior());
reader.setObsoleteBehavior(cache.getObsoleteBehavior());
reader.setFileParsingParameters(cache.getFileParsingParams());
return reader.getStructure(url);
}
}
/**
* Recognizes PDB IDs that occur at the beginning of name followed by some
* delimiter.
* @param name Input filename
* @return A 4-character id-like string, or null if none is found
*/
public static String guessPDBID(String name) {
Matcher match = PDBID_REGEX.matcher(name);
if (match.matches()) {
return match.group(1).toUpperCase();
}
// Give up if doesn't match
return null;
}
/**
* Parses URL parameters into a map. Keys are stored lower-case.
*
* @param url
* @return
* @throws UnsupportedEncodingException
*/
private static Map parseQuery(URL url) throws UnsupportedEncodingException {
Map params = new LinkedHashMap<>();
String query = url.getQuery();
if (query == null || query.isEmpty()) {
// empty query
return params;
}
String[] pairs = url.getQuery().split("&");
for (String pair : pairs) {
int i = pair.indexOf("=");
String key = pair;
if (i > 0) {
key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
}
String value = null;
if(i > 0 && pair.length() > i + 1) {
value = URLDecoder.decode(pair.substring(i + 1), "UTF-8");
}
// note that this uses the last instance if a parameter is specified multiple times
params.put(key.toLowerCase(), value);
}
return params;
}
}