org.biojava.nbio.structure.URLIdentifier Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-structure Show documentation
The protein structure modules of BioJava.
There is a newer version: 7.1.3
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure;

import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.io.PDBFileReader;
import org.biojava.nbio.structure.io.cif.CifStructureConverter;
import org.biojava.nbio.structure.io.StructureFiletype;
import org.biojava.nbio.structure.io.mmtf.MmtfActions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Represents a structure loaded from a URL (including a file URL)
 *
 * A few custom query parameters are supported:
 *
 * 
 * format=[pdb|cif] Specify the file format (will otherwise be
 *     guessed from the extension)
 * 
pdbId=[String] Specify the PDB ID (also guessed from the filename)
 * 
chainID=[String] A single chain from the structure
 * 
residues=[String] Residue ranges, in a form understood by
 *     {@link SubstructureIdentifier}
 * 
 * @author Spencer Bliven
 *
 */
public class URLIdentifier implements StructureIdentifier {
	private static final long serialVersionUID = -5161230822868926035L;
	private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);

	// Used for guessing the PDB ID from the filename
	//UPDATE: It seems that this RegEx rarely succeeded , because the file
	//name is most of the time in the format pdbxxxx.EXT not xxxx.EXT.
	private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?([0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE);
//	private static final Pattern PDBID_REGEX = Pattern.compile("^(?:pdb)?((PDB_[0-9]{4})?[0-9][a-z0-9]{3})([._-]|\\s).*", Pattern.CASE_INSENSITIVE);
	
	/** URL parameter specifying the file format (PDB or CIF) */
	public static final String FORMAT_PARAM = "format";
	/** URL parameter specifying the PDB ID */
	public static final String PDBID_PARAM = "pdbid";
	/** URL parameter specifying a single chain to include; overridden by residues */

	//TODO: should this get renamed to chainname or asymid?
	public static final String CHAINID_PARAM = "chainid";
	/**
	 * URL parameter specifying residue ranges to include, e.g. residues=A:1-70
	 * @see SubstructureIdentifier
	 */
	public static final String RESIDUES_PARAM = "residues";

	final private URL url;
	public URLIdentifier(URL url) {
		this.url = url;
	}

	public URLIdentifier(String url) throws MalformedURLException {
		this(new URL(url));
	}

	public URL getURL() {
		return url;
	}

	@Override
	public String getIdentifier() {
		return url.toString();
	}

	/**
	 * @return A SubstructureIdentifier without ranges (e.g. including all residues)
	 */
	@Override
	public SubstructureIdentifier toCanonical() throws StructureException{
		String pdbId = null;
		List ranges = Collections.emptyList();
		try {
			Map params = parseQuery(url);
			if (params.containsKey(PDBID_PARAM)) {
				pdbId = params.get(PDBID_PARAM);
			}
			if (params.containsKey(RESIDUES_PARAM)) {
				ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
			} else if (params.containsKey(CHAINID_PARAM)) {
				ranges = Collections.singletonList(new ResidueRange(params.get(CHAINID_PARAM), (ResidueNumber) null, (ResidueNumber) null));
			}
		} catch (UnsupportedEncodingException e) {
			logger.error("Unable to decode URL {}", url, e);
		}
		if (pdbId == null) {
			String path = url.getPath();
			pdbId = guessPDBID(path.substring(path.lastIndexOf("/") + 1));
		}
		return new SubstructureIdentifier((pdbId==null?(PdbId)null:new PdbId(pdbId)), ranges);
	}

	@Override
	public Structure reduce(Structure input) throws StructureException {
		return toCanonical().reduce(input);
	}

	/**
	 * Load the structure from the URL
	 * @return null
	 */
	@Override
	public Structure loadStructure(AtomCache cache) throws StructureException, IOException {
		StructureFiletype format = StructureFiletype.UNKNOWN;

		// Use user-specified format
		try {
			Map params = parseQuery(url);
			if (params.containsKey(FORMAT_PARAM)) {
				String formatStr = params.get(FORMAT_PARAM);
				format = StructureIO.guessFiletype("." + formatStr);
			}
		} catch (UnsupportedEncodingException e) {
			logger.error("Unable to decode URL {}", url, e);
		}

		// Guess format from extension
		if (format == StructureFiletype.UNKNOWN) {
			format = StructureIO.guessFiletype(url.getPath());
		}

		switch(format) {
			case CIF: case BCIF:
				return CifStructureConverter.fromURL(url, cache.getFileParsingParams());
			case MMTF:
				return MmtfActions.readFromInputStream(url.openStream());
			default: case PDB:
				// pdb file based parsing
				PDBFileReader reader = new PDBFileReader(cache.getPath());
				reader.setFetchBehavior(cache.getFetchBehavior());
				reader.setObsoleteBehavior(cache.getObsoleteBehavior());
				reader.setFileParsingParameters(cache.getFileParsingParams());
				return reader.getStructure(url);
		}
	}

	/**
	 * Recognizes PDB IDs that occur at the beginning of name followed by some
	 * delimiter.
	 * @param name Input filename
	 * @return A 4-character id-like string, or null if none is found
	 */
	public static String guessPDBID(String name) {
		Matcher match = PDBID_REGEX.matcher(name);
		if (match.matches()) {
			return match.group(1).toUpperCase();
		}
		// Give up if doesn't match
		return null;
	}

	/**
	 * Parses URL parameters into a map. Keys are stored lower-case.
	 *
	 * @param url
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	private static Map parseQuery(URL url) throws UnsupportedEncodingException {
		Map params = new LinkedHashMap<>();
		String query = url.getQuery();
		if (query == null || query.isEmpty()) {
			// empty query
			return params;
		}
		String[] pairs = url.getQuery().split("&");
		for (String pair : pairs) {
			int i = pair.indexOf("=");
			String key = pair;
			if (i > 0) {
				key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
			}
			String value = null;
			if(i > 0 && pair.length() > i + 1) {
				value = URLDecoder.decode(pair.substring(i + 1), "UTF-8");
			}
			// note that this uses the last instance if a parameter is specified multiple times
			params.put(key.toLowerCase(), value);
		}
		return params;
	}
}