All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.structure.io.LocalPDBDirectory Maven / Gradle / Ivy

There is a newer version: 7.1.3
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * created at Oct 18, 2008
 */
package org.biojava.nbio.structure.io;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;

import org.biojava.nbio.core.util.FileDownloadUtils;
import org.biojava.nbio.core.util.InputStreamProvider;
import org.biojava.nbio.structure.PdbId;
import org.biojava.nbio.structure.PDBStatus;
import org.biojava.nbio.structure.PDBStatus.Status;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.util.UserConfiguration;
import org.rcsb.mmtf.utils.CodecUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Superclass for classes which download and interact with the PDB's FTP server,
 * specifically {@link PDBFileReader} and {@link CifFileReader}. The basic
 * functionality of downloading structure files from the FTP site is gathered
 * here, making the child classes responsible for only the specific paths and
 * file formats needed.
 *
 * @author Spencer Bliven
 *
 */
public abstract class LocalPDBDirectory implements StructureIOFile {

	private static final Logger logger = LoggerFactory.getLogger(LocalPDBDirectory.class);

	/**
	 * The default server name, prefixed by the protocol string (http://, https:// or ftp://).
	 * Note that we don't support file stamp retrieving for ftp protocol, thus some of the
	 * fetch modes will not work properly with ftp protocol
	 */
	public static final String DEFAULT_PDB_FILE_SERVER = "https://files.wwpdb.org";
	public static final String PDB_FILE_SERVER_PROPERTY = "PDB.FILE.SERVER";

	/**
	 * The default server to retrieve BinaryCIF files.
	 */
	public static final String DEFAULT_BCIF_FILE_SERVER = "https://models.rcsb.org/";

	/**
	 * Behaviors for when an obsolete structure is requested.
	 * @author Spencer Bliven
	 * @see LocalPDBDirectory#setObsoleteBehavior(ObsoleteBehavior)
	 */
	public static enum ObsoleteBehavior {
		/** Fetch the most recent version of the PDB entry. */
		FETCH_CURRENT,
		/** Fetch the obsolete entry from the PDB archives. */
		FETCH_OBSOLETE,
		/** Throw a StructureException for obsolete entries.*/
		THROW_EXCEPTION;

		public static final ObsoleteBehavior DEFAULT=THROW_EXCEPTION;
	}

	/**
	 * Controls when the class should fetch files from the ftp server
	 * @author Spencer Bliven
	 *
	 */
	public static enum FetchBehavior {
		/** Never fetch from the server; Throw errors for missing files */
		LOCAL_ONLY,
		/** Fetch missing files from the server. Don't check for outdated files */
		FETCH_FILES,
		/**
		 * Fetch missing files from the server, also fetch if file present but older than the
		 * server file.
		 * This requires always querying the server for the last modified time of the file, thus
		 * it adds an overhead to getting files from cache.
		 */
		FETCH_IF_OUTDATED,
		/**
		 * Fetch missing files from the server.
		 * Also force the download of files older than {@value #LAST_REMEDIATION_DATE_STRING}.
		 */
		FETCH_REMEDIATED,
		/** For every file, force downloading from the server */
		FORCE_DOWNLOAD;

		public static final FetchBehavior DEFAULT = FETCH_REMEDIATED;
	}

	/**
	 * Date of the latest PDB file remediation
	 */
	public static final long LAST_REMEDIATION_DATE ;
	private static final String LAST_REMEDIATION_DATE_STRING = "2011/07/12";

	static {

		SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);

		long t = 0;
		try {
			Date d = formatter.parse(LAST_REMEDIATION_DATE_STRING);
			t = d.getTime();
		} catch (ParseException e){
			logger.error("Unexpected error! could not parse LAST_REMEDIATION_DATE: {}", e.getMessage());
		}
		LAST_REMEDIATION_DATE = t;
	}

	/** Minimum size for a valid structure file (CIF or PDB), in bytes */
	public static final long MIN_PDB_FILE_SIZE = 40;  // Empty gzip files are 20bytes. Add a few more for buffer.

	private File path;
	private List extensions;

	/**
	 * The server name, prefixed by the protocol string (http:// or ftp://).
	 * Note that we don't support file stamp retrieving for ftp protocol, thus some of the
	 * fetch modes will not work properly with ftp protocol
	 */
	private String serverName;

	private FileParsingParameters params;

	private ObsoleteBehavior obsoleteBehavior;
	private FetchBehavior fetchBehavior;


	// Cache results of get*DirPath()
	private String splitDirURL; // path on the server, starting with a slash and ending before the 2-char split directories
	private String obsoleteDirURL;
	private File splitDirPath; // path to the directory before the 2-char split
	private File obsoleteDirPath;

	/**
	 * Subclasses should provide default and single-string constructors.
	 * They should use {@link #addExtension(String)} to add one or more extensions.
	 *
	 * 

If path is null, initialize using the system property/environment variable * {@link UserConfiguration#PDB_DIR}. * @param path Path to the PDB file directory */ public LocalPDBDirectory(String path) { extensions = new ArrayList<>(); params = new FileParsingParameters(); if( path == null) { UserConfiguration config = new UserConfiguration(); path = config.getPdbFilePath(); logger.debug("Initialising from system property/environment variable to path: {}", path); } else { path = FileDownloadUtils.expandUserHome(path); logger.debug("Initialising with path {}", path); } this.path = new File(path); this.serverName = getServerName(); // Initialize splitDirURL,obsoleteDirURL,splitDirPath,obsoleteDirPath initPaths(); fetchBehavior = FetchBehavior.DEFAULT; obsoleteBehavior = ObsoleteBehavior.DEFAULT; } public LocalPDBDirectory() { this(null); } /** * Sets the path for the directory where PDB files are read/written */ public void setPath(String p){ path = new File(FileDownloadUtils.expandUserHome(p)) ; initPaths(); } /** * Returns the path value. * @return a String representing the path value * @see #setPath * */ public String getPath() { return path.toString() ; } /** define supported file extensions * compressed extensions .Z,.gz do not need to be specified * they are dealt with automatically. */ @Override public void addExtension(String s){ //System.out.println("add Extension "+s); extensions.add(s); } @Override public List getExtensions() { return Collections.unmodifiableList(extensions); } /** clear the supported file extensions * */ public void clearExtensions(){ extensions.clear(); } @Override public void setFileParsingParameters(FileParsingParameters params){ this.params= params; } @Override public FileParsingParameters getFileParsingParameters(){ return params; } /** * [Optional] This method changes the behavior when obsolete entries * are requested. Current behaviors are: *

    *
  • {@link ObsoleteBehavior#THROW_EXCEPTION THROW_EXCEPTION} * Throw a {@link StructureException} (the default) *
  • {@link ObsoleteBehavior#FETCH_OBSOLETE FETCH_OBSOLETE} * Load the requested ID from the PDB's obsolete repository *
  • {@link ObsoleteBehavior#FETCH_CURRENT FETCH_CURRENT} * Load the most recent version of the requested structure * *

    This setting may be silently ignored by implementations which do not have * access to the server to determine whether an entry is obsolete, such as * if {@link #isAutoFetch()} is false. Note that an obsolete entry may still be * returned even this is FETCH_CURRENT if the entry is found locally. * * @param behavior Whether to fetch obsolete records * @see #setFetchBehavior(FetchBehavior) * @since 4.0.0 */ public void setObsoleteBehavior(ObsoleteBehavior behavior) { obsoleteBehavior = behavior; } /** * Returns how this instance deals with obsolete entries. Note that this * setting may be ignored by some implementations or in some situations, * such as when {@link #isAutoFetch()} is false. * *

    For most implementations, the default value is * {@link ObsoleteBehavior#THROW_EXCEPTION THROW_EXCEPTION}. * * @return The ObsoleteBehavior * @since 4.0.0 */ public ObsoleteBehavior getObsoleteBehavior() { return obsoleteBehavior; } /** * Get the behavior for fetching files from the server * @return */ public FetchBehavior getFetchBehavior() { return fetchBehavior; } /** * Set the behavior for fetching files from the server. * This replaces the {@link #setAutoFetch(boolean)} method with a more * extensive set of options. * @param fetchBehavior */ public void setFetchBehavior(FetchBehavior fetchBehavior) { this.fetchBehavior = fetchBehavior; } @Override public Structure getStructure(String filename) throws IOException { filename = FileDownloadUtils.expandUserHome(filename); File f = new File(filename); return getStructure(f); } public Structure getStructure(URL u) throws IOException{ InputStreamProvider isp = new InputStreamProvider(); InputStream inStream = isp.getInputStream(u); return getStructure(inStream); } @Override public Structure getStructure(File filename) throws IOException { InputStreamProvider isp = new InputStreamProvider(); InputStream inStream = isp.getInputStream(filename); return getStructure(inStream); } /** *{@inheritDoc} */ public Structure getStructureById(String pdbId) throws IOException { return getStructureById(new PdbId(pdbId)); } /** *{@inheritDoc} */ @Override public Structure getStructureById(PdbId pdbId) throws IOException { InputStream inStream = getInputStream(pdbId); return getStructure(inStream); } /** * Handles the actual parsing of the file into a Structure object. * @param inStream * @return * @throws IOException */ public abstract Structure getStructure(InputStream inStream) throws IOException; /** * Load or download the specified structure and return it as an InputStream * for direct parsing. * @param pdbId * @return * @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file. */ protected InputStream getInputStream(PdbId pdbId) throws IOException{ // Check existing File file = downloadStructure(pdbId); if(!file.exists()) { throw new IOException("Structure "+pdbId+" not found and unable to download."); } if(! FileDownloadUtils.validateFile(file)) throw new IOException("Downloaded file invalid: "+file); InputStreamProvider isp = new InputStreamProvider(); InputStream inputStream = isp.getInputStream(file); return inputStream; } /** * Download a structure, but don't parse it yet or store it in memory. * * Used to pre-fetch large numbers of structures. * @param pdbId * @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file. */ public void prefetchStructure(String pdbId) throws IOException { // Check existing File file = downloadStructure(new PdbId(pdbId)); if(!file.exists()) { throw new IOException("Structure "+pdbId+" not found and unable to download."); } if(! FileDownloadUtils.validateFile(file)) throw new IOException("Downloaded file invalid: "+file); } /** * Attempts to delete all versions of a structure from the local directory. * @param pdbId a String representing the PDB ID. * @return True if one or more files were deleted * @throws IOException if the file cannot be deleted */ public boolean deleteStructure(String pdbId) throws IOException { return deleteStructure(new PdbId(pdbId)); } /** * Attempts to delete all versions of a structure from the local directory. * @param pdbId The PDB ID * @return True if one or more files were deleted * @throws IOException if the file cannot be deleted */ public boolean deleteStructure(PdbId pdbId) throws IOException{ boolean deleted = false; // Force getLocalFile to check in obsolete locations ObsoleteBehavior obsolete = getObsoleteBehavior(); setObsoleteBehavior(ObsoleteBehavior.FETCH_OBSOLETE); try { File existing = getLocalFile(pdbId); while(existing != null) { assert(existing.exists()); // should exist unless concurrency problems if( getFetchBehavior() == FetchBehavior.LOCAL_ONLY) { throw new RuntimeException("Refusing to delete from LOCAL_ONLY directory"); } // delete file boolean success = existing.delete(); if(success) { logger.debug("Deleting {}", existing.getAbsolutePath()); } deleted = deleted || success; // delete parent if empty File parent = existing.getParentFile(); if(parent != null) { success = parent.delete(); if(success) { logger.debug("Deleting {}", parent.getAbsolutePath()); } } existing = getLocalFile(pdbId); } return deleted; } finally { setObsoleteBehavior(obsolete); } } /** * Downloads an MMCIF file from the PDB to the local path * @param pdbId * @return The file, or null if it was unavailable for download * @throws IOException for errors downloading or writing, or if the * fetchBehavior is {@link FetchBehavior#LOCAL_ONLY} */ protected File downloadStructure(PdbId pdbId) throws IOException { // decide whether download is required File existing = getLocalFile(pdbId); switch(fetchBehavior) { case LOCAL_ONLY: if( existing == null ) { throw new IOException(String.format("Structure %s not found in %s " + "and configured not to download.",pdbId,getPath())); } else { return existing; } case FETCH_FILES: // Use existing if present if( existing != null) { return existing; } // existing is null, downloadStructure(String,String,boolean,File) will download it break; case FETCH_IF_OUTDATED: // here existing can be null or not: // existing == null : downloadStructure(String,String,boolean,File) will download it // existing != null : downloadStructure(String,String,boolean,File) will check its date and download if older break; case FETCH_REMEDIATED: // Use existing if present and recent enough if( existing != null) { long lastModified = existing.lastModified(); if (lastModified < LAST_REMEDIATION_DATE) { // the file is too old, replace with newer version logger.warn("Replacing file {} with latest remediated (remediation of {}) file from PDB.", existing, LAST_REMEDIATION_DATE_STRING); existing = null; break; } else { return existing; } } case FORCE_DOWNLOAD: // discard the existing file to force redownload existing = null; // downloadStructure(String,String,boolean,File) will download it break; } // Force the download now if(obsoleteBehavior == ObsoleteBehavior.FETCH_CURRENT) { String current = PDBStatus.getCurrent(pdbId.getId()); PdbId pdbIdToDownload = null; if(current == null) { // either an error or there is not current entry pdbIdToDownload = pdbId; }else { pdbIdToDownload = new PdbId(current); } return downloadStructure(pdbIdToDownload, splitDirURL,false, existing); } else if(obsoleteBehavior == ObsoleteBehavior.FETCH_OBSOLETE && PDBStatus.getStatus(pdbId.getId()) == Status.REMOVED) { return downloadStructure(pdbId, obsoleteDirURL, true, existing); } else { return downloadStructure(pdbId, splitDirURL, false, existing); } } /** * Download a file from the http server +/- its validation metadata, replacing any existing files if needed * @param pdbId PDB ID * @param pathOnServer Path on the http server, e.g. data/structures/divided/pdb * @param obsolete Whether or not file should be saved to the obsolete location locally * @param existingFile if not null and checkServerFileDate is true, the last modified date of the * server file and this file will be compared to decide whether to download or not * @return * @throws IOException in cases of file I/O, including failure to download a healthy (non-corrupted) file. */ private File downloadStructure(PdbId pdbId, String pathOnServer, boolean obsolete, File existingFile) throws IOException{ String id = pdbId.getId().toLowerCase(); File dir = getDir(id, obsolete); File realFile = new File(dir,getFilename(id)); String ftp; String filename = getFilename(id); if (filename.endsWith(".bcif") || filename.endsWith(".bcif.gz")) { // TODO this should be configurable ftp = DEFAULT_BCIF_FILE_SERVER + filename; } else { ftp = String.format("%s%s/%s/%s", serverName, pathOnServer, id.substring(id.length()-3, id.length()-1), getFilename(id)); } URL url = new URL(ftp); Date serverFileDate = null; if (existingFile!=null) { serverFileDate = getLastModifiedTime(url); if (serverFileDate!=null) { if (existingFile.lastModified()>=serverFileDate.getTime()) { return existingFile; } else { // otherwise we go ahead and download, warning about it first logger.warn("File {} is outdated, will download new one from PDB (updated on {})", existingFile, serverFileDate.toString()); } } else { logger.warn("Could not determine if file {} is outdated (could not get timestamp from server). Will force redownload", existingFile); } } logger.info("Fetching {}", ftp); logger.info("Writing to {}", realFile); FileDownloadUtils.createValidationFiles(url, realFile, null, FileDownloadUtils.Hash.UNKNOWN); FileDownloadUtils.downloadFile(url, realFile); if(! FileDownloadUtils.validateFile(realFile)) throw new IOException("Downloaded file invalid: "+realFile); return realFile; } /** * Get the last modified time of the file in given url by retrieveing the "Last-Modified" header. * Note that this only works for http URLs * @param url * @return the last modified date or null if it couldn't be retrieved (in that case a warning will be logged) */ private Date getLastModifiedTime(URL url) { // see http://stackoverflow.com/questions/2416872/how-do-you-obtain-modified-date-from-a-remote-file-java Date date = null; try { String lastModified = url.openConnection().getHeaderField("Last-Modified"); logger.debug("Last modified date of server file ({}) is {}",url.toString(),lastModified); if (lastModified!=null) { try { date = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH).parse(lastModified); } catch (ParseException e) { logger.warn("Could not parse last modified time from string '{}', no last modified time available for file {}", lastModified, url.toString()); // this will return null } } } catch (IOException e) { logger.warn("Problems while retrieving last modified time for file {}", url.toString()); } return date; } /** * Gets the directory in which the file for a given MMCIF file would live, * creating it if necessary. * * The obsolete parameter is necessary to avoid additional server queries. * @param pdbId * @param obsolete Whether the pdbId is obsolete or not * @return File pointing to the directory, */ protected File getDir(String pdbId, boolean obsolete) { File dir = null; int offset = pdbId.length() - 3; if (obsolete) { // obsolete is always split String middle = pdbId.substring(offset, offset + 2).toLowerCase(); dir = new File(obsoleteDirPath, middle); } else { String middle = pdbId.substring(offset, offset + 2).toLowerCase(); dir = new File(splitDirPath, middle); } if (!dir.exists()) { boolean success = dir.mkdirs(); if (!success) logger.error("Could not create mmCIF dir {}",dir.toString()); } return dir; } /** * Searches for previously downloaded files * @param pdbId * @return A file pointing to the existing file, or null if not found * @throws IOException If the file exists but is empty and can't be deleted */ public File getLocalFile(String pdbId) throws IOException { return getLocalFile(new PdbId(pdbId)); } /** * Searches for previously downloaded files * @param pdbId * @return A file pointing to the existing file, or null if not found * @throws IOException If the file exists but is empty and can't be deleted */ public File getLocalFile(PdbId pdbId) throws IOException { String id = pdbId.getId(); int offset = id.length() - 3; // Search for existing files // Search directories: // 1) LOCAL_MMCIF_SPLIT_DIR//(pdb)?. // 2) LOCAL_MMCIF_ALL_DIR//(pdb)?. LinkedList searchdirs = new LinkedList<>(); String middle = id.substring(offset, offset+2).toLowerCase(); File splitdir = new File(splitDirPath, middle); searchdirs.add(splitdir); // Search obsolete files if requested if(getObsoleteBehavior() == ObsoleteBehavior.FETCH_OBSOLETE) { File obsdir = new File(obsoleteDirPath,middle); searchdirs.add(obsdir); } // valid prefixes before the in the filename String[] prefixes = new String[] {"", "pdb"}; for( File searchdir :searchdirs){ for( String prefix : prefixes) { for(String ex : getExtensions() ){ File f = new File(searchdir,prefix + id.toLowerCase() + ex) ; if ( f.exists()) { // delete files that are too short to have contents if( f.length() < MIN_PDB_FILE_SIZE ) { Files.delete(f.toPath()); return null; } return f; } } } } //Not found return null; } protected boolean checkFileExists(String pdbId) { return checkFileExists(new PdbId(pdbId)); } protected boolean checkFileExists(PdbId pdbId){ try { File path = getLocalFile(pdbId); if ( path != null) return true; } catch(IOException e) {} return false; } /** * Return the String with the PDB server name, including the leading protocol * String (http:// or ftp://). * The server name will be by default the value {@value #DEFAULT_PDB_FILE_SERVER} or the one * read from system property {@value #PDB_FILE_SERVER_PROPERTY} * * @return the server name including the leading protocol string */ public static String getServerName() { String name = System.getProperty(PDB_FILE_SERVER_PROPERTY); if ( name == null || name.trim().isEmpty()) { name = DEFAULT_PDB_FILE_SERVER; logger.debug("Using default PDB file server {}",name); } else { if (!name.startsWith("http://") && !name.startsWith("ftp://") && !name.startsWith("https://")) { logger.warn("Server name {} read from system property {} does not have a leading protocol string. Adding http:// to it", name, PDB_FILE_SERVER_PROPERTY); name = "http://"+name; } logger.info("Using PDB file server {} read from system property {}", name, PDB_FILE_SERVER_PROPERTY); } return name; } /** * Should be called whenever any of the path variables change. * Thus, if {@link getSplitDirPath()} or {@link getObsoleteDirPath()} * depend on anything, they should call this function when that thing * changes (possibly including at the end of the constructor). */ protected void initPaths() { // Hand-rolled String.join(), for java 6 String[] split = getSplitDirPath(); String[] obsolete = getObsoleteDirPath(); //URLs are joined with '/' StringBuilder splitURL = new StringBuilder("/pub/pdb"); for(int i=0;i





© 2015 - 2024 Weber Informatics LLC | Privacy Policy