All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.structure.io.mmcif.ZipChemCompProvider Maven / Gradle / Ivy

There is a newer version: 7.2.2
Show newest version
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io.mmcif;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.biojava.nbio.structure.io.mmcif.chem.PolymerType;
import org.biojava.nbio.structure.io.mmcif.chem.ResidueType;
import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** This chemical component provider retrieves and caches chemical component definition files from a
 * zip archive specified in its construction.  If the archive does not contain the record, an attempt is
 * made to download it using DownloadChemCompProvider. The downloaded file is then added to the archive.
 *
 * The class is thread-safe and the same ZipChemCompProvider should be used by all threads to prevent
 * simultaneous read or write to the zip archive.  A zip archive will be created if missing.
 *
 * @author edlunde
 * @author larsonm
 * @since 12/05/12
 * updated 3/5/2016 for Java 7 ZipFileSystem
 */
public class ZipChemCompProvider implements ChemCompProvider{
	private static final Logger s_logger = LoggerFactory.getLogger(ZipChemCompProvider.class);

	private final Path m_tempDir;  // Base path where $m_zipRootDir/ will be downloaded to.
	private final Path m_zipRootDir;
	private final Path m_zipFile;
	private final DownloadChemCompProvider m_dlProvider;

	private boolean m_removeCif;

	// Missing IDs from library that cannot be download added here to prevent delays.
	private Set unavailable = new HashSet();

	/**
	 * ZipChemCompProvider is a Chemical Component provider that stores chemical components
	 * in a zip archive.  Missing chemical components are downloaded and appended to the
	 * archive.  If non-existent a new zip archive will be created.
	 *
	 * @param chemicalComponentDictionaryFile : path to zip archive for chemical components.
	 * @param tempDir : path for temporary directory, (null) defaults to path in property "java.io.tmpdir".
	 * @throws IOException
	 */
	public ZipChemCompProvider(String chemicalComponentDictionaryFile, String tempDir) throws IOException {
		this.m_zipFile = Paths.get(chemicalComponentDictionaryFile);

		// Use a default temporary directory if not passed a value.
		if (tempDir == null || tempDir.equals("")) {
			this.m_tempDir = Paths.get(System.getProperty("java.io.tmpdir"));
		} else {
			this.m_tempDir = Paths.get(tempDir);
		}

		this.m_zipRootDir = Paths.get("chemcomp");

		// Setup an instance of the download chemcomp provider.
		this.m_dlProvider = new DownloadChemCompProvider(m_tempDir.toString());
		this.m_removeCif = true;
		initializeZip();
	}

	// See comments in addToZipFileSystem for why initialization is required with
	// ZipFileSystems - due to URI issues in Java7.
	private void initializeZip() throws IOException {
		s_logger.info("Using chemical component dictionary: " + m_zipFile.toString());
		final File f = m_zipFile.toFile();
		if (!f.exists()) {
			s_logger.info("Creating missing zip archive: " + m_zipFile.toString());
			FileOutputStream fo = new FileOutputStream(f);
			ZipOutputStream zip = new ZipOutputStream(new BufferedOutputStream(fo));
			try {
				zip.putNextEntry(new ZipEntry("chemcomp/"));
				zip.closeEntry();
			} finally {
				zip.close();
			}
		}
	}

	/**
	 * Remove downloaded .cif.gz after adding to zip archive?
	 * Default is true.
	 * @param doRemove
	 */
	public void setRemoveCif(boolean doRemove) {
		m_removeCif = doRemove;
	}

	/* (non-Javadoc)
	 * @see org.biojava.nbio.structure.io.mmcif.ChemCompProvider#getChemComp(java.lang.String)
	 *
	 * @param recordName : three letter PDB name for a residue
	 * @return ChemComp from .zip or ChemComp from repository.  Will return empty ChemComp when unable to find a residue and will return null if not provided a valid recordName.
	 */
	@Override
	public ChemComp getChemComp(String recordName) {
		if (null == recordName) return null;

		// handle non-existent ChemComp codes and do not repeatedly attempt to add these.
		for (String str : unavailable) {
			if (recordName.equals(str)) return getEmptyChemComp(recordName);
		}

		// Try to pull from zip, if fail then download.
		ChemComp cc = getFromZip(recordName);
		if (cc == null) {
			s_logger.info("File "+recordName+" not found in archive. Attempting download from PDB.");
			cc = downloadAndAdd(recordName);
		}

		// If a null record or an empty chemcomp, return a default ChemComp and blacklist.
		if (cc == null || (null == cc.getName() && cc.getAtoms().size() == 0)) {
			s_logger.info("Unable to find or download " + recordName + " - excluding from future searches.");
			unavailable.add(recordName);
			return getEmptyChemComp(recordName);
		}
		return cc;
	}

	/** Use DownloadChemCompProvider to grab a gzipped cif record from the PDB.
	 *  Zip all downloaded cif.gz files into the dictionary.
	 *
	 * @param recordName is the three-letter chemical component code (i.e. residue name).
	 * @return ChemComp matching recordName
	 */
	private ChemComp downloadAndAdd(String recordName){
		final ChemComp cc = m_dlProvider.getChemComp(recordName);

		// final File [] files = finder(m_tempDir.resolve("chemcomp").toString(), "cif.gz");
		final File [] files = new File[1];
		Path cif = m_tempDir.resolve("chemcomp").resolve(recordName + ".cif.gz");
		files[0] = cif.toFile();
		if (files != null) {
			addToZipFileSystem(m_zipFile, files, m_zipRootDir);
			if (m_removeCif) for (File f : files) f.delete();
		}
		return cc;
	}

	/**
	 * Cleanup chemical component (.cif.gz) files downloaded to tmpdir.
	 * @param tempdir : path to temporary directory for chemical components
	 */
	public static void purgeTempFiles(String tempdir) {
		if (tempdir == null) return;

		s_logger.info("Removing: "+tempdir);
		Path dlPath = Paths.get(tempdir).resolve("chemcomp");
		File[] chemCompOutFiles = finder(dlPath.toString(), "cif.gz");
		if (null != chemCompOutFiles) for (File f : chemCompOutFiles) f.delete();
		dlPath.toFile().delete();
	}

	/**
	 * Return an empty ChemComp group for a three-letter resName.
	 * @param resName
	 * @return
	 */
	private ChemComp getEmptyChemComp(String resName){
		String pdbName = ""; // Empty string is default
		if (null != resName && resName.length() >= 3) {
			pdbName = resName.substring(0,3);
		}
		final ChemComp comp = new ChemComp();
		comp.setOne_letter_code("?");
		comp.setThree_letter_code(pdbName);
		comp.setPolymerType(PolymerType.unknown);
		comp.setResidueType(ResidueType.atomn);
		return comp;
	}

	/**
	 * Return File(s) in dirName that match suffix.
	 * @param dirName
	 * @param suffix
	 * @return
	 */
	static private File[] finder( String dirName, final String suffix){
		if (null == dirName || null == suffix) {
			return null;
		}

		final File dir = new File(dirName);
		return dir.listFiles(new FilenameFilter() {
			@Override
			public boolean accept(File dir, String filename)
			{ return filename.endsWith(suffix); }
		} );
	}

	/**
	 * This is synchronized, along with addToFileSystem to prevent simulatenous reading/writing.
	 * @param recordName to find in zipfile.
	 * @return ChemComp if found or null if missing.
	 */
	private synchronized ChemComp getFromZip(String recordName) {
		ChemComp cc = null;
		if (!m_zipFile.toFile().exists()) return cc;
		final String filename = "chemcomp/" + recordName+".cif.gz";

		// try with resources block to read from the filesystem.
		try (FileSystem fs = FileSystems.newFileSystem(m_zipFile, null)) {
			Path cif = fs.getPath(filename);

			if (Files.exists(cif)) {
				final InputStream zipStream = Files.newInputStream(cif);
				final InputStream inputStream = new GZIPInputStream(zipStream);
				s_logger.debug("reading " + recordName + " from " + m_zipFile);
				final MMcifParser parser = new SimpleMMcifParser();
				final ChemCompConsumer consumer = new ChemCompConsumer();
				parser.addMMcifConsumer(consumer);
				parser.parse(inputStream);
				inputStream.close();

				final ChemicalComponentDictionary dict = consumer.getDictionary();
				cc = dict.getChemComp(recordName);
			}
		} catch (IOException e) {
			s_logger.error("Unable to read from zip file : " + e.getMessage());
		}

		return cc;
	}

	/**
	 * Add an array of files to a zip archive.
	 * Synchronized to prevent simultaneous reading/writing.
	 *
	 * @param zipFile is a destination zip archive
	 * @param files is an array of files to be added
	 * @param pathWithinArchive is the path within the archive to add files to
	 * @return true if successfully appended these files.
	 */
	private synchronized boolean addToZipFileSystem(Path zipFile, File[] files, Path pathWithinArchive) {
		boolean ret = false;

		/* URIs in Java 7 cannot have spaces, must use Path instead
		 * and so, cannot use the properties map to describe need to create
		 * a new zip archive.  ZipChemCompProvider.initilizeZip to creates the
		 * missing zip file */

		/*
		// convert the filename to a URI
		String uriString = "jar:file:" + zipFile.toUri().getPath();
		final URI uri = URI.create(uriString);

		// if filesystem doesn't exist, create one.
		final Map env = new HashMap<>();
		// Create a new zip if one isn't present.
		if (!zipFile.toFile().exists()) {
			System.out.println("Need to create " + zipFile.toString());
		}
		env.put("create", String.valueOf(!zipFile.toFile().exists()));
		// Specify the encoding as UTF -8
		env.put("encoding", "UTF-8");
		*/

		// Copy in each file.
		try (FileSystem zipfs = FileSystems.newFileSystem(zipFile, null)) {
			Files.createDirectories(pathWithinArchive);
			for (File f : files) {
				if (!f.isDirectory() && f.exists()) {
					Path externalFile = f.toPath();
					Path pathInZipFile = zipfs.getPath(pathWithinArchive.resolve(f.getName()).toString());
					Files.copy(externalFile, pathInZipFile,
							StandardCopyOption.REPLACE_EXISTING);
				}
			}
			ret = true;
		} catch (IOException ex) {
			s_logger.error("Unable to add entries to Chemical Component zip archive : " + ex.getMessage());
			ret = false;
		}
		return ret;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy