All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openmolecules.chem.conf.gen.RigidFragmentCache Maven / Gradle / Ivy

There is a newer version: 2024.11.2
Show newest version
/*
 * Copyright 2013-2020 Thomas Sander, openmolecules.org
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the copyright holder nor the names of its contributors
 *    may be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author Thomas Sander
 */

package org.openmolecules.chem.conf.gen;

import com.actelion.research.chem.Canonizer;
import com.actelion.research.chem.Coordinates;
import com.actelion.research.chem.IDCodeParserWithoutCoordinateInvention;
import com.actelion.research.chem.StereoMolecule;
import com.actelion.research.chem.io.CompoundFileParser;
import com.actelion.research.gui.FileHelper;
import com.actelion.research.util.DoubleFormat;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.ZipInputStream;

/**
 * This class implements a thread-save, concurrent cache of rigid fragments' 3D-atom-coordinates.
 * It is accessed by the RigidFragmentProvider instances, which serve RigidFragments to
 * ConformerGenerator instances when constructing 3D-coordinates for molecules by assembling
 * them from 3-dimensional rigid fragments and torsion tables.
 * Typically, ConformerGenerators start with an empty cache that fills over time or with
 * a default cache, which is prefilled with many common fragments from organic and medicinal
 * chemistry as well as with common building block fragments.
* The default cache is balanced in memory footprint and number of fragments it contains. * For special purposes you may consider creating an own custom cache file using the createCacheFiles() method. **/ public class RigidFragmentCache extends ConcurrentHashMap implements Serializable { private static final String DEFAULT_CACHE_FILE = "/resources/defaultRigidFragments.zip"; private static RigidFragmentCache sInstance; private int mHitCount,mGetCount, mNonCachableCount; private boolean mDefaultCacheLoaded; private TreeSet mSetOfLoadedCacheFiles; public static RigidFragmentCache getDefaultInstance() { if (sInstance != null) return sInstance; synchronized (RigidFragmentCache.class) { if (sInstance == null) sInstance = new RigidFragmentCache(); return sInstance; } } public static RigidFragmentCache createInstance(String cacheFileName) { RigidFragmentCache cache = new RigidFragmentCache(); if (cacheFileName != null) cache.loadCache(cacheFileName); return cache; } private RigidFragmentCache() {} @Override public void clear() { super.clear(); mDefaultCacheLoaded = false; } @Override public RigidFragmentCache.CacheEntry get(Object key) { RigidFragmentCache.CacheEntry entry = super.get(key); mGetCount++; if (entry != null) { entry.incrementHitCount(); mHitCount++; } return entry; } public double getHitQuote() { return (double)mHitCount/(double)mGetCount; } public int getHitCount() { return mHitCount; } public int getRequestCount() { return mGetCount; } public int getNonCachableCount() { return mNonCachableCount; } public void increaseNonCachableCount() { mNonCachableCount++; } public void resetAllCounters() { mNonCachableCount = 0; mHitCount = 0; mGetCount = 0; } /** * Writes for every distinct fragment: one idcode, multiple encoded coordinate sets, multiple conformer likelihoods * @param cacheFileName * @param minHits number of hits for a cache entry to be included in the cache file */ public boolean serializeCache(String cacheFileName, int minHits) { try{ BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(cacheFileName),"UTF-8")); for (String key : keySet()) { RigidFragmentCache.CacheEntry cacheEntry = super.get(key); // we need super to not increment hit counter if (cacheEntry.hitCount >= minHits) { bw.write(key); bw.newLine(); bw.write(Integer.toString(cacheEntry.coordinates.length)); bw.newLine(); StereoMolecule mol = new IDCodeParserWithoutCoordinateInvention().getCompactMolecule(key); Canonizer canonizer = new Canonizer(mol, Canonizer.COORDS_ARE_3D); for (Coordinates[] coords:cacheEntry.coordinates) { bw.write(canonizer.getEncodedCoordinates(true, coords)); bw.newLine(); canonizer.invalidateCoordinates(); } for (double likelihood:cacheEntry.likelihood) { bw.write(DoubleFormat.toString(likelihood)); bw.newLine(); } } } bw.close(); return true; } catch (IOException ex) { ex.printStackTrace(); } return false; } /** * This loads the default cache file */ public synchronized void loadDefaultCache() { if (!mDefaultCacheLoaded) { try { InputStream is = RigidFragmentCache.class.getResourceAsStream(DEFAULT_CACHE_FILE); if (is != null) { ZipInputStream zipStream = new ZipInputStream(is); zipStream.getNextEntry(); BufferedReader reader = new BufferedReader(new InputStreamReader(zipStream, StandardCharsets.UTF_8)); loadCache(reader); reader.close(); mDefaultCacheLoaded = true; } } catch (Exception e) { e.printStackTrace(); } } } /** * Loads pre-calculated rigid fragment coordinates from a cache file, which is either a text file * created by the createCacheFiles() method, or a zip archive of the text file. * This method can be called multiple times to add conformer data from multiple sources. * If the method is called with a cacheFileNam, which was loaded before, then it is not loaded a second time. * @param cacheFileName text file or zipped text file with extension .zip */ public void loadCache(String cacheFileName) { if (mSetOfLoadedCacheFiles == null) mSetOfLoadedCacheFiles = new TreeSet(); if (!mSetOfLoadedCacheFiles.contains(cacheFileName)) { try { BufferedReader reader; if (cacheFileName.endsWith(".zip")) { ZipInputStream zipStream = new ZipInputStream(new FileInputStream(cacheFileName)); zipStream.getNextEntry(); reader = new BufferedReader(new InputStreamReader(zipStream, StandardCharsets.UTF_8)); } else { reader = new BufferedReader(new FileReader(cacheFileName)); } loadCache(reader); reader.close(); } catch (Exception e) { e.printStackTrace(); } mSetOfLoadedCacheFiles.add(cacheFileName); } } private void loadCache(BufferedReader br) throws Exception { String idcode; while ((idcode = br.readLine()) != null) { IDCodeParserWithoutCoordinateInvention parser = new IDCodeParserWithoutCoordinateInvention(); StereoMolecule mol = parser.getCompactMolecule(idcode); int count = Integer.parseInt(br.readLine()); Coordinates[][] coords = new Coordinates[count][mol.getAllAtoms()]; for (int i=0; i * - To achieve a maximum of speed on the expense of memory, e.g. for a cloud based service that * generates conformers on request.
* - If you process molecules with limited diversity, e.g. combinatorial libraries as the Enamine REAL space. * Then you may use a complete cache covering every existing fragment for maximum speed.
* - If you store conformer sets as fragment references and torsion tables. Then your fragment cache * needs a complete cache covering every existing fragment.
* This method processes all input files, locates and all rigid fragments, produces one or more * distinct conformers from the fragments and creates a new cache from them. Optionally, the * fragment conformers can be energy minimized using the MMFF94s+ forcefield. Then multiple cache * cache export files are written: with all cache entries, with entries used at least 2,3,5, and 10 times. * The numbers 1,2,3,5,10 and .txt extention will be appended to the given cache file name. * @param inputFileNames array of one or more input file paths (may be mixture of sdf and dwar) * @param outputDirectory path to output directory ('_cache_n_.txt' will be added) * @param threadCount if 1 then a single threaded approach is used; if 0 then all existing cores are used * @param optimizeFragments whether to energy minimize fragments using MMFF94s+ * @param maxCompoundsPerFile if an input file contains more compounds than this, then the rest are skipped * @param rfp null or custom RigidFragmentProvider if fragments shall be minimized with a different method * @return created cache or null, if an input file could not be found */ public static RigidFragmentCache createCache(String[] inputFileNames, String outputDirectory, int threadCount, boolean optimizeFragments, int maxCompoundsPerFile, RigidFragmentProvider rfp) { boolean notFound = false; for (String ifn:inputFileNames) if (!FileHelper.fileExists(new File(ifn), 1000)) { System.out.println("File not found: '"+ifn+"'"); notFound = true; } if (notFound) return null; RigidFragmentCache cache = createInstance(null); if (rfp != null) rfp.setCache(cache); for (String ifn:inputFileNames) { long millis = (threadCount != 1) ? addFragmentsToCacheSMP(cache, optimizeFragments, ifn, maxCompoundsPerFile, rfp, threadCount) : addFragmentsToCache(cache, optimizeFragments, ifn, maxCompoundsPerFile, rfp); System.out.println("File '"+ifn+"' processed in "+millis+" milliseconds."); } if (inputFileNames != null) { System.out.print("Writing cache files... "); String cacheFileName = outputDirectory.concat("/cache_"); boolean success = cache.serializeCache(cacheFileName + "1.txt", 0) // we have one hit less than usages && cache.serializeCache(cacheFileName + "2.txt", 1) && cache.serializeCache(cacheFileName + "3.txt", 2) && cache.serializeCache(cacheFileName + "5.txt", 4) && cache.serializeCache(cacheFileName + "10.txt", 9); System.out.println(success ? "done" : "failure !!!"); } return cache; } private static long addFragmentsToCache(RigidFragmentCache cache, boolean optimizeFragments, String inputFile, int maxCompounds, RigidFragmentProvider rfp) { long start_millis = System.currentTimeMillis(); int compoundNo = 0; System.out.println("Processing '"+inputFile+"'... ('.' = 50 molecules)"); CompoundFileParser parser = CompoundFileParser.createParser(inputFile); ConformerGenerator cg = (rfp == null) ? new ConformerGenerator(123L, cache, optimizeFragments) : new ConformerGenerator(123L, rfp); while (parser.next() && compoundNo < maxCompounds) { if (compoundNo % 50 == 49) System.out.print("."); if (compoundNo % 5000 == 4999) { System.out.println(" hit-rate:" + DoubleFormat.toString(cache.getHitQuote(), 5, false) + " millis:" + (System.currentTimeMillis() - start_millis) + " cacheSize:" + cache.size()); cache.resetAllCounters(); } cg.initialize(parser.getMolecule(), false); compoundNo++; } System.out.println(); return System.currentTimeMillis() - start_millis; } private static long addFragmentsToCacheSMP(RigidFragmentCache cache, boolean optimizeFragments, String inputFile, int maxCompounds, RigidFragmentProvider rfp, int threadCount) { long start_millis = System.currentTimeMillis(); int compoundNo = 0; System.out.println("Processing '" + inputFile + "'... ('.' = 50 molecules)"); CompoundFileParser parser = CompoundFileParser.createParser(inputFile); if (threadCount == 0) threadCount = Runtime.getRuntime().availableProcessors(); ArrayBlockingQueue queue = new ArrayBlockingQueue<>(2*threadCount); Thread[] t = new Thread[threadCount]; for (int i = 0; i consumeMoleculesToCacheFragments(queue, cache, optimizeFragments, rfp)); t[i].setPriority(Thread.MIN_PRIORITY); t[i].start(); } while (parser.next() && compoundNo queue, RigidFragmentCache cache, boolean optimizeFragments, RigidFragmentProvider rfp) { ConformerGenerator cg = (rfp == null) ? new ConformerGenerator(123L, cache, optimizeFragments) : new ConformerGenerator(123L, rfp); try { while (true) cg.initialize(queue.take(), false); } catch (InterruptedException ie) {} // spawning thread interrupts this after last molecule } /** * Writes a TAB delimited text file that can be opened for debug or other purposes by DataWarrior containing * idcode, idcoords, multiple conformer likelihoods * @param cacheFileName */ public boolean writeTabDelimitedTable(String cacheFileName) { try { BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(cacheFileName),"UTF-8")); bw.write("Fragment No\tConformer No\tConformer Count\tidcode\tidcoords\tLikelihood"); bw.newLine(); int fragmentCount = 0; for (String key : keySet()) { fragmentCount++; RigidFragmentCache.CacheEntry cacheEntry = super.get(key); // we need super to not increment hit counter StereoMolecule mol = new IDCodeParserWithoutCoordinateInvention().getCompactMolecule(key); Canonizer canonizer = new Canonizer(mol, Canonizer.COORDS_ARE_3D); for (int i=0; i { Coordinates[][] coordinates; double[] likelihood; int hitCount; public CacheEntry(Coordinates[][] coordinates, double[] likelihoods) { this.coordinates = coordinates; this.likelihood = likelihoods; } public void incrementHitCount() { hitCount++; } @Override public int compareTo(CacheEntry o) { if (hitCount != o.hitCount) return hitCount < o.hitCount ? -1 : 1; return 0; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy