org.openscience.cdk.fingerprint.Fingerprinter Maven / Gradle / Ivy
/* Copyright (C) 2002-2007 Christoph Steinbeck
*
* Contact: [email protected]
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may distribute
* with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.fingerprint;
import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.aromaticity.Aromaticity;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.graph.PathTools;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.ringsearch.AllRingsFinder;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import org.openscience.cdk.tools.periodictable.PeriodicTable;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Generates a fingerprint for a given AtomContainer. Fingerprints are
* one-dimensional bit arrays, where bits are set according to a the
* occurrence of a particular structural feature (See for example the
* Daylight inc. theory manual for more information). Fingerprints allow for
* a fast screening step to exclude candidates for a substructure search in a
* database. They are also a means for determining the similarity of chemical
* structures.
*
* A fingerprint is generated for an AtomContainer with this code:
* Molecule molecule = new Molecule();
* IFingerprinter fingerprinter = new Fingerprinter();
* IBitFingerprint fingerprint = fingerprinter.getBitFingerprint(molecule);
* fingerprint.size(); // returns 1024 by default
* fingerprint.length(); // returns the highest set bit
*
*
* The FingerPrinter assumes that hydrogens are explicitly given! Furthermore,
* if pseudo atoms or atoms with malformed symbols are present, their atomic
* number is taken as one more than the last element currently supported in
* {@link org.openscience.cdk.tools.periodictable.PeriodicTable}.
*
* Warning: The aromaticity detection for this
* FingerPrinter relies on AllRingsFinder, which is known to take very long
* for some molecules with many cycles or special cyclic topologies. Thus,
* the AllRingsFinder has a built-in timeout of 5 seconds after which it
* aborts and throws an Exception. If you want your SMILES generated at any
* expense, you need to create your own AllRingsFinder, set the timeout to a
* higher value, and assign it to this FingerPrinter. In the vast majority of
* cases, however, the defaults will be fine.
*
* Another Warning : The daylight manual says:
* "Fingerprints are not so definite: if a fingerprint indicates a pattern is
* missing then it certainly is, but it can only indicate a pattern's presence
* with some probability." In the case of very small molecules, the
* probability that you get the same fingerprint for different molecules is
* high.
*
*
* @author steinbeck
* @cdk.created 2002-02-24
* @cdk.keyword fingerprint
* @cdk.keyword similarity
* @cdk.module standard
* @cdk.githash
*/
public class Fingerprinter implements IFingerprinter {
/** Throw an exception if too many paths (per atom) are generated. */
private final static int DEFAULT_PATH_LIMIT = 1500;
/** The default length of created fingerprints. */
public final static int DEFAULT_SIZE = 1024;
/** The default search depth used to create the fingerprints. */
public final static int DEFAULT_SEARCH_DEPTH = 8;
private int size;
private int searchDepth;
private int pathLimit = DEFAULT_PATH_LIMIT;
static int debugCounter = 0;
private static ILoggingTool logger = LoggingToolFactory
.createLoggingTool(Fingerprinter.class);
private static final Map QUERY_REPLACE = new HashMap() {
private static final long serialVersionUID = 1L;
{
put("Cl", "X");
put("Br", "Z");
put("Si", "Y");
put("As", "D");
put("Li", "L");
put("Se", "E");
put("Na", "G");
put("Ca", "J");
put("Al", "A");
}
};
/**
* Creates a fingerprint generator of length DEFAULT_SIZE
* and with a search depth of DEFAULT_SEARCH_DEPTH
.
*/
public Fingerprinter() {
this(DEFAULT_SIZE, DEFAULT_SEARCH_DEPTH);
}
public Fingerprinter(int size) {
this(size, DEFAULT_SEARCH_DEPTH);
}
/**
* Constructs a fingerprint generator that creates fingerprints of
* the given size, using a generation algorithm with the given search
* depth.
*
* @param size The desired size of the fingerprint
* @param searchDepth The desired depth of search
*/
public Fingerprinter(int size, int searchDepth) {
this.size = size;
this.searchDepth = searchDepth;
}
/**
* Generates a fingerprint of the default size for the given AtomContainer.
*
* @param container The AtomContainer for which a Fingerprint is generated
* @param ringFinder An instance of
* {@link org.openscience.cdk.ringsearch.AllRingsFinder}
* @exception CDKException if there is a timeout in ring or aromaticity
* perception
* @return A {@link BitSet} representing the fingerprint
*/
public IBitFingerprint getBitFingerprint(IAtomContainer container, AllRingsFinder ringFinder) throws CDKException {
int position = -1;
logger.debug("Entering Fingerprinter");
logger.debug("Starting Aromaticity Detection");
long before = System.currentTimeMillis();
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(container);
Aromaticity.cdkLegacy().apply(container);
long after = System.currentTimeMillis();
logger.debug("time for aromaticity calculation: " + (after - before) + " milliseconds");
logger.debug("Finished Aromaticity Detection");
BitSet bitSet = new BitSet(size);
int[] hashes = findPathes(container, searchDepth);
for (int hash : hashes) {
position = new java.util.Random(hash).nextInt(size);
bitSet.set(position);
}
return new BitSetFingerprint(bitSet);
}
/**
* Generates a fingerprint of the default size for the given AtomContainer.
*
*@param container The AtomContainer for which a Fingerprint is generated
*/
@Override
public IBitFingerprint getBitFingerprint(IAtomContainer container) throws CDKException {
return getBitFingerprint(container, null);
}
/** {@inheritDoc} */
@Override
public Map getRawFingerprint(IAtomContainer iAtomContainer) throws CDKException {
throw new UnsupportedOperationException();
}
/**
* Get all paths of lengths 0 to the specified length.
*
* This method will find all paths up to length N starting from each
* atom in the molecule and return the unique set of such paths.
*
* @param container The molecule to search
* @param searchDepth The maximum path length desired
* @return A Map of path strings, keyed on themselves
*/
protected int[] findPathes(IAtomContainer container, int searchDepth) throws CDKException {
List allPaths = new ArrayList();
Map> cache = new HashMap>();
for (IAtom startAtom : container.atoms()) {
List> p = PathTools.getLimitedPathsOfLengthUpto(container, startAtom, searchDepth, pathLimit);
for (List path : p) {
StringBuffer sb = new StringBuffer();
IAtom x = path.get(0);
// TODO if we ever get more than 255 elements, this will
// fail maybe we should use 0 for pseudo atoms and
// malformed symbols? - nope a char 16 bit, up to 65,535
// is okay :)
if (x instanceof IPseudoAtom)
sb.append((char) PeriodicTable.getElementCount() + 1);
else {
Integer atnum = PeriodicTable.getAtomicNumber(x.getSymbol());
if (atnum != null)
sb.append(convertSymbol(x.getSymbol()));
else
sb.append((char) PeriodicTable.getElementCount() + 1);
}
for (int i = 1; i < path.size(); i++) {
final IAtom[] y = {path.get(i)};
Map m = cache.get(x);
final IBond[] b = {m != null ? m.get(y[0]) : null};
if (b[0] == null) {
b[0] = container.getBond(x, y[0]);
cache.put(x, new HashMap() {
{
put(y[0], b[0]);
}
});
}
sb.append(getBondSymbol(b[0]));
sb.append(convertSymbol(y[0].getSymbol()));
x = y[0];
}
// we store the lexicographically lower one of the
// string and its reverse
StringBuffer revForm = new StringBuffer(sb);
revForm.reverse();
if (sb.toString().compareTo(revForm.toString()) <= 0)
allPaths.add(sb);
else
allPaths.add(revForm);
}
}
// now lets clean stuff up
Set cleanPath = new HashSet();
for (StringBuffer s : allPaths) {
String s1 = s.toString().trim();
if (s1.equals("")) continue;
if (cleanPath.contains(s1)) continue;
String s2 = s.reverse().toString().trim();
if (cleanPath.contains(s2)) continue;
cleanPath.add(s2);
}
// convert paths to hashes
int[] hashes = new int[cleanPath.size()];
int i = 0;
for (String s : cleanPath)
hashes[i++] = s.hashCode();
return hashes;
}
private String convertSymbol(String symbol) {
String returnSymbol = QUERY_REPLACE.get(symbol);
return returnSymbol == null ? symbol : returnSymbol;
}
/**
* Gets the bondSymbol attribute of the Fingerprinter class
*
*@param bond Description of the Parameter
*@return The bondSymbol value
*/
protected String getBondSymbol(IBond bond) {
String bondSymbol = "";
if (bond.getFlag(CDKConstants.ISAROMATIC)) {
bondSymbol = ":";
} else if (bond.getOrder() == IBond.Order.SINGLE) {
bondSymbol = "-";
} else if (bond.getOrder() == IBond.Order.DOUBLE) {
bondSymbol = "=";
} else if (bond.getOrder() == IBond.Order.TRIPLE) {
bondSymbol = "#";
}
return bondSymbol;
}
public void setPathLimit(int limit) {
this.pathLimit = limit;
}
public int getSearchDepth() {
return searchDepth;
}
@Override
public int getSize() {
return size;
}
@Override
public ICountFingerprint getCountFingerprint(IAtomContainer container) throws CDKException {
throw new UnsupportedOperationException();
}
}