org.openscience.cdk.fingerprint.Fingerprinter Maven / Gradle / Ivy
/* Copyright (C) 2002-2007 Christoph Steinbeck
*
* Contact: [email protected]
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may distribute
* with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.fingerprint;
import org.openscience.cdk.aromaticity.Aromaticity;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.graph.PathTools;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.ringsearch.AllRingsFinder;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
* Generates a fingerprint for a given AtomContainer. Fingerprints are
* one-dimensional bit arrays, where bits are set according to a the
* occurrence of a particular structural feature (See for example the
* Daylight inc. theory manual for more information). Fingerprints allow for
* a fast screening step to exclude candidates for a substructure search in a
* database. They are also a means for determining the similarity of chemical
* structures.
*
* A fingerprint is generated for an AtomContainer with this code:
* Molecule molecule = new Molecule();
* IFingerprinter fingerprinter = new Fingerprinter();
* IBitFingerprint fingerprint = fingerprinter.getBitFingerprint(molecule);
* fingerprint.size(); // returns 1024 by default
* fingerprint.length(); // returns the highest set bit
*
*
* The FingerPrinter assumes that hydrogens are explicitly given! Furthermore,
* if pseudo atoms or atoms with malformed symbols are present, their atomic
* number is taken as one more than the last element currently supported in
* {@link org.openscience.cdk.tools.periodictable.PeriodicTable}.
*
* Warning: The aromaticity detection for this
* FingerPrinter relies on AllRingsFinder, which is known to take very long
* for some molecules with many cycles or special cyclic topologies. Thus,
* the AllRingsFinder has a built-in timeout of 5 seconds after which it
* aborts and throws an Exception. If you want your SMILES generated at any
* expense, you need to create your own AllRingsFinder, set the timeout to a
* higher value, and assign it to this FingerPrinter. In the vast majority of
* cases, however, the defaults will be fine.
*
* Another Warning : The daylight manual says:
* "Fingerprints are not so definite: if a fingerprint indicates a pattern is
* missing then it certainly is, but it can only indicate a pattern's presence
* with some probability." In the case of very small molecules, the
* probability that you get the same fingerprint for different molecules is
* high.
*
*
* @author steinbeck
* @cdk.created 2002-02-24
* @cdk.keyword fingerprint
* @cdk.keyword similarity
* @cdk.module standard
* @cdk.githash
*/
public class Fingerprinter extends AbstractFingerprinter implements IFingerprinter {
/** Throw an exception if too many paths (per atom) are generated. */
private final static int DEFAULT_PATH_LIMIT = 42000;
/** The default length of created fingerprints. */
public final static int DEFAULT_SIZE = 1024;
/** The default search depth used to create the fingerprints. */
public final static int DEFAULT_SEARCH_DEPTH = 7;
private int size;
private int searchDepth;
private int pathLimit = DEFAULT_PATH_LIMIT;
private boolean hashPseudoAtoms = false;
static int debugCounter = 0;
private static ILoggingTool logger = LoggingToolFactory
.createLoggingTool(Fingerprinter.class);
/**
* Creates a fingerprint generator of length DEFAULT_SIZE
* and with a search depth of DEFAULT_SEARCH_DEPTH
.
*/
public Fingerprinter() {
this(DEFAULT_SIZE, DEFAULT_SEARCH_DEPTH);
}
public Fingerprinter(int size) {
this(size, DEFAULT_SEARCH_DEPTH);
}
/**
* Constructs a fingerprint generator that creates fingerprints of
* the given size, using a generation algorithm with the given search
* depth.
*
* @param size The desired size of the fingerprint
* @param searchDepth The desired depth of search (number of bonds)
*/
public Fingerprinter(int size, int searchDepth) {
this.size = size;
this.searchDepth = searchDepth;
}
@Override
protected List> getParameters() {
return Arrays.>asList(
new SimpleImmutableEntry<>("searchDepth", Integer.toString(searchDepth)),
new SimpleImmutableEntry<>("pathLimit", Integer.toString(pathLimit)),
new SimpleImmutableEntry<>("hashPseudoAtoms", Boolean.toString(hashPseudoAtoms))
);
}
/**
* Generates a fingerprint of the default size for the given AtomContainer.
*
* @param container The AtomContainer for which a Fingerprint is generated
* @param ringFinder An instance of
* {@link org.openscience.cdk.ringsearch.AllRingsFinder}
* @exception CDKException if there is a timeout in ring or aromaticity
* perception
* @return A {@link BitSet} representing the fingerprint
*/
public IBitFingerprint getBitFingerprint(IAtomContainer container, AllRingsFinder ringFinder) throws CDKException {
logger.debug("Entering Fingerprinter");
logger.debug("Starting Aromaticity Detection");
long before = System.currentTimeMillis();
if (!hasPseudoAtom(container.atoms())) {
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(container);
Aromaticity.cdkLegacy().apply(container);
}
long after = System.currentTimeMillis();
logger.debug("time for aromaticity calculation: " + (after - before) + " milliseconds");
logger.debug("Finished Aromaticity Detection");
BitSet bitSet = new BitSet(size);
encodePaths(container, searchDepth, bitSet, size);
return new BitSetFingerprint(bitSet);
}
/**
* Generates a fingerprint of the default size for the given AtomContainer.
*
*@param container The AtomContainer for which a Fingerprint is generated
*/
@Override
public IBitFingerprint getBitFingerprint(IAtomContainer container) throws CDKException {
return getBitFingerprint(container, null);
}
/** {@inheritDoc} */
@Override
public Map getRawFingerprint(IAtomContainer iAtomContainer) throws CDKException {
throw new UnsupportedOperationException();
}
private IBond findBond(List bonds, IAtom beg, IAtom end) {
for (IBond bond : bonds)
if (bond.contains(beg) && bond.contains(end))
return bond;
return null;
}
private String encodePath(IAtomContainer mol, Map> cache, List path, StringBuilder buffer) {
buffer.setLength(0);
IAtom prev = path.get(0);
buffer.append(getAtomSymbol(prev));
for (int i = 1; i < path.size(); i++) {
final IAtom next = path.get(i);
List bonds = cache.get(prev);
if (bonds == null) {
bonds = mol.getConnectedBondsList(prev);
cache.put(prev, bonds);
}
IBond bond = findBond(bonds, next, prev);
if (bond == null)
throw new IllegalStateException("FATAL - Atoms in patch were connected?");
buffer.append(getBondSymbol(bond));
buffer.append(getAtomSymbol(next));
prev = next;
}
return buffer.toString();
}
private String encodePath(List apath, List bpath, StringBuilder buffer) {
buffer.setLength(0);
IAtom prev = apath.get(0);
buffer.append(getAtomSymbol(prev));
for (int i = 1; i < apath.size(); i++) {
final IAtom next = apath.get(i);
final IBond bond = bpath.get(i-1);
buffer.append(getBondSymbol(bond));
buffer.append(getAtomSymbol(next));
}
return buffer.toString();
}
private int appendHash(int hash, String str) {
int len = str.length();
for (int i = 0; i < len; i++)
hash = 31 * hash + str.charAt(0);
return hash;
}
private int hashPath(List apath, List bpath) {
int hash = 0;
hash = appendHash(hash, getAtomSymbol(apath.get(0)));
for (int i = 1; i < apath.size(); i++) {
final IAtom next = apath.get(i);
final IBond bond = bpath.get(i-1);
hash = appendHash(hash, getBondSymbol(bond));
hash = appendHash(hash, getAtomSymbol(next));
}
return hash;
}
private int hashRevPath(List apath, List bpath) {
int hash = 0;
int last = apath.size() - 1;
hash = appendHash(hash, getAtomSymbol(apath.get(last)));
for (int i = last-1; i >= 0; i--) {
final IAtom next = apath.get(i);
final IBond bond = bpath.get(i);
hash = appendHash(hash, getBondSymbol(bond));
hash = appendHash(hash, getAtomSymbol(next));
}
return hash;
}
private static final class State {
private int numPaths = 0;
private Random rand = new Random();
private BitSet fp;
private IAtomContainer mol;
private Set visited = new HashSet<>();
private List apath = new ArrayList<>();
private List bpath = new ArrayList<>();
private final int maxDepth;
private final int fpsize;
private Map> cache = new IdentityHashMap<>();
public StringBuilder buffer = new StringBuilder();
public State(IAtomContainer mol, BitSet fp, int fpsize, int maxDepth) {
this.mol = mol;
this.fp = fp;
this.fpsize = fpsize;
this.maxDepth = maxDepth;
}
List getBonds(IAtom atom) {
List bonds = cache.get(atom);
if (bonds == null) {
bonds = mol.getConnectedBondsList(atom);
cache.put(atom, bonds);
}
return bonds;
}
boolean visit(IAtom a) {
return visited.add(a);
}
boolean unvisit(IAtom a) {
return visited.remove(a);
}
void push(IAtom atom, IBond bond) {
apath.add(atom);
if (bond != null)
bpath.add(bond);
}
void pop() {
if (!apath.isEmpty())
apath.remove(apath.size()-1);
if (!bpath.isEmpty())
bpath.remove(bpath.size()-1);
}
void addHash(int x) {
numPaths++;
rand.setSeed(x);
// XXX: fp.set(x % size); would work just as well but would encode a
// different bit
fp.set(rand.nextInt(fpsize));
}
}
private void traversePaths(State state, IAtom beg, IBond prev) throws CDKException {
if (!hashPseudoAtoms && isPseudo(beg))
return;
state.push(beg, prev);
state.addHash(encodeUniquePath(state.apath, state.bpath, state.buffer));
if (state.numPaths > pathLimit)
throw new CDKException("Too many paths! Structure is likely a cage, reduce path length or increase path limit");
if (state.apath.size() < state.maxDepth) {
for (IBond bond : state.getBonds(beg)) {
if (bond.equals(prev))
continue;
final IAtom nbr = bond.getOther(beg);
if (state.visit(nbr)) {
traversePaths(state, nbr, bond);
state.unvisit(nbr); // traverse all paths
}
}
}
state.pop();
}
/**
* Get all paths of lengths 0 to the specified length.
*
* This method will find all paths up to length N starting from each
* atom in the molecule and return the unique set of such paths.
*
* @param container The molecule to search
* @param searchDepth The maximum path length desired
* @return A Map of path strings, keyed on themselves
* @deprecated Use {@link #encodePaths(IAtomContainer, int, BitSet, int)}
*/
@Deprecated
protected int[] findPathes(IAtomContainer container, int searchDepth) throws CDKException {
Set hashes = new HashSet<>();
Map> cache = new HashMap<>();
StringBuilder buffer = new StringBuilder();
for (IAtom startAtom : container.atoms()) {
List> p = PathTools.getLimitedPathsOfLengthUpto(container, startAtom, searchDepth, pathLimit);
for (List path : p) {
if (hashPseudoAtoms || !hasPseudoAtom(path))
hashes.add(encodeUniquePath(container, cache, path, buffer));
}
}
int pos = 0;
int[] result = new int[hashes.size()];
for (Integer hash : hashes)
result[pos++] = hash;
return result;
}
protected void encodePaths(IAtomContainer mol, int depth, BitSet fp, int size) throws CDKException {
State state = new State(mol, fp, size, depth+1);
for (IAtom atom : mol.atoms()) {
state.numPaths = 0;
state.visit(atom);
traversePaths(state, atom, null);
state.unvisit(atom);
}
}
private static boolean isPseudo(IAtom a) {
return getElem(a) == 0;
}
private static boolean hasPseudoAtom(Iterable path) {
for (IAtom atom : path)
if (isPseudo(atom))
return true;
return false;
}
private int encodeUniquePath(IAtomContainer container, Map> cache, List path, StringBuilder buffer) {
if (path.size() == 1)
return getAtomSymbol(path.get(0)).hashCode();
String forward = encodePath(container, cache, path, buffer);
Collections.reverse(path);
String reverse = encodePath(container, cache, path, buffer);
Collections.reverse(path);
final int x;
if (reverse.compareTo(forward) < 0)
x = forward.hashCode();
else
x = reverse.hashCode();
return x;
}
/**
* Compares atom symbols lexicographical
* @param a atom a
* @param b atom b
* @return comparison <0 a is less than b, >0 a is more than b
*/
private int compare(IAtom a, IAtom b) {
final int elemA = getElem(a);
final int elemB = getElem(b);
if (elemA == elemB)
return 0;
return getAtomSymbol(a).compareTo(getAtomSymbol(b));
}
/**
* Compares bonds symbols lexicographical
* @param a bond a
* @param b bond b
* @return comparison <0 a is less than b, >0 a is more than b
*/
private int compare(IBond a, IBond b) {
return getBondSymbol(a).compareTo(getBondSymbol(b));
}
/**
* Compares a path of atoms with it's self to give the
* lexicographically lowest traversal (forwards or backwards).
* @param apath path of atoms
* @param bpath path of bonds
* @return <0 forward is lower >0 reverse is lower
*/
private int compare(List apath, List bpath) {
int i = 0;
int len = apath.size();
int j = len - 1;
int cmp = compare(apath.get(i), apath.get(j));
if (cmp != 0)
return cmp;
i++;
j--;
while (j != 0) {
cmp = compare(bpath.get(i-1), bpath.get(j));
if (cmp != 0) return cmp;
cmp = compare(apath.get(i), apath.get(j));
if (cmp != 0) return cmp;
i++;
j--;
}
return 0;
}
private int encodeUniquePath(List apath, List bpath, StringBuilder buffer) {
if (bpath.size() == 0)
return getAtomSymbol(apath.get(0)).hashCode();
final int x;
if (compare(apath, bpath) >= 0) {
x = hashPath(apath, bpath);
} else {
x = hashRevPath(apath, bpath);
}
return x;
}
private static int getElem(IAtom atom) {
Integer elem = atom.getAtomicNumber();
if (elem == null)
elem = 0;
return elem;
}
private String getAtomSymbol(IAtom atom) {
// XXX: backwards compatibility
// This is completely random, I believe the intention is because
// paths were reversed with string manipulation to de-duplicate
// (only the lowest lexicographically is stored) however this
// doesn't work with multiple atom symbols:
// e.g. Fe-C => C-eF vs C-Fe => eF-C
// A dirty hack is to replace "common" symbols with single letter
// equivalents so the reversing is less wrong
switch (getElem(atom)) {
case 0: // *
return "*";
case 6: // C
return "C";
case 7: // N
return "N";
case 8: // O
return "O";
case 17: // Cl
return "X";
case 35: // Br
return "Z";
case 14: // Si
return "Y";
case 33: // As
return "D";
case 3: // Li
return "L";
case 34: // Se
return "E";
case 11: // Na
return "G";
case 20: // Ca
return "J";
case 13: // Al
return "A";
}
return atom.getSymbol();
}
/**
* Gets the bondSymbol attribute of the Fingerprinter class
*
*@param bond Description of the Parameter
*@return The bondSymbol value
*/
protected String getBondSymbol(IBond bond) {
if (bond.isAromatic())
return ":";
switch (bond.getOrder()) {
case SINGLE:
return "-";
case DOUBLE:
return "=";
case TRIPLE:
return "#";
default:
return "";
}
}
public void setPathLimit(int limit) {
this.pathLimit = limit;
}
public void setHashPseudoAtoms(boolean value) {
this.hashPseudoAtoms = value;
}
public int getSearchDepth() {
return searchDepth;
}
@Override
public int getSize() {
return size;
}
@Override
public ICountFingerprint getCountFingerprint(IAtomContainer container) throws CDKException {
throw new UnsupportedOperationException();
}
}