All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openscience.cdk.hash.BasicAtomHashGenerator Maven / Gradle / Ivy

/*
 * Copyright (c) 2013 John May 
 *
 * Contact: [email protected]
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 * All we ask is that proper credit is given for our work, which includes
 * - but is not limited to - adding the above copyright notice to the beginning
 * of your source code files, and to any copyright notice that you may distribute
 * with programs based on this work.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
 */

package org.openscience.cdk.hash;

import org.openscience.cdk.hash.stereo.StereoEncoder;
import org.openscience.cdk.hash.stereo.StereoEncoderFactory;
import org.openscience.cdk.interfaces.IAtomContainer;

/**
 * A generator for basic atom hash codes. This implementation is based on the
 * description by {@cdk.cite Ihlenfeldt93}. The hash codes use an initial
 * generator to seed the values of each atom. The initial values are then
 * combined over a series of cycles up to a specified depth. At each cycle the
 * hash values of adjacent invariants are incorporated.
 *
 * 

Which depth should I use?

The depth determines the number of * cycles and thus how deep the hashing is, larger values discriminate * more molecules but can take longer to compute. The original publication * recommends a depth of 32 however values as low as 6 can yield good results. * The actual depth required is related to the diameter of the chemical * graph. The diameter is the longest shortest path, that is, the * furthest distance one must travel between any two vertex. Unfortunately the * time complexity of finding the longest shortest path in an undirected graph * is O(n2) which is larger then the time required for this hash * function. Depending on the types of molecules in your data set the depth * should be adjusted accordingly. For example, a library of large-lipids would * require deeper hashing to discriminate differences in chain length. * *

Usage

*
 * SeedGenerator     seeding   = ...
 * AtomHashGenerator generator = new BasicAtomHashGenerator(seeding,
 *                                                          new Xorshift(),
 *                                                          32);
 *
 * IAtomContainer benzene = MoleculeFactory.benzene();
 * long[]         hashes  = generator.generate(benzene);
 * 
* * @author John May * @cdk.module hash * @see SeedGenerator * @see Graph * Diameter * @see Original * Publication * @cdk.githash */ final class BasicAtomHashGenerator extends AbstractAtomHashGenerator implements AtomHashGenerator { /* a generator for the initial atom seeds */ private final AtomHashGenerator seedGenerator; /* creates stereo encoders for IAtomContainers */ private final StereoEncoderFactory factory; /* number of cycles to include adjacent invariants */ private final int depth; /** * Create a basic hash generator using the provided seed generator to * initialise atom invariants and using the provided stereo factory. * * @param seedGenerator generator to seed the initial values of atoms * @param pseudorandom pseudorandom number generator used to randomise hash * distribution * @param factory a stereo encoder factory * @param depth depth of the hashing function, larger values take * longer * @throws IllegalArgumentException depth was less then 0 * @throws NullPointerException seed generator or pseudo random was * null * @see SeedGenerator */ public BasicAtomHashGenerator(AtomHashGenerator seedGenerator, Pseudorandom pseudorandom, StereoEncoderFactory factory, int depth) { super(pseudorandom); if (seedGenerator == null) throw new NullPointerException("seed generator cannot be null"); if (depth < 0) throw new IllegalArgumentException("depth cannot be less then 0"); this.seedGenerator = seedGenerator; this.factory = factory; this.depth = depth; } /** * Create a basic hash generator using the provided seed generator to * initialise atom invariants and no stereo configuration. * * @param seedGenerator generator to seed the initial values of atoms * @param pseudorandom pseudorandom number generator used to randomise hash * distribution * @param depth depth of the hashing function, larger values take * longer * @throws IllegalArgumentException depth was less then 0 * @throws NullPointerException seed generator or pseudo random was * null * @see SeedGenerator */ public BasicAtomHashGenerator(AtomHashGenerator seedGenerator, Pseudorandom pseudorandom, int depth) { this(seedGenerator, pseudorandom, StereoEncoderFactory.EMPTY, depth); } /** *{@inheritDoc} */ @Override public long[] generate(IAtomContainer container) { int[][] graph = toAdjList(container); return generate(seedGenerator.generate(container), factory.create(container, graph), graph, Suppressed.none()); } /** * Package-private method for generating the hash for the given molecule. * The initial invariants are passed as to the method along with an * adjacency list representation of the graph. * * @param current initial invariants * @param graph adjacency list representation * @return hash codes for atoms */ @Override long[] generate(long[] current, StereoEncoder encoder, int[][] graph, Suppressed suppressed) { int n = graph.length; long[] next = copy(current); // buffers for including adjacent invariants long[] unique = new long[n]; long[] included = new long[n]; while (encoder.encode(current, next)) { copy(next, current); } for (int d = 0; d < depth; d++) { for (int v = 0; v < n; v++) { next[v] = next(graph, v, current, unique, included); } copy(next, current); while (encoder.encode(current, next)) { copy(next, current); } } return current; } /** * Determine the next value of the atom at index v. The value is * calculated by combining the current values of adjacent atoms. When a * duplicate value is found it can not be directly included and is * rotated the number of times it has previously been seen. * * @param graph adjacency list representation of connected atoms * @param v the atom to calculate the next value for * @param current the current values * @param unique buffer for working out which adjacent values are unique * @param included buffer for storing the rotated unique value, this * value is rotated each time the same value is * found. * @return the next value for v */ long next(int[][] graph, int v, long[] current, long[] unique, long[] included) { long invariant = distribute(current[v]); int nUnique = 0; for (int w : graph[v]) { long adjInv = current[w]; // find index of already included neighbor int i = 0; while (i < nUnique && unique[i] != adjInv) { ++i; } // no match, then the value is unique, use adjInv // match, then rotate the previously included value included[i] = (i == nUnique) ? unique[nUnique++] = adjInv : rotate(included[i]); invariant ^= included[i]; } return invariant; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy