org.openscience.cdk.graph.invariant.Canon Maven / Gradle / Ivy
/*
* Copyright (c) 2013 European Bioinformatics Institute (EMBL-EBI)
* John May
*
* Contact: [email protected]
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version. All we ask is that proper credit is given
* for our work, which includes - but is not limited to - adding the above
* copyright notice to the beginning of your source code files, and to any
* copyright notice that you may distribute with programs based on this work.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
*/
package org.openscience.cdk.graph.invariant;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
/**
* An implementation based on the canon algorithm {@cdk.cite WEI89}. The
* algorithm uses an initial set of of invariants which are assigned a rank.
* Equivalent ranks are then shattered using an unambiguous function (in this
* case, the product of primes of adjacent ranks). Once no more equivalent ranks
* can be shattered ties are artificially broken and rank shattering continues.
* Unlike the original description rank stability is not maintained reducing
* the number of values to rank at each stage to only those which are equivalent.
*
*
* The initial set of invariants is basic and are -
* "sufficient for the purpose of obtaining unique notation for simple SMILES,
* but it is not necessarily a “complete” set. No “perfect” set of invariants
* is known that will distinguish all possible graph asymmetries. However,
* for any given set of structures, a set of invariants can be devised to
* provide the necessary discrimination" {@cdk.cite WEI89}. As such this
* producer should not be considered a complete canonical labelled but in
* practice performs well. For a more accurate and computationally expensive
* labelling, please using the {@link InChINumbersTools}.
*
*
* IAtomContainer m = ...;
* int[][] g = GraphUtil.toAdjList(m);
*
* // obtain canon labelling
* long[] labels = Canon.label(m, g);
*
* // obtain symmetry classes
* long[] labels = Canon.symmetry(m, g);
*
*
* @author John May
* @cdk.module standard
* @cdk.githash
*/
public final class Canon {
private static final int N_PRIMES = 10000;
/**
* Graph, adjacency list representation.
*/
private final int[][] g;
/**
* Storage of canon labelling and symmetry classes.
*/
private final long[] labelling, symmetry;
/** Only compute the symmetry classes. */
private boolean symOnly = false;
/**
* Create a canon labelling for the graph (g) with the specified
* invariants.
*
* @param g a graph (adjacency list representation)
* @param hydrogens binary vector of terminal hydrogens
* @param partition an initial partition of the vertices
*/
private Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly) {
this.g = g;
this.symOnly = symOnly;
labelling = partition.clone();
symmetry = refine(labelling, hydrogens);
}
/**
* Compute the canonical labels for the provided structure. The labelling
* does not consider isomer information or stereochemistry. The current
* implementation does not fully distinguish all structure topologies
* but in practise performs well in the majority of cases. A complete
* canonical labelling can be obtained using the {@link InChINumbersTools}
* but is computationally much more expensive.
*
* @param container structure
* @param g adjacency list graph representation
* @return the canonical labelling
* @see EquivalentClassPartitioner
* @see InChINumbersTools
*/
public static long[] label(IAtomContainer container, int[][] g) {
return label(container, g, basicInvariants(container, g));
}
/**
* Compute the canonical labels for the provided structure. The labelling
* does not consider isomer information or stereochemistry. This method
* allows provision of a custom array of initial invariants.
*
*
* The current
* implementation does not fully distinguish all structure topologies
* but in practise performs well in the majority of cases. A complete
* canonical labelling can be obtained using the {@link InChINumbersTools}
* but is computationally much more expensive.
*
* @param container structure
* @param g adjacency list graph representation
* @param initial initial seed invariants
* @return the canonical labelling
* @see EquivalentClassPartitioner
* @see InChINumbersTools
*/
public static long[] label(IAtomContainer container, int[][] g, long[] initial) {
if (initial.length != g.length)
throw new IllegalArgumentException("number of initial != number of atoms");
return new Canon(g, initial, terminalHydrogens(container, g), false).labelling;
}
/**
* Compute the canonical labels for the provided structure. The initial
* labelling is seed-ed with the provided atom comparator cmp
* allowing arbitary properties to be distinguished or ignored.
*
* @param container structure
* @param g adjacency list graph representation
* @param cmp comparator to compare atoms
* @return the canonical labelling
*/
public static long[] label(IAtomContainer container,
int[][] g,
Comparator cmp) {
if (g.length == 0)
return new long[0];
IAtom[] atoms = AtomContainerManipulator.getAtomArray(container);
Arrays.sort(atoms, cmp);
long[] initial = new long[atoms.length];
long part = 1;
initial[container.indexOf(atoms[0])] = part;
for (int i=1; i m && n < ord) {
nnu = 0;
for (int i = 0; i < ord && nextVs[i] >= 0; i++) {
int v = nextVs[i];
currVs[nnu++] = v;
curr[v] = hydrogens[v] ? prev[v] : primeProduct(g[v], prev, hydrogens);
}
m = n;
}
if (symmetry == null) {
// After symmetry classes have been found without hydrogens we add
// back in the hydrogens and assign ranks. We don't refine the
// partition until the next time round the while loop to avoid
// artificially splitting due to hydrogen representation, for example
// the two hydrogens are equivalent in this SMILES for ethane '[H]CC'
for (int i = 0; i < g.length; i++) {
if (hydrogens[i]) {
curr[i] = prev[g[i][0]];
hydrogens[i] = false;
}
}
n = ranker.rank(currVs, nextVs, nnu, curr, prev);
symmetry = Arrays.copyOf(prev, ord);
// Update the buffer of non-unique vertices as hydrogens next
// to discrete heavy atoms are also discrete (and removed from
// 'nextVs' during ranking.
nnu = 0;
for (int i = 0; i < ord && nextVs[i] >= 0; i++) {
currVs[nnu++] = nextVs[i];
}
}
// partition is discrete or only symmetry classes are needed
if (symOnly || n == ord) return symmetry;
// artificially split the lowest cell, we perturb the value
// of all vertices with equivalent rank to the lowest non-unique
// vertex
int lo = nextVs[0];
for (int i = 1; i < ord && nextVs[i] >= 0 && prev[nextVs[i]] == prev[lo]; i++)
prev[nextVs[i]]++;
// could also swap but this is cleaner
System.arraycopy(nextVs, 0, currVs, 0, nnu);
}
return symmetry;
}
/**
* Compute the prime product of the values (ranks) for the given
* adjacent neighbors (ws).
*
* @param ws indices (adjacent neighbors)
* @param ranks invariant ranks
* @return the prime product
*/
private long primeProduct(int[] ws, long[] ranks, boolean[] hydrogens) {
long prod = 1;
for (int w : ws) {
if (!hydrogens[w]) {
prod *= PRIMES[(int) ranks[w]];
}
}
return prod;
}
/**
* Generate the initial invariants for each atom in the {@code container}.
* The labels use the invariants described in {@cdk.cite WEI89}.
*
* The bits in the low 32-bits are: {@code 0000000000xxxxXXXXeeeeeeescchhhh}
* where:
*
* - 0: padding
* - x: number of connections
* - X: number of non-hydrogens bonds
* - e: atomic number
* - s: sign of charge
* - c: absolute charge
* - h: number of attached hydrogens
*
*
* Important: These invariants are basic and there are known
* examples they don't distinguish. One trivial example to consider is
* {@code [O]C=O} where both oxygens have no hydrogens and a single
* connection but the atoms are not equivalent. Including a better
* initial partition is more expensive
*
* @param container an atom container to generate labels for
* @param graph graph representation (adjacency list)
* @return initial invariants
* @throws NullPointerException an atom had unset atomic number, hydrogen
* count or formal charge
*/
public static long[] basicInvariants(IAtomContainer container, int[][] graph) {
long[] labels = new long[graph.length];
for (int v = 0; v < graph.length; v++) {
IAtom atom = container.getAtom(v);
int deg = graph[v].length;
int impH = implH(atom);
int expH = 0;
int elem = atomicNumber(atom);
int chg = charge(atom);
// count non-suppressed (explicit) hydrogens
for (int w : graph[v])
if (atomicNumber(container.getAtom(w)) == 1) expH++;
long label = 0; // connectivity (first in)
label |= deg + impH & 0xf;
label <<= 4; // connectivity (heavy) <= 15 (4 bits)
label |= deg - expH & 0xf;
label <<= 7; // atomic number <= 127 (7 bits)
label |= elem & 0x7f;
label <<= 1; // charge sign == 1 (1 bit)
label |= chg >> 31 & 0x1;
label <<= 2; // charge <= 3 (2 bits)
label |= Math.abs(chg) & 0x3;
label <<= 4; // hydrogen count <= 15 (4 bits)
label |= impH + expH & 0xf;
labels[v] = label;
}
return labels;
}
/**
* Access atomic number of atom defaulting to 0 for pseudo atoms.
*
* @param atom an atom
* @return the atomic number
* @throws NullPointerException the atom was non-pseudo at did not have an
* atomic number
*/
private static int atomicNumber(IAtom atom) {
Integer elem = atom.getAtomicNumber();
if (elem != null) return elem;
if (atom instanceof IPseudoAtom) return 0;
throw new NullPointerException("a non-pseudoatom had unset atomic number");
}
/**
* Access implicit hydrogen count of the atom defaulting to 0 for pseudo
* atoms.
*
* @param atom an atom
* @return the implicit hydrogen count
* @throws NullPointerException the atom was non-pseudo at did not have an
* implicit hydrogen count
*/
private static int implH(IAtom atom) {
Integer h = atom.getImplicitHydrogenCount();
if (h != null) return h;
if (atom instanceof IPseudoAtom) return 0;
throw new NullPointerException("a non-pseudoatom had unset hydrogen count");
}
/**
* Access formal charge of an atom defaulting to 0 if undefined.
*
* @param atom an atom
* @return the formal charge
*/
private static int charge(IAtom atom) {
Integer charge = atom.getFormalCharge();
if (charge != null) return charge;
return 0;
}
/**
* Locate explicit hydrogens that are attached to exactly one other atom.
*
* @param ac a structure
* @return binary set of terminal hydrogens
*/
static boolean[] terminalHydrogens(final IAtomContainer ac, final int[][] g) {
final boolean[] hydrogens = new boolean[ac.getAtomCount()];
// we specifically don't check for null atomic number, this must be set.
// if not, something major is wrong
for (int i = 0; i < ac.getAtomCount(); i++) {
IAtom atom = ac.getAtom(i);
hydrogens[i] = atom.getAtomicNumber() == 1 &&
atom.getMassNumber() == null &&
g[i].length == 1;
}
return hydrogens;
}
/**
* The first 10,000 primes.
*/
private static final int[] PRIMES = loadPrimes();
private static int[] loadPrimes() {
try (BufferedReader br = new BufferedReader(new InputStreamReader(Canon.class.getResourceAsStream("primes.dat")))) {
int[] primes = new int[N_PRIMES];
int i = 0;
String line = null;
while ((line = br.readLine()) != null) {
primes[i++] = Integer.parseInt(line);
}
assert i == N_PRIMES;
return primes;
} catch (NumberFormatException | IOException e) {
System.err.println("Critical - could not load primes table for canonical labelling!");
return new int[0];
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy