com.actelion.research.chem.mmp.MMP Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
There is a newer version: 2024.11.2
/*
 * Copyright (c) 2017
 * Actelion Pharmaceuticals Ltd.
 * Gewerbestrasse 16
 * CH-4123 Allschwil, Switzerland
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the copyright holder nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author Gregori Gerebtzoff
 */

package com.actelion.research.chem.mmp;

import com.actelion.research.chem.IDCodeParser;
import com.actelion.research.chem.StereoMolecule;
import com.actelion.research.chem.io.CompoundFileParser;
import com.actelion.research.chem.mmp.MMPFragmenter.MoleculeIndexID;

import java.io.IOException;
import java.io.PrintWriter;
import java.math.BigDecimal;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.*;

public class MMP {
	private static String VERSION = "1.1";									  // Version 1.1 has a modified Enumerator method (slightly faster reading)
	private static final String r1H = MMPFragmenter.createR1HMoleculeID();    // R1-H molecule IDCode (for Hydrogen replacements)
	private static final boolean TRANSFORM_UM_TO_PIC50 = true;                // transforms uM to pIC50 if a field name finishes with "_uM"
	private int maxValueAtoms = 0;                                            // size of the biggest fragment (value), for enumeration
	
	private MMPairs matchedMolecularPairs;                                    // container for all Matched Molecular Pairs (result of enumeration)
	private HashMap>> mMPIndex;     // valueAtoms -- keys - list of values ({valueIndex, moleculeIndex})
	private HashMap> wholeMoleculesIndex;    // IDCode - molIndex, molName, molData
	private List> moleculesFragmentsID;                        // container for "clean" fragments (without R-groups) and MolName
	private MMPUniqueFragments mmpUniqueFragments;                            // Set of unique fragmentsID
	private MMPFragments mmpFragments;                                        // Container for molecule fragments (keys, value, molecule index)
	private int r1HIndex;                                                     // idCode of R-[H] molecule
	private String[] fieldNames;                                              // list of field names
	private boolean[] fieldNumerics;                                          // list of booleans (true if the field contains only numerical data)
	private float[] fieldPercentiles5;                                        // list of 5th percentiles
	private float[] fieldPercentiles95;                                       // list of 95th percentile
	private int moleculesRowCount;
	private String datasetName;
	
	static public class MoleculeIndex {
		int moleculeIndex;
		String moleculeName;
		String[] moleculeData;
		String moleculeIDCode;
		String moleculeIDCoord;
		
		public MoleculeIndex(int moleculeIndex, String moleculeIDCoord, String moleculeIDCode, String moleculeName, String[] moleculeData) {
			this.moleculeIndex = moleculeIndex;
			this.moleculeIDCoord = moleculeIDCoord;
			this.moleculeIDCode = moleculeIDCode;
			this.moleculeName = moleculeName;
			this.moleculeData = moleculeData;
		}
		
		public MoleculeIndex(int moleculeIndex, String moleculeName, String[] moleculeData) {
			this.moleculeIndex = moleculeIndex;
			this.moleculeName = moleculeName;
			this.moleculeData = moleculeData;
			this.moleculeIDCode = null;
			this.moleculeIDCoord = null;
		}
		
		public void setIDCode(String moleculeIDCode) {
			this.moleculeIDCode = moleculeIDCode;
		}
		
		public void setIDCoord(String moleculeIDCoord) {
			this.moleculeIDCoord = moleculeIDCoord;
		}
	}

	private static String getDateAndTime() {
		return new SimpleDateFormat("d-MMM-yyyy HH:mm:ss").format(Calendar.getInstance().getTime());
	}

	/**
	 * Helper function 
	 * @param str String to be analyzed
	 * @return true/false if the input String is numeric
	 */
	private static boolean isNumeric(String str) {
		if (str == null) {
			return true;
		}
		try {
			Double.parseDouble(str);
		}
		catch(NumberFormatException nfe) {  
			return false;  
		}  
		return true;  
	}
	
	/**
	 * Gets the mapped values corresponding to a defined size of the 'value' (variable part of the molecule)
	 * @param valueAtoms Number of heavy atoms of the 'value' (variable part of the molecule)
	 * @return Hashmap of keys - list of values ({valueIndex, moleculeIndex})
	 */
	private HashMap> getIndex(int valueAtoms) {
		if (mMPIndex.containsKey(valueAtoms)) {
			return mMPIndex.get(valueAtoms);
		}
		return null;
	}
	
	/**
	 * Add new value to mMPIndex
	 * @param valueAtoms Number of heavy atoms of the 'value' (variable part of the molecule)
	 * @param keys Array of one (single cut) or two (double cut) strings for the 'key' (constant part(s) of the molecule) 
	 * @param values Array of two integers (valueIndex, moleculeIndex)
	 * @param isH true if the 'value' is a Hydrogen atom
	 * @return true/false if the new values were added
	 */
	private boolean addValues(int valueAtoms, String keys, int[] values, boolean isH) {
		boolean added = true;
		HashMap> keysHash = null;
		ArrayList valuesList = null;
		if (mMPIndex.containsKey(valueAtoms)) {
			keysHash = mMPIndex.get(valueAtoms);
		}
		else {
			keysHash = new HashMap>();
		}
		if (keysHash.containsKey(keys)) {
			valuesList = keysHash.get(keys);
			if (valuesList == null)
				valuesList = new ArrayList();
			if (isH == true && !valuesList.isEmpty()) {
				int[] lastItem = valuesList.get(valuesList.size() - 1);
				if (lastItem[0] != r1HIndex)
					valuesList.add(values);
				else
					added = false;
			}
			else {
				valuesList.add(values);
			}
		}
		else {
			valuesList = new ArrayList();
			valuesList.add(values);
		}
		keysHash.put(keys, valuesList);
		mMPIndex.put(valueAtoms, keysHash);
		return added;
	}
	
	/**
	 * Generates a hash table of keys - list of values;

	 * for double cuts, one key consists of a '\t'-separated string

	 * of the two 'left' and 'right' fragmentsID.
	 * @param datasetName Name of the data set
	 * @param compoundFileParser Compound File Parser (SD Reader, database link, ...)
	 * @param verbose Verbose
	 * @throws IOException
	 */
	public MMP(String datasetName, CompoundFileParser compoundFileParser, boolean verbose) throws IOException {
		mMPIndex = new HashMap>>();
		wholeMoleculesIndex = new LinkedHashMap>();
		moleculesFragmentsID = new ArrayList>();
		matchedMolecularPairs = new MMPairs();
		mmpUniqueFragments = new MMPUniqueFragments();
		r1HIndex = mmpUniqueFragments.addFragment(r1H, 0, null); // we force the number of atoms to 0 because [H] counts for 1
		mmpFragments = new MMPFragments();
		moleculesRowCount = 0;
		fieldNames = compoundFileParser.getFieldNames();
		fieldNumerics = new boolean[fieldNames.length];
		fieldPercentiles5 = new float[fieldNames.length];
		fieldPercentiles95 = new float[fieldNames.length];
		IDCodeParser idCodeParser = new IDCodeParser();
		boolean isSDFileParser = compoundFileParser.getClass().getName().contains("SDFileParser");
		@SuppressWarnings("unchecked")
		  ArrayList[] fieldDatas = (ArrayList[])new ArrayList[fieldNames.length];
		for (int i = 0; i < fieldNames.length; i++) {
			fieldDatas[i] = new ArrayList();
		}
		Arrays.fill(fieldNumerics, true);
		this.datasetName = datasetName;
		NumberFormat formatter = new DecimalFormat("#.##");
		int molCounter = 0;
		if (verbose) {
			if (compoundFileParser.getRowCount() != -1) {
				System.out.println(getDateAndTime() + ": fragmenting " + compoundFileParser.getRowCount() + " molecules...");
			}
			else {
				System.out.println(getDateAndTime() + ": fragmenting molecules...");
			}
		}
		while (compoundFileParser.next()) {
			StereoMolecule mol = compoundFileParser.getMolecule();
			mol.stripSmallFragments();
			String moleculeName = (compoundFileParser.getMoleculeName() == null) ? mol.getName() : compoundFileParser.getMoleculeName();
			String molID = compoundFileParser.getIDCode();
			String molIDCoord = compoundFileParser.getCoordinates();
			if (isSDFileParser) {
				// TODO: avoid re-parsing the IDCode to ensure that the saved molecule is the same as this one - problem occurs with SDF files...
				idCodeParser.parse(mol, molID);
			}
			String moleculeData[] = new String[fieldNames.length];
			for (int i=0; i, >= and <= symbols
						fieldNumerics[i] = false;
					}
					else if (fieldData != null && isNumeric(fieldData)) {
						if (TRANSFORM_UM_TO_PIC50 && fieldNames[i].endsWith("_uM")) {
							float data = round((float)-Math.log10(Float.parseFloat(fieldData)*1.0E-6), 3);
							fieldData = Float.toString(data);
							moleculeData[i] = fieldData;
							fieldDatas[i].add(data);
						}
						else {
							fieldDatas[i].add(Float.parseFloat(fieldData));
							moleculeData[i] = formatter.format(Float.parseFloat(fieldData));
						}
					}
					else if (fieldData != null && !fieldData.startsWith("<") && !fieldData.startsWith(">")) {
						fieldNumerics[i] = false;
					}
					else if (fieldData != null && (fieldData.startsWith("<=") || fieldData.startsWith(">="))) {
						if (TRANSFORM_UM_TO_PIC50 && fieldNames[i].endsWith("_uM")) {
							float data = round((float)-Math.log10(Float.parseFloat(fieldData.substring(2))*1.0E-6), 3);
							if (fieldData.startsWith(">=")) {
								moleculeData[i] = "<=" + Float.toString(data);
							}
							else {
								moleculeData[i] = ">=" + Float.toString(data);
							}
						}
						else {
							moleculeData[i] = fieldData.substring(0, 2) + formatter.format(Float.parseFloat(fieldData.substring(2)));
						}
					}
					else if (fieldData != null && (fieldData.startsWith("<") || fieldData.startsWith(">"))) {
						if (TRANSFORM_UM_TO_PIC50 && fieldNames[i].endsWith("_uM")) {
							float data = round((float)-Math.log10(Float.parseFloat(fieldData.substring(1))*1.0E-6), 3);
							if (fieldData.startsWith(">")) {
								moleculeData[i] = "<" + Float.toString(data);
							}
							else {
								moleculeData[i] = ">" + Float.toString(data);
							}
						}
						else {
							moleculeData[i] = fieldData.substring(0, 1) + formatter.format(Float.parseFloat(fieldData.substring(1)));
						}
					}
				}				
			}
			ArrayList moleculesIndex = new ArrayList();
			int molIndex = molCounter; 
			if (!wholeMoleculesIndex.containsKey(molID)) {
				MMPFragmenter mmp = new MMPFragmenter(mol);
				moleculesFragmentsID.add(mmp.getMoleculeFragmentsID());
				List moleculeIndexesID = mmp.getMoleculeIndexesID(false);
				for (MoleculeIndexID moleculeIndexID: moleculeIndexesID) {
					String[] keysID = moleculeIndexID.getKeysID();
					String valueID = moleculeIndexID.getValueID();
					int valueAtoms = moleculeIndexID.getValueIDAtoms();
					int key1Index = mmpUniqueFragments.addFragment(keysID[0]);
					int valueIndex = mmpUniqueFragments.addFragment(valueID);
					moleculeIndexID.setValueIndex(valueIndex);
					if (keysID.length == 1) { // single cut
						addValues(valueAtoms, Integer.toString(key1Index) + "\t", new int[]{valueIndex, molCounter}, false);
						moleculeIndexID.setKeysIndex(new int[]{key1Index});
					}
					else { // double cut
						int key2Index = mmpUniqueFragments.addFragment(keysID[1]);
						addValues(valueAtoms, Integer.toString(key1Index) + "\t" + Integer.toString(key2Index), new int[]{valueIndex, molCounter}, false);
						moleculeIndexID.setKeysIndex(new int[]{key1Index, key2Index});
					}
					mmpFragments.addFragments(molCounter, moleculeIndexID);
					if (moleculeIndexID.getValueIDAtoms() > maxValueAtoms) {
						maxValueAtoms = moleculeIndexID.getValueIDAtoms();
					}
				}
				molCounter++;
			}
			else {
				moleculesIndex = wholeMoleculesIndex.get(molID);
				molIndex = moleculesIndex.get(0).moleculeIndex;
			}
			if (moleculesIndex.size() > 0) {
				moleculesIndex.add(new MoleculeIndex(molIndex, moleculeName, moleculeData));
			}
			else {
				moleculesIndex.add(new MoleculeIndex(molIndex, molIDCoord, molID, moleculeName, moleculeData));
			}
			wholeMoleculesIndex.put(molID, moleculesIndex);
			moleculesRowCount++;
			if (verbose) {
				if (moleculesRowCount % 1000 == 0) {
					System.out.println("# " + moleculesRowCount);
				}
				else if (moleculesRowCount % 100 == 0) {
					System.out.print("#");
				}
				else if (moleculesRowCount % 10 == 0) {
					System.out.print(".");
				}
			}
		}
		// Getting percentiles
		if (verbose) {
			System.out.println(" " + moleculesRowCount);
			System.out.println(getDateAndTime() + ": getting percentiles...");
		}
		for (int i=0; i 0) {
				Collections.sort(fieldDatas[i]);
				int index = (int)Math.floor(0.05 * fieldDatas[i].size()); // I use floor so that I don't have to correct for the array indexes starting at 0
				if (Math.round(0.05f * fieldDatas[i].size()) != 0.05f * fieldDatas[i].size()) {
					fieldPercentiles5[i] = fieldDatas[i].get(index);
				}
				else {
					fieldPercentiles5[i] = (fieldDatas[i].get(index) + fieldDatas[i].get(index+1)) / 2.0f;
				}
				index = (int)Math.floor(0.95 * fieldDatas[i].size());
				if (Math.round(0.95f * fieldDatas[i].size()) != 0.95f * fieldDatas[i].size()) {
					fieldPercentiles95[i] = fieldDatas[i].get(index);
				}
				else {
					fieldPercentiles95[i] = (fieldDatas[i].get(index) + fieldDatas[i].get(index+1)) / 2.0f;
				}
			}
//			else if (fieldNumerics[i] != false && fieldDatas[i].size() == 0) {
//				fieldNumerics[i] = false;
//			}
		}
		// Processing Hydrogens replacements
		if (verbose)
			System.out.println(getDateAndTime() + ": processing hydrogen replacements...");
		for (List fragmentsID:moleculesFragmentsID) {
			for (String[] fragmentID:fragmentsID) { // {R-group, "clean"}
				if (wholeMoleculesIndex.containsKey(fragmentID[1])) {
					int moleculeIndex = wholeMoleculesIndex.get(fragmentID[1]).get(0).moleculeIndex;
					int fragmentIndex = mmpUniqueFragments.addFragment(fragmentID[0]);
					boolean added = addValues(0, Integer.toString(fragmentIndex) + "\t", new int[]{r1HIndex, moleculeIndex}, true);
					if (added) {
						// to do: replace -1 by the correct value (is it needed?)
						MoleculeIndexID moleculeIndexID = new MoleculeIndexID(new String[]{fragmentID[0]}, new int[]{fragmentIndex}, r1H, r1HIndex, null, 0, new int[]{-1}, new int[]{-1});
						mmpFragments.addFragments(moleculeIndex, moleculeIndexID); // String[] keysID, String valueID, int[] keysIDAtoms, int valueIDAtoms
					}
				}
			}
		}
		// Generating MMPs
		if (verbose)
			System.out.print(getDateAndTime() + ": generating MMPs");
		maxValueAtoms += 1;
		int counter = 0;
		int[][] combinations;
		if (VERSION == "1.0") {
			combinations = new int[maxValueAtoms*(maxValueAtoms+1)/2-1][2];
			for (int i=1; i> mMPs = mMPEnumerator.getMMPEnumeration();
//			matchedMolecularPairs.addMMPs(mMPs);
			if (mMPs != null && mMPs.size() > 0)
				matchedMolecularPairs.writeMMPEnumeration(mMPs);
			counter++;
			if (verbose) {
				if (counter % 1000 == 0) {
					System.out.println("# " + counter);
				}
				else if (counter % 100 == 0) {
					System.out.print("#");
				}
				else if (counter % 10 == 0) {
					System.out.print(".");
				}
			}
		}
		compoundFileParser.close();
		if (verbose)
			System.out.println(" " + counter + "\n" + getDateAndTime() + ": done.");
  	}
	
	/**
	 * Writes the Molecules block. A moleculeIndex column has been added

	 * since molecules might be not unique → The index won't be unique.
	 * @param printWriter
	 */
	private void writeMolecules(PrintWriter printWriter) {
		String line = "moleculeIndex\tidcoordinates2D\tmolecule\tmoleculeName";
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		for (int i=0; i 1) {
					category = items[0];
					columnName = items[1];
					longName = items[1];
					if (items.length == 3) {
						longName = items[2];
					}
				}
				printWriter.println("");
				line += "\t" + columnName;
			}
		}
		printWriter.println("");
		printWriter.println(line);
		Iterator it = wholeMoleculesIndex.keySet().iterator();
		while (it.hasNext()) {
			String molID = it.next();
			String molIDCoord = "";
			List moleculesIndex = wholeMoleculesIndex.get(molID);
			for (MoleculeIndex moleculeIndex: moleculesIndex) {
				if (molIDCoord == "" && moleculeIndex.moleculeIDCoord != null) {
					molIDCoord = moleculeIndex.moleculeIDCoord;
				}
				line = Integer.toString(moleculeIndex.moleculeIndex) + "\t" + molIDCoord + "\t" + molID + "\t" + moleculeIndex.moleculeName;
				for (int i=0; i");
	}
	
	/**
	 * Writes the header (general information) block and calls the writing of the different blocks
	 * @param printWriter
	 * @throws IOException
	 */
	public void writeMMPFile(PrintWriter printWriter) throws IOException {
		printWriter.println("");
		printWriter.println("");
		DateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy");
		Date date = new Date();
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		printWriter.println("");
		// Process fragments (adding FP) and save them
		writeMolecules(printWriter);
		mmpUniqueFragments.writeUniqueFragments(printWriter);
		mmpFragments.writeFragments(printWriter);
		matchedMolecularPairs.writeMMPs(printWriter);
		printWriter.close();
	}
	
	/**
	 * Helper function for rounding
	 * @param f Input value
	 * @param decimalPlace
	 * @return
	 */
	private static float round(float f, int decimalPlace) {
		BigDecimal bd = new BigDecimal(Float.toString(f));
		bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP);
	    return bd.floatValue();
	}
}