All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.actelion.research.chem.mmp.MMP Maven / Gradle / Ivy

There is a newer version: 2024.11.2
Show newest version
/*
 * Copyright (c) 2017
 * Actelion Pharmaceuticals Ltd.
 * Gewerbestrasse 16
 * CH-4123 Allschwil, Switzerland
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the copyright holder nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author Gregori Gerebtzoff
 */

package com.actelion.research.chem.mmp;

import com.actelion.research.chem.IDCodeParser;
import com.actelion.research.chem.StereoMolecule;
import com.actelion.research.chem.io.CompoundFileParser;
import com.actelion.research.chem.mmp.MMPFragmenter.MoleculeIndexID;

import java.io.IOException;
import java.io.PrintWriter;
import java.math.BigDecimal;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.*;

public class MMP {
	private static String VERSION = "1.1";									  // Version 1.1 has a modified Enumerator method (slightly faster reading)
	private static final String r1H = MMPFragmenter.createR1HMoleculeID();    // R1-H molecule IDCode (for Hydrogen replacements)
	private static final boolean TRANSFORM_UM_TO_PIC50 = true;                // transforms uM to pIC50 if a field name finishes with "_uM"
	private int maxValueAtoms = 0;                                            // size of the biggest fragment (value), for enumeration
	
	private MMPairs matchedMolecularPairs;                                    // container for all Matched Molecular Pairs (result of enumeration)
	private HashMap>> mMPIndex;     // valueAtoms -- keys - list of values ({valueIndex, moleculeIndex})
	private HashMap> wholeMoleculesIndex;    // IDCode - molIndex, molName, molData
	private List> moleculesFragmentsID;                        // container for "clean" fragments (without R-groups) and MolName
	private MMPUniqueFragments mmpUniqueFragments;                            // Set of unique fragmentsID
	private MMPFragments mmpFragments;                                        // Container for molecule fragments (keys, value, molecule index)
	private int r1HIndex;                                                     // idCode of R-[H] molecule
	private String[] fieldNames;                                              // list of field names
	private boolean[] fieldNumerics;                                          // list of booleans (true if the field contains only numerical data)
	private float[] fieldPercentiles5;                                        // list of 5th percentiles
	private float[] fieldPercentiles95;                                       // list of 95th percentile
	private int moleculesRowCount;
	private String datasetName;
	
	static public class MoleculeIndex {
		int moleculeIndex;
		String moleculeName;
		String[] moleculeData;
		String moleculeIDCode;
		String moleculeIDCoord;
		
		public MoleculeIndex(int moleculeIndex, String moleculeIDCoord, String moleculeIDCode, String moleculeName, String[] moleculeData) {
			this.moleculeIndex = moleculeIndex;
			this.moleculeIDCoord = moleculeIDCoord;
			this.moleculeIDCode = moleculeIDCode;
			this.moleculeName = moleculeName;
			this.moleculeData = moleculeData;
		}
		
		public MoleculeIndex(int moleculeIndex, String moleculeName, String[] moleculeData) {
			this.moleculeIndex = moleculeIndex;
			this.moleculeName = moleculeName;
			this.moleculeData = moleculeData;
			this.moleculeIDCode = null;
			this.moleculeIDCoord = null;
		}
		
		public void setIDCode(String moleculeIDCode) {
			this.moleculeIDCode = moleculeIDCode;
		}
		
		public void setIDCoord(String moleculeIDCoord) {
			this.moleculeIDCoord = moleculeIDCoord;
		}
	}

	private static String getDateAndTime() {
		return new SimpleDateFormat("d-MMM-yyyy HH:mm:ss").format(Calendar.getInstance().getTime());
	}

	/**
	 * Helper function 
	 * @param str String to be analyzed
	 * @return true/false if the input String is numeric
	 */
	private static boolean isNumeric(String str) {
		if (str == null) {
			return true;
		}
		try {
			Double.parseDouble(str);
		}
		catch(NumberFormatException nfe) {  
			return false;  
		}  
		return true;  
	}
	
	/**
	 * Gets the mapped values corresponding to a defined size of the 'value' (variable part of the molecule)
	 * @param valueAtoms Number of heavy atoms of the 'value' (variable part of the molecule)
	 * @return Hashmap of keys - list of values ({valueIndex, moleculeIndex})
	 */
	private HashMap> getIndex(int valueAtoms) {
		if (mMPIndex.containsKey(valueAtoms)) {
			return mMPIndex.get(valueAtoms);
		}
		return null;
	}
	
	/**
	 * Add new value to mMPIndex
	 * @param valueAtoms Number of heavy atoms of the 'value' (variable part of the molecule)
	 * @param keys Array of one (single cut) or two (double cut) strings for the 'key' (constant part(s) of the molecule) 
	 * @param values Array of two integers (valueIndex, moleculeIndex)
	 * @param isH true if the 'value' is a Hydrogen atom
	 * @return true/false if the new values were added
	 */
	private boolean addValues(int valueAtoms, String keys, int[] values, boolean isH) {
		boolean added = true;
		HashMap> keysHash = null;
		ArrayList valuesList = null;
		if (mMPIndex.containsKey(valueAtoms)) {
			keysHash = mMPIndex.get(valueAtoms);
		}
		else {
			keysHash = new HashMap>();
		}
		if (keysHash.containsKey(keys)) {
			valuesList = keysHash.get(keys);
			if (valuesList == null)
				valuesList = new ArrayList();
			if (isH == true && !valuesList.isEmpty()) {
				int[] lastItem = valuesList.get(valuesList.size() - 1);
				if (lastItem[0] != r1HIndex)
					valuesList.add(values);
				else
					added = false;
			}
			else {
				valuesList.add(values);
			}
		}
		else {
			valuesList = new ArrayList();
			valuesList.add(values);
		}
		keysHash.put(keys, valuesList);
		mMPIndex.put(valueAtoms, keysHash);
		return added;
	}
	
	/**
	 * Generates a hash table of keys - list of values;
* for double cuts, one key consists of a '\t'-separated string
* of the two 'left' and 'right' fragmentsID. * @param datasetName Name of the data set * @param compoundFileParser Compound File Parser (SD Reader, database link, ...) * @param verbose Verbose * @throws IOException */ public MMP(String datasetName, CompoundFileParser compoundFileParser, boolean verbose) throws IOException { mMPIndex = new HashMap>>(); wholeMoleculesIndex = new LinkedHashMap>(); moleculesFragmentsID = new ArrayList>(); matchedMolecularPairs = new MMPairs(); mmpUniqueFragments = new MMPUniqueFragments(); r1HIndex = mmpUniqueFragments.addFragment(r1H, 0, null); // we force the number of atoms to 0 because [H] counts for 1 mmpFragments = new MMPFragments(); moleculesRowCount = 0; fieldNames = compoundFileParser.getFieldNames(); fieldNumerics = new boolean[fieldNames.length]; fieldPercentiles5 = new float[fieldNames.length]; fieldPercentiles95 = new float[fieldNames.length]; IDCodeParser idCodeParser = new IDCodeParser(); boolean isSDFileParser = compoundFileParser.getClass().getName().contains("SDFileParser"); @SuppressWarnings("unchecked") ArrayList[] fieldDatas = (ArrayList[])new ArrayList[fieldNames.length]; for (int i = 0; i < fieldNames.length; i++) { fieldDatas[i] = new ArrayList(); } Arrays.fill(fieldNumerics, true); this.datasetName = datasetName; NumberFormat formatter = new DecimalFormat("#.##"); int molCounter = 0; if (verbose) { if (compoundFileParser.getRowCount() != -1) { System.out.println(getDateAndTime() + ": fragmenting " + compoundFileParser.getRowCount() + " molecules..."); } else { System.out.println(getDateAndTime() + ": fragmenting molecules..."); } } while (compoundFileParser.next()) { StereoMolecule mol = compoundFileParser.getMolecule(); mol.stripSmallFragments(); String moleculeName = (compoundFileParser.getMoleculeName() == null) ? mol.getName() : compoundFileParser.getMoleculeName(); String molID = compoundFileParser.getIDCode(); String molIDCoord = compoundFileParser.getCoordinates(); if (isSDFileParser) { // TODO: avoid re-parsing the IDCode to ensure that the saved molecule is the same as this one - problem occurs with SDF files... idCodeParser.parse(mol, molID); } String moleculeData[] = new String[fieldNames.length]; for (int i=0; i, >= and <= symbols fieldNumerics[i] = false; } else if (fieldData != null && isNumeric(fieldData)) { if (TRANSFORM_UM_TO_PIC50 && fieldNames[i].endsWith("_uM")) { float data = round((float)-Math.log10(Float.parseFloat(fieldData)*1.0E-6), 3); fieldData = Float.toString(data); moleculeData[i] = fieldData; fieldDatas[i].add(data); } else { fieldDatas[i].add(Float.parseFloat(fieldData)); moleculeData[i] = formatter.format(Float.parseFloat(fieldData)); } } else if (fieldData != null && !fieldData.startsWith("<") && !fieldData.startsWith(">")) { fieldNumerics[i] = false; } else if (fieldData != null && (fieldData.startsWith("<=") || fieldData.startsWith(">="))) { if (TRANSFORM_UM_TO_PIC50 && fieldNames[i].endsWith("_uM")) { float data = round((float)-Math.log10(Float.parseFloat(fieldData.substring(2))*1.0E-6), 3); if (fieldData.startsWith(">=")) { moleculeData[i] = "<=" + Float.toString(data); } else { moleculeData[i] = ">=" + Float.toString(data); } } else { moleculeData[i] = fieldData.substring(0, 2) + formatter.format(Float.parseFloat(fieldData.substring(2))); } } else if (fieldData != null && (fieldData.startsWith("<") || fieldData.startsWith(">"))) { if (TRANSFORM_UM_TO_PIC50 && fieldNames[i].endsWith("_uM")) { float data = round((float)-Math.log10(Float.parseFloat(fieldData.substring(1))*1.0E-6), 3); if (fieldData.startsWith(">")) { moleculeData[i] = "<" + Float.toString(data); } else { moleculeData[i] = ">" + Float.toString(data); } } else { moleculeData[i] = fieldData.substring(0, 1) + formatter.format(Float.parseFloat(fieldData.substring(1))); } } } } ArrayList moleculesIndex = new ArrayList(); int molIndex = molCounter; if (!wholeMoleculesIndex.containsKey(molID)) { MMPFragmenter mmp = new MMPFragmenter(mol); moleculesFragmentsID.add(mmp.getMoleculeFragmentsID()); List moleculeIndexesID = mmp.getMoleculeIndexesID(false); for (MoleculeIndexID moleculeIndexID: moleculeIndexesID) { String[] keysID = moleculeIndexID.getKeysID(); String valueID = moleculeIndexID.getValueID(); int valueAtoms = moleculeIndexID.getValueIDAtoms(); int key1Index = mmpUniqueFragments.addFragment(keysID[0]); int valueIndex = mmpUniqueFragments.addFragment(valueID); moleculeIndexID.setValueIndex(valueIndex); if (keysID.length == 1) { // single cut addValues(valueAtoms, Integer.toString(key1Index) + "\t", new int[]{valueIndex, molCounter}, false); moleculeIndexID.setKeysIndex(new int[]{key1Index}); } else { // double cut int key2Index = mmpUniqueFragments.addFragment(keysID[1]); addValues(valueAtoms, Integer.toString(key1Index) + "\t" + Integer.toString(key2Index), new int[]{valueIndex, molCounter}, false); moleculeIndexID.setKeysIndex(new int[]{key1Index, key2Index}); } mmpFragments.addFragments(molCounter, moleculeIndexID); if (moleculeIndexID.getValueIDAtoms() > maxValueAtoms) { maxValueAtoms = moleculeIndexID.getValueIDAtoms(); } } molCounter++; } else { moleculesIndex = wholeMoleculesIndex.get(molID); molIndex = moleculesIndex.get(0).moleculeIndex; } if (moleculesIndex.size() > 0) { moleculesIndex.add(new MoleculeIndex(molIndex, moleculeName, moleculeData)); } else { moleculesIndex.add(new MoleculeIndex(molIndex, molIDCoord, molID, moleculeName, moleculeData)); } wholeMoleculesIndex.put(molID, moleculesIndex); moleculesRowCount++; if (verbose) { if (moleculesRowCount % 1000 == 0) { System.out.println("# " + moleculesRowCount); } else if (moleculesRowCount % 100 == 0) { System.out.print("#"); } else if (moleculesRowCount % 10 == 0) { System.out.print("."); } } } // Getting percentiles if (verbose) { System.out.println(" " + moleculesRowCount); System.out.println(getDateAndTime() + ": getting percentiles..."); } for (int i=0; i 0) { Collections.sort(fieldDatas[i]); int index = (int)Math.floor(0.05 * fieldDatas[i].size()); // I use floor so that I don't have to correct for the array indexes starting at 0 if (Math.round(0.05f * fieldDatas[i].size()) != 0.05f * fieldDatas[i].size()) { fieldPercentiles5[i] = fieldDatas[i].get(index); } else { fieldPercentiles5[i] = (fieldDatas[i].get(index) + fieldDatas[i].get(index+1)) / 2.0f; } index = (int)Math.floor(0.95 * fieldDatas[i].size()); if (Math.round(0.95f * fieldDatas[i].size()) != 0.95f * fieldDatas[i].size()) { fieldPercentiles95[i] = fieldDatas[i].get(index); } else { fieldPercentiles95[i] = (fieldDatas[i].get(index) + fieldDatas[i].get(index+1)) / 2.0f; } } // else if (fieldNumerics[i] != false && fieldDatas[i].size() == 0) { // fieldNumerics[i] = false; // } } // Processing Hydrogens replacements if (verbose) System.out.println(getDateAndTime() + ": processing hydrogen replacements..."); for (List fragmentsID:moleculesFragmentsID) { for (String[] fragmentID:fragmentsID) { // {R-group, "clean"} if (wholeMoleculesIndex.containsKey(fragmentID[1])) { int moleculeIndex = wholeMoleculesIndex.get(fragmentID[1]).get(0).moleculeIndex; int fragmentIndex = mmpUniqueFragments.addFragment(fragmentID[0]); boolean added = addValues(0, Integer.toString(fragmentIndex) + "\t", new int[]{r1HIndex, moleculeIndex}, true); if (added) { // to do: replace -1 by the correct value (is it needed?) MoleculeIndexID moleculeIndexID = new MoleculeIndexID(new String[]{fragmentID[0]}, new int[]{fragmentIndex}, r1H, r1HIndex, null, 0, new int[]{-1}, new int[]{-1}); mmpFragments.addFragments(moleculeIndex, moleculeIndexID); // String[] keysID, String valueID, int[] keysIDAtoms, int valueIDAtoms } } } } // Generating MMPs if (verbose) System.out.print(getDateAndTime() + ": generating MMPs"); maxValueAtoms += 1; int counter = 0; int[][] combinations; if (VERSION == "1.0") { combinations = new int[maxValueAtoms*(maxValueAtoms+1)/2-1][2]; for (int i=1; i> mMPs = mMPEnumerator.getMMPEnumeration(); // matchedMolecularPairs.addMMPs(mMPs); if (mMPs != null && mMPs.size() > 0) matchedMolecularPairs.writeMMPEnumeration(mMPs); counter++; if (verbose) { if (counter % 1000 == 0) { System.out.println("# " + counter); } else if (counter % 100 == 0) { System.out.print("#"); } else if (counter % 10 == 0) { System.out.print("."); } } } compoundFileParser.close(); if (verbose) System.out.println(" " + counter + "\n" + getDateAndTime() + ": done."); } /** * Writes the Molecules block. A moleculeIndex column has been added
* since molecules might be not unique → The index won't be unique. * @param printWriter */ private void writeMolecules(PrintWriter printWriter) { String line = "moleculeIndex\tidcoordinates2D\tmolecule\tmoleculeName"; printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); for (int i=0; i 1) { category = items[0]; columnName = items[1]; longName = items[1]; if (items.length == 3) { longName = items[2]; } } printWriter.println(""); line += "\t" + columnName; } } printWriter.println(""); printWriter.println(line); Iterator it = wholeMoleculesIndex.keySet().iterator(); while (it.hasNext()) { String molID = it.next(); String molIDCoord = ""; List moleculesIndex = wholeMoleculesIndex.get(molID); for (MoleculeIndex moleculeIndex: moleculesIndex) { if (molIDCoord == "" && moleculeIndex.moleculeIDCoord != null) { molIDCoord = moleculeIndex.moleculeIDCoord; } line = Integer.toString(moleculeIndex.moleculeIndex) + "\t" + molIDCoord + "\t" + molID + "\t" + moleculeIndex.moleculeName; for (int i=0; i"); } /** * Writes the header (general information) block and calls the writing of the different blocks * @param printWriter * @throws IOException */ public void writeMMPFile(PrintWriter printWriter) throws IOException { printWriter.println(""); printWriter.println(""); DateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy"); Date date = new Date(); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); printWriter.println(""); // Process fragments (adding FP) and save them writeMolecules(printWriter); mmpUniqueFragments.writeUniqueFragments(printWriter); mmpFragments.writeFragments(printWriter); matchedMolecularPairs.writeMMPs(printWriter); printWriter.close(); } /** * Helper function for rounding * @param f Input value * @param decimalPlace * @return */ private static float round(float f, int decimalPlace) { BigDecimal bd = new BigDecimal(Float.toString(f)); bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP); return bd.floatValue(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy