com.actelion.research.chem.io.DWARFileParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
There is a newer version: 2024.11.2
/*
 * Copyright (c) 1997 - 2016
 * Actelion Pharmaceuticals Ltd.
 * Gewerbestrasse 16
 * CH-4123 Allschwil, Switzerland
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the the copyright holder nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author Thomas Sander
 */

package com.actelion.research.chem.io;

import com.actelion.research.chem.*;
import com.actelion.research.chem.descriptor.DescriptorConstants;
import com.actelion.research.chem.descriptor.DescriptorHandlerLongFFP512;
import com.actelion.research.chem.descriptor.DescriptorHandlerStandard2DFactory;
import com.actelion.research.chem.descriptor.DescriptorHelper;
import com.actelion.research.chem.reaction.Reaction;
import com.actelion.research.chem.reaction.ReactionEncoder;
import com.actelion.research.io.BOMSkipper;
import com.actelion.research.util.BinaryDecoder;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.Collectors;

public class DWARFileParser extends CompoundFileParser implements DescriptorConstants,CompoundTableConstants {

    public static final int MODE_COORDINATES_PREFER_2D = 1;
    public static final int MODE_COORDINATES_PREFER_3D = 2;
    public static final int MODE_COORDINATES_REQUIRE_2D = 3;
    public static final int MODE_COORDINATES_REQUIRE_3D = 4;
    private static final int MODE_COORDINATE_MASK = 7;
    public static final int MODE_BUFFER_HEAD_AND_TAIL = 8;
    public static final int MODE_EXTRACT_DETAILS = 16;

	private String[]		mFieldName;
	private String[]		mFieldData;
	private String			mLine,mCoordinate3DColumnName;
	private int[]			mFieldIndex;
    private int             mRecordCount,mMode;
	private int				mIDCodeColumn,mCoordinateColumn,mCoordinate2DColumn,mCoordinate3DColumn,
							mMoleculeNameColumn,mFragFpColumn;
    private TreeMap mColumnPropertyMap;
	private TreeMap mSpecialFieldMap;
	private TreeMap mDescriptorColumnMap;
	private ArrayList mHeadOrTailLineList;
	private HashMap mDetails;

    /**
     * Constructs a DWARFileParser from a file name with coordinate mode MODE_COORDINATES_PREFER_2D.
     * @param fileName
     */
	public DWARFileParser(String fileName) {
        try {
            mReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8));
			BOMSkipper.skip(mReader);
            mMode = MODE_COORDINATES_PREFER_2D;
            init();
            }
        catch (IOException e) {
            mReader = null;
            }
		}

    /**
     * Constructs a DWARFileParser from a File with coordinate mode MODE_COORDINATES_PREFER_2D.
     * @param file
     */
	public DWARFileParser(File file) {
        try {
            mReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
			BOMSkipper.skip(mReader);
            mMode = MODE_COORDINATES_PREFER_2D;
            init();
            }
        catch (IOException e) {
            mReader = null;
            }
		}

    /**
     * Constructs a DWARFileParser from a Reader with coordinate mode MODE_COORDINATES_PREFER_2D.
     * @param reader
     */
	public DWARFileParser(Reader reader) {
        try {
			mReader = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader);
            mMode = MODE_COORDINATES_PREFER_2D;
            init();
            }
        catch (IOException e) {
            mReader = null;
            }
		}

    /**
     * Constructs a DWARFileParser from a file name with the specified coordinate mode.
     * @param fileName
     * @param mode one of 4 MODE_COORDINATE... modes
     */
    public DWARFileParser(String fileName, int mode) {
        try {
            mReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8));
			BOMSkipper.skip(mReader);
            mMode = mode;
            init();
            }
        catch (IOException e) {
            mReader = null;
            }
        }

    /**
     * Constructs a DWARFileParser from a File with the specified coordinate mode.
     * @param file
     * @param mode one of 4 MODE_COORDINATE... modes
     */
    public DWARFileParser(File file, int mode) {
        try {
            mReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
			BOMSkipper.skip(mReader);
            mMode = mode;
            init();
            }
        catch (IOException e) {
            mReader = null;
            }
        }

    /**
     * Constructs a DWARFileParser from a Reader with the specified coordinate mode.
     * @param reader
     * @param mode one of 4 MODE_COORDINATE... modes
     */
    public DWARFileParser(Reader reader, int mode) {
        try {
			mReader = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader);
            mMode = mode;
            init();
            }
        catch (IOException e) {
            mReader = null;
            }
        }

    private String readHeadOrTailLine() throws IOException {
    	String line = mReader.readLine();
    	if ((mMode & MODE_BUFFER_HEAD_AND_TAIL) != 0 && line != null)
    		mHeadOrTailLineList.add(line);
    	return line;
    	}

    private void init() throws IOException {
	    setDescriptorHandlerFactory(DescriptorHandlerStandard2DFactory.getFactory());

    	if ((mMode & MODE_BUFFER_HEAD_AND_TAIL) != 0)
    		mHeadOrTailLineList = new ArrayList<>();

    	int coordinateMode = mMode & MODE_COORDINATE_MASK;
	
		String line = readHeadOrTailLine();
        if (line == null
         || !line.equals(cNativeFileHeaderStart))
            throw new IOException("no header found");

        mRecordCount = -1;
        line = readHeadOrTailLine();
        while (line != null
            && !line.equals(cNativeFileHeaderEnd)) {

            if (line.startsWith("<"+cNativeFileVersion)) {
                String version = extractValue(line);
                if (!version.startsWith("3.")
                 && !version.isEmpty())
                    throw new IOException("unsupported .dwar file version");
                }

            else if (line.startsWith("<"+cNativeFileRowCount)) {
                try {
                    mRecordCount = Integer.parseInt(extractValue(line));
                    }
                catch (NumberFormatException e) {}
                }
  
            line = readHeadOrTailLine();
            }

        line = readHeadOrTailLine();

        while (line != null
         && (line.equals(cFileExplanationStart)
          || line.equals(cMacroListStart))) {
            line = readHeadOrTailLine();
            while (line != null
                && !line.equals(cFileExplanationEnd)
                && !line.equals(cMacroListEnd))
                line = readHeadOrTailLine();
            line = readHeadOrTailLine();
        	}

        mColumnPropertyMap = new TreeMap<>();

        if (line != null
         && line.equals(cColumnPropertyStart)) {
            line = readHeadOrTailLine();
            String columnName = null;
            while (line != null
                && !line.equals(cColumnPropertyEnd)) {

                if (line.startsWith("<"+cColumnName)) {
                	columnName = extractValue(line);
                	mColumnPropertyMap.put(columnName, new Properties());
                    }

                else if (line.startsWith("<"+cColumnProperty)) {
                    String[] property = extractValue(line).split("\\t");
                    if(property.length==1) {
                    	mColumnPropertyMap.get(columnName).setProperty(property[0],"");
                    }
                    else {
                    	mColumnPropertyMap.get(columnName).setProperty(property[0], property[1]);
                    }
                }

                line = readHeadOrTailLine();
                }

            line = readHeadOrTailLine();
            }

        mSpecialFieldMap = new TreeMap<>();	// only take those columns that have a special type
        for (String columnName:mColumnPropertyMap.keySet()) {
        	Properties properties = mColumnPropertyMap.get(columnName);
        	String specialType = properties.getProperty(cColumnPropertySpecialType);
        	if (specialType != null)
        		mSpecialFieldMap.put(columnName, new SpecialField(
        				columnName,
        				specialType,
        				properties.getProperty(cColumnPropertyParentColumn),
        				properties.getProperty(cColumnPropertyRelatedIdentifierColumn),
        				properties.getProperty(cColumnPropertyDescriptorVersion)
        				));
        	}
        
        ArrayList columnNameList = new ArrayList();
        ArrayList columnIndexList = new ArrayList();

		if (line == null)
            throw new IOException("unexpected end of file");

		int fromIndex = 0;
		int toIndex = 0;
		int sourceColumn = 0;
		do {
			String columnName;
			toIndex = line.indexOf('\t', fromIndex);
			if (toIndex == -1) {
				columnName = line.substring(fromIndex);
				}
			else {
				columnName = line.substring(fromIndex, toIndex);
				fromIndex = toIndex+1;
				}

            if (mSpecialFieldMap.containsKey(columnName)) {
                mSpecialFieldMap.get(columnName).fieldIndex = sourceColumn;
                }
            else {
    			columnNameList.add(columnName);
    			columnIndexList.add(sourceColumn);
                }

			sourceColumn++;
			} while (toIndex != -1);

		mFieldName = new String[columnNameList.size()];
		mFieldIndex = new int[columnNameList.size()];
		for (int i=0; i specialColumn.fieldIndex)
                        idcodeColumn = specialColumn;
                    }
                }
            }
        if (idcodeColumn != null) {
            if (idcodeColumn.idColumn != null) {
                for (int i=0; i();
                        mDescriptorColumnMap.put(specialColumn.type, specialColumn.fieldIndex);
                        }
                    }
                }

			if (mCoordinate2DColumn != -1
			 && (coordinateMode == MODE_COORDINATES_REQUIRE_2D
			  || coordinateMode == MODE_COORDINATES_PREFER_2D
			  || (coordinateMode == MODE_COORDINATES_PREFER_3D && mCoordinate3DColumn == -1)))
				mCoordinateColumn = mCoordinate2DColumn;

			if (mCoordinate3DColumn != -1
			 && (coordinateMode == MODE_COORDINATES_REQUIRE_3D
			  || coordinateMode == MODE_COORDINATES_PREFER_3D
			  || (coordinateMode == MODE_COORDINATES_PREFER_2D && mCoordinate2DColumn == -1)))
				mCoordinateColumn = mCoordinate3DColumn;
            }
	    }

    /**
     * If you don't read any records after calling this method,
     * don't forget to call close() to close the underlying file.
     * @return whether the file contains chemical structures
     */
    public boolean hasStructures() {
    	return (mIDCodeColumn != -1);
    	}

    /**
     * @return whether the file contains chemical structures with explicit atom coordinates
     */
    public boolean hasStructureCoordinates() {
    	return (mCoordinateColumn != -1);
    	}

	/**
	 * @return whether the file contains chemical structures with explicit atom coordinates
	 */
	public boolean hasStructureCoordinates2D() {
		return (mCoordinate2DColumn != -1);
		}

	/**
	 * @return whether the file contains chemical structures with explicit atom coordinates
	 */
	public boolean hasStructureCoordinates3D() {
		return (mCoordinate3DColumn != -1);
		}

	public String getStructureCoordinates3DColumnName() {
		return mCoordinate3DColumnName;
		}

	public String[] getFieldNames() {
		return mFieldName;
		}

	/**
	 * @param columnName
	 * @return field index for special fields, e.g. to be used for getSpecialFieldData()
	 */
	public int getSpecialFieldIndex(String columnName) {
		for (SpecialField sf:mSpecialFieldMap.values())
			if (columnName.equals(sf.name))
				return sf.fieldIndex;

		return -1;
		}

	/**
	 * @param parentColumnName
	 * @param childType
	 * @return field index for special fields, e.g. to be used for getSpecialFieldData()
	 */
	public int getChildFieldIndex(String parentColumnName, String childType) {
		for (SpecialField sf:mSpecialFieldMap.values())
			if (parentColumnName.equals(sf.parent) && childType.equals(sf.type))
				return sf.fieldIndex;

		return -1;
		}

    public int getRowCount() {
        return mRecordCount;
        }

    /**
     * Provided that the mode contains MODE_BUFFER_HEAD_AND_TAIL, then this method
     * returns a list of all header/footer rows of the DWAR file. If this method is
     * called before all rows have been read, then the header lines including column
     * properties and the column title line are returned. If this method is
     * called after all rows have been read, then all lines after the data table, i.e. the
     * runtime properties, are returned.
     * @return
     */
    public ArrayList getHeadOrTail() {
        return mHeadOrTailLineList;
        }

    /**
     * Provided that the mode contains MODE_EXTRACT_DETAILS, then this method
     * returns a map of all embedded detail objects of the DWAR file.
     * This method must not be called before all rows have been read.
     * @return
     */
    public HashMap getDetails() {
    	return mDetails;
    	}

    /**
     * Returns the entire line containing all row data
     * @return
     */
    public String getRow() {
        return mLine;
        }

	/**
	 * Returns the raw data in the following format:
	 * Columns are sorted according to the order of how they appear in DataWarrior:
	 *
	 * @param includeHeaderRow
	 * @return
	 */
	public String[][] getRawData(boolean includeHeaderRow, boolean structureAsSmiles) {

		String[] fn  = getFieldNames();
		TreeMap sfn = getSpecialFieldMap();

		List allFieldNames = new ArrayList<>();

		allFieldNames.addAll(Arrays.stream(fn).collect(Collectors.toList()));
		List allFieldNamesForOutput = new ArrayList<>();
		allFieldNamesForOutput.addAll(allFieldNames);
		for(String sfi : sfn.keySet().stream().sorted(  (x,y) -> Integer.compare( sfn.get(x).fieldIndex , sfn.get(y).fieldIndex ) ).collect(Collectors.toList())) {
			if(sfn.get(sfi).type.equals(cColumnTypeIDCode)) {
				allFieldNames.add(sfn.get(sfi).fieldIndex, sfi);
				allFieldNamesForOutput.add(sfi);
			}
		}

		int nDataRows = getRowCount();
		int nOutputRows = nDataRows + (includeHeaderRow?1:0);

		String[][] rawData = new String[ nOutputRows ][ allFieldNames.size() ];

		if(includeHeaderRow) {
			for(int zi=0;zi();
		try {
		    while (true) {
				String theLine = readHeadOrTailLine();
				if (theLine == null
				 || theLine.equals(cDetailDataEnd)) {
					break;
					}

				if (theLine.startsWith("<"+cDetailID)) {
					String detailID = extractValue(theLine);
					BinaryDecoder decoder = new BinaryDecoder(mReader);
					int size = decoder.initialize(8);
					byte[] detailData = new byte[size];
					for (int i=0; iSpecialField map of all non-alphanumerical columns.
	 * SpecialField.type is one of the types defined in CompoundTableConstants:
	 * cColumnTypeIDCode,cColumnTypeRXNCode,cColumnType2DCoordinates,cColumnType3DCoordinates,
	 * cColumnTypeAtomColorInfo, and descriptor shortNames;
	 * @return special fields
	 */
	public TreeMap getSpecialFieldMap() {
	    return mSpecialFieldMap;
	    }

	/**
	 * @param fieldIndex is available from special-field-TreeMap by getSpecialFieldMap().get(columnName).fieldIndex
	 * @return String encoded data content of special field, e.g. idcode
	 */
    public String getSpecialFieldData(int fieldIndex) {
        return mFieldData[fieldIndex];
        }

    /**
     * Returns the original column properties of any source column by column name.
     * @param columnName
     * @return
     */
    public Properties getColumnProperties(String columnName) {
    	return mColumnPropertyMap.get(columnName);
    	}

    private String extractValue(String theLine) {
        int index1 = theLine.indexOf("=\"") + 2;
        int index2 = theLine.indexOf("\"", index1);
        return theLine.substring(index1, index2);
        }

	public class SpecialField {
	    public String name;
	    public String type;
	    public String parent;
	    public String idColumn;
	    public String version;
	    public int fieldIndex;

	    public SpecialField(String name, String type, String parent, String idColumn, String version) {
	        this.name = name;
	        this.type = type;
	        this.parent = parent;
	        this.idColumn = idColumn;
	        this.version = version;
	        this.fieldIndex = -1;
	        }
	    }
    }