com.actelion.research.chem.io.SDFileParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openchemlib Show documentation
Open Source Chemistry Library
There is a newer version: 2024.11.2
/*
* Copyright (c) 1997 - 2016
* Actelion Pharmaceuticals Ltd.
* Gewerbestrasse 16
* CH-4123 Allschwil, Switzerland
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
*    list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
*    this list of conditions and the following disclaimer in the documentation
*    and/or other materials provided with the distribution.
* 3. Neither the name of the the copyright holder nor the
*    names of its contributors may be used to endorse or promote products
*    derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/

package com.actelion.research.chem.io;

import com.actelion.research.chem.MolfileParser;
import com.actelion.research.chem.StereoMolecule;
import com.actelion.research.chem.UniqueStringList;
import com.actelion.research.io.BOMSkipper;

import java.io.*;
import java.nio.charset.StandardCharsets;

public class SDFileParser extends CompoundFileParser {
    private static final int DEFAULT_RECORDS_TO_INSPECT = 10240;
    private static final String[] cIDFieldNames = { "Idorsia No", "Actelion No", "ID", "IDNUMBER", "COMPOUND_ID", "NAME", "COMPND" };
	public static final String cNewLineString = "\n";

	private StringBuilder		mMolfileBuffer,mDataBuffer;
	private StereoMolecule		mMol;
	private String[]			mFieldName;
	private String[]			mFieldData;
	private int					mNoOfRecords,mIDFieldIndex;

	public SDFileParser(String fileName) {
		this(fileName, null);
		}

	public SDFileParser(String fileName, String[] fieldName) {
	    mNoOfRecords = -1;
	    
		mFieldName = fieldName;
		
		try {
			mReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8));
			BOMSkipper.skip(mReader);
		} catch (IOException e) {
			mReader = null;
		}
		
		
		init();
		}


	public SDFileParser(File file) {
		this(file, null);
		}


	public SDFileParser(File file, String[] fieldName) {
        mNoOfRecords = -1;
		mFieldName = fieldName;
		try {
    		mReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
			BOMSkipper.skip(mReader);
		} catch (IOException e) {
			mReader = null;
		}
		
		init();
	}


	public SDFileParser(Reader reader) {
		this(reader, null);
	}


	public SDFileParser(Reader reader, String[] fieldName) {
        mNoOfRecords = -1;
		mFieldName = fieldName;
		mReader = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader);
		
		init();
		}


	private void init() {
		mMolfileBuffer = new StringBuilder(10240);
		mDataBuffer = new StringBuilder(10240);
		}
	
	private void extractAllFieldNames(int recordsToInspect) {
	    int records = 0;
//		TreeSet fieldNameList = new TreeSet(); Changed to keep the original order of field names. TLS 6Jan16
		UniqueStringList fieldNameList = new UniqueStringList();

		while (records < recordsToInspect) {
			String line;
			try {
				line = mReader.readLine();
				}
			catch (IOException e) {
				if (records < recordsToInspect)
					mNoOfRecords = records;
				break;
				}

			if (line == null) {
				if (records < recordsToInspect)
					mNoOfRecords = records;
				break;
				}

			if (line.startsWith("$$$$"))
				records++;

			if (line.startsWith(">")) {
				String fieldName = extractFieldName(line);
				if (fieldName != null)
					fieldNameList.addString(fieldName);
				}
			}

		try {
			mReader.close();
		    }
		catch (IOException e) {}

		mFieldName = fieldNameList.toArray();
		}


	/**
	 * Only accurate if getFieldNames() or getFieldNames(int) was called earlier
	 * and if the number of records of the SD-file is smaller than the number
	 * of records that were examined within the the getFieldNames() method.
	 * If not all records of the file were seen, then -1 is returned.
	 * For getRowCount() to reliably return the record count call getFieldNames(Integer.MAX_VALUE) first.
	 * @return number of rows or -1
	 */
	public int getRowCount() {
		return mNoOfRecords;
		}


	protected boolean advanceToNext() {
		if (mReader == null)
			return false;

// removed 13.8.2012 TLS; no need to read molfile in order to advance to the next record
//		if (mMolfileBuffer.length() != 0)
//			return true;

		mMolfileBuffer.setLength(0);
		mDataBuffer.setLength(0);
		
    	mMol = null;

		boolean molfileComplete = false;
		int fieldIndex = -1;
		String fieldName = null;
		String line;
		mFieldData = (mFieldName == null) ? null : new String[mFieldName.length];
		mIDFieldIndex = -1;

		do {
			try {
				line = mReader.readLine();
				if (line == null) {
	    			mMolfileBuffer.setLength(0);
		    		mReader.close();
			    	return false;
				    }
				}
			catch (IOException e) {
				mMolfileBuffer.setLength(0);
				return false;
				}

			if (!molfileComplete) {
				if (line.startsWith(">")) {	// to handle sd-record with molfiles without 'M  END'
					molfileComplete = true;
					mMolfileBuffer.append("M  END");
		    		mMolfileBuffer.append('\n');
		    		mDataBuffer.append(line);
		    		mDataBuffer.append('\n');
					}
				else {
					mMolfileBuffer.append(line);
		    		mMolfileBuffer.append('\n');
			    	if (line.startsWith("M  END"))
						molfileComplete = true;
					continue;
					}
				}
			else {
	    		mDataBuffer.append(line);
	    		mDataBuffer.append('\n');
				}

			if (mFieldName != null) {
				if (line.length() == 0) {
					fieldIndex = -1;
					}
				else if (fieldIndex == -1) {
					fieldName = extractFieldName(line);
					if (fieldName != null) {
					    // find fieldIndex to given fieldName
						fieldIndex = -1;
						for (int field=0; field records to extract all field names and uses up this
     * SDFileParser. In this case one needs to instantiate a new SDFileParser
     * to sequentially iterate through the file/reader's records and supply
     * the field name array to the constructor.
     * @return array of field names
     */
	public String[] getFieldNames(int recordsToInspect) {
        if (mFieldName == null)
            extractAllFieldNames(recordsToInspect);

        return mFieldName;
	    }

	/*	public boolean moreRecordsAvailable() {
		if (mFieldDataList == null || mFieldDataList.size() == 0)
			return false;

		mCurrentFieldData = mFieldDataList.get(0);
		mFieldDataList.remove(0);
		return true;
		}
*/

	public String getFieldData(int index) {
		if (mFieldData == null)
			return null;

		return mFieldData[index];
		}


	protected String extractFieldName(String line) {
		if (line.length() == 0
		 || line.charAt(0) != '>')
			return null;

		int index = 1;
		int openBracket = 0;
		int closeBracket = 0;
		while (index < line.length()) {
			if (line.charAt(index) == '<') {
				if (openBracket != 0)
					return null;
				openBracket = index;
				}
			else if (line.charAt(index) == '>') {
				if (closeBracket != 0)
					return null;
				closeBracket = index;
				}
			index++;
			}

		if (openBracket != 0 && openBracket < closeBracket)
			return line.substring(openBracket+1, closeBracket);

		// allow for MACCS-II field numbers, which have format DTn
		index = line.indexOf("DT", 1);
		if (index == -1)
			return null;

		int i = index+2;
		while (line.length()>i && Character.isDigit(line.charAt(i)))
			i++;
		
		return (i == index+2) ? null : line.substring(index, i);
		}
	}