All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openscience.cdk.io.iterator.IteratingSDFReader Maven / Gradle / Ivy

There is a newer version: 2.10
Show newest version
/* Copyright (C) 2003-2007  The Chemistry Development Kit (CDK) project
 *                    2014  Mark B Vine (orcid:0000-0002-7794-0426)
 *
 * Contact: [email protected]
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 * All we ask is that proper credit is given for our work, which includes
 * - but is not limited to - adding the above copyright notice to the beginning
 * of your source code files, and to any copyright notice that you may distribute
 * with programs based on this work.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 */
package org.openscience.cdk.io.iterator;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.io.ISimpleChemObjectReader;
import org.openscience.cdk.io.ReaderFactory;
import org.openscience.cdk.io.formats.IChemFormat;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.MDLFormat;
import org.openscience.cdk.io.formats.MDLV2000Format;
import org.openscience.cdk.io.formats.MDLV3000Format;
import org.openscience.cdk.io.setting.BooleanIOSetting;
import org.openscience.cdk.io.setting.IOSetting;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;

/**
 * Iterating MDL SDF reader. It allows to iterate over all molecules
 * in the SD file, without reading them into memory first. Suitable
 * for (very) large SDF files. For parsing the molecules in the
 * SD file, it uses the MDLV2000Reader or
 * MDLV3000Reader reader; it does not work
 * for SDF files with MDL formats prior to the V2000 format.
 *
 * 

Example use: *

 * File sdfFile = new File("../zinc-structures/ZINC_subset3_3D_charged_wH_maxmin1000.sdf");
 * IteratingMDLReader reader = new IteratingMDLReader(
 *   new FileInputStream(sdfFile), DefaultChemObjectBuilder.getInstance()
 * );
 * while (reader.hasNext()) {
 *   IAtomContainer molecule = (IAtomContainer)reader.next();
 * }
 * 
* * @cdk.module io * @cdk.githash * * @see org.openscience.cdk.io.MDLV2000Reader * @see org.openscience.cdk.io.MDLV3000Reader * * @author Egon Willighagen * @cdk.created 2003-10-19 * * @cdk.keyword file format, MDL molfile * @cdk.keyword file format, SDF * @cdk.iooptions */ public class IteratingSDFReader extends DefaultIteratingChemObjectReader { private BufferedReader input; private static ILoggingTool logger = LoggingToolFactory .createLoggingTool(IteratingSDFReader.class); private String currentLine; private IChemFormat currentFormat; private final ReaderFactory factory = new ReaderFactory(); private boolean nextAvailableIsKnown; private boolean hasNext; private IChemObjectBuilder builder; private IAtomContainer nextMolecule; private BooleanIOSetting forceReadAs3DCoords; // if an error is encountered the reader will skip over the error private boolean skip = false; // buffer to store pre-read Mol records in private StringBuffer buffer = new StringBuffer(10000); private static final String LINE_SEPARATOR = System.getProperty("line.separator"); // patterns to match private static Pattern MDL_VERSION = Pattern.compile("[vV](2000|3000)"); private static Pattern M_END = Pattern.compile("M\\s\\sEND"); private static Pattern SDF_RECORD_SEPARATOR = Pattern.compile("\\$\\$\\$\\$"); private static Pattern SDF_FIELD_START = Pattern.compile("\\A>\\s"); // map of MDL formats to their readers private final Map readerMap = new HashMap( 5); /** * Constructs a new IteratingMDLReader that can read Molecule from a given Reader. * * @param in The Reader to read from * @param builder The builder */ public IteratingSDFReader(Reader in, IChemObjectBuilder builder) { this(in, builder, false); } /** * Constructs a new IteratingMDLReader that can read Molecule from a given InputStream. * * @param in The InputStream to read from * @param builder The builder */ public IteratingSDFReader(InputStream in, IChemObjectBuilder builder) { this(new InputStreamReader(in), builder); } /** * Constructs a new IteratingMDLReader that can read Molecule from a given a * InputStream. This constructor allows specification of whether the reader will * skip 'null' molecules. If skip is set to false and a broken/corrupted molecule * is read the iterating reader will stop at the broken molecule. However if * skip is set to true then the reader will keep trying to read more molecules * until the end of the file is reached. * * @param in the {@link InputStream} to read from * @param builder builder to use * @param skip whether to skip null molecules */ public IteratingSDFReader(InputStream in, IChemObjectBuilder builder, boolean skip) { this(new InputStreamReader(in), builder, skip); } /** * Constructs a new IteratingMDLReader that can read Molecule from a given a * Reader. This constructor allows specification of whether the reader will * skip 'null' molecules. If skip is set to false and a broken/corrupted molecule * is read the iterating reader will stop at the broken molecule. However if * skip is set to true then the reader will keep trying to read more molecules * until the end of the file is reached. * * @param in the {@link Reader} to read from * @param builder builder to use * @param skip whether to skip null molecules */ public IteratingSDFReader(Reader in, IChemObjectBuilder builder, boolean skip) { this.builder = builder; setReader(in); initIOSettings(); setSkip(skip); } @Override public IResourceFormat getFormat() { return currentFormat; } /** * Method will return an appropriate reader for the provided format. Each reader is stored * in a map, if no reader is available for the specified format a new reader is created. The * {@see ISimpleChemObjectReadr#setErrorHandler(IChemObjectReaderErrorHandler)} and * {@see ISimpleChemObjectReadr#setReaderMode(DefaultIteratingChemObjectReader)} * methods are set. * * @param format The format to obtain a reader for * @return instance of a reader appropriate for the provided format */ private ISimpleChemObjectReader getReader(IChemFormat format) { // create a new reader if not mapped if (!readerMap.containsKey(format)) { ISimpleChemObjectReader reader = factory.createReader(format); reader.setErrorHandler(this.errorHandler); reader.setReaderMode(this.mode); if (currentFormat instanceof MDLV2000Format) { reader.addSettings(getSettings()); } readerMap.put(format, reader); } return readerMap.get(format); } /** * Returns true if another {@link IAtomContainer} can be read. */ @Override public boolean hasNext() { if (nextAvailableIsKnown) { return hasNext; } hasNext = false; nextMolecule = null; buffer.delete(0, buffer.length()); // now try to parse the next Molecule try { currentFormat = (IChemFormat) MDLFormat.getInstance(); while ((currentLine = input.readLine()) != null) { // still in a molecule buffer.append(currentLine).append(LINE_SEPARATOR); // do MDL molfile version checking Matcher versionMatcher = MDL_VERSION.matcher(currentLine); if (versionMatcher.find()) { currentFormat = "2000".equals(versionMatcher.group(1)) ? (IChemFormat) MDLV2000Format.getInstance() : (IChemFormat) MDLV3000Format.getInstance(); } // un-trimmed line has already been stored in buffer currentLine = currentLine.trim(); if (M_END.matcher(currentLine).matches()) { logger.debug("MDL file part read: ", buffer); IAtomContainer molecule = null; try { ISimpleChemObjectReader reader = getReader(currentFormat); InputStream byteStream = new ByteArrayInputStream(buffer.toString().getBytes("UTF-8")); reader.setReader(byteStream); molecule = (IAtomContainer) reader.read(builder.newInstance(IAtomContainer.class)); byteStream.close(); } catch (CDKException | IllegalArgumentException | IOException exception) { logger.error("Error while reading next molecule: " + exception.getMessage()); logger.debug(exception); } if (molecule != null) { readDataBlockInto(molecule); hasNext = true; nextAvailableIsKnown = true; nextMolecule = molecule; return true; } else if (skip) { // null molecule and skip = true, eat up the rest of the entry until '$$$$' String line; while ((line = input.readLine()) != null && !SDF_RECORD_SEPARATOR.matcher(line).matches()) { buffer.delete(0, buffer.length()); } } else { return false; } // empty the buffer buffer.delete(0, buffer.length()); } // found SDF record separator ($$$$) without parsing a molecule (separator is detected // in readDataBlockInto()) the buffer is cleared and the iterator continues reading if (SDF_RECORD_SEPARATOR.matcher(currentLine).matches()) { buffer.delete(0, buffer.length()); } } } catch (IOException exception) { logger.error("Error while reading next molecule: " + exception.getMessage()); logger.debug(exception); } // reached end of file return false; } private void readDataBlockInto(IAtomContainer m) throws IOException { String fieldName = null; while ((currentLine = input.readLine()) != null && !SDF_RECORD_SEPARATOR.matcher(currentLine).matches()) { logger.debug("looking for data header: ", currentLine); String str = currentLine; if (SDF_FIELD_START.matcher(str).find()) { fieldName = extractFieldName(fieldName, str); str = skipOtherFieldHeaderLines(str); String data = extractFieldData(str); if (fieldName != null) { logger.info("fieldName, data: ", fieldName, ", ", data); m.setProperty(fieldName, data); } } } } /** * Indicate whether the reader should skip over SDF records * that cause problems. If true the reader will fetch the next * molecule * @param skip ignore error molecules continue reading */ public void setSkip(boolean skip) { this.skip = skip; } private String extractFieldData(String str) throws IOException { StringBuilder data = new StringBuilder(); while (str.trim().length() > 0) { logger.debug("data line: ", currentLine); if (data.length() > 0) { str = System.getProperty("line.separator") + str; } data.append(str); currentLine = input.readLine(); str = currentLine.trim(); } return data.toString(); } private String skipOtherFieldHeaderLines(String str) throws IOException { while (str.startsWith("> ")) { logger.debug("data header line: ", currentLine); currentLine = input.readLine(); str = currentLine; } return str; } private String extractFieldName(String fieldName, String str) { int index = str.indexOf('<'); if (index != -1) { int index2 = str.substring(index).indexOf('>'); if (index2 != -1) { fieldName = str.substring(index + 1, index + index2); } } return fieldName; } /** * Returns the next {@link IAtomContainer}. */ @Override public IAtomContainer next() { if (!nextAvailableIsKnown) { hasNext(); } nextAvailableIsKnown = false; if (!hasNext) { throw new NoSuchElementException(); } return nextMolecule; } @Override public void close() throws IOException { input.close(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public void setReader(Reader reader) { if (reader instanceof BufferedReader) { input = (BufferedReader) reader; } else { input = new BufferedReader(reader); } nextMolecule = null; nextAvailableIsKnown = false; hasNext = false; } @Override public void setReader(InputStream reader) { setReader(new InputStreamReader(reader)); } private void initIOSettings() { forceReadAs3DCoords = new BooleanIOSetting("ForceReadAs3DCoordinates", IOSetting.Importance.LOW, "Should coordinates always be read as 3D?", "false"); addSetting(forceReadAs3DCoords); } public void customizeJob() { fireIOSettingQuestion(forceReadAs3DCoords); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy