All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.bioclipse.managers.OscarManager Maven / Gradle / Ivy

There is a newer version: 1.0.3
Show newest version
/* Copyright (c) 2011,2021  Egon Willighagen 
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contact: http://www.bioclipse.net/
 */
package net.bioclipse.managers;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.openscience.cdk.io.formats.CMLFormat;
import org.openscience.cdk.io.formats.IChemFormat;

import io.github.egonw.bacting.IBactingManager;
import net.bioclipse.core.business.BioclipseException;
import net.bioclipse.core.domain.IMolecule;
import net.htmlparser.jericho.Source;
import uk.ac.cam.ch.wwmm.oscar.Oscar;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ChemicalStructure;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ResolvedNamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;

/**
 * Bioclipse manager that adds text mining functionality.
 */
public class OscarManager implements IBactingManager {

	private Oscar oscar = new Oscar();
	private CDKManager cdk;

	/**
     * Creates a new {@link OscarManager}.
     *
     * @param workspaceRoot location of the workspace, e.g. "."
     */
	public OscarManager(String workspaceRoot) {
		this.cdk = new CDKManager(workspaceRoot);
	}

	/**
	 * Extracts named entities from a text.
	 *
	 * @param text Text with named entities (chemical names).
	 * @return List of named entities. 
	 */
    public List findNamedEntities(String text) {
    	List mols = new ArrayList();
    	List entities = oscar.findNamedEntities(text);
    	for (NamedEntity entity : entities) {
    		mols.add(entity.getSurface());
    	}
    	return mols;
    }

	/**
	 * Extracts molecules from a text for which the corresponding named entities are recognized as chemicals.
	 *
	 * @param text Text with chemical names.
	 * @return List of {@link IMolecule}s. 
	 */
    public List findResolvedNamedEntities(String text) throws BioclipseException {
    	List mols = new ArrayList();
    	List entities = oscar.findAndResolveNamedEntities(text);
    	for (ResolvedNamedEntity entity : entities) {
    		ChemicalStructure structure = 
    			entity.getFirstChemicalStructure(FormatType.CML);
    		if (structure != null) {
    			IMolecule mol;
				try {
					mol = cdk.loadMolecule(
						new ByteArrayInputStream(
							structure.getValue().getBytes()
						), (IChemFormat)CMLFormat.getInstance()
					);
	    		    mols.add(mol);
				} catch (BioclipseException e) {
					throw new BioclipseException(
						"Error while creating an IMolecule for an " +
						"extracted compound: " + e.getMessage(), e);
				} catch (IOException e) {
					throw new BioclipseException(
						"Error while creating an IMolecule for an " +
						"extracted compound: " + e.getMessage(), e);
				}
    		}
    	}
    	return mols;
    }

    /**
     * Extracts plain text from a HTML document.
     *
     * @param html HTML string from which the text should be extracted.
     * @return The resulting text
     */
    public String extractText(String html) {
    	Source source = new Source(html);
    	return source.getTextExtractor().toString();
    }

	@Override
	public String getManagerName() {
		return "oscar";
	}

	@Override
	public List doi() {
		List dois = new ArrayList();
		dois.add("10.1186/1758-2946-3-41");
		return dois;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy