net.bioclipse.managers.OscarManager Maven / Gradle / Ivy
/* Copyright (c) 2011,2021 Egon Willighagen
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contact: http://www.bioclipse.net/
*/
package net.bioclipse.managers;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.openscience.cdk.io.formats.CMLFormat;
import org.openscience.cdk.io.formats.IChemFormat;
import io.github.egonw.bacting.IBactingManager;
import net.bioclipse.core.business.BioclipseException;
import net.bioclipse.core.domain.IMolecule;
import net.htmlparser.jericho.Source;
import uk.ac.cam.ch.wwmm.oscar.Oscar;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ChemicalStructure;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ResolvedNamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;
/**
* Bioclipse manager that adds text mining functionality.
*/
public class OscarManager implements IBactingManager {
private Oscar oscar = new Oscar();
private CDKManager cdk;
/**
* Creates a new {@link OscarManager}.
*
* @param workspaceRoot location of the workspace, e.g. "."
*/
public OscarManager(String workspaceRoot) {
this.cdk = new CDKManager(workspaceRoot);
}
/**
* Extracts named entities from a text.
*
* @param text Text with named entities (chemical names).
* @return List of named entities.
*/
public List findNamedEntities(String text) {
List mols = new ArrayList();
List entities = oscar.findNamedEntities(text);
for (NamedEntity entity : entities) {
mols.add(entity.getSurface());
}
return mols;
}
/**
* Extracts molecules from a text for which the corresponding named entities are recognized as chemicals.
*
* @param text Text with chemical names.
* @return List of {@link IMolecule}s.
*/
public List findResolvedNamedEntities(String text) throws BioclipseException {
List mols = new ArrayList();
List entities = oscar.findAndResolveNamedEntities(text);
for (ResolvedNamedEntity entity : entities) {
ChemicalStructure structure =
entity.getFirstChemicalStructure(FormatType.CML);
if (structure != null) {
IMolecule mol;
try {
mol = cdk.loadMolecule(
new ByteArrayInputStream(
structure.getValue().getBytes()
), (IChemFormat)CMLFormat.getInstance()
);
mols.add(mol);
} catch (BioclipseException e) {
throw new BioclipseException(
"Error while creating an IMolecule for an " +
"extracted compound: " + e.getMessage(), e);
} catch (IOException e) {
throw new BioclipseException(
"Error while creating an IMolecule for an " +
"extracted compound: " + e.getMessage(), e);
}
}
}
return mols;
}
/**
* Extracts plain text from a HTML document.
*
* @param html HTML string from which the text should be extracted.
* @return The resulting text
*/
public String extractText(String html) {
Source source = new Source(html);
return source.getTextExtractor().toString();
}
@Override
public String getManagerName() {
return "oscar";
}
@Override
public List doi() {
List dois = new ArrayList();
dois.add("10.1186/1758-2946-3-41");
return dois;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy