org.ow2.weblab.service.gate.GateService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-extraction Show documentation
Show all versions of gate-extraction Show documentation
Gate based component, that can process the Text units to extract informations using Gate's tools (such as grammars, gazetteers, tokenizer or POS Taggers).
This project contains two versions, a simple component and webservice one.
/**
* WEBLAB: Service oriented integration platform for media mining and intelligence applications
*
* Copyright (C) 2004 - 2009 EADS DEFENCE AND SECURITY SYSTEMS
*
* This library is free software; you can redistribute it and/or modify it under the terms of
* the GNU Lesser General Public License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
package org.ow2.weblab.service.gate;
import gate.Corpus;
import gate.CorpusController;
import gate.Factory;
import gate.Gate;
import gate.creole.ConditionalSerialAnalyserController;
import gate.creole.ConditionalSerialController;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.SerialAnalyserController;
import gate.creole.SerialController;
import gate.persist.PersistenceException;
import gate.util.GateException;
import gate.util.persistence.PersistenceManager;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Map.Entry;
import javax.jws.WebService;
import org.apache.commons.logging.LogFactory;
import org.springframework.core.io.ClassPathResource;
import org.weblab_project.core.exception.WebLabCheckedException;
import org.weblab_project.core.exception.WebLabUncheckedException;
import org.weblab_project.core.helper.RDFHelperFactory;
import org.weblab_project.core.helper.ResourceHelper;
import org.weblab_project.core.model.MediaUnit;
import org.weblab_project.core.model.Resource;
import org.weblab_project.core.model.text.Text;
import org.weblab_project.core.ontologies.DublinCore;
import org.weblab_project.core.util.ResourceUtil;
import org.weblab_project.services.analyser.Analyser;
import org.weblab_project.services.analyser.ProcessException;
import org.weblab_project.services.analyser.types.ProcessArgs;
import org.weblab_project.services.analyser.types.ProcessReturn;
import org.weblab_project.services.exception.WebLabException;
/**
* This class is a WebService calling Gate with different configurations.
* It initializes gate once with a gapp configuration and implements a corpus pipeline
*
* @author khelif, ymombrun
*/
@WebService(endpointInterface = "org.weblab_project.services.analyser.Analyser")
public class GateService implements Analyser {
private static final String GATE_HOME_DIR = "gate";
private static final String GATE_USER_FILE = "gate.xml";
final static private String GATE_PLUGINS_DIR = "plugins";
/*
* TODO Check how we can handle the trouble with language and snowball porter stemmer.
*
* The trouble is that in snowball language codes are defined in the plugin, and they cannot be by passed.
*/
private static final String GATE_LANGUAGE_FEATURE = "language";
private static final String DEFAULT_GAPP_FILE = "default.gapp";
public final static String DEFAULT_LANGUAGE = "en";
private static boolean INITIALISED = false;
private final File defaultGappFile;
private final String defaultLanguage;
/**
* Default constructor using DEFAULT_GAPP_FILE as defaultGappFile and DEFAULT_SERVICE_URI as serviceURI
*
* @throws IOException
* If the default gapp file cannot be found in classpath resources or If Gate home cannot be found in classpath resource
*/
public GateService() throws IOException {
this(new ClassPathResource(DEFAULT_GAPP_FILE).getFile());
}
/**
* @param defaultGappFile
* The defaultGappFile
* @throws IOException
* If Gate home cannot be found in classpath resource
*/
public GateService(final File defaultGappFile) throws IOException {
this(defaultGappFile, new File(new ClassPathResource(GATE_HOME_DIR).getFile(), GATE_PLUGINS_DIR));
}
/**
*
* @param defaultGappFile
* @param pluginsPath
* @throws IOException
* If Gate home cannot be found in classpath resource
*/
public GateService(final File defaultGappFile, final File pluginsPath) throws IOException {
this(defaultGappFile, pluginsPath, DEFAULT_LANGUAGE);
}
/**
*
* @param defaultGappFile
* @param pluginsPath
* @param defaultLanguage
* @throws IOException
* If Gate home cannot be found in classpath resource
*/
public GateService(final File defaultGappFile, final File pluginsPath, final String defaultLanguage) throws IOException {
super();
this.defaultGappFile = defaultGappFile;
this.defaultLanguage = defaultLanguage;
initGate(pluginsPath);
}
@Override
public ProcessReturn process(ProcessArgs args) throws ProcessException {
LogFactory.getLog(this.getClass()).debug("Early starting of Gate Analyser");
// Tests the input parameters and get every Text section contained by the resource in args.
List texts = this.checkParameters(args);
LogFactory.getLog(this.getClass()).info("Starting of resource '" + args.getResource().getUri() + "' with Gate Analyser");
// Instantiates an empty Gate Corpus
Corpus corpusGate;
try {
corpusGate = Factory.newCorpus(args.getResource().getUri() + " " + System.nanoTime());
} catch (final ResourceInstantiationException rie) {
WebLabException wle = new WebLabException();
wle.setErrorId("E0");
wle.setErrorMessage("Unexpected error");
throw new ProcessException("Unable to instanciate new Corpus.", wle, rie);
}
/*
* Creates a RDF helper that can process the whole resource.
* It will enable to get the dc:language property of each text section.
* If none, the document property will be used.
* If none, defaultLanguage will be used.
*/
ResourceHelper helper = RDFHelperFactory.getResourceHelper(args.getResource());
String docDefaultLanguage = GateService.getLanguage(helper, args.getResource().getUri());
if (docDefaultLanguage == null) {
docDefaultLanguage = this.defaultLanguage;
}
int errorCount = 0;
// For each Text section
Map gateDocsAndText = new HashMap();
TextLoop: for (final Text text : texts) {
LogFactory.getLog(this.getClass()).debug("Text section to process by GateAnalyserComponent: " + text.getUri());
LogFactory.getLog(this.getClass()).debug("Number of segments before GateAnalyserComponent: " + text.getSegment().size());
// Creates an empty Gate Document
gate.Document docGate;
try {
docGate = Factory.newDocument(text.getContent());
} catch (final ResourceInstantiationException rie) {
LogFactory.getLog(this.getClass()).error("Unable to create a new Gate Document.");
errorCount++;
if (errorCount < 3) {
continue TextLoop;
}
WebLabException wle = new WebLabException();
wle.setErrorId("E0");
wle.setErrorMessage("Unexpected error");
throw new ProcessException("Fails three times to instanciate new Gate Document.", wle, rie);
}
// Extract the language of the text and add this as feature in the Gate Document.
final String language = GateService.getLanguage(helper, text.getUri());
if (language != null) {
docGate.getFeatures().put(GATE_LANGUAGE_FEATURE, language);
} else {
docGate.getFeatures().put(GATE_LANGUAGE_FEATURE, docDefaultLanguage);
}
// Add the Gate doc in the corpus and maps it to its WebLab Text.
addGateDocumentToCorpus(corpusGate, docGate);
gateDocsAndText.put(docGate, text);
}
final String usageContext;
if (args.getUsageContext() != null) {
usageContext = args.getUsageContext().getUri();
} else {
usageContext = null;
}
CorpusController controller = this.getController(usageContext);
/*
* CorpusController are not thread safe (they can process only one corpus at a time).
*
* Here we synchronized on the controller instance. It means that on a given usageContext process are sequential.
* If inputs are from two usageContexts, parallelism is enabled.
*/
synchronized (controller) {
controller.setCorpus(corpusGate);
try {
controller.execute();
} catch (final ExecutionException ee) {
WebLabException wle = new WebLabException();
wle.setErrorId("E0");
wle.setErrorMessage("Unexpected exception.");
throw new ProcessException("Unable to process pipeline of corpus.", wle, ee);
}
}
LogFactory.getLog(this.getClass()).info("Starting annotation extraction");
// Extract annotations of each Gate Document and add them to the WebLab Text.
for (Entry entry : gateDocsAndText.entrySet()) {
GateHelper.linkGateAnnotsToText(entry.getValue(), entry.getKey().getAnnotations());
if (LogFactory.getLog(this.getClass()).isDebugEnabled()) {
LogFactory.getLog(this.getClass()).debug("Number of segment after GateExtractionComponent: " + entry.getValue().getSegment().size());
try {
LogFactory.getLog(this.getClass()).debug(ResourceUtil.saveToXMLString(entry.getValue()));
} catch (final WebLabCheckedException wlce) {
LogFactory.getLog(this.getClass()).warn("Unable to serialise to XML the resource: '" + entry.getValue().getUri() + "'.", wlce);
}
}
// Empties the memory for each doc
corpusGate.unloadDocument(entry.getKey());
Factory.deleteResource(entry.getKey());
}
// Empties the memory from the corpus
Factory.deleteResource(corpusGate);
// Creates the return wrapper and add the resource in it.
ProcessReturn theRet = new ProcessReturn();
theRet.setResource(args.getResource());
LogFactory.getLog(this.getClass()).info("Resource '" + args.getResource().getUri() + "' successfully processed with Gate Analyser");
return theRet;
}
/**
* This method first check if a controller exists in configuration singleton for the given usageContext.
* If not it creates a controller from the gapp file path in config (if exists) or from the default path.
*
* @param usageContext
* The URI of usageContext or null if not define
* @return An existing corpus controller or a newly created one.
* @throws ProcessException
* If the instantiation of the controller fails.
*/
private synchronized CorpusController getController(final String usageContext) throws ProcessException {
// Checks if controller already exists in config.
CorpusController controller = Configuration.getInstance().getController(usageContext);
// Returns it if it exists.
if (controller != null) {
return controller;
}
// Else: loads the required controller from a gapp file in config
final SerialController genericController;
try {
genericController = (SerialController) PersistenceManager.loadObjectFromFile(this.getGappFile(usageContext));
} catch (final IOException ioe) {
WebLabException wle = new WebLabException();
wle.setErrorId("E2");
wle.setErrorMessage("Insufficient resources.");
throw new ProcessException("Unable to load gapp file.", wle, ioe);
} catch (final PersistenceException pe) {
WebLabException wle = new WebLabException();
wle.setErrorId("E0");
wle.setErrorMessage("Unexpected exception.");
throw new ProcessException("Unable to load gapp file.", wle, pe);
} catch (final ResourceInstantiationException rie) {
WebLabException wle = new WebLabException();
wle.setErrorId("E0");
wle.setErrorMessage("Unexpected exception.");
throw new ProcessException("Unable to load gapp file.", wle, rie);
}
/*
* In a Gapp file, only serialAnlyser are enabled.
* In our implementation only corpus controller are enabled.
*
* Due to a trouble in Gate conception we need to do some strange cast in the case of conditional corpus controller.
*/
if (genericController.getClass().getCanonicalName().equals(SerialAnalyserController.class.getCanonicalName())) {
controller = (CorpusController) genericController;
} else {
final ConditionalSerialController csController = (ConditionalSerialController) genericController;
final ConditionalSerialAnalyserController conditionalPipeline;
try {
conditionalPipeline = (ConditionalSerialAnalyserController) Factory.createResource(ConditionalSerialAnalyserController.class.getCanonicalName());
} catch (final ResourceInstantiationException rie) {
WebLabException wle = new WebLabException();
wle.setErrorId("E0");
wle.setErrorMessage("Unexpected exception.");
throw new ProcessException("Unable to create a '" + ConditionalSerialAnalyserController.class.getCanonicalName() + "' when converting conditional pipeline.", wle, rie);
}
conditionalPipeline.setPRs(csController.getPRs());
conditionalPipeline.setRunningStrategies(csController.getRunningStrategies());
controller = conditionalPipeline;
// Empties Gate from the useless Pipeline created.
Factory.deleteResource(csController);
}
Configuration.getInstance().setController(usageContext, controller);
return controller;
}
/**
* @param helper
* A resource Helper at the document level
* @param uri
* URI of the resource to extract language
* @return The language or null if not found
*/
private static String getLanguage(final ResourceHelper helper, final String uri) {
final String language;
final List languages = helper.getLitsOnPredSubj(uri, DublinCore.LANGUAGE_PROPERTY_NAME);
if (languages.isEmpty() || languages.get(0).toLowerCase().equals("unknown")) {
language = null;
} else {
language = languages.get(0).toLowerCase();
}
return language;
}
/**
* It looks in configuration singleton if a gapp file path exists and returns it if any.
* Else, returns the default gapp file.
*
* @param usageContext
* URI of the usageContext or null if not defined
* @return The gapp file to be used to instantiate a controller for this usageContext.
*
* @throws IOException
* If the default gapp file was not found in the classpath.
*/
private File getGappFile(final String usageContext) throws IOException {
// Gets the right gapp file in the configuration singleton if configure method as previously been called.
final String gappFilePath = Configuration.getInstance().getGateApplicationStateFilePath(usageContext);
// Creates the file from the path if not null. Else, uses the default one.
File gappFile;
if (gappFilePath != null) {
gappFile = new File(gappFilePath);
} else {
gappFile = this.defaultGappFile;
}
return gappFile;
}
/**
* @param corpusGate
* The Corpus
* @param docGate
* The document to be added into corpusGate
*/
@SuppressWarnings("unchecked")
private void addGateDocumentToCorpus(Corpus corpusGate, final gate.Document docGate) {
corpusGate.add(docGate);
}
private synchronized void initGate(final File pluginsdir) throws IOException {
// Gate must be initialized only one time !
if (!INITIALISED) {
final File gateHome = new ClassPathResource(GATE_HOME_DIR).getFile();
LogFactory.getLog(this.getClass()).debug("Here is the path of your application => " + gateHome.getAbsolutePath());
Gate.setGateHome(gateHome);
Gate.setUserConfigFile(new File(gateHome, GATE_USER_FILE));
Gate.setPluginsHome(pluginsdir);
Gate.setSiteConfigFile(new File(gateHome, GATE_USER_FILE));
try {
Gate.init();
} catch (final GateException ge) {
throw new WebLabUncheckedException("Unable to find initialise Gate.", ge);
}
GateHelper.init();
INITIALISED = true;
}
}
/**
* @param args
* The ProcessArgs
* @return The list of Text contained by the Resource in args.
* @throws ProcessException
* For any reason preventing the retrieval of text unit to be done.
*/
protected List checkParameters(final ProcessArgs args) throws ProcessException {
if (args == null) {
throw new ProcessException("ProcessArgs was null.", this.createInvalidParameterWLE());
}
Resource res = args.getResource();
if (res == null) {
throw new ProcessException("Resource in ProcessArg was null.", this.createInvalidParameterWLE());
}
if (!(res instanceof MediaUnit)) {
throw new ProcessException("This service only process MediaUnit; Resource was a: " + res.getClass().getSimpleName() + ".", this.createInvalidParameterWLE());
}
List texts = new ArrayList();
if (res instanceof Text) {
texts.add((Text) res);
} else {
texts.addAll(ResourceUtil.getSelectedSubResources(args.getResource(), Text.class));
}
for (ListIterator textIt = texts.listIterator(); textIt.hasNext();) {
Text text = textIt.next();
if (text.getContent() == null) {
textIt.remove();
}
}
return texts;
}
/**
* @return A "E1" WebLabException
*/
private WebLabException createInvalidParameterWLE() {
WebLabException wle = new WebLabException();
wle.setErrorId("E1");
wle.setErrorMessage("Invalid parameter");
return wle;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy