gate.creole.splitter.SentenceSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
The newest version!
/*
* Copyright (c) 1995-2011, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Valentin Tablan, 01 Feb 2000
*
* $Id: SentenceSplitter.java 19742 2016-11-16 17:58:23Z markagreenwood $
*/
package gate.creole.splitter;
import java.net.URISyntaxException;
import java.net.URL;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.Transducer;
import gate.creole.gazetteer.DefaultGazetteer;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.event.ProgressListener;
import gate.event.StatusListener;
import gate.util.Benchmark;
import gate.util.Benchmarkable;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
/**
* A sentence splitter. This is module contains a tokeniser, a
* gazetteer and a Jape grammar. This class is used so we can have a different
* entry in the creole.xml file describing the default resources and to add
* some minor processing after running the components in order to extract the
* results in a usable form.
*/
@CreoleResource(name="ANNIE Sentence Splitter", comment="ANNIE sentence splitter.", helpURL="http://gate.ac.uk/userguide/sec:annie:splitter", icon="sentence-splitter")
public class SentenceSplitter extends AbstractLanguageAnalyser implements Benchmarkable {
private static final long serialVersionUID = -5335682060379173111L;
public static final String
SPLIT_DOCUMENT_PARAMETER_NAME = "document";
public static final String
SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
public static final String
SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
public static final String
SPLIT_ENCODING_PARAMETER_NAME = "encoding";
public static final String
SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
public static final String
SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
private String benchmarkId;
@Override
public Resource init()throws ResourceInstantiationException{
//create all the componets
FeatureMap params;
FeatureMap features;
params = Factory.newFeatureMap();
if(gazetteerListsURL != null)
params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
gazetteerListsURL);
params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
if (gazetteer == null) {
//gazetteer
fireStatusChanged("Creating the gazetteer");
features = Factory.newFeatureMap();
Gate.setHiddenAttribute(features, true);
gazetteer = (DefaultGazetteer)Factory.createResource(
"gate.creole.gazetteer.DefaultGazetteer",
params, features);
gazetteer.setName("Gazetteer " + System.currentTimeMillis());
}
else {
gazetteer.setParameterValues(params);
gazetteer.reInit();
}
fireProgressChanged(10);
params = Factory.newFeatureMap();
if(transducerURL != null)
params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
if (transducer == null) {
//transducer
fireStatusChanged("Creating the JAPE transducer");
features = Factory.newFeatureMap();
Gate.setHiddenAttribute(features, true);
transducer = (AbstractLanguageAnalyser)Factory.createResource(
"gate.creole.Transducer",
params, features);
transducer.setName("Transducer " + System.currentTimeMillis());
}
else {
transducer.setParameterValues(params);
transducer.reInit();
}
fireProgressChanged(100);
fireProcessFinished();
return this;
}
@Override
public void cleanup() {
Factory.deleteResource(gazetteer);
Factory.deleteResource(transducer);
}
@Override
public void execute() throws ExecutionException{
interrupted = false;
//set the runtime parameters
FeatureMap params;
if(inputASName != null && inputASName.equals("")) inputASName = null;
if(outputASName != null && outputASName.equals("")) outputASName = null;
ProgressListener pListener = null;
StatusListener sListener = null;
fireProgressChanged(5);
pListener = new IntervalProgressListener(5, 10);
sListener = new StatusListener() {
@Override
public void statusChanged(String text) {
fireStatusChanged(text);
}
};
try {
// run the gazetteer
params = Factory.newFeatureMap();
params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
gazetteer.setParameterValues(params);
gazetteer.addProgressListener(pListener);
gazetteer.addStatusListener(sListener);
gazetteer.execute();
} catch(ResourceInstantiationException e) {
throw new ExecutionException(e);
} finally {
gazetteer.setDocument(null);
gazetteer.removeProgressListener(pListener);
gazetteer.removeStatusListener(sListener);
}
if(isInterrupted())
throw new ExecutionInterruptedException("The execution of the \""
+ getName()
+ "\" sentence splitter has been abruptly interrupted!");
pListener = new IntervalProgressListener(11, 90);
try {
params = Factory.newFeatureMap();
params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
transducer.setParameterValues(params);
transducer.addProgressListener(pListener);
transducer.addStatusListener(sListener);
Benchmark.executeWithBenchmarking(transducer,
Benchmark.createBenchmarkId("SentenceSplitterTransducer",
getBenchmarkId()), this, null);
} catch(ResourceInstantiationException e) {
throw new ExecutionException(e);
} finally {
transducer.setDocument(null);
transducer.removeProgressListener(pListener);
transducer.removeStatusListener(sListener);
}
//get pointers to the annotation sets
AnnotationSet inputAS = (inputASName == null) ?
document.getAnnotations() :
document.getAnnotations(inputASName);
AnnotationSet outputAS = (outputASName == null) ?
document.getAnnotations() :
document.getAnnotations(outputASName);
//copy the results to the output set if they are different
if(inputAS != outputAS){
outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
}
//create one big sentence if none were found
AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
if(sentences == null || sentences.isEmpty()){
//create an annotation covering the entire content
try{
outputAS.add(0L, document.getContent().size(),
SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
}catch(InvalidOffsetException ioe){
throw new GateRuntimeException(ioe);
}
}else{
//add a sentence covering all the tokens after the last sentence
Long endSentences = sentences.lastNode().getOffset();
AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
inputAS.lastNode().getOffset());
if(remainingTokens != null && !remainingTokens.isEmpty()){
try{
outputAS.add(remainingTokens.firstNode().getOffset(),
remainingTokens.lastNode().getOffset(),
SENTENCE_ANNOTATION_TYPE,
Factory.newFeatureMap());
}catch(InvalidOffsetException ioe){
throw new ExecutionException(ioe);
}
}
}
fireProcessFinished();
}//execute()
/**
* Notifies all the PRs in this controller that they should stop their
* execution as soon as possible.
*/
@Override
public synchronized void interrupt(){
interrupted = true;
gazetteer.interrupt();
transducer.interrupt();
}
@Optional
@CreoleParameter(defaultValue="resources/sentenceSplitter/grammar/main-single-nl.jape", comment="The URL to the custom Jape grammar file", suffixes="jape")
public void setTransducerURL(ResourceReference newTransducerURL) {
transducerURL = newTransducerURL;
}
@Deprecated
public void setTransducerURL(URL newTransducerURL) {
try {
this.setTransducerURL(new ResourceReference(newTransducerURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
public ResourceReference getTransducerURL() {
return transducerURL;
}
DefaultGazetteer gazetteer;
AbstractLanguageAnalyser transducer;
private ResourceReference transducerURL;
private String encoding;
private ResourceReference gazetteerListsURL;
@CreoleParameter(comment="The encoding used for reading the definition files", defaultValue="UTF-8")
public void setEncoding(String newEncoding) {
encoding = newEncoding;
}
public String getEncoding() {
return encoding;
}
@Optional
@CreoleParameter(defaultValue="resources/sentenceSplitter/gazetteer/lists.def", comment="The URL to the custom list lookup definition file", suffixes="def")
public void setGazetteerListsURL(ResourceReference newGazetteerListsURL) {
gazetteerListsURL = newGazetteerListsURL;
}
@Deprecated
public void setGazetteerListsURL(URL newGazetteerListsURL) {
try {
this.setGazetteerListsURL(new ResourceReference(newGazetteerListsURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
public ResourceReference getGazetteerListsURL() {
return gazetteerListsURL;
}
@RunTime
@Optional
@CreoleParameter(comment="The annotation set to be used as input that must contain 'Token' annotations")
public void setInputASName(String newInputASName) {
inputASName = newInputASName;
}
public String getInputASName() {
return inputASName;
}
@RunTime
@Optional
@CreoleParameter(comment="The annotation set to be used as output for 'Sentence' and 'Split' annotations")
public void setOutputASName(String newOutputASName) {
outputASName = newOutputASName;
}
public String getOutputASName() {
return outputASName;
}
/* (non-Javadoc)
* @see gate.util.Benchmarkable#getBenchmarkId()
*/
@Override
public String getBenchmarkId() {
if(benchmarkId == null) {
return getName();
}
else {
return benchmarkId;
}
}
/* (non-Javadoc)
* @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String)
*/
@Override
public void setBenchmarkId(String benchmarkId) {
this.benchmarkId = benchmarkId;
}
private String inputASName;
private String outputASName;
}//public class SentenceSplitter extends Nerc
© 2015 - 2024 Weber Informatics LLC | Privacy Policy