gate.creole.splitter.RegexSentenceSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
The newest version!
/*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Valentin Tablan, 04 Sep 2007
*
* $Id: RegexSentenceSplitter.java 19742 2016-11-16 17:58:23Z markagreenwood $
*/
package gate.creole.splitter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;
/**
* A fast sentence splitter replacement based on regular expressions.
*/
@CreoleResource(name="RegEx Sentence Splitter", icon="sentence-splitter", comment="A sentence splitter based on regular expressions.", helpURL="http://gate.ac.uk/userguide/sec:annie:regex-splitter")
public class RegexSentenceSplitter extends AbstractLanguageAnalyser {
/**
* Parameter name
*/
public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document";
/**
* Parameter name
*/
public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
/**
* Parameter name
*/
public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
/**
* Parameter name
*/
public static final String SPLIT_ENCODING_PARAMETER_NAME = "encoding";
/**
* Parameter name
*/
public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME = "splitListURL";
/**
* Parameter name
*/
public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME = "nonSplitListURL";
/**
* serialisation ID
*/
private static final long serialVersionUID = 1L;
/**
* Output annotation set name.
*/
protected String outputASName;
/**
* Encoding used when reading config files
*/
protected String encoding;
/**
* URL pointing to a file with regex patterns for internal sentence splits.
*/
protected ResourceReference internalSplitListURL;
/**
* URL pointing to a file with regex patterns for external sentence splits.
*/
protected ResourceReference externalSplitListURL;
/**
* URL pointing to a file with regex patterns for non sentence splits.
*/
protected ResourceReference nonSplitListURL;
protected Pattern internalSplitsPattern;
protected Pattern externalSplitsPattern;
protected Pattern nonSplitsPattern;
protected Pattern compilePattern(URL paternsListUrl, String encoding)
throws UnsupportedEncodingException, IOException {
StringBuffer patternString = new StringBuffer();
try (BufferedReader reader =
new BomStrippingInputStreamReader(paternsListUrl.openStream(),
encoding)){
String line = reader.readLine();
while(line != null) {
line = line.trim();
if(line.length() == 0 || line.startsWith("//")) {
// ignore empty lines and comments
} else {
if(patternString.length() > 0) patternString.append("|");
patternString.append("(?:" + line + ")");
}
// move to next line
line = reader.readLine();
}
}
return Pattern.compile(patternString.toString());
}
// protected enum StartEnd {START, END};
/**
* A comparator for MatchResult objects. This is used to find the next match
* result in a text. A null value is used to signify that no more matches are
* available, hence nulls are the largest value, according to this comparator.
* @author Valentin Tablan (valyt)
*/
private class MatchResultComparator implements Comparator{
/* (non-Javadoc)
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
*/
@Override
public int compare(MatchResult o1, MatchResult o2) {
if(o1 == null && o2 == null) return 0;
if(o1 == null) return 1;
if(o2 == null) return -1;
//at this point both match results are not null
return o1.start() - o2.start();
}
}
@Override
public void execute() throws ExecutionException {
interrupted = false;
int lastProgress = 0;
fireProgressChanged(lastProgress);
//get pointers to the annotation sets
AnnotationSet outputAS = (outputASName == null ||
outputASName.trim().length() == 0) ?
document.getAnnotations() :
document.getAnnotations(outputASName);
String docText = document.getContent().toString();
/* If the document's content is empty or contains only whitespace,
* we drop out right here, since there's nothing to sentence-split. */
if (docText.trim().length() < 1) {
return;
}
Matcher internalSplitMatcher = internalSplitsPattern.matcher(docText);
Matcher externalSplitMatcher = externalSplitsPattern.matcher(docText);
Matcher nonSplitMatcher = nonSplitsPattern.matcher(docText);
//store all non split locations in a list of pairs
List nonSplits = new LinkedList();
while(nonSplitMatcher.find()){
nonSplits.add(new int[]{nonSplitMatcher.start(), nonSplitMatcher.end()});
}
//this lists holds the next matches at each step
List nextSplitMatches = new ArrayList();
//initialise matching process
MatchResult internalMatchResult = null;
if(internalSplitMatcher.find()){
internalMatchResult = internalSplitMatcher.toMatchResult();
nextSplitMatches.add(internalMatchResult);
}
MatchResult externalMatchResult = null;
if(externalSplitMatcher.find()){
externalMatchResult = externalSplitMatcher.toMatchResult();
nextSplitMatches.add(externalMatchResult);
}
MatchResultComparator comparator = new MatchResultComparator();
int lastSentenceEnd = 0;
while(!nextSplitMatches.isEmpty()){
//see which one matches first
Collections.sort(nextSplitMatches, comparator);
MatchResult nextMatch = nextSplitMatches.remove(0);
if(nextMatch == internalMatchResult){
//we have a new internal split; see if it's vetoed or not
if(!veto(nextMatch, nonSplits)){
//split is not vetoed
try {
//add the split annotation
FeatureMap features = Factory.newFeatureMap();
features.put("kind", "internal");
outputAS.add(Long.valueOf(nextMatch.start()), Long.valueOf(nextMatch.end()),
"Split", features);
//generate the sentence annotation
int endOffset = nextMatch.end();
//find the first non whitespace character starting from where the
//last sentence ended
while(lastSentenceEnd < endOffset &&
Character.isWhitespace(
Character.codePointAt(docText, lastSentenceEnd))){
lastSentenceEnd++;
}
//if there is any useful text between the two offsets, generate
//a new sentence
if(lastSentenceEnd < nextMatch.start()){
outputAS.add(Long.valueOf(lastSentenceEnd), Long.valueOf(endOffset),
ANNIEConstants.SENTENCE_ANNOTATION_TYPE,
Factory.newFeatureMap());
}
//store the new sentence end
lastSentenceEnd = endOffset;
} catch(InvalidOffsetException e) {
// this should never happen
throw new ExecutionException(e);
}
}
//prepare for next step
if(internalSplitMatcher.find()){
internalMatchResult = internalSplitMatcher.toMatchResult();
nextSplitMatches.add(internalMatchResult);
}else{
internalMatchResult = null;
}
}else if(nextMatch == externalMatchResult){
//we have a new external split; see if it's vetoed or not
if(!veto(nextMatch, nonSplits)){
//split is not vetoed
try {
//generate the split
FeatureMap features = Factory.newFeatureMap();
features.put("kind", "external");
outputAS.add(Long.valueOf(nextMatch.start()), Long.valueOf(nextMatch.end()),
"Split", features);
//generate the sentence annotation
//find the last non whitespace character, going backward from
//where the external skip starts
int endOffset = nextMatch.start();
while(endOffset > lastSentenceEnd &&
Character.isSpaceChar(
Character.codePointAt(docText, endOffset -1))){
endOffset--;
}
//find the first non whitespace character starting from where the
//last sentence ended
while(lastSentenceEnd < endOffset &&
Character.isSpaceChar(
Character.codePointAt(docText, lastSentenceEnd))){
lastSentenceEnd++;
}
//if there is any useful text between the two offsets, generate
//a new sentence
if(lastSentenceEnd < endOffset){
outputAS.add(Long.valueOf(lastSentenceEnd), Long.valueOf(endOffset),
ANNIEConstants.SENTENCE_ANNOTATION_TYPE,
Factory.newFeatureMap());
}
//store the new sentence end
lastSentenceEnd = nextMatch.end();
} catch(InvalidOffsetException e) {
// this should never happen
throw new ExecutionException(e);
}
}
//prepare for next step
if(externalSplitMatcher.find()){
externalMatchResult = externalSplitMatcher.toMatchResult();
nextSplitMatches.add(externalMatchResult);
}else{
externalMatchResult = null;
}
}else{
//malfunction
throw new ExecutionException("Invalid state - cannot identify match!");
}
//report progress
int newProgress = 100 * lastSentenceEnd / docText.length();
if(newProgress - lastProgress > 20){
lastProgress = newProgress;
fireProgressChanged(lastProgress);
}
}//while(!nextMatches.isEmpty()){
fireProcessFinished();
}
/**
* Checks whether a possible match is being vetoed by a non split match. A
* possible match is vetoed if it any nay overlap with a veto region.
*
* @param split the match result representing the split to be tested
* @param vetoRegions regions where matches are not allowed. For efficiency
* reasons, this method assumes these regions to be non overlapping and sorted
* in ascending order.
* All veto regions that end before the proposed match are also discarded
* (again for efficiency reasons). This requires the proposed matches to be
* sent to this method in ascending order, so as to avoid malfunctions.
* @return true iff the proposed split should be ignored
*/
private boolean veto(MatchResult split, List vetoRegions){
//if no more non splits available, accept everything
for(Iterator vetoRegIter = vetoRegions.iterator();
vetoRegIter.hasNext();){
int[] aVetoRegion = vetoRegIter.next();
if(aVetoRegion[1] -1 < split.start()){
//current veto region ends before the proposed split starts
//--> discard the veto region
vetoRegIter.remove();
}else if(split.end() -1 < aVetoRegion[0]){
//veto region starts after the split ends
//-> we can return false
return false;
}else{
//we have overlap
return true;
}
}
//if we got this far, all veto regions are before the split
return false;
}
@Override
public Resource init() throws ResourceInstantiationException {
super.init();
try {
//sanity checks
if(internalSplitListURL == null)
throw new ResourceInstantiationException("No list of internal splits provided!");
if(externalSplitListURL == null)
throw new ResourceInstantiationException("No list of external splits provided!");
if(nonSplitListURL == null)
throw new ResourceInstantiationException("No list of non splits provided!");
if(encoding == null)
throw new ResourceInstantiationException("No encoding provided!");
//load the known abbreviations list
internalSplitsPattern = compilePattern(internalSplitListURL.toURL(), encoding);
externalSplitsPattern = compilePattern(externalSplitListURL.toURL(), encoding);
nonSplitsPattern = compilePattern(nonSplitListURL.toURL(), encoding);
} catch(UnsupportedEncodingException e) {
throw new ResourceInstantiationException(e);
} catch(IOException e) {
throw new ResourceInstantiationException(e);
}
return this;
}
/**
* @return the outputASName
*/
public String getOutputASName() {
return outputASName;
}
/**
* @param outputASName the outputASName to set
*/
@RunTime
@Optional
@CreoleParameter(comment="The annotation set to be used as output for 'Sentence' and 'Split' annotations")
public void setOutputASName(String outputASName) {
this.outputASName = outputASName;
}
/**
* @return the encoding
*/
public String getEncoding() {
return encoding;
}
/**
* @param encoding the encoding to set
*/
@CreoleParameter(comment="The encoding used for reading the definition files", defaultValue="UTF-8")
public void setEncoding(String encoding) {
this.encoding = encoding;
}
/**
* @return the internalSplitListURL
*/
public ResourceReference getInternalSplitListURL() {
return internalSplitListURL;
}
/**
* @param internalSplitListURL the internalSplitListURL to set
*/
@CreoleParameter(defaultValue="resources/regex-splitter/internal-split-patterns.txt", suffixes="txt", comment="The URL to the internal splits pattern list")
public void setInternalSplitListURL(ResourceReference internalSplitListURL) {
this.internalSplitListURL = internalSplitListURL;
}
@Deprecated
public void setInternalSplitListURL(URL internalSplitListURL) {
try {
this.setInternalSplitListURL(new ResourceReference(internalSplitListURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
/**
* @return the externalSplitListURL
*/
public ResourceReference getExternalSplitListURL() {
return externalSplitListURL;
}
/**
* @param externalSplitListURL the externalSplitListURL to set
*/
@CreoleParameter(defaultValue="resources/regex-splitter/external-split-patterns.txt", comment="The URL to the external splits pattern list", suffixes="txt")
public void setExternalSplitListURL(ResourceReference externalSplitListURL) {
this.externalSplitListURL = externalSplitListURL;
}
@Deprecated
public void setExternalSplitListURL(URL externalSplitListURL) {
try {
this.setExternalSplitListURL(new ResourceReference(externalSplitListURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
/**
* @return the nonSplitListURL
*/
public ResourceReference getNonSplitListURL() {
return nonSplitListURL;
}
/**
* @param nonSplitListURL the nonSplitListURL to set
*/
@CreoleParameter(defaultValue="resources/regex-splitter/non-split-patterns.txt", comment="The URL to the non splits pattern list", suffixes="txt")
public void setNonSplitListURL(ResourceReference nonSplitListURL) {
this.nonSplitListURL = nonSplitListURL;
}
@Deprecated
public void setNonSplitListURL(URL nonSplitListURL) {
try {
this.setNonSplitListURL(new ResourceReference(nonSplitListURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
/**
* @return the internalSplitsPattern
*/
public Pattern getInternalSplitsPattern() {
return internalSplitsPattern;
}
/**
* @param internalSplitsPattern the internalSplitsPattern to set
*/
public void setInternalSplitsPattern(Pattern internalSplitsPattern) {
this.internalSplitsPattern = internalSplitsPattern;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy