gate.creole.splitter.RegexSentenceSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.
The newest version!
/*
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 04 Sep 2007
 *
 *  $Id: RegexSentenceSplitter.java 19742 2016-11-16 17:58:23Z markagreenwood $
 */
package gate.creole.splitter;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;

/**
 * A fast sentence splitter replacement based on regular expressions.
 */
@CreoleResource(name="RegEx Sentence Splitter", icon="sentence-splitter", comment="A sentence splitter based on regular expressions.", helpURL="http://gate.ac.uk/userguide/sec:annie:regex-splitter")
public class RegexSentenceSplitter extends AbstractLanguageAnalyser {

  /**
   * Parameter name
   */
  public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document";

  /**
   * Parameter name
   */
  public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";

  /**
   * Parameter name
   */
  public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";

  /**
   * Parameter name
   */
  public static final String SPLIT_ENCODING_PARAMETER_NAME = "encoding";

  /**
   * Parameter name
   */
  public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME = "splitListURL";


  /**
   * Parameter name
   */
  public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME = "nonSplitListURL";

  /**
   * serialisation ID
   */
  private static final long serialVersionUID = 1L;

  /**
   * Output annotation set name.
   */
  protected String outputASName;

  /**
   * Encoding used when reading config files
   */
  protected String encoding;

  /**
   * URL pointing to a file with regex patterns for internal sentence splits.
   */
  protected ResourceReference internalSplitListURL;

  /**
   * URL pointing to a file with regex patterns for external sentence splits.
   */
  protected ResourceReference externalSplitListURL;

  /**
   * URL pointing to a file with regex patterns for non sentence splits.
   */
  protected ResourceReference nonSplitListURL;


  protected Pattern internalSplitsPattern;

  protected Pattern externalSplitsPattern;

  protected Pattern nonSplitsPattern;

  protected Pattern compilePattern(URL paternsListUrl, String encoding)
          throws UnsupportedEncodingException, IOException {

    StringBuffer patternString = new StringBuffer();
    
    try (BufferedReader reader =
            new BomStrippingInputStreamReader(paternsListUrl.openStream(),
                    encoding)){

      String line = reader.readLine();
      while(line != null) {
        line = line.trim();

        if(line.length() == 0 || line.startsWith("//")) {
          // ignore empty lines and comments
        } else {
          if(patternString.length() > 0) patternString.append("|");
          patternString.append("(?:" + line + ")");
        }
        // move to next line
        line = reader.readLine();
      }
    }
    return Pattern.compile(patternString.toString());
  }


//  protected enum StartEnd {START, END};

  /**
   * A comparator for MatchResult objects. This is used to find the next match
   * result in a text. A null value is used to signify that no more matches are
   * available, hence nulls are the largest value, according to this comparator.
   * @author Valentin Tablan (valyt)
   */
  private class MatchResultComparator implements Comparator{

    /* (non-Javadoc)
     * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
     */
    @Override
    public int compare(MatchResult o1, MatchResult o2) {
      if(o1 == null && o2 == null) return 0;
      if(o1 == null) return 1;
      if(o2 == null) return -1;
      //at this point both match results are not null
      return o1.start() - o2.start();
    }
  }

  @Override
  public void execute() throws ExecutionException {
    interrupted = false;
    int lastProgress = 0;
    fireProgressChanged(lastProgress);
    //get pointers to the annotation sets
    AnnotationSet outputAS = (outputASName == null ||
            outputASName.trim().length() == 0) ?
                             document.getAnnotations() :
                             document.getAnnotations(outputASName);

    String docText = document.getContent().toString();

    /* If the document's content is empty or contains only whitespace,
     * we drop out right here, since there's nothing to sentence-split.     */
    if (docText.trim().length() < 1)  {
      return;
    }

    Matcher internalSplitMatcher = internalSplitsPattern.matcher(docText);
    Matcher externalSplitMatcher = externalSplitsPattern.matcher(docText);

    Matcher nonSplitMatcher = nonSplitsPattern.matcher(docText);
    //store all non split locations in a list of pairs
    List nonSplits = new LinkedList();
    while(nonSplitMatcher.find()){
      nonSplits.add(new int[]{nonSplitMatcher.start(), nonSplitMatcher.end()});
    }
    //this lists holds the next matches at each step
    List nextSplitMatches = new ArrayList();
    //initialise matching process
    MatchResult internalMatchResult = null;
    if(internalSplitMatcher.find()){
      internalMatchResult = internalSplitMatcher.toMatchResult();
      nextSplitMatches.add(internalMatchResult);
    }
    MatchResult externalMatchResult = null;
    if(externalSplitMatcher.find()){
      externalMatchResult = externalSplitMatcher.toMatchResult();
      nextSplitMatches.add(externalMatchResult);
    }
    MatchResultComparator comparator = new MatchResultComparator();
    int lastSentenceEnd = 0;

    while(!nextSplitMatches.isEmpty()){
      //see which one matches first
      Collections.sort(nextSplitMatches, comparator);
      MatchResult nextMatch = nextSplitMatches.remove(0);
      if(nextMatch == internalMatchResult){
        //we have a new internal split; see if it's vetoed or not
        if(!veto(nextMatch, nonSplits)){
          //split is not vetoed
          try {
            //add the split annotation
            FeatureMap features = Factory.newFeatureMap();
            features.put("kind", "internal");
            outputAS.add(Long.valueOf(nextMatch.start()), Long.valueOf(nextMatch.end()),
                    "Split", features);
            //generate the sentence annotation
            int endOffset = nextMatch.end();
            //find the first non whitespace character starting from where the
            //last sentence ended
            while(lastSentenceEnd < endOffset &&
                  Character.isWhitespace(
                          Character.codePointAt(docText, lastSentenceEnd))){
              lastSentenceEnd++;
            }
            //if there is any useful text between the two offsets, generate
            //a new sentence
            if(lastSentenceEnd < nextMatch.start()){
              outputAS.add(Long.valueOf(lastSentenceEnd), Long.valueOf(endOffset),
                      ANNIEConstants.SENTENCE_ANNOTATION_TYPE,
                      Factory.newFeatureMap());
            }
            //store the new sentence end
            lastSentenceEnd = endOffset;
          } catch(InvalidOffsetException e) {
            // this should never happen
            throw new ExecutionException(e);
          }
        }
        //prepare for next step
        if(internalSplitMatcher.find()){
          internalMatchResult = internalSplitMatcher.toMatchResult();
          nextSplitMatches.add(internalMatchResult);
        }else{
          internalMatchResult = null;
        }
      }else if(nextMatch == externalMatchResult){
        //we have a new external split; see if it's vetoed or not
        if(!veto(nextMatch, nonSplits)){
          //split is not vetoed
          try {
            //generate the split
            FeatureMap features = Factory.newFeatureMap();
            features.put("kind", "external");
            outputAS.add(Long.valueOf(nextMatch.start()), Long.valueOf(nextMatch.end()),
                    "Split", features);
            //generate the sentence annotation
            //find the last non whitespace character, going backward from
            //where the external skip starts
            int endOffset = nextMatch.start();
            while(endOffset > lastSentenceEnd &&
                    Character.isSpaceChar(
                            Character.codePointAt(docText, endOffset -1))){
              endOffset--;
            }
            //find the first non whitespace character starting from where the
            //last sentence ended
            while(lastSentenceEnd < endOffset &&
                    Character.isSpaceChar(
                            Character.codePointAt(docText, lastSentenceEnd))){
              lastSentenceEnd++;
            }
            //if there is any useful text between the two offsets, generate
            //a new sentence
            if(lastSentenceEnd < endOffset){
              outputAS.add(Long.valueOf(lastSentenceEnd), Long.valueOf(endOffset),
                      ANNIEConstants.SENTENCE_ANNOTATION_TYPE,
                      Factory.newFeatureMap());
            }
            //store the new sentence end
            lastSentenceEnd = nextMatch.end();
          } catch(InvalidOffsetException e) {
            // this should never happen
            throw new ExecutionException(e);
          }
        }
        //prepare for next step
        if(externalSplitMatcher.find()){
          externalMatchResult = externalSplitMatcher.toMatchResult();
          nextSplitMatches.add(externalMatchResult);
        }else{
          externalMatchResult = null;
        }
      }else{
        //malfunction
        throw new ExecutionException("Invalid state - cannot identify match!");
      }
      //report progress
      int newProgress = 100 * lastSentenceEnd / docText.length();
      if(newProgress - lastProgress > 20){
        lastProgress = newProgress;
        fireProgressChanged(lastProgress);
      }
    }//while(!nextMatches.isEmpty()){
    fireProcessFinished();
  }


  /**
   * Checks whether a possible match is being vetoed by a non split match. A
   * possible match is vetoed if it any nay overlap with a veto region.
   *
   * @param split the match result representing the split to be tested
   * @param vetoRegions regions where matches are not allowed. For efficiency
   * reasons, this method assumes these regions to be non overlapping and sorted
   * in ascending order.
   * All veto regions that end before the proposed match are also discarded
   * (again for efficiency reasons). This requires the proposed matches to be
   * sent to this method in ascending order, so as to avoid malfunctions.
   * @return true iff the proposed split should be ignored
   */
  private boolean veto(MatchResult split, List vetoRegions){
    //if no more non splits available, accept everything
    for(Iterator vetoRegIter = vetoRegions.iterator();
        vetoRegIter.hasNext();){
      int[] aVetoRegion = vetoRegIter.next();
      if(aVetoRegion[1] -1 < split.start()){
        //current veto region ends before the proposed split starts
        //--> discard the veto region
        vetoRegIter.remove();
      }else if(split.end() -1 < aVetoRegion[0]){
        //veto region starts after the split ends
        //-> we can return false
        return false;
      }else{
        //we have overlap
        return true;
      }
    }
    //if we got this far, all veto regions are before the split
    return false;
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    super.init();
    try {
      //sanity checks
      if(internalSplitListURL == null)
        throw new ResourceInstantiationException("No list of internal splits provided!");
      if(externalSplitListURL == null)
        throw new ResourceInstantiationException("No list of external splits provided!");
      if(nonSplitListURL == null)
        throw new ResourceInstantiationException("No list of non splits provided!");
      if(encoding == null)
        throw new ResourceInstantiationException("No encoding provided!");

      //load the known abbreviations list
      internalSplitsPattern = compilePattern(internalSplitListURL.toURL(), encoding);
      externalSplitsPattern = compilePattern(externalSplitListURL.toURL(), encoding);
      nonSplitsPattern = compilePattern(nonSplitListURL.toURL(), encoding);
    } catch(UnsupportedEncodingException e) {
      throw new ResourceInstantiationException(e);
    } catch(IOException e) {
      throw new ResourceInstantiationException(e);
    }

    return this;
  }

  /**
   * @return the outputASName
   */
  public String getOutputASName() {
    return outputASName;
  }

  /**
   * @param outputASName the outputASName to set
   */
  @RunTime
  @Optional
  @CreoleParameter(comment="The annotation set to be used as output for 'Sentence' and 'Split' annotations")
  public void setOutputASName(String outputASName) {
    this.outputASName = outputASName;
  }

  /**
   * @return the encoding
   */
  public String getEncoding() {
    return encoding;
  }

  /**
   * @param encoding the encoding to set
   */
  @CreoleParameter(comment="The encoding used for reading the definition files", defaultValue="UTF-8")
  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }

  /**
   * @return the internalSplitListURL
   */
  public ResourceReference getInternalSplitListURL() {
    return internalSplitListURL;
  }

  /**
   * @param internalSplitListURL the internalSplitListURL to set
   */
  @CreoleParameter(defaultValue="resources/regex-splitter/internal-split-patterns.txt", suffixes="txt", comment="The URL to the internal splits pattern list")
  public void setInternalSplitListURL(ResourceReference internalSplitListURL) {
    this.internalSplitListURL = internalSplitListURL;
  }
  
  @Deprecated
  public void setInternalSplitListURL(URL internalSplitListURL) {
    try {
      this.setInternalSplitListURL(new ResourceReference(internalSplitListURL));
    } catch (URISyntaxException e) {
      throw new RuntimeException("Error converting URL to ResourceReference", e);
    }
  }

  /**
   * @return the externalSplitListURL
   */
  public ResourceReference getExternalSplitListURL() {
    return externalSplitListURL;
  }

  /**
   * @param externalSplitListURL the externalSplitListURL to set
   */
  @CreoleParameter(defaultValue="resources/regex-splitter/external-split-patterns.txt", comment="The URL to the external splits pattern list", suffixes="txt")
  public void setExternalSplitListURL(ResourceReference externalSplitListURL) {
    this.externalSplitListURL = externalSplitListURL;
  }
  
  @Deprecated
  public void setExternalSplitListURL(URL externalSplitListURL) {
    try {
      this.setExternalSplitListURL(new ResourceReference(externalSplitListURL));
    } catch (URISyntaxException e) {
      throw new RuntimeException("Error converting URL to ResourceReference", e);
    }
  }

  /**
   * @return the nonSplitListURL
   */
  public ResourceReference getNonSplitListURL() {
    return nonSplitListURL;
  }

  /**
   * @param nonSplitListURL the nonSplitListURL to set
   */
  @CreoleParameter(defaultValue="resources/regex-splitter/non-split-patterns.txt", comment="The URL to the non splits pattern list", suffixes="txt")
  public void setNonSplitListURL(ResourceReference nonSplitListURL) {
    this.nonSplitListURL = nonSplitListURL;
  }
  
  @Deprecated
  public void setNonSplitListURL(URL nonSplitListURL) {
    try {
      this.setNonSplitListURL(new ResourceReference(nonSplitListURL));
    } catch (URISyntaxException e) {
      throw new RuntimeException("Error converting URL to ResourceReference", e);
    }
  }

  /**
   * @return the internalSplitsPattern
   */
  public Pattern getInternalSplitsPattern() {
    return internalSplitsPattern;
  }

  /**
   * @param internalSplitsPattern the internalSplitsPattern to set
   */
  public void setInternalSplitsPattern(Pattern internalSplitsPattern) {
    this.internalSplitsPattern = internalSplitsPattern;
  }
}