gate.plugins.ANNIE.resources.sentenceSplitter.grammar.split.jape Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of gate-extraction Show documentation

Gate based component, that can process the Text units to extract informations using Gate's tools (such as grammars, gazetteers, tokenizer or POS Taggers). This project contains two versions, a simple component and webservice one.

There is a newer version: 2.0

Show newest version

/*
*  splitter.jape
*
* Copyright (c) 1998-2004, The University of Sheffield.
*
*  This file is part of GATE (see http://gate.ac.uk/), and is free
*  software, licenced under the GNU Library General Public License,
*  Version 2, June 1991 (in the distribution as file licence.html,
*  and also available at http://gate.ac.uk/gate/licence.html).
*
*  Valentin Tablan, March 7th, 2007
*
*  $Id: split.jape 8234 2007-03-07 17:46:33Z valyt $
*/

Phase:split
Input: Split TempNoSplitText
Options: control = first


//sentence that consumes a split
Rule: internalSplits
({Split.kind == "internal"}):isplit
-->
{
  Long endOffset = ((AnnotationSet)bindings.get("isplit")).
      lastNode().getOffset();
  //find the end offset of previous sentences
  Long lastOffset = (Long)doc.getFeatures().get("temp-last-sentence-end");
  if(lastOffset == null) lastOffset = new Long(0);
//  
//  AnnotationSet sentences = outputAS.get("Sentence");
//  Long lastOffset = sentences == null || sentences.isEmpty() ?
//          new Long(0) :
//          sentences.lastNode().getOffset();  
  //get the start offset of the first token.kind==word
  AnnotationSet tokens = inputAS.getContained(lastOffset, endOffset);
  if(tokens != null) tokens = tokens.get("Token");
  if(tokens != null && tokens.size() > 0){
    List tokList = new ArrayList(tokens);
    Collections.sort(tokList, new OffsetComparator());
    for(Annotation token : tokList){
      String tokenKind = (String)token.getFeatures().get("kind");
      if("word".equals(tokenKind)){
        Long startOffset = token.getStartNode().getOffset();
        if(startOffset.compareTo(endOffset) < 0){
          //create the new sentence
          try{
            outputAS.add(startOffset, endOffset, "Sentence", 
                    Factory.newFeatureMap());
            //save the new end offset
            doc.getFeatures().put("temp-last-sentence-end", endOffset);
          }catch( InvalidOffsetException ioe){
            throw new GateRuntimeException(ioe);
          }
        }
        return;
      }
    }
  }
}

//sentence that doesn't consume a split
Rule: externalSplits
({Split.kind == "external"}):esplit
-->
{
  Long endOffset = ((AnnotationSet)bindings.get("esplit")).
      firstNode().getOffset();
//  //get the end offset of the previous sentence
//  AnnotationSet sentences = outputAS.get("Sentence");
//  Long lastOffset = sentences == null || sentences.isEmpty() ?
//          new Long(0) :
//          sentences.lastNode().getOffset();  
  //find the end offset of previous sentences
  Long lastOffset = (Long)doc.getFeatures().get("temp-last-sentence-end");
  if(lastOffset == null) lastOffset = new Long(0);
  
  //get the start offset of the first token.kind==word
  AnnotationSet tokens = inputAS.getContained(lastOffset, endOffset);
  if(tokens != null) tokens = tokens.get("Token");
  if(tokens != null && tokens.size() > 0){
    //we have a more precise end offset
    endOffset = tokens.lastNode().getOffset();
    List tokList = new ArrayList(tokens);
    Collections.sort(tokList, new OffsetComparator());
    for(Annotation token : tokList){
      String tokenKind = (String)token.getFeatures().get("kind");
      if("word".equals(tokenKind)){
        Long startOffset = token.getStartNode().getOffset();
        if(startOffset.compareTo(endOffset) < 0){
          //create the new sentence
          try{
            outputAS.add(startOffset, endOffset, "Sentence", 
                    Factory.newFeatureMap());
            doc.getFeatures().put("temp-last-sentence-end", endOffset);
          }catch( InvalidOffsetException ioe){
            throw new GateRuntimeException(ioe);
          }
        }
        return;
      }
    }
  }
}