All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.plugins.ANNIE.resources.tokeniser.postprocess.jape Maven / Gradle / Ivy

Go to download

Gate based component, that can process the Text units to extract informations using Gate's tools (such as grammars, gazetteers, tokenizer or POS Taggers). This project contains two versions, a simple component and webservice one.

There is a newer version: 2.0
Show newest version
// Valentin Tablan, 29/06/2001
// $id$


Phase:postprocess
Input: Token SpaceToken
Options: control = appelt

//adjusts the tokeniser output

Rule: simpleJoin
 (
  //'30s, ..., 'Cause, 'em, 'N, 'S, 's, 'T, 'd, , 'll, 'm, 're, 's, 'til, 've
  (
   {Token.string=="'"}
   ({Token.string=="30s"}|{Token.string=="40s"}|{Token.string=="50s"}|{Token.string=="60s"}
    |{Token.string=="70s"}|{Token.string=="80s"}|{Token.string=="90s"}|{Token.string=="Cause"}
    |{Token.string=="cause"}|{Token.string=="Em"}|{Token.string=="em"}|{Token.string=="N"}
    |{Token.string=="S"}|{Token.string=="s"}|{Token.string=="T"}|{Token.string=="d"}
    |{Token.string=="ll"}|{Token.string=="m"}|{Token.string=="re"}|{Token.string=="s"}
    |{Token.string=="til"}|{Token.string=="ve"})
  )
  |
  //'n'
  ({Token.string=="'"} {Token.string=="n"} {Token.string=="'"})
  |
  //C'mon
  (({Token.string=="C"}|{Token.string=="c"}){Token.string=="'"} {Token.string=="mon"})
  |
  //o'clock
  (({Token.string=="O"}|{Token.string=="o"}){Token.string=="'"} {Token.string=="clock"})
  |
  //ma'am
  (({Token.string=="ma"}|{Token.string=="Ma"}){Token.string=="'"} {Token.string=="am"})
 ):left
-->
{
  gate.AnnotationSet toRemove = (gate.AnnotationSet)bindings.get("left");
  outputAS.removeAll(toRemove);
  //get the tokens
  java.util.ArrayList tokens = new java.util.ArrayList(toRemove);
  //define a comparator for annotations by start offset
  Collections.sort(tokens, new gate.util.OffsetComparator());
  String text = "";
  Iterator tokIter = tokens.iterator();
  while(tokIter.hasNext())
    text += (String)((Annotation)tokIter.next()).getFeatures().get("string");

  gate.FeatureMap features = Factory.newFeatureMap();
  features.put("kind", "word");
  features.put("string", text);
  features.put("length", Integer.toString(text.length()));
  features.put("orth", "apostrophe");
  outputAS.add(toRemove.firstNode(), toRemove.lastNode(), "Token", features);
}


Rule: ordinals
  //3rd, 1st, 22nd
  (
   ({Token.kind=="number"}):number
   ({Token.string=="st"}|{Token.string=="nd"}|{Token.string=="rd"}|{Token.string=="th"}):ending
  ):left
-->
{
  Annotation numberAnn = (Annotation)((AnnotationSet)bindings.get("number")).
	iterator().next();	
  Annotation endingAnn = (Annotation)((AnnotationSet)bindings.get("ending")).
	iterator().next();
  
  String numberStr = (String)numberAnn.getFeatures().get("string");
  String endingStr = (String)endingAnn.getFeatures().get("string");
  if((numberStr.endsWith("1") && endingStr.equals("st"))
     |
     (numberStr.endsWith("2") && endingStr.equals("nd"))
     |
     (numberStr.endsWith("3") && endingStr.equals("rd"))
     |
     (endingStr.equals("th"))
){
    //remove old tokens
    gate.AnnotationSet toRemove = (gate.AnnotationSet)bindings.get("left");
    inputAS.removeAll(toRemove);
    //create the new token
    FeatureMap features = Factory.newFeatureMap();
    features.put("kind", "word");
    features.put("string", numberStr + endingStr);
    features.put("length", (numberStr + endingStr).length());
    outputAS.add(toRemove.firstNode(), toRemove.lastNode(), "Token", features);
  }
  
}


//?n't
Rule: VBneg
   ({Token}):one
   ({Token.string=="'"}{Token.string=="t"}):two
-->
{
  gate.Annotation firstToken = (gate.Annotation)
                               ((gate.AnnotationSet)bindings.get("one")).iterator().next();
  String firstTokenText = (String)firstToken.getFeatures().get("string");
  if(firstTokenText.endsWith("n")){
    //remove the old tokens
    outputAS.removeAll((gate.AnnotationSet)bindings.get("one"));
    outputAS.removeAll((gate.AnnotationSet)bindings.get("two"));
    //create the new tokens
    Long ofs0 = firstToken.getStartNode().getOffset();
    Long ofs1 = new Long(firstToken.getEndNode().getOffset().longValue() - 1);
    Long ofs2 = ((gate.AnnotationSet)bindings.get("two")).lastNode().getOffset();
    try{
      gate.FeatureMap features;
      if(!ofs0.equals(ofs1)){
        features = Factory.newFeatureMap();
        features.put("kind", "word");
        String text = firstTokenText.substring(0, firstTokenText.length() - 1);
        features.put("string", text);
        features.put("length", Integer.toString(text.length()));
        features.put("orth", firstToken.getFeatures().get("orth"));
        outputAS.add(ofs0, ofs1, "Token", features);
      }

      features = Factory.newFeatureMap();
      features.put("kind", "word");
      features.put("string", "n't");
      features.put("length", "3");
      features.put("orth", "lowercase");
      outputAS.add(ofs1, ofs2, "Token", features);
    }catch(Exception e){
      e.printStackTrace();
    }
  }//if first token ends with "n"
}

// CR+LF | CR |LF+CR -> One single SpaceToken
Rule: NewLine
 (
  ({SpaceToken.string=="\n"}) |
  ({SpaceToken.string=="\r"}) |
  ({SpaceToken.string=="\n"}{SpaceToken.string=="\r"}) |
  ({SpaceToken.string=="\r"}{SpaceToken.string=="\n"})
  ):left
-->
{
  gate.AnnotationSet toRemove = (gate.AnnotationSet)bindings.get("left");
  outputAS.removeAll(toRemove);
  //get the tokens
  java.util.ArrayList tokens = new java.util.ArrayList(toRemove);
  //define a comparator for annotations by start offset
  Collections.sort(tokens, new gate.util.OffsetComparator());
  String text = "";
  Iterator tokIter = tokens.iterator();
  while(tokIter.hasNext())
    text += (String)((Annotation)tokIter.next()).getFeatures().get("string");

  gate.FeatureMap features = Factory.newFeatureMap();
  features.put("kind", "control");
  features.put("string", text);
  features.put("length", Integer.toString(text.length()));
  outputAS.add(toRemove.firstNode(), toRemove.lastNode(), "SpaceToken", features);
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy