All Downloads are FREE. Search and download functionalities are using the official Maven repository.

resources.tokeniser.postprocess.jape Maven / Gradle / Ivy

The newest version!
// Valentin Tablan, 29/06/2001
// $id$


Phase:postprocess
Input: Token SpaceToken
Options: control = appelt

//adjusts the tokeniser output

Rule: simpleJoin
 (
  //'30s, ..., 'Cause, 'em, 'N, 'S, 's, 'T, 'd, , 'll, 'm, 're, 's, 'til, 've
  (
   {Token.string=="'"}
   ({Token.string=="30s"}|{Token.string=="40s"}|{Token.string=="50s"}|{Token.string=="60s"}
    |{Token.string=="70s"}|{Token.string=="80s"}|{Token.string=="90s"}|{Token.string=="Cause"}
    |{Token.string=="cause"}|{Token.string=="Em"}|{Token.string=="em"}|{Token.string=="N"}
    |{Token.string=="S"}|{Token.string=="s"}|{Token.string=="T"}|{Token.string=="d"}
    |{Token.string=="ll"}|{Token.string=="m"}|{Token.string=="re"}|{Token.string=="s"}
    |{Token.string=="til"}|{Token.string=="ve"})
  )
  |
  //'n'
  ({Token.string=="'"} {Token.string=="n"} {Token.string=="'"})
  |
  //C'mon
  (({Token.string=="C"}|{Token.string=="c"}){Token.string=="'"} {Token.string=="mon"})
  |
  //o'clock
  (({Token.string=="O"}|{Token.string=="o"}){Token.string=="'"} {Token.string=="clock"})
  |
  //ma'am
  (({Token.string=="ma"}|{Token.string=="Ma"}){Token.string=="'"} {Token.string=="am"})

 ):left
-->
{
  gate.AnnotationSet toRemove = (gate.AnnotationSet)bindings.get("left");
  outputAS.removeAll(toRemove);
  //get the tokens
  java.util.ArrayList tokens = new java.util.ArrayList(toRemove);
  //define a comparator for annotations by start offset
  Collections.sort(tokens, new gate.util.OffsetComparator());
  String text = "";
  Iterator tokIter = tokens.iterator();
  while(tokIter.hasNext())
    text += (String)((Annotation)tokIter.next()).getFeatures().get("string");

  gate.FeatureMap features = Factory.newFeatureMap();
  features.put("kind", "word");
  features.put("string", text);
  features.put("length", Integer.toString(text.length()));
  features.put("orth", "apostrophe");
  outputAS.add(toRemove.firstNode(), toRemove.lastNode(), "Token", features);
}

//?n't
Rule: VBneg
   ({Token}):one
   ({Token.string=="'"}{Token.string=="t"}):two
-->
{
  gate.Annotation firstToken = (gate.Annotation)
                               ((gate.AnnotationSet)bindings.get("one")).iterator().next();
  String firstTokenText = (String)firstToken.getFeatures().get("string");
  if(firstTokenText.endsWith("n")){
    //remove the old tokens
    outputAS.removeAll((gate.AnnotationSet)bindings.get("one"));
    outputAS.removeAll((gate.AnnotationSet)bindings.get("two"));
    //create the new tokens
    Long ofs0 = firstToken.getStartNode().getOffset();
    Long ofs1 = new Long(firstToken.getEndNode().getOffset().longValue() - 1);
    Long ofs2 = ((gate.AnnotationSet)bindings.get("two")).lastNode().getOffset();
    try{
      gate.FeatureMap features;
      if(!ofs0.equals(ofs1)){
        features = Factory.newFeatureMap();
        features.put("kind", "word");
        String text = firstTokenText.substring(0, firstTokenText.length() - 1);
        features.put("string", text);
        features.put("length", Integer.toString(text.length()));
        features.put("orth", firstToken.getFeatures().get("orth"));
        outputAS.add(ofs0, ofs1, "Token", features);
      }

      features = Factory.newFeatureMap();
      features.put("kind", "word");
      features.put("string", "n't");
      features.put("length", "3");
      features.put("orth", "lowercase");
      outputAS.add(ofs1, ofs2, "Token", features);
    }catch(Exception e){
      e.printStackTrace();
    }
  }//if first token ends with "n"
}

// CR+LF | CR |LF+CR -> One single SpaceToken
Rule: NewLine
 (
  ({SpaceToken.string=="\n"}) |
  ({SpaceToken.string=="\r"}) |
  ({SpaceToken.string=="\n"}{SpaceToken.string=="\r"}) |
  ({SpaceToken.string=="\r"}{SpaceToken.string=="\n"})
  ):left
-->
{
  gate.AnnotationSet toRemove = (gate.AnnotationSet)bindings.get("left");
  outputAS.removeAll(toRemove);
  //get the tokens
  java.util.ArrayList tokens = new java.util.ArrayList(toRemove);
  //define a comparator for annotations by start offset
  Collections.sort(tokens, new gate.util.OffsetComparator());
  String text = "";
  Iterator tokIter = tokens.iterator();
  while(tokIter.hasNext())
    text += (String)((Annotation)tokIter.next()).getFeatures().get("string");

  gate.FeatureMap features = Factory.newFeatureMap();
  features.put("kind", "control");
  features.put("string", text);
  features.put("length", Integer.toString(text.length()));
  outputAS.add(toRemove.firstNode(), toRemove.lastNode(), "SpaceToken", features);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy