com.gengoai.hermes.GettingStarted Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hermes-examples Show documentation
There is a newer version: 2.1
/*
 * (c) 2005 David B. Bracewell
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.gengoai.hermes;

import com.gengoai.collection.counter.Counter;
import com.gengoai.hermes.extraction.TermExtractor;
import com.gengoai.hermes.tools.HermesCLI;

import java.util.Collections;
import java.util.regex.Matcher;

/**
 * @author David B. Bracewell
 */
public class GettingStarted extends HermesCLI {
   public static void main(String[] args) throws Exception {
      new GettingStarted().run(args);
   }

   public void programLogic() throws Exception {
      //Documents are created using the DocumentFactory class which takes care of preprocessing text (e.g
      //normalizing white space and unicode) and constructing a document.
      Document document = DocumentFactory.getInstance().create("The quick brown fox jumps over the lazy dog.");
      //Note: for convenience a document can also be created using static methods on the document class.
      //eg. Document document = Document.create("The quick brown fox jumps over the lazy dog.");

      //Annotators that provide the given annotation types and their dependencies are defined via configuration files
      //common annotations are defined in com/gengoai/com.gengoai.hermes/annotations.conf
      //Note: the annotators that provide a given type can be language dependent.
      document.annotate(Types.TOKEN, Types.SENTENCE);

      //For each sentence (Types.SENTENCE) print to standard out
      //Sentences and tokens have convenience accessor methods (sentences() and tokens()), but can be retrieved in a
      //similar manner as other annotations using get(AnnotationType).
      document.sentences().forEach(System.out::println);

      //Counts the token lemmas in the document (also lower cases)
      //We have not provided lemma annotations to the document, so instead it will simply lowercase the tokens
      Counter unigrams = TermExtractor.builder().toLemma().build().extract(document).count();

      //Prints: Count(the) = 2
      System.out.println("Count(the) = " + unigrams.get("the"));

      //Add a custom annotation, by performing a regex for fox or dog
      //Since HStrings act like super charged strings, we can do simple annotations using the builtin Java regex engine.

      //First define the type ANIMAL_MENTION (Note that Types.type(String) is the same as AnnotationType.create(String))
      AnnotationType animalMention = Types.annotation("ANIMAL_MENTION");

      //Second create annotations based on a regular expression match
      //We will match fox or dog with word breaks on both sides
      Matcher matcher = document.matcher("\\b(fox|dog)\\b");
      while(matcher.find()) {
         //Creating the annotation is done using the document and only requires the type and the character offsets
         document.createAnnotation(animalMention, matcher.start(), matcher.end(), Collections.emptyMap());
         //More complicated annotations would also provide attributes, for example Entity Type word Word Sense.
      }

      //Print out the animal mention annotations
      //We should print out fox[16, 19] and dog[40, 43]
      document.annotations(animalMention).forEach(a -> System.out.println(a + "[" + a.start() + ", " + a.end() + "]"));

   }

}//END OF GettingStarted