All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.datexis.ner.GenericMentionAnnotator Maven / Gradle / Ivy

package de.datexis.ner;

import de.datexis.annotator.AnnotatorFactory;
import de.datexis.common.Resource;
import de.datexis.model.Document;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A generic Named Entity Mention Annotator with language detection.
 * Uses pre-trained models for German and English.
 * @author Sebastian Arnold 
 */
public class GenericMentionAnnotator extends MentionAnnotator {

  protected final static Logger log = LoggerFactory.getLogger(GenericMentionAnnotator.class);
  
  // Configure model path to use internal JAR resources
  private final static Resource path = Resource.fromJAR("models");
  
  private final MentionAnnotator annotatorEN;
  private final MentionAnnotator annotatorDE;
  
  /**
   * Returns a generic Named Entity Mention Annotator with language detection.
   * It uses pre-trained models for German and English.
   */
  public static MentionAnnotator create() {
    try {
      return new GenericMentionAnnotator();
    } catch (IOException ex) {
      log.error("Could not load packaged models for MentionAnnotator.");
      ex.printStackTrace();
      return null;
    }
  }
  
  /**
   * Please use GenericMentionAnnotator().create()
   */
  protected GenericMentionAnnotator() throws IOException {
    annotatorEN = (MentionAnnotator) AnnotatorFactory.fromXML(path.resolve("MentionAnnotator_en_NER-GENERIC_WikiNER+tri_20170309")); // English Wiki Entities
    annotatorDE = (MentionAnnotator) AnnotatorFactory.fromXML(path.resolve("MentionAnnotator_de_NER-GENERIC_WikiNER+tri_20170309")); // German Wiki Entities
  }
  
  /**
   * Annotates given documents for English and German texts.
   */
  @Override
  public void annotate(Collection docs) {
    Map> groups = docs.stream().collect(Collectors.groupingBy(doc -> Optional.ofNullable(doc.getLanguage()).orElse("unk")));
    for(Map.Entry> group : groups.entrySet()) {
      if(group.getKey().equals("de")) {
        annotatorDE.annotate(group.getValue());
      } else if(group.getKey().equals("en")) {
        annotatorEN.annotate(group.getValue());
      } else {
        log.warn("Detected language " + group.getKey() + ", using English annotator.");
        annotatorEN.annotate(group.getValue());
      }
    }
  }
  

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy