opennlp.uima.namefind.NameFinderTrainer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreemnets.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */ 

package opennlp.uima.namefind;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import opennlp.maxent.GIS;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.ContainingConstraint;
import opennlp.uima.util.OpennlpUtil;
import opennlp.uima.util.UimaUtil;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

/**
 * OpenNLP NameFinder trainer.
 * 
 * Mandatory parameters
 * 
 *   
 *   
 *   
 *   
 *   
 *   
 *  Type  Name  Description
String  opennlp.uima.ModelName  The name of the model file
String  opennlp.uima.Language  The language code
String  opennlp.uima.SentenceType  The full name of the sentence type
String  opennlp.uima.TokenType  The full name of the token type
String  opennlp.uima.NameType  The full name of the name type
 *  
 * Optional parameters
 * 
 *   
 *   
 *   
 *   
 *   
 * Type  Name  Description
String  opennlp.uima.AdditionalTrainingDataFile  Training file which contains additional data in the OpenNLP format
String  opennlp.uima.AdditionalTrainingDataEncoding  Encoding of the additional training data
Integer  opennlp.uima.Cutoff  (default=5)
Integer  opennlp.uima.Iterations  (default=100)
 * 
 */
public final class NameFinderTrainer extends CasConsumer_ImplBase {
    
  private Logger logger;
  
  private String modelPath;
  
  private String additionalTrainingDataFile;
  
  private String additionalTrainingDataEncoding;
  
  private Type sentenceType;

  private Type tokenType;

  private Type nameType;
  
  private String language;
  
  private int cutoff;
  
  private int iterations;
  
  // TODO: Keeping all events in memory limits the size of the training corpus
  // Possible solutions:
  // - Write all events to disk
  // - Directly start indexing with a blocking sample stream, the indexer will then write everything
  //   to disk or could store the events much more space efficient in memory
  
  private List nameFinderSamples = new ArrayList();
  
  /**
   * Initializes the current instance.
   */
  public void initialize() throws ResourceInitializationException {
    
    super.initialize();
    
    logger = getUimaContext().getLogger();
    
    if (logger.isLoggable(Level.INFO)) {
      logger.log(Level.INFO, "Initializing the OpenNLP Name Trainer.");
    } 
    
    modelPath = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.MODEL_PARAMETER);
    
    language = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.LANGUAGE_PARAMETER);
    
    cutoff = CasConsumerUtil.getOptionalIntegerParameter(getUimaContext(), UimaUtil.CUTOFF_PARAMETER, 5);
    iterations = CasConsumerUtil.getOptionalIntegerParameter(getUimaContext(), UimaUtil.ITERATIONS_PARAMETER, 100);
    
    additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(
        getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
    
    // If the additional training data is specified, the encoding must be provided!
    if (additionalTrainingDataFile != null) {
      additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(
          getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING);
    }
  }

  /**
   * Initialize the current instance with the given type system.
   */
  public void typeSystemInit(TypeSystem typeSystem)
      throws ResourceInitializationException {

    String sentenceTypeName = 
        CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.SENTENCE_TYPE_PARAMETER);

    sentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName);

    String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.TOKEN_TYPE_PARAMETER);

    tokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName);

    String nameTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        NameFinder.NAME_TYPE_PARAMETER);
    
    nameType = CasConsumerUtil.getType(typeSystem, nameTypeName);
  }

  /**
   * Creates a {@link List} from an {@link Iterator}.
   * 
   * @param 
   * @param it
   * @return
   */
  private static  List iteratorToList(Iterator it) {
    List list = new LinkedList();
    
    while (it.hasNext()) {
      list.add(it.next());
    }
    
    return list;
  }

  private static boolean isContaining(AnnotationFS annotation,
      AnnotationFS containtedAnnotation) {
    boolean isStartContaining = annotation.getBegin() <= containtedAnnotation
        .getBegin();
    if (!isStartContaining) {
      return false;
    }

    boolean isEndContaining = annotation.getEnd() >= containtedAnnotation
        .getEnd();
    if (!isEndContaining) {
      return false;
    }

    return true;
  }
  
  /**
   * Creates the name spans out of a list of token annotations and a list of entity annotations.
   * 
   * The name spans for the name finder use a token index and not on a character index which
   * is used by the entity annotations.
   * 
   * @param tokenList
   * @param entityAnnotations
   * @return
   */
  private static Span[] createNames(List tokenList, List entityAnnotations) {

    List nameList = new LinkedList();

    AnnotationFS currentEntity = null;

    int startIndex = -1;
    int index = 0;
    for (Iterator tokenIterator = tokenList.iterator(); tokenIterator.hasNext();) {
      AnnotationFS token = (AnnotationFS) tokenIterator.next();

      for (Iterator it = entityAnnotations.iterator(); it.hasNext();) {

        AnnotationFS entity = (AnnotationFS) it.next();

        if (!isContaining(entity, token)) {
          // ... end of an entity
          if (currentEntity == entity) {
            nameList.add(new Span(startIndex, index));

            startIndex = -1;
            currentEntity = null;
            // break;
          } else {
            continue;
          }
        }

        // is this token start of new entity
        if (currentEntity == null && isContaining(entity, token)) {
          startIndex = index;

          currentEntity = entity;
        }
      }

      index++;
    }

    if (currentEntity != null) {
      Span name = new Span(startIndex, index);
      nameList.add(name);
    }

    return nameList.toArray(new Span[nameList.size()]);
  }
  
  /**
   * Process the given CAS object.
   */
  public void processCas(CAS cas) {
    FSIndex sentenceIndex = cas.getAnnotationIndex(sentenceType);

    Iterator sentenceIterator = sentenceIndex.iterator();
    while (sentenceIterator.hasNext()) {
      AnnotationFS sentenceAnnotation = sentenceIterator.next();

      ContainingConstraint sentenceContainingConstraint = new ContainingConstraint(
          sentenceAnnotation);
      
      FSIndex tokenAnnotations = cas.getAnnotationIndex(tokenType);
      
      Iterator containingTokens = cas.createFilteredIterator(tokenAnnotations
          .iterator(), sentenceContainingConstraint);
      
      FSIndex allNames = cas.getAnnotationIndex(nameType);
      
      Iterator containingNames = cas.createFilteredIterator(allNames.iterator(),
          sentenceContainingConstraint);
      
      List tokenList = iteratorToList(containingTokens);
      
      Span names[] = createNames(tokenList, iteratorToList(containingNames));
      
      // create token array
      String tokenArray[] = new String[tokenList.size()];
      
      for (int i = 0; i < tokenArray.length; i++) {
        tokenArray[i] = ((AnnotationFS) tokenList.get(i))
            .getCoveredText();
      }
      
      NameSample traingSentence = new NameSample(tokenArray, names, null, false);
      
      if (traingSentence.getSentence().length != 0) {
      	nameFinderSamples.add(traingSentence);
      }
      else {
      	if (logger.isLoggable(Level.INFO)) {
      		logger.log(Level.INFO, "Sentence without tokens: " + 
      				sentenceAnnotation.getCoveredText());
      	}
      }
    }
  }
  
  /**
   * Called if the processing is finished, this method
   * does the training.
   */
  public void collectionProcessComplete(ProcessTrace trace)
      throws ResourceProcessException, IOException {
   
    if (logger.isLoggable(Level.INFO)) {
      logger.log(Level.INFO, "Collected " + nameFinderSamples.size() + 
          " name samples.");
    }
    
    GIS.PRINT_MESSAGES = false;
    
    // create training stream ... 
    ObjectStream samples = ObjectStreamUtils.createObjectStream(nameFinderSamples);
    
    InputStream additionalTrainingDataIn = null;
    TokenNameFinderModel nameModel;
    try {
      if (additionalTrainingDataFile != null) {
        
        if (logger.isLoggable(Level.INFO)) {
          logger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile); 
        }
        
        additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile);
        
        // TODO: Make encoding configurable, otherwise use UTF-8 as default!
        ObjectStream additionalSamples = new NameSampleDataStream(
            new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding)));
        
        samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
      }
      
      // TODO: Make sure its possible to pass custom feature generator
      // User could subclass this trainer to provide a custom feature generator
      nameModel = NameFinderME.train(language, null,
          samples, Collections.EMPTY_MAP, iterations, cutoff);
      
    }
    finally {
      if (additionalTrainingDataIn != null)
        additionalTrainingDataIn.close();
    }
    
    // dereference to allow garbage collection
    nameFinderSamples = null;

    File modelFile = new File(getUimaContextAdmin().getResourceManager()
        .getDataPath() + File.separatorChar + modelPath);

    OpennlpUtil.serialize(nameModel, modelFile);
    
    if (logger.isLoggable(Level.INFO)) {
      logger.log(Level.INFO, "Model was written to: " + modelFile.getAbsolutePath());
    }
  }
  
  /**
   * The trainer is not stateless.
   */
  public boolean isStateless() {
    return false;
  }
  
  /**
   * Destroys the current instance.
   */
  public void destroy() {
    // dereference to allow garbage collection
    nameFinderSamples = null;
  }
}
Type	Name	Description
String	opennlp.uima.ModelName	The name of the model file
String	opennlp.uima.Language	The language code
String	opennlp.uima.SentenceType	The full name of the sentence type
String	opennlp.uima.TokenType	The full name of the token type
String	opennlp.uima.NameType	The full name of the name type
Type	Name	Description
String	opennlp.uima.AdditionalTrainingDataFile	Training file which contains additional data in the OpenNLP format
String	opennlp.uima.AdditionalTrainingDataEncoding	Encoding of the additional training data
Integer	opennlp.uima.Cutoff	(default=5)
Integer	opennlp.uima.Iterations	(default=100)