chalk.uima.namefind.NameFinderTrainer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of chalk Show documentation
chalk
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */ 

package chalk.uima.namefind;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;


import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

import nak.maxent.GIS;
import chalk.tools.cmdline.namefind.TokenNameFinderTrainerTool;
import chalk.tools.namefind.NameFinderME;
import chalk.tools.namefind.NameSample;
import chalk.tools.namefind.NameSampleDataStream;
import chalk.tools.namefind.TokenNameFinderModel;
import chalk.tools.util.ObjectStream;
import chalk.tools.util.ObjectStreamUtils;
import chalk.tools.util.PlainTextByLineStream;
import chalk.tools.util.Span;
import chalk.tools.util.TrainingParameters;
import chalk.uima.util.CasConsumerUtil;
import chalk.uima.util.ContainingConstraint;
import chalk.uima.util.OpennlpUtil;
import chalk.uima.util.SampleTraceStream;
import chalk.uima.util.UimaUtil;

/**
 * OpenNLP NameFinder trainer.
 * 
 * Mandatory parameters
 * 
 *   
 *   
 *   
 *   
 *   
 *   
 *  Type  Name  Description
String  opennlp.uima.ModelName  The name of the model file
String  opennlp.uima.Language  The language code
String  opennlp.uima.SentenceType  The full name of the sentence type
String  opennlp.uima.TokenType  The full name of the token type
String  opennlp.uima.NameType  The full name of the name type
 *  
 * Optional parameters
 * 
 *   
 *   
 *   
 *   
 *   
 *   
 *   
 *   
 * Type  Name  Description
String  opennlp.uima.opennlp.uima.TrainingParamsFile  Training Parameters Properties file
String  opennlp.uima.FeatureGeneratorFile  Feature Generator definition file which contain the feature generator configuration
String  opennlp.uima.FeatureGeneratorResources  Feature Generator resources dictionary
String  opennlp.uima.AdditionalTrainingDataFile  Training file which contains additional data in the OpenNLP format
String  opennlp.uima.AdditionalTrainingDataEncoding  Encoding of the additional training data
String  opennlp.uima.SampleTraceFile  All training samples are traced to this file
String  opennlp.uima.SampleTraceFileEncoding  Encoding of the sample trace file
 * 
 */
public final class NameFinderTrainer extends CasConsumer_ImplBase {
    
  private static final String FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER = "opennlp.uima.FeatureGeneratorFile";
  private static final String FEATURE_GENERATOR_RESOURCES_PARAMETER = "opennlp.uima.FeatureGeneratorResources";
  
  private Logger logger;
  
  private String modelPath;
  
  private byte featureGeneratorDefinition[];
  
  private File featureGeneratorResourceDir;
  
  private String additionalTrainingDataFile;
  
  private String additionalTrainingDataEncoding;
  
  private File sampleTraceFile = null;
  
  private String sampleTraceFileEncoding = null;
  
  private Type sentenceType;

  private Type tokenType;

  private Type nameType;
  
  private String language;
  
  // TODO: Keeping all events in memory limits the size of the training corpus
  // Possible solutions:
  // - Write all events to disk
  // - Directly start indexing with a blocking sample stream, the indexer will then write everything
  //   to disk or could store the events much more space efficient in memory
  
  private List nameFinderSamples = new ArrayList();
  private TrainingParameters trainingParams;
  
  /**
   * Initializes the current instance.
   */
  public void initialize() throws ResourceInitializationException {
    
    super.initialize();
    
    logger = getUimaContext().getLogger();
    
    if (logger.isLoggable(Level.INFO)) {
      logger.log(Level.INFO, "Initializing the OpenNLP Name Trainer.");
    } 
    
    modelPath = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.MODEL_PARAMETER);
    
    language = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.LANGUAGE_PARAMETER);
    
    trainingParams = OpennlpUtil.loadTrainingParams(CasConsumerUtil.getOptionalStringParameter(
        getUimaContext(), UimaUtil.TRAINING_PARAMS_FILE_PARAMETER), true);

    String featureGeneratorDefinitionFile = CasConsumerUtil.getOptionalStringParameter(
        getUimaContext(), FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER);
    
    if (featureGeneratorDefinitionFile != null) {
      try {
        featureGeneratorDefinition = OpennlpUtil.loadBytes(new File(featureGeneratorDefinitionFile));
      } catch (IOException e) {
        throw new ResourceInitializationException(e);
      }
      
      String featureGeneratorResourcesDirName = CasConsumerUtil.getOptionalStringParameter(
          getUimaContext(), FEATURE_GENERATOR_RESOURCES_PARAMETER);
      
      if (featureGeneratorResourcesDirName != null) {
        featureGeneratorResourceDir = new File(featureGeneratorResourcesDirName);
      }
    }
    
    additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(
        getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
    
    // If the additional training data is specified, the encoding must be provided!
    if (additionalTrainingDataFile != null) {
      additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(
          getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING);
    }
    
    String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter(
        getUimaContext(), "opennlp.uima.SampleTraceFile");
    
    if (sampleTraceFileName != null) {
      sampleTraceFile = new File(getUimaContextAdmin().getResourceManager()
          .getDataPath() + File.separatorChar + sampleTraceFileName);
      sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(
          getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
    }
  }

  /**
   * Initialize the current instance with the given type system.
   */
  public void typeSystemInit(TypeSystem typeSystem)
      throws ResourceInitializationException {

    String sentenceTypeName = 
        CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.SENTENCE_TYPE_PARAMETER);

    sentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName);

    String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        UimaUtil.TOKEN_TYPE_PARAMETER);

    tokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName);

    String nameTypeName = CasConsumerUtil.getRequiredStringParameter(getUimaContext(),
        NameFinder.NAME_TYPE_PARAMETER);
    
    nameType = CasConsumerUtil.getType(typeSystem, nameTypeName);
  }

  /**
   * Creates a {@link List} from an {@link Iterator}.
   * 
   * @param 
   * @param it
   * @return
   */
  private static  List iteratorToList(Iterator it) {
    List list = new LinkedList();
    
    while (it.hasNext()) {
      list.add(it.next());
    }
    
    return list;
  }

  private static boolean isContaining(AnnotationFS annotation,
      AnnotationFS containtedAnnotation) {
    boolean isStartContaining = annotation.getBegin() <= containtedAnnotation
        .getBegin();
    if (!isStartContaining) {
      return false;
    }

    boolean isEndContaining = annotation.getEnd() >= containtedAnnotation
        .getEnd();
    if (!isEndContaining) {
      return false;
    }

    return true;
  }
  
  /**
   * Creates the name spans out of a list of token annotations and a list of entity annotations.
   * 
   * The name spans for the name finder use a token index and not on a character index which
   * is used by the entity annotations.
   * 
   * @param tokenList
   * @param entityAnnotations
   * @return
   */
  private static Span[] createNames(List tokenList, List entityAnnotations) {

    List nameList = new LinkedList();

    AnnotationFS currentEntity = null;

    int startIndex = -1;
    int index = 0;
    for (AnnotationFS token : tokenList) {
      for (AnnotationFS entity : entityAnnotations) {

        if (!isContaining(entity, token)) {
          // ... end of an entity
          if (currentEntity == entity) {
            nameList.add(new Span(startIndex, index));

            startIndex = -1;
            currentEntity = null;
            // break;
          } else {
            continue;
          }
        }

        // is this token start of new entity
        if (currentEntity == null && isContaining(entity, token)) {
          startIndex = index;

          currentEntity = entity;
        }
      }

      index++;
    }

    if (currentEntity != null) {
      Span name = new Span(startIndex, index);
      nameList.add(name);
    }

    return nameList.toArray(new Span[nameList.size()]);
  }
  
  /**
   * Process the given CAS object.
   */
  /**
   * Process the given CAS object.
   */
  public void processCas(CAS cas) {
    FSIndex sentenceIndex = cas.getAnnotationIndex(sentenceType);
    
    boolean isClearAdaptiveData = true;
    
    for (AnnotationFS sentenceAnnotation : sentenceIndex) {
      ContainingConstraint sentenceContainingConstraint = new ContainingConstraint(
          sentenceAnnotation);

      FSIndex tokenAnnotations = cas.getAnnotationIndex(tokenType);

      Iterator containingTokens = cas.createFilteredIterator(tokenAnnotations
          .iterator(), sentenceContainingConstraint);

      FSIndex allNames = cas.getAnnotationIndex(nameType);

      Iterator containingNames = cas.createFilteredIterator(allNames.iterator(),
          sentenceContainingConstraint);

      List tokenList = iteratorToList(containingTokens);

      Span names[] = createNames(tokenList, iteratorToList(containingNames));

      // create token array
      String tokenArray[] = new String[tokenList.size()];

      for (int i = 0; i < tokenArray.length; i++) {
        tokenArray[i] = tokenList.get(i).getCoveredText();
      }

      NameSample trainingSentence = new NameSample(tokenArray, names, null, isClearAdaptiveData);

      if (trainingSentence.getSentence().length != 0) {
        nameFinderSamples.add(trainingSentence);
        
        if (isClearAdaptiveData) {
          isClearAdaptiveData = false;
        }
      } else {
        if (logger.isLoggable(Level.INFO)) {
          logger.log(Level.INFO, "Sentence without tokens: " +
              sentenceAnnotation.getCoveredText());
        }
      }
    }
  }
  
  /**
   * Called if the processing is finished, this method
   * does the training.
   */
  public void collectionProcessComplete(ProcessTrace trace)
      throws ResourceProcessException, IOException {
   
    if (logger.isLoggable(Level.INFO)) {
      logger.log(Level.INFO, "Collected " + nameFinderSamples.size() + 
          " name samples.");
    }
    
    GIS.PRINT_MESSAGES = false;
    
    // create training stream ... 
    ObjectStream samples = ObjectStreamUtils.createObjectStream(nameFinderSamples);
    
    InputStream additionalTrainingDataIn = null;
    Writer samplesOut = null;
    TokenNameFinderModel nameModel;
    try {
      if (additionalTrainingDataFile != null) {
        
        if (logger.isLoggable(Level.INFO)) {
          logger.log(Level.INFO, "Using additional training data file: " + additionalTrainingDataFile);
        }
        
        additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile);
        
        ObjectStream additionalSamples = new NameSampleDataStream(
            new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding)));
        
        samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
      }

      if (sampleTraceFile != null) {
        samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding);
        samples = new SampleTraceStream(samples, samplesOut);
      }
      
      Map resourceMap;
      
      if (featureGeneratorResourceDir != null) {
        resourceMap = TokenNameFinderTrainerTool.loadResources(featureGeneratorResourceDir);
      }
      else {
        resourceMap = Collections.emptyMap();
      }
      
      nameModel = NameFinderME.train(language, null,
          samples, trainingParams, featureGeneratorDefinition, resourceMap);
    }
    finally {
      if (additionalTrainingDataIn != null) {
        additionalTrainingDataIn.close();
      }
      
      if (samplesOut != null) {
        samplesOut.close();
      }
    }
    
    // dereference to allow garbage collection
    nameFinderSamples = null;

    File modelFile = new File(getUimaContextAdmin().getResourceManager()
        .getDataPath() + File.separatorChar + modelPath);

    OpennlpUtil.serialize(nameModel, modelFile);
    
    if (logger.isLoggable(Level.INFO)) {
      logger.log(Level.INFO, "Model was written to: " + modelFile.getAbsolutePath());
    }
  }
  
  /**
   * The trainer is not stateless.
   */
  public boolean isStateless() {
    return false;
  }
  
  /**
   * Destroys the current instance.
   */
  public void destroy() {
    // dereference to allow garbage collection
    nameFinderSamples = null;
  }
}
Type	Name	Description
String	opennlp.uima.ModelName	The name of the model file
String	opennlp.uima.Language	The language code
String	opennlp.uima.SentenceType	The full name of the sentence type
String	opennlp.uima.TokenType	The full name of the token type
String	opennlp.uima.NameType	The full name of the name type
Type	Name	Description
String	opennlp.uima.opennlp.uima.TrainingParamsFile	Training Parameters Properties file
String	opennlp.uima.FeatureGeneratorFile	Feature Generator definition file which contain the feature generator configuration
String	opennlp.uima.FeatureGeneratorResources	Feature Generator resources dictionary
String	opennlp.uima.AdditionalTrainingDataFile	Training file which contains additional data in the OpenNLP format
String	opennlp.uima.AdditionalTrainingDataEncoding	Encoding of the additional training data
String	opennlp.uima.SampleTraceFile	All training samples are traced to this file
String	opennlp.uima.SampleTraceFileEncoding	Encoding of the sample trace file