opennlp.tools.namefind.NameFinderME Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.namefind;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectStreamException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.util.BeamSearch;
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.Span;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator;
import opennlp.tools.util.featuregen.CachedFeatureGenerator;
import opennlp.tools.util.featuregen.OutcomePriorFeatureGenerator;
import opennlp.tools.util.featuregen.PreviousMapFeatureGenerator;
import opennlp.tools.util.featuregen.SentenceFeatureGenerator;
import opennlp.tools.util.featuregen.TokenClassFeatureGenerator;
import opennlp.tools.util.featuregen.TokenFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;

/**
 * Class for creating a maximum-entropy-based name finder.
 */
public class NameFinderME implements TokenNameFinder {

  private static String[][] EMPTY = new String[0][0];
  public static final int DEFAULT_BEAM_SIZE = 3;
  private static final Pattern typedOutcomePattern = Pattern.compile("(.+)-\\w+");

  private static class NameFinderSequenceValidator implements
      SequenceValidator {
    
    public boolean validSequence(int i, String[] inputSequence,
        String[] outcomesSequence, String outcome) {
      
      // outcome is formatted like "cont" or "sometype-cont", so we
      // can check if it ends with "cont".
      if (outcome.endsWith(CONTINUE)) {
        
        int li = outcomesSequence.length - 1;
        
        if (li == -1) {
          return false;
        } else if (outcomesSequence[li].endsWith(OTHER)) {
          return false;
        } else if (outcomesSequence[li].endsWith(CONTINUE)) {
          // if it is continue, we have to check if previous match was of the same type 
          String previousNameType = extractNameType(outcomesSequence[li]);
          String nameType = extractNameType(outcome);
          if( previousNameType != null || nameType != null ) {
            if( nameType != null ) {
              if( nameType.equals(previousNameType) ){
                return true;
              }
            }
            return false; // outcomes types are not equal
          }
        }
      }
      return true;
    }
  }

  public static final String START = "start";
  public static final String CONTINUE = "cont";
  public static final String OTHER = "other";

  protected MaxentModel model;
  protected NameContextGenerator contextGenerator;
  private Sequence bestSequence;
  private BeamSearch beam;

  private AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
      new AdditionalContextFeatureGenerator();

  public NameFinderME(TokenNameFinderModel model) {
    this(model, DEFAULT_BEAM_SIZE);
  }

  /**
   * Initializes the name finder with the specified model.
   *
   * @param model
   * @param beamSize
   */
  public NameFinderME(TokenNameFinderModel model, AdaptiveFeatureGenerator generator, int beamSize) {
    this.model = model.getNameFinderModel();

    if (generator != null) 
      contextGenerator = new DefaultNameContextGenerator(generator);
    else
      contextGenerator = new DefaultNameContextGenerator(createFeatureGenerator());
    
    contextGenerator.addFeatureGenerator(
          new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
    
    beam = new BeamSearch(beamSize, contextGenerator, this.model,
        new NameFinderSequenceValidator(), beamSize);
  }

  public NameFinderME(TokenNameFinderModel model, int beamSize) {
    this(model, null, beamSize);
  }
  
  
  /**
   * Creates a new name finder with the specified model.
   * 
   * @param mod The model to be used to find names.
   * 
   * @deprecated Use the new model API! 
   */
  @Deprecated
  public NameFinderME(MaxentModel mod) {
    this(mod, new DefaultNameContextGenerator(), DEFAULT_BEAM_SIZE);
  }

  /**
   * Creates a new name finder with the specified model and context generator.
   * 
   * @param mod The model to be used to find names.
   * @param cg The context generator to be used with this name finder.
   */
  @Deprecated
  public NameFinderME(MaxentModel mod, NameContextGenerator cg) {
    this(mod, cg, DEFAULT_BEAM_SIZE);
  }

  /**
   * Creates a new name finder with the specified model and context generator.
   * 
   * @param mod The model to be used to find names.
   * @param cg The context generator to be used with this name finder.
   * @param beamSize The size of the beam to be used in decoding this model.
   */
  @Deprecated
  public NameFinderME(MaxentModel mod, NameContextGenerator cg, int beamSize) {
    model = mod;
    contextGenerator = cg;

    contextGenerator.addFeatureGenerator(new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
    beam = new BeamSearch(beamSize, cg, mod,
        new NameFinderSequenceValidator(), beamSize);
  }

  private static AdaptiveFeatureGenerator createFeatureGenerator() {
   return new CachedFeatureGenerator(
         new AdaptiveFeatureGenerator[]{
           new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
           new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 2),
           new OutcomePriorFeatureGenerator(),
           new PreviousMapFeatureGenerator(),
           new BigramNameFeatureGenerator(),
           new SentenceFeatureGenerator(true, false)
           });
  }
  
  public Span[] find(String[] tokens) {
    return find(tokens, EMPTY);
  }
  
  /** 
   * Generates name tags for the given sequence, typically a sentence, 
   * returning token spans for any identified names.
   * 
   * @param tokens an array of the tokens or words of the sequence,
   *     typically a sentence.
   * @param additionalContext features which are based on context outside
   *     of the sentence but which should also be used.
   * 
   * @return an array of spans for each of the names identified.
   */
  public Span[] find(String[] tokens, String[][] additionalContext) {
    additionalContextFeatureGenerator.setCurrentContext(additionalContext);
    bestSequence = beam.bestSequence(tokens, additionalContext);
    List c = bestSequence.getOutcomes();

    contextGenerator.updateAdaptiveData(tokens, (String[]) c.toArray(new String[c.size()]));

    int start = -1;
    int end = -1;
    List spans = new ArrayList(tokens.length);
    for (int li = 0; li < c.size(); li++) {
      String chunkTag = (String) c.get(li);
      if (chunkTag.endsWith(NameFinderME.START)) {
        if (start != -1) {
          spans.add(new Span(start, end, extractNameType(chunkTag)));
        }

        start = li;
        end = li + 1;

      }
      else if (chunkTag.endsWith(NameFinderME.CONTINUE)) {
        end = li + 1;
      }
      else if (chunkTag.endsWith(NameFinderME.OTHER)) {
        if (start != -1) {
          spans.add(new Span(start, end, extractNameType(c.get(li - 1))));
          start = -1;
          end = -1;
        }
      }
    }

    if (start != -1) {
      spans.add(new Span(start, end, extractNameType(c.get(c.size() - 1))));
    }

    return spans.toArray(new Span[spans.size()]);
  }

  /**
   * Forgets all adaptive data which was collected during previous
   * calls to one of the find methods.
   *
   * This method is typical called at the end of a document.
   */
  public void clearAdaptiveData() {
   contextGenerator.clearAdaptiveData();
  }

  /**
   * Populates the specified array with the probabilities of the last decoded
   * sequence. The sequence was determined based on the previous call to
   * chunk. The specified array should be at least as large as
   * the number of tokens in the previous call to chunk.
   *
   * @param probs
   *          An array used to hold the probabilities of the last decoded
   *          sequence.
   */
   public void probs(double[] probs) {
     bestSequence.getProbs(probs);
   }

  /**
    * Returns an array with the probabilities of the last decoded sequence.  The
    * sequence was determined based on the previous call to chunk.
    * 
    * @return An array with the same number of probabilities as tokens were sent to chunk
    * when it was last called.
    */
   public double[] probs() {
     return bestSequence.getProbs();
   }

   /**
    * Returns an array of probabilities for each of the specified spans which is the product
    * the probabilities for each of the outcomes which make up the span.
    * 
    * @param spans The spans of the names for which probabilities are desired.
    * 
    * @return an array of probabilities for each of the specified spans.
    */
   public double[] probs(Span[] spans) {
     
     double[] sprobs = new double[spans.length];
     double[] probs = bestSequence.getProbs();
     
     for (int si=0;si samples, 
       AdaptiveFeatureGenerator generator, final Map resources, 
       int iterations, int cutoff) throws IOException {
     
     Map manifestInfoEntries = new HashMap();
     ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
     
     AdaptiveFeatureGenerator featureGenerator;
     
     if (generator != null)
       featureGenerator = generator;
     else 
       featureGenerator = createFeatureGenerator();
     
     EventStream eventStream = new NameFinderEventStream(samples, type,
         new DefaultNameContextGenerator(featureGenerator));
     HashSumEventStream hses = new HashSumEventStream(eventStream);
     AbstractModel nameFinderModel = GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
     
     manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
         hses.calculateHashSum().toString(16));
     
     return new TokenNameFinderModel(languageCode, nameFinderModel,
         resources, manifestInfoEntries);
   }

   public static TokenNameFinderModel train(String languageCode, String type, ObjectStream samples, 
       final Map resources, int iterations, int cutoff) throws IOException  {
     return train(languageCode, type, samples, null, resources, iterations, cutoff);
   }
   
   public static TokenNameFinderModel train(String languageCode, String type, ObjectStream samples,
       final Map resources) throws IOException {
     return NameFinderME.train(languageCode, type, samples, resources, 100, 5);
   }
   
  @Deprecated
  public static GISModel train(EventStream es, int iterations, int cut) throws IOException {
    return GIS.trainModel(iterations, new TwoPassDataIndexer(es, cut));
  }
  
  /**
   * Gets the name type from the outcome 
   * @param outcome the outcome
   * @return the name type, or null if not set
   */
  private static final String extractNameType(String outcome) {
    Matcher matcher = typedOutcomePattern.matcher(outcome);
    if(matcher.matches()) {
      String nameType = matcher.group(1);
      return nameType;
    }
    
    return null;
  }

  /**
   * Removes spans with are intersecting or crossing in anyway.
   * 
   * 
   * The following rules are used to remove the spans:

   * Identical spans: The first span in the array after sorting it remains

   * Intersecting spans: The first span after sorting remains

   * Contained spans: All spans which are contained by another are removed

   * 
   * @param spans
   * 
   * @return
   */
  public static Span[] dropOverlappingSpans(Span spans[]) {
    
    List sortedSpans = new ArrayList(spans.length);
    Collections.addAll(sortedSpans, spans);
    Collections.sort(sortedSpans);
    
    Iterator it = sortedSpans.iterator();
    
    
    Span lastSpan = null;
    
    while (it.hasNext()) {
      Span span = it.next();
      
      if (lastSpan != null) {
        if (lastSpan.intersects(span)) {
          it.remove();
          span = lastSpan;
        }
      }
      
      lastSpan = span;
    }
    
    return sortedSpans.toArray(new Span[sortedSpans.size()]);
  }
  
  /**
   * Trains a new named entity model on the specified training file using the specified encoding to read it in.
   * 
   * @param args [-encoding encoding] training_file model_file
   * 
   * @throws java.io.IOException
   */
  @Deprecated
  public static void main(String[] args) throws IOException {
    
    // Encoding must be specified !!!
    // -encoding code train.file model.file
    
    if (args.length == 4) {
      
      NameSampleDataStream sampleStream = new NameSampleDataStream(
          new PlainTextByLineStream(new InputStreamReader(new FileInputStream(args[2]), args[1])));
      
      TokenNameFinderModel model = 
          NameFinderME.train("x-unspecified", "default", sampleStream, new HashMap());
      
      model.serialize(new FileOutputStream(args[4]));
      
    }
    else {
      // TODO: Usage
    }
  }
}