opennlp.tools.namefind.NameFinderEventStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.namefind;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import opennlp.tools.ml.model.Event;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.Span;
import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;

/**
 * Class for creating an event stream out of data files for training an name
 * finder.
 */
public class NameFinderEventStream extends opennlp.tools.util.AbstractEventStream {

  private NameContextGenerator contextGenerator;

  private AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
      new AdditionalContextFeatureGenerator();

  private SequenceCodec codec;

  private final String defaultType;

  /**
   * Creates a new name finder event stream using the specified data stream and context generator.
   * @param dataStream The data stream of events.
   * @param type null or overrides the type parameter in the provided samples
   * @param contextGenerator The context generator used to generate features for the event stream.
   */
  public NameFinderEventStream(ObjectStream dataStream, String type,
                               NameContextGenerator contextGenerator, SequenceCodec codec) {
    super(dataStream);

    this.codec = codec;

    if (codec == null) {
      this.codec = new BioCodec();
    }

    this.contextGenerator = contextGenerator;
    this.contextGenerator.addFeatureGenerator(
        new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));

    this.defaultType = type;
  }

  public NameFinderEventStream(ObjectStream dataStream) {
    this(dataStream, null, new DefaultNameContextGenerator(), null);
  }

  /**
   * Generates the name tag outcomes (start, continue, other) for each token in a sentence
   * with the specified length using the specified name spans.
   * @param names Token spans for each of the names.
   * @param type null or overrides the type parameter in the provided samples
   * @param length The length of the sentence.
   * @return An array of start, continue, other outcomes based on the specified names and sentence length.
   *
   * @deprecated use the BioCodec implementation of the SequenceValidator instead!
   */
  @Deprecated
  public static String[] generateOutcomes(Span[] names, String type, int length) {
    String[] outcomes = new String[length];
    for (int i = 0; i < outcomes.length; i++) {
      outcomes[i] = NameFinderME.OTHER;
    }
    for (Span name : names) {
      if (name.getType() == null) {
        outcomes[name.getStart()] = type + "-" + NameFinderME.START;
      }
      else {
        outcomes[name.getStart()] = name.getType() + "-" + NameFinderME.START;
      }
      // now iterate from begin + 1 till end
      for (int i = name.getStart() + 1; i < name.getEnd(); i++) {
        if (name.getType() == null) {
          outcomes[i] = type + "-" + NameFinderME.CONTINUE;
        }
        else {
          outcomes[i] = name.getType() + "-" + NameFinderME.CONTINUE;
        }
      }
    }
    return outcomes;
  }

  public static List generateEvents(String[] sentence, String[] outcomes,
                                           NameContextGenerator cg) {
    List events = new ArrayList<>(outcomes.length);
    for (int i = 0; i < outcomes.length; i++) {
      events.add(new Event(outcomes[i], cg.getContext(i, sentence, outcomes,null)));
    }

    cg.updateAdaptiveData(sentence, outcomes);

    return events;
  }

  @Override
  protected Iterator createEvents(NameSample sample) {

    if (sample.isClearAdaptiveDataSet()) {
      contextGenerator.clearAdaptiveData();
    }

    Span[] names = sample.getNames();
    if (!Objects.isNull(this.defaultType)) {
      overrideType(names);
    }

    String[] outcomes = codec.encode(names, sample.getSentence().length);
    // String outcomes[] = generateOutcomes(sample.getNames(), type, sample.getSentence().length);
    additionalContextFeatureGenerator.setCurrentContext(sample.getAdditionalContext());
    String[] tokens = new String[sample.getSentence().length];

    for (int i = 0; i < sample.getSentence().length; i++) {
      tokens[i] = sample.getSentence()[i];
    }

    return generateEvents(tokens, outcomes, contextGenerator).iterator();
  }

  private void overrideType(Span[] names) {
    for (int i = 0; i < names.length; i++) {
      Span n = names[i];
      names[i] = new Span(n.getStart(), n.getEnd(), this.defaultType,
              n.getProb());
    }
  }

  /**
   * Generated previous decision features for each token based on contents of the specified map.
   * @param tokens The token for which the context is generated.
   * @param prevMap A mapping of tokens to their previous decisions.
   * @return An additional context array with features for each token.
   */
  public static String[][] additionalContext(String[] tokens, Map prevMap) {
    String[][] ac = new String[tokens.length][1];
    for (int ti = 0; ti < tokens.length; ti++) {
      String pt = prevMap.get(tokens[ti]);
      ac[ti][0] = "pd=" + pt;
    }
    return ac;
  }
}