opennlp.tools.formats.muc.MucNameContentHandler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.muc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import opennlp.tools.namefind.NameSample;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;

public class MucNameContentHandler extends SgmlParser.ContentHandler {

  private static final String ENTITY_ELEMENT_NAME = "ENAMEX";
  private static final String TIME_ELEMENT_NAME = "TIMEX";
  private static final String NUM_ELEMENT_NAME = "NUMEX";

  private static final Set NAME_ELEMENT_NAMES;

  private static final Set EXPECTED_TYPES;

  static {
    EXPECTED_TYPES = Set.of("PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PERCENT");
    NAME_ELEMENT_NAMES = Set.of(ENTITY_ELEMENT_NAME, TIME_ELEMENT_NAME, NUM_ELEMENT_NAME);
  }

  private final Tokenizer tokenizer;
  private final List storedSamples;

  private boolean isInsideContentElement = false;
  private final List text = new ArrayList<>();
  private boolean isClearAdaptiveData = false;
  private final Stack incompleteNames = new Stack<>();

  private final List names = new ArrayList<>();

  /**
   * Initializes a {@link MucNameContentHandler}.
   *
   * @param tokenizer The {@link Tokenizer} to use. Must not be {@code null}.
   * @param storedSamples The {@link List samples} as input.
   *                      Must not be {@code null}.
   */
  public MucNameContentHandler(Tokenizer tokenizer, List storedSamples) {
    this.tokenizer = tokenizer;
    this.storedSamples = storedSamples;
  }

  @Override
  public void startElement(String name, Map attributes)
      throws InvalidFormatException {

    if (MucElementNames.DOC_ELEMENT.equals(name)) {
      isClearAdaptiveData = true;
    }

    if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
      isInsideContentElement = true;
    }

    if (NAME_ELEMENT_NAMES.contains(name)) {

      String nameType = attributes.get("TYPE");

      if (!EXPECTED_TYPES.contains(nameType)) {
        throw new InvalidFormatException("Unknown timex, numex or namex type: "
            + nameType + ", expected one of " + EXPECTED_TYPES);
      }

      incompleteNames.add(new Span(text.size(), text.size(), nameType.toLowerCase(Locale.ENGLISH)));
    }
  }

  @Override
  public void characters(CharSequence chars) {
    if (isInsideContentElement) {
      String[] tokens = tokenizer.tokenize(chars.toString());
      text.addAll(Arrays.asList(tokens));
    }
  }

  @Override
  public void endElement(String name) {

    if (NAME_ELEMENT_NAMES.contains(name)) {
      Span nameSpan = incompleteNames.pop();
      nameSpan = new Span(nameSpan.getStart(), text.size(), nameSpan.getType());
      names.add(nameSpan);
    }

    if (MucElementNames.CONTENT_ELEMENTS.contains(name)) {
      storedSamples.add(new NameSample(text.toArray(new String[0]),
          names.toArray(new Span[0]), isClearAdaptiveData));

      if (isClearAdaptiveData) {
        isClearAdaptiveData = false;
      }

      text.clear();
      names.clear();
      isInsideContentElement = false;
    }
  }
}