opennlp.tools.formats.ontonotes.OntoNotesNameSampleStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.ontonotes;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import opennlp.tools.namefind.NameSample;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;

/**
 * Name Sample Stream parser for the OntoNotes 4.0 corpus.
 */
public class OntoNotesNameSampleStream extends
    FilterObjectStream {

  private final Map tokenConversionMap;

  private List nameSamples = new LinkedList<>();

  public OntoNotesNameSampleStream(ObjectStream samples) {
    super(samples);

    Map tokenConversionMap = new HashMap<>();
    tokenConversionMap.put("-LRB-", "(");
    tokenConversionMap.put("-RRB-", ")");
    tokenConversionMap.put("-LSB-", "[");
    tokenConversionMap.put("-RSB-", "]");
    tokenConversionMap.put("-LCB-", "{");
    tokenConversionMap.put("-RCB-", "}");
    tokenConversionMap.put("-AMP-", "&");
    this.tokenConversionMap = Collections.unmodifiableMap(tokenConversionMap);
  }

  private String convertToken(String token) {

    StringBuilder convertedToken = new StringBuilder(token);

    int startTagEndIndex = convertedToken.indexOf(">");

    if (token.contains("=\"") && startTagEndIndex != -1) {
      convertedToken.delete(0, startTagEndIndex + 1);
    }

    int endTagBeginIndex = convertedToken.indexOf("<");
    int endTagEndIndex = convertedToken.indexOf(">");

    if (endTagBeginIndex != -1 && endTagEndIndex != -1) {
      convertedToken.delete(endTagBeginIndex, endTagEndIndex + 1);
    }

    String cleanedToken = convertedToken.toString();

    if (tokenConversionMap.get(cleanedToken) != null) {
      cleanedToken = tokenConversionMap.get(cleanedToken);
    }

    return cleanedToken;
  }

  public NameSample read() throws IOException {

    if (nameSamples.isEmpty()) {
      String doc = samples.read();

      if (doc != null) {
        BufferedReader docIn = new BufferedReader(new StringReader(doc));

        boolean clearAdaptiveData = true;

        String line;
        while ((line = docIn.readLine()) != null) {

          if (line.startsWith("")) {
            break;
          }

          String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(line);

          List entities = new LinkedList<>();
          List cleanedTokens = new ArrayList<>(tokens.length);

          int tokenIndex = 0;
          int entityBeginIndex = -1;
          String entityType = null;
          boolean insideStartEnmaxTag = false;
          for (String token : tokens) {

            // Split here, next part of tag is in new token
            if (token.startsWith("")) {
                entityBeginIndex = tokenIndex;
                insideStartEnmaxTag = false;
              } else {
                continue;
              }
            }

            if (token.endsWith("")) {
              entities.add(new Span(entityBeginIndex, tokenIndex + 1,
                  entityType));
              entityBeginIndex = -1;
            }

            cleanedTokens.add(convertToken(token));
            tokenIndex++;
          }

          nameSamples.add(new NameSample(cleanedTokens
              .toArray(new String[cleanedTokens.size()]), entities
              .toArray(new Span[entities.size()]), clearAdaptiveData));

          clearAdaptiveData = false;
        }
      }
    }

    if (!nameSamples.isEmpty()) {
      return nameSamples.remove(0);
    } else {
      return null;
    }
  }
}