All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.brat.BratNameSampleStreamFactory Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.brat;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Collectors;

import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.sentdetect.NewlineSentenceDetector;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;

public class BratNameSampleStreamFactory
        extends AbstractSampleStreamFactory {

  interface Parameters {
    @ParameterDescription(valueName = "bratDataDir", description = "location of brat data dir")
    File getBratDataDir();

    @ParameterDescription(valueName = "annConfFile")
    File getAnnotationConfig();

    @ParameterDescription(valueName = "modelFile")
    @OptionalParameter
    File getSentenceDetectorModel();

    @ParameterDescription(valueName = "modelFile")
    @OptionalParameter
    File getTokenizerModel();

    @ParameterDescription(valueName = "name")
    @OptionalParameter
    String getRuleBasedTokenizer();

    @ParameterDescription(valueName = "value")
    @OptionalParameter(defaultValue = "false")
    Boolean getRecursive();

    @ParameterDescription(valueName = "names")
    @OptionalParameter
    String getNameTypes();
  }

  protected BratNameSampleStreamFactory() {
    super(Parameters.class);
  }

  /**
   * Checks that non of the passed values are null.
   *
   * @param objects
   * @return true or false
   */
  private boolean notNull(Object... objects) {

    for (Object obj : objects) {
      if (obj == null)
        return false;
    }

    return true;
  }

  @Override
  public ObjectStream create(String[] args) {

    Parameters params = ArgumentParser.parse(args, Parameters.class);

    if (notNull(params.getRuleBasedTokenizer(), params.getTokenizerModel())) {
      throw new TerminateToolException(-1, "Either use rule based or statistical tokenizer!");
    }

    // TODO: Provide the file name to the annotation.conf file and implement the parser ...
    AnnotationConfiguration annConfig;
    try {
      annConfig = AnnotationConfiguration.parse(params.getAnnotationConfig());
    }
    catch (IOException e) {
      throw new TerminateToolException(1, "Failed to parse annotation.conf file!");
    }

    // TODO: Add an optional parameter to search recursive
    // TODO: How to handle the error here ? terminate the tool? not nice if used by API!
    ObjectStream samples;
    try {
      samples = new BratDocumentStream(annConfig,
          params.getBratDataDir(), params.getRecursive(), null);
    } catch (IOException e) {
      throw new TerminateToolException(-1, e.getMessage());
    }

    SentenceDetector sentDetector;

    if (params.getSentenceDetectorModel() != null) {
      try {
        sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel()));
      } catch (IOException e) {
        throw new TerminateToolException(-1, "Failed to load sentence detector model!", e);
      }
    }
    else {
      sentDetector = new NewlineSentenceDetector();
    }

    Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;

    if (params.getTokenizerModel() != null) {
      try {
        tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
      } catch (IOException e) {
        throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
      }
    }
    else if (params.getRuleBasedTokenizer() != null) {
      String tokenizerName = params.getRuleBasedTokenizer();

      if ("simple".equals(tokenizerName)) {
        tokenizer = SimpleTokenizer.INSTANCE;
      }
      else if ("whitespace".equals(tokenizerName)) {
        tokenizer = WhitespaceTokenizer.INSTANCE;
      }
      else {
        throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName);
      }
    }

    Set nameTypes = null;
    if (params.getNameTypes() != null) {
      String[] nameTypesArr = params.getNameTypes().split(",");
      if (nameTypesArr.length > 0) {
        nameTypes = Arrays.stream(nameTypesArr).map(String::trim).collect(Collectors.toSet());
      }
    }

    return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
  }

  public static void registerFactory() {
    StreamFactoryRegistry.registerFactory(NameSample.class, "brat",
        new BratNameSampleStreamFactory());
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy