All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.sentdetect.SentenceDetectorFactory Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.sentdetect;

import java.util.Collections;
import java.util.Map;
import java.util.Set;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.lang.Factory;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;

/**
 * The factory that provides SentenceDetecor default implementations and
 * resources
 */
public class SentenceDetectorFactory extends BaseToolFactory {

  private String languageCode;
  private char[] eosCharacters;
  private Dictionary abbreviationDictionary;
  private Boolean useTokenEnd = null;

  private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
  private static final String EOS_CHARACTERS_PROPERTY = "eosCharacters";
  private static final String TOKEN_END_PROPERTY = "useTokenEnd";

  /**
   * Creates a {@link SentenceDetectorFactory} that provides the default
   * implementation of the resources.
   */
  public SentenceDetectorFactory() {
  }

  /**
   * Creates a {@link SentenceDetectorFactory}. Use this constructor to
   * programmatically create a factory.
   *
   * @param languageCode
   * @param abbreviationDictionary
   * @param eosCharacters
   */
  public SentenceDetectorFactory(String languageCode, boolean useTokenEnd,
      Dictionary abbreviationDictionary, char[] eosCharacters) {
    this.init(languageCode, useTokenEnd, abbreviationDictionary, eosCharacters);
  }

  protected void init(String languageCode, boolean useTokenEnd,
      Dictionary abbreviationDictionary, char[] eosCharacters) {
    this.languageCode = languageCode;
    this.useTokenEnd = useTokenEnd;
    this.eosCharacters = eosCharacters;
    this.abbreviationDictionary = abbreviationDictionary;
  }

  @Override
  public void validateArtifactMap() throws InvalidFormatException {

    if (this.artifactProvider.getManifestProperty(TOKEN_END_PROPERTY) == null)
      throw new InvalidFormatException(TOKEN_END_PROPERTY
          + " is a mandatory property!");

    Object abbreviationsEntry = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);

    if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
      throw new InvalidFormatException(
          "Abbreviations dictionary '" + abbreviationsEntry +
              "' has wrong type, needs to be of type Dictionary!");
    }
  }

  @Override
  public Map createArtifactMap() {
    Map artifactMap = super.createArtifactMap();

    // Abbreviations are optional
    if (abbreviationDictionary != null)
      artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviationDictionary);

    return artifactMap;
  }

  @Override
  public Map createManifestEntries() {
    Map manifestEntries = super.createManifestEntries();

    manifestEntries.put(TOKEN_END_PROPERTY, Boolean.toString(isUseTokenEnd()));

    // EOS characters are optional
    if (getEOSCharacters() != null)
      manifestEntries.put(EOS_CHARACTERS_PROPERTY,
          eosCharArrayToString(getEOSCharacters()));

    return manifestEntries;
  }

  public static SentenceDetectorFactory create(String subclassName,
      String languageCode, boolean useTokenEnd,
      Dictionary abbreviationDictionary, char[] eosCharacters)
      throws InvalidFormatException {
    if (subclassName == null) {
      // will create the default factory
      return new SentenceDetectorFactory(languageCode, useTokenEnd,
          abbreviationDictionary, eosCharacters);
    }
    try {
      SentenceDetectorFactory theFactory = ExtensionLoader
          .instantiateExtension(SentenceDetectorFactory.class, subclassName);
      theFactory.init(languageCode, useTokenEnd, abbreviationDictionary,
          eosCharacters);
      return theFactory;
    } catch (Exception e) {
      String msg = "Could not instantiate the " + subclassName
          + ". The initialization throw an exception.";
      System.err.println(msg);
      e.printStackTrace();
      throw new InvalidFormatException(msg, e);
    }
  }

  public char[] getEOSCharacters() {
    if (this.eosCharacters == null) {
      if (artifactProvider != null) {
        String prop = this.artifactProvider
            .getManifestProperty(EOS_CHARACTERS_PROPERTY);
        if (prop != null) {
          this.eosCharacters = eosStringToCharArray(prop);
        }
      } else {
        // get from language dependent factory
        Factory f = new Factory();
        this.eosCharacters = f.getEOSCharacters(languageCode);
      }
    }
    return this.eosCharacters;
  }

  public boolean isUseTokenEnd() {
    if (this.useTokenEnd == null && artifactProvider != null) {
      this.useTokenEnd = Boolean.valueOf(artifactProvider
          .getManifestProperty(TOKEN_END_PROPERTY));
    }
    return this.useTokenEnd;
  }

  public Dictionary getAbbreviationDictionary() {
    if (this.abbreviationDictionary == null && artifactProvider != null) {
      this.abbreviationDictionary = artifactProvider
          .getArtifact(ABBREVIATIONS_ENTRY_NAME);
    }
    return this.abbreviationDictionary;
  }

  public String getLanguageCode() {
    if (this.languageCode == null && artifactProvider != null) {
      this.languageCode = this.artifactProvider.getLanguage();
    }
    return this.languageCode;
  }

  public EndOfSentenceScanner getEndOfSentenceScanner() {
    Factory f = new Factory();
    char[] eosChars = getEOSCharacters();
    if (eosChars != null && eosChars.length > 0) {
      return f.createEndOfSentenceScanner(eosChars);
    } else {
      return f.createEndOfSentenceScanner(this.languageCode);
    }
  }

  public SDContextGenerator getSDContextGenerator() {
    Factory f = new Factory();
    char[] eosChars = getEOSCharacters();
    Set abbs;
    Dictionary abbDict = getAbbreviationDictionary();
    if (abbDict != null) {
      abbs = abbDict.asStringSet();
    } else {
      abbs = Collections.emptySet();
    }
    if (eosChars != null && eosChars.length > 0) {
      return f.createSentenceContextGenerator(abbs, eosChars);
    } else {
      return f.createSentenceContextGenerator(this.languageCode, abbs);
    }
  }

  private String eosCharArrayToString(char[] eosCharacters) {
    return String.valueOf(eosCharacters);
  }

  private char[] eosStringToCharArray(String eosCharacters) {
    return eosCharacters.toCharArray();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy