All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.tokenize.TokenizerFactory Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;

/**
 * The factory that provides {@link Tokenizer} default implementations and
 * resources. Users can extend this class if their application requires
 * overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
 */
public class TokenizerFactory extends BaseToolFactory {

  private String languageCode;
  private Dictionary abbreviationDictionary;
  private Boolean useAlphaNumericOptimization = false;
  private Pattern alphaNumericPattern;

  private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
  private static final String USE_ALPHA_NUMERIC_OPTIMIZATION = "useAlphaNumericOptimization";
  private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";

  /**
   * Creates a {@link TokenizerFactory} that provides the default implementation
   * of the resources.
   */
  public TokenizerFactory() {
  }

  /**
   * Creates a {@link TokenizerFactory}. Use this constructor to
   * programmatically create a factory.
   *
   * @param languageCode
   *          the language of the natural text
   * @param abbreviationDictionary
   *          an abbreviations dictionary
   * @param useAlphaNumericOptimization
   *          if true alpha numerics are skipped
   * @param alphaNumericPattern
   *          null or a custom alphanumeric pattern (default is:
   *          "^[A-Za-z0-9]+$", provided by {@link Factory#DEFAULT_ALPHANUMERIC}
   */
  public TokenizerFactory(String languageCode,
      Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
      Pattern alphaNumericPattern) {
    this.init(languageCode, abbreviationDictionary,
        useAlphaNumericOptimization, alphaNumericPattern);
  }

  protected void init(String languageCode, Dictionary abbreviationDictionary,
      boolean useAlphaNumericOptimization, Pattern alphaNumericPattern) {
    this.languageCode = languageCode;
    this.useAlphaNumericOptimization = useAlphaNumericOptimization;
    this.alphaNumericPattern = alphaNumericPattern;
    this.abbreviationDictionary = abbreviationDictionary;
  }

  @Override
  public void validateArtifactMap() throws InvalidFormatException {
    if (this.artifactProvider.getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null)
      throw new InvalidFormatException(USE_ALPHA_NUMERIC_OPTIMIZATION
          + " is a mandatory property!");

    Object abbreviationsEntry = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);

    if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
      throw new InvalidFormatException("Abbreviations dictionary '" + abbreviationsEntry +
              "' has wrong type, needs to be of type Dictionary!");
    }
  }

  @Override
  public Map createArtifactMap() {
    Map artifactMap = super.createArtifactMap();

    // Abbreviations are optional
    if (abbreviationDictionary != null) {
      artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviationDictionary);
    }

    return artifactMap;
  }

  @Override
  public Map createManifestEntries() {
    Map manifestEntries = super.createManifestEntries();

    manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION,
        Boolean.toString(isUseAlphaNumericOptmization()));

    // alphanumeric pattern is optional
    if (getAlphaNumericPattern() != null) {
      manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern().pattern());
    }

    return manifestEntries;
  }

  /**
   * Factory method the framework uses create a new {@link TokenizerFactory}.
   *
   * @param subclassName the name of the class implementing the {@link TokenizerFactory}
   * @param languageCode the language code the tokenizer should use
   * @param abbreviationDictionary an optional dictionary containing abbreviations, or null if not present
   * @param useAlphaNumericOptimization indicate if the alpha numeric optimization
   *     should be enabled or disabled
   * @param alphaNumericPattern the pattern the alpha numeric optimization should use
   *
   * @return the instance of the Tokenizer Factory
   *
   * @throws InvalidFormatException if once of the input parameters doesn't comply if the expected format
   */
  public static TokenizerFactory create(String subclassName,
      String languageCode, Dictionary abbreviationDictionary,
      boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
      throws InvalidFormatException {
    if (subclassName == null) {
      // will create the default factory
      return new TokenizerFactory(languageCode, abbreviationDictionary,
          useAlphaNumericOptimization, alphaNumericPattern);
    }
    try {
      TokenizerFactory theFactory = ExtensionLoader.instantiateExtension(
          TokenizerFactory.class, subclassName);
      theFactory.init(languageCode, abbreviationDictionary,
          useAlphaNumericOptimization, alphaNumericPattern);
      return theFactory;
    } catch (Exception e) {
      String msg = "Could not instantiate the " + subclassName
          + ". The initialization throw an exception.";
      System.err.println(msg);
      e.printStackTrace();
      throw new InvalidFormatException(msg, e);
    }
  }

  /**
   * Gets the alpha numeric pattern.
   *
   * @return the user specified alpha numeric pattern or a default.
   */
  public Pattern getAlphaNumericPattern() {
    if (this.alphaNumericPattern == null) {
      if (this.artifactProvider != null) {
        String prop = this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN);
        if (prop != null) {
          this.alphaNumericPattern = Pattern.compile(prop);
        }
      }
      // could not load from manifest, will get from language dependent factory
      if (this.alphaNumericPattern == null) {
        Factory f = new Factory();
        this.alphaNumericPattern = f.getAlphanumeric(languageCode);
      }
    }
    return this.alphaNumericPattern;
  }

  /**
   * Gets whether to use alphanumeric optimization.
   *
   * @return true if the alpha numeric optimization is enabled, otherwise false
   */
  public boolean isUseAlphaNumericOptmization() {
    if (artifactProvider != null) {
      this.useAlphaNumericOptimization = Boolean.valueOf(this.artifactProvider
          .getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
    }
    return this.useAlphaNumericOptimization;
  }

  /**
   * Gets the abbreviation dictionary
   *
   * @return null or the abbreviation dictionary
   */
  public Dictionary getAbbreviationDictionary() {
    if (this.abbreviationDictionary == null && artifactProvider != null) {
      this.abbreviationDictionary = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);
    }
    return this.abbreviationDictionary;
  }

  /**
   * Retrieves the language code.
   *
   * @return the language code
   */
  public String getLanguageCode() {
    if (this.languageCode == null && this.artifactProvider != null) {
      this.languageCode = this.artifactProvider.getLanguage();
    }
    return this.languageCode;
  }

  /**
   * Gets the context generator
   *
   * @return a new instance of the context generator
   */
  public TokenContextGenerator getContextGenerator() {
    Factory f = new Factory();
    Set abbs;
    Dictionary abbDict = getAbbreviationDictionary();
    if (abbDict != null) {
      abbs = abbDict.asStringSet();
    } else {
      abbs = Collections.emptySet();
    }
    return f.createTokenContextGenerator(getLanguageCode(), abbs);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy