opennlp.tools.tokenize.TokenizerFactory Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.tokenize;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;
/**
* The factory that provides {@link Tokenizer} default implementations and
* resources. Users can extend this class if their application requires
* overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
*/
public class TokenizerFactory extends BaseToolFactory {
private String languageCode;
private Dictionary abbreviationDictionary;
private Boolean useAlphaNumericOptimization = false;
private Pattern alphaNumericPattern;
private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
private static final String USE_ALPHA_NUMERIC_OPTIMIZATION = "useAlphaNumericOptimization";
private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";
/**
* Creates a {@link TokenizerFactory} that provides the default implementation
* of the resources.
*/
public TokenizerFactory() {
}
/**
* Creates a {@link TokenizerFactory}. Use this constructor to
* programmatically create a factory.
*
* @param languageCode
* the language of the natural text
* @param abbreviationDictionary
* an abbreviations dictionary
* @param useAlphaNumericOptimization
* if true alpha numerics are skipped
* @param alphaNumericPattern
* null or a custom alphanumeric pattern (default is:
* "^[A-Za-z0-9]+$", provided by {@link Factory#DEFAULT_ALPHANUMERIC}
*/
public TokenizerFactory(String languageCode,
Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
Pattern alphaNumericPattern) {
this.init(languageCode, abbreviationDictionary,
useAlphaNumericOptimization, alphaNumericPattern);
}
protected void init(String languageCode, Dictionary abbreviationDictionary,
boolean useAlphaNumericOptimization, Pattern alphaNumericPattern) {
this.languageCode = languageCode;
this.useAlphaNumericOptimization = useAlphaNumericOptimization;
this.alphaNumericPattern = alphaNumericPattern;
this.abbreviationDictionary = abbreviationDictionary;
}
@Override
public void validateArtifactMap() throws InvalidFormatException {
if (this.artifactProvider.getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null)
throw new InvalidFormatException(USE_ALPHA_NUMERIC_OPTIMIZATION
+ " is a mandatory property!");
Object abbreviationsEntry = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);
if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
throw new InvalidFormatException("Abbreviations dictionary '" + abbreviationsEntry +
"' has wrong type, needs to be of type Dictionary!");
}
}
@Override
public Map createArtifactMap() {
Map artifactMap = super.createArtifactMap();
// Abbreviations are optional
if (abbreviationDictionary != null) {
artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviationDictionary);
}
return artifactMap;
}
@Override
public Map createManifestEntries() {
Map manifestEntries = super.createManifestEntries();
manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION,
Boolean.toString(isUseAlphaNumericOptmization()));
// alphanumeric pattern is optional
if (getAlphaNumericPattern() != null) {
manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern().pattern());
}
return manifestEntries;
}
/**
* Factory method the framework uses create a new {@link TokenizerFactory}.
*
* @param subclassName the name of the class implementing the {@link TokenizerFactory}
* @param languageCode the language code the tokenizer should use
* @param abbreviationDictionary an optional dictionary containing abbreviations, or null if not present
* @param useAlphaNumericOptimization indicate if the alpha numeric optimization
* should be enabled or disabled
* @param alphaNumericPattern the pattern the alpha numeric optimization should use
*
* @return the instance of the Tokenizer Factory
*
* @throws InvalidFormatException if once of the input parameters doesn't comply if the expected format
*/
public static TokenizerFactory create(String subclassName,
String languageCode, Dictionary abbreviationDictionary,
boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
throws InvalidFormatException {
if (subclassName == null) {
// will create the default factory
return new TokenizerFactory(languageCode, abbreviationDictionary,
useAlphaNumericOptimization, alphaNumericPattern);
}
try {
TokenizerFactory theFactory = ExtensionLoader.instantiateExtension(
TokenizerFactory.class, subclassName);
theFactory.init(languageCode, abbreviationDictionary,
useAlphaNumericOptimization, alphaNumericPattern);
return theFactory;
} catch (Exception e) {
String msg = "Could not instantiate the " + subclassName
+ ". The initialization throw an exception.";
System.err.println(msg);
e.printStackTrace();
throw new InvalidFormatException(msg, e);
}
}
/**
* Gets the alpha numeric pattern.
*
* @return the user specified alpha numeric pattern or a default.
*/
public Pattern getAlphaNumericPattern() {
if (this.alphaNumericPattern == null) {
if (this.artifactProvider != null) {
String prop = this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN);
if (prop != null) {
this.alphaNumericPattern = Pattern.compile(prop);
}
}
// could not load from manifest, will get from language dependent factory
if (this.alphaNumericPattern == null) {
Factory f = new Factory();
this.alphaNumericPattern = f.getAlphanumeric(languageCode);
}
}
return this.alphaNumericPattern;
}
/**
* Gets whether to use alphanumeric optimization.
*
* @return true if the alpha numeric optimization is enabled, otherwise false
*/
public boolean isUseAlphaNumericOptmization() {
if (artifactProvider != null) {
this.useAlphaNumericOptimization = Boolean.valueOf(this.artifactProvider
.getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
}
return this.useAlphaNumericOptimization;
}
/**
* Gets the abbreviation dictionary
*
* @return null or the abbreviation dictionary
*/
public Dictionary getAbbreviationDictionary() {
if (this.abbreviationDictionary == null && artifactProvider != null) {
this.abbreviationDictionary = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);
}
return this.abbreviationDictionary;
}
/**
* Retrieves the language code.
*
* @return the language code
*/
public String getLanguageCode() {
if (this.languageCode == null && this.artifactProvider != null) {
this.languageCode = this.artifactProvider.getLanguage();
}
return this.languageCode;
}
/**
* Gets the context generator
*
* @return a new instance of the context generator
*/
public TokenContextGenerator getContextGenerator() {
Factory f = new Factory();
Set abbs;
Dictionary abbDict = getAbbreviationDictionary();
if (abbDict != null) {
abbs = abbDict.asStringSet();
} else {
abbs = Collections.emptySet();
}
return f.createTokenContextGenerator(getLanguageCode(), abbs);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy