opennlp.uima.normalizer.Normalizer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.uima.normalizer;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import opennlp.tools.util.StringList;
import opennlp.uima.namefind.NameFinder;
import opennlp.uima.util.AnnotatorUtil;
import opennlp.uima.util.ExceptionMessages;
import opennlp.uima.util.UimaUtil;
/**
* The Normalizer tries the structure annotations. The structured value
* is than assigned to a field of the annotation.
*
* The process depends on the
*
* string Tokens must be (fuzzy) mapped to categories eg. a month, a day or a
* year (use dictionary) integer, float tokens must be parsed eg. for percentage
* or period boolean tokens must be parsed eg is there any ???
*
*
* restricted set of outcomes throw error if not matched or silently fail
* unrestricted set of outcomes
*/
public class Normalizer extends CasAnnotator_ImplBase {
/**
* This set contains all supported range types.
*/
private static final Set SUPPORTED_TYPES;
static {
Set supportedTypes = new HashSet<>();
supportedTypes.add(CAS.TYPE_NAME_STRING);
supportedTypes.add(CAS.TYPE_NAME_BYTE);
supportedTypes.add(CAS.TYPE_NAME_SHORT);
supportedTypes.add(CAS.TYPE_NAME_INTEGER);
supportedTypes.add(CAS.TYPE_NAME_LONG);
supportedTypes.add(CAS.TYPE_NAME_FLOAT);
supportedTypes.add(CAS.TYPE_NAME_DOUBLE);
SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
}
private UimaContext context;
private Logger mLogger;
/**
* The annotation marks the text to structure.
*/
private Type mNameType;
/**
* The target type which the text should have. This type must be primitive.
*
* It should not be possible to assign something to this feature with is not
* structured. The feature should define allowed values.
*/
private Feature mStructureFeature;
// private Type mSentenceType;
private StringDictionary mLookupDictionary;
/**
* Initializes a new instance.
*
* Note: Use {@link #initialize(UimaContext) } to initialize this instance. Not
* use the constructor.
*/
public Normalizer() {
// must not be implemented !
}
/**
* Initializes the current instance with the given context.
*
* Note: Do all initialization in this method, do not use the constructor.
*/
public void initialize(UimaContext context)
throws ResourceInitializationException {
super.initialize(context);
this.context = context;
mLogger = context.getLogger();
if (mLogger.isLoggable(Level.INFO)) {
mLogger.log(Level.INFO, "Initializing the OpenNLP Normalizer annotator.");
}
try {
String modelName = AnnotatorUtil.getOptionalStringParameter(context,
UimaUtil.DICTIONARY_PARAMETER);
if (modelName != null) {
InputStream inModel = AnnotatorUtil.getResourceAsStream(context,
modelName);
mLookupDictionary = new StringDictionary(inModel);
}
} catch (IOException e) {
throw new ResourceInitializationException(
ExceptionMessages.MESSAGE_CATALOG, "io_error_model_reading",
new Object[] {e.getMessage()}, e);
}
}
/**
* Initializes the type system.
*/
public void typeSystemInit(TypeSystem typeSystem)
throws AnalysisEngineProcessException {
// sentence type
// String sentenceTypeName =
// AnnotatorUtil.getRequiredStringParameter(mContext,
// UimaUtil.SENTENCE_TYPE_PARAMETER);
// mSentenceType = AnnotatorUtil.getType(typeSystem, sentenceTypeName);
// name type
mNameType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
NameFinder.NAME_TYPE_PARAMETER);
mStructureFeature = AnnotatorUtil.getRequiredFeatureParameter(context,
mNameType, "opennlp.uima.normalizer.StructureFeature");
if (!SUPPORTED_TYPES.contains(mStructureFeature.getRange().getName())) {
throw new AnalysisEngineProcessException(
ExceptionMessages.MESSAGE_CATALOG, "range_type_unsupported",
new Object[] {mStructureFeature.getRange().getName()});
}
}
public void process(CAS tcas) {
FSIndex sentenceIndex = tcas.getAnnotationIndex(mNameType);
for (AnnotationFS nameAnnotation : sentenceIndex) {
// check if the document language is supported
String language = tcas.getDocumentLanguage();
if (!NumberUtil.isLanguageSupported(language)) {
if (mLogger.isLoggable(Level.INFO)) {
mLogger.log(Level.INFO, "Unsupported language: " + language);
}
continue;
}
String text = nameAnnotation.getCoveredText();
// if possible replace text with normalization from dictionary
if (mLookupDictionary != null) {
StringList tokens = new StringList(text);
String normalizedText = mLookupDictionary.get(tokens);
if (normalizedText != null) {
text = normalizedText;
}
}
if (CAS.TYPE_NAME_STRING.equals(mStructureFeature.getRange().getName())) {
nameAnnotation.setStringValue(mStructureFeature, text);
} else {
Number number;
try {
number = NumberUtil.parse(text, language);
} catch (ParseException e) {
if (mLogger.isLoggable(Level.INFO)) {
mLogger.log(Level.INFO, "Invalid number format: " + text);
}
continue;
}
if (CAS.TYPE_NAME_BYTE.equals(mStructureFeature.getRange().getName())) {
nameAnnotation.setByteValue(mStructureFeature, number.byteValue());
} else if (CAS.TYPE_NAME_SHORT.equals(mStructureFeature.getRange()
.getName())) {
nameAnnotation.setShortValue(mStructureFeature, number.shortValue());
} else if (CAS.TYPE_NAME_INTEGER.equals(mStructureFeature.getRange()
.getName())) {
nameAnnotation.setIntValue(mStructureFeature, number.intValue());
} else if (CAS.TYPE_NAME_LONG.equals(mStructureFeature.getRange()
.getName())) {
nameAnnotation.setLongValue(mStructureFeature, number.longValue());
} else if (CAS.TYPE_NAME_FLOAT.equals(mStructureFeature.getRange()
.getName())) {
nameAnnotation.setFloatValue(mStructureFeature, number.floatValue());
} else if (CAS.TYPE_NAME_DOUBLE.equals(mStructureFeature.getRange()
.getName())) {
nameAnnotation
.setDoubleValue(mStructureFeature, number.doubleValue());
}
}
}
}
}