All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.uima.normalizer.Normalizer Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.uima.normalizer;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;

import opennlp.tools.util.StringList;
import opennlp.uima.namefind.NameFinder;
import opennlp.uima.util.AnnotatorUtil;
import opennlp.uima.util.ExceptionMessages;
import opennlp.uima.util.UimaUtil;

/**
 * The Normalizer tries the structure annotations. The structured value
 * is than assigned to a field of the annotation.
 * 

* The process depends on the *

* string Tokens must be (fuzzy) mapped to categories eg. a month, a day or a * year (use dictionary) integer, float tokens must be parsed eg. for percentage * or period boolean tokens must be parsed eg is there any ??? *

*

* restricted set of outcomes throw error if not matched or silently fail * unrestricted set of outcomes */ public class Normalizer extends CasAnnotator_ImplBase { /** * This set contains all supported range types. */ private static final Set SUPPORTED_TYPES; static { Set supportedTypes = new HashSet<>(); supportedTypes.add(CAS.TYPE_NAME_STRING); supportedTypes.add(CAS.TYPE_NAME_BYTE); supportedTypes.add(CAS.TYPE_NAME_SHORT); supportedTypes.add(CAS.TYPE_NAME_INTEGER); supportedTypes.add(CAS.TYPE_NAME_LONG); supportedTypes.add(CAS.TYPE_NAME_FLOAT); supportedTypes.add(CAS.TYPE_NAME_DOUBLE); SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes); } private UimaContext context; private Logger mLogger; /** * The annotation marks the text to structure. */ private Type mNameType; /** * The target type which the text should have. This type must be primitive. *

* It should not be possible to assign something to this feature with is not * structured. The feature should define allowed values. */ private Feature mStructureFeature; // private Type mSentenceType; private StringDictionary mLookupDictionary; /** * Initializes a new instance. *

* Note: Use {@link #initialize(UimaContext) } to initialize this instance. Not * use the constructor. */ public Normalizer() { // must not be implemented ! } /** * Initializes the current instance with the given context. *

* Note: Do all initialization in this method, do not use the constructor. */ public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); this.context = context; mLogger = context.getLogger(); if (mLogger.isLoggable(Level.INFO)) { mLogger.log(Level.INFO, "Initializing the OpenNLP Normalizer annotator."); } try { String modelName = AnnotatorUtil.getOptionalStringParameter(context, UimaUtil.DICTIONARY_PARAMETER); if (modelName != null) { InputStream inModel = AnnotatorUtil.getResourceAsStream(context, modelName); mLookupDictionary = new StringDictionary(inModel); } } catch (IOException e) { throw new ResourceInitializationException( ExceptionMessages.MESSAGE_CATALOG, "io_error_model_reading", new Object[] {e.getMessage()}, e); } } /** * Initializes the type system. */ public void typeSystemInit(TypeSystem typeSystem) throws AnalysisEngineProcessException { // sentence type // String sentenceTypeName = // AnnotatorUtil.getRequiredStringParameter(mContext, // UimaUtil.SENTENCE_TYPE_PARAMETER); // mSentenceType = AnnotatorUtil.getType(typeSystem, sentenceTypeName); // name type mNameType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem, NameFinder.NAME_TYPE_PARAMETER); mStructureFeature = AnnotatorUtil.getRequiredFeatureParameter(context, mNameType, "opennlp.uima.normalizer.StructureFeature"); if (!SUPPORTED_TYPES.contains(mStructureFeature.getRange().getName())) { throw new AnalysisEngineProcessException( ExceptionMessages.MESSAGE_CATALOG, "range_type_unsupported", new Object[] {mStructureFeature.getRange().getName()}); } } public void process(CAS tcas) { FSIndex sentenceIndex = tcas.getAnnotationIndex(mNameType); for (AnnotationFS nameAnnotation : sentenceIndex) { // check if the document language is supported String language = tcas.getDocumentLanguage(); if (!NumberUtil.isLanguageSupported(language)) { if (mLogger.isLoggable(Level.INFO)) { mLogger.log(Level.INFO, "Unsupported language: " + language); } continue; } String text = nameAnnotation.getCoveredText(); // if possible replace text with normalization from dictionary if (mLookupDictionary != null) { StringList tokens = new StringList(text); String normalizedText = mLookupDictionary.get(tokens); if (normalizedText != null) { text = normalizedText; } } if (CAS.TYPE_NAME_STRING.equals(mStructureFeature.getRange().getName())) { nameAnnotation.setStringValue(mStructureFeature, text); } else { Number number; try { number = NumberUtil.parse(text, language); } catch (ParseException e) { if (mLogger.isLoggable(Level.INFO)) { mLogger.log(Level.INFO, "Invalid number format: " + text); } continue; } if (CAS.TYPE_NAME_BYTE.equals(mStructureFeature.getRange().getName())) { nameAnnotation.setByteValue(mStructureFeature, number.byteValue()); } else if (CAS.TYPE_NAME_SHORT.equals(mStructureFeature.getRange() .getName())) { nameAnnotation.setShortValue(mStructureFeature, number.shortValue()); } else if (CAS.TYPE_NAME_INTEGER.equals(mStructureFeature.getRange() .getName())) { nameAnnotation.setIntValue(mStructureFeature, number.intValue()); } else if (CAS.TYPE_NAME_LONG.equals(mStructureFeature.getRange() .getName())) { nameAnnotation.setLongValue(mStructureFeature, number.longValue()); } else if (CAS.TYPE_NAME_FLOAT.equals(mStructureFeature.getRange() .getName())) { nameAnnotation.setFloatValue(mStructureFeature, number.floatValue()); } else if (CAS.TYPE_NAME_DOUBLE.equals(mStructureFeature.getRange() .getName())) { nameAnnotation .setDoubleValue(mStructureFeature, number.doubleValue()); } } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy