opennlp.uima.normalizer.Normalizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-uima Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.uima.normalizer;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;

import opennlp.tools.util.StringList;
import opennlp.uima.namefind.NameFinder;
import opennlp.uima.util.AnnotatorUtil;
import opennlp.uima.util.ExceptionMessages;
import opennlp.uima.util.UimaUtil;

/**
 * The Normalizer tries the structure annotations. The structured value
 * is than assigned to a field of the annotation.
 * 
 * The process depends on the
 * 

 * string Tokens must be (fuzzy) mapped to categories eg. a month, a day or a
 * year (use dictionary) integer, float tokens must be parsed eg. for percentage
 * or period boolean tokens must be parsed eg is there any ???
 * 

 * 

 * restricted set of outcomes throw error if not matched or silently fail
 * unrestricted set of outcomes
 */
public class Normalizer extends CasAnnotator_ImplBase {

  /**
   * This set contains all supported range types.
   */
  private static final Set SUPPORTED_TYPES;

  static {
    Set supportedTypes = new HashSet<>();

    supportedTypes.add(CAS.TYPE_NAME_STRING);
    supportedTypes.add(CAS.TYPE_NAME_BYTE);
    supportedTypes.add(CAS.TYPE_NAME_SHORT);
    supportedTypes.add(CAS.TYPE_NAME_INTEGER);
    supportedTypes.add(CAS.TYPE_NAME_LONG);
    supportedTypes.add(CAS.TYPE_NAME_FLOAT);
    supportedTypes.add(CAS.TYPE_NAME_DOUBLE);

    SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
  }

  private UimaContext context;

  private Logger mLogger;

  /**
   * The annotation marks the text to structure.
   */
  private Type mNameType;

  /**
   * The target type which the text should have. This type must be primitive.
   * 

   * It should not be possible to assign something to this feature with is not
   * structured. The feature should define allowed values.
   */
  private Feature mStructureFeature;

  // private Type mSentenceType;

  private StringDictionary mLookupDictionary;

  /**
   * Initializes a new instance.
   * 

   * Note: Use {@link #initialize(UimaContext) } to initialize this instance. Not
   * use the constructor.
   */
  public Normalizer() {
    // must not be implemented !
  }

  /**
   * Initializes the current instance with the given context.
   * 
   * Note: Do all initialization in this method, do not use the constructor.
   */
  public void initialize(UimaContext context)
      throws ResourceInitializationException {

    super.initialize(context);

    this.context = context;

    mLogger = context.getLogger();

    if (mLogger.isLoggable(Level.INFO)) {
      mLogger.log(Level.INFO, "Initializing the OpenNLP Normalizer annotator.");
    }

    try {
      String modelName = AnnotatorUtil.getOptionalStringParameter(context,
          UimaUtil.DICTIONARY_PARAMETER);

      if (modelName != null) {
        InputStream inModel = AnnotatorUtil.getResourceAsStream(context,
            modelName);

        mLookupDictionary = new StringDictionary(inModel);
      }
    } catch (IOException e) {
      throw new ResourceInitializationException(
          ExceptionMessages.MESSAGE_CATALOG, "io_error_model_reading",
          new Object[] {e.getMessage()}, e);
    }
  }

  /**
   * Initializes the type system.
   */
  public void typeSystemInit(TypeSystem typeSystem)
      throws AnalysisEngineProcessException {

    // sentence type
    // String sentenceTypeName =
    // AnnotatorUtil.getRequiredStringParameter(mContext,
    // UimaUtil.SENTENCE_TYPE_PARAMETER);

    // mSentenceType = AnnotatorUtil.getType(typeSystem, sentenceTypeName);

    // name type
    mNameType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
        NameFinder.NAME_TYPE_PARAMETER);

    mStructureFeature = AnnotatorUtil.getRequiredFeatureParameter(context,
        mNameType, "opennlp.uima.normalizer.StructureFeature");

    if (!SUPPORTED_TYPES.contains(mStructureFeature.getRange().getName())) {
      throw new AnalysisEngineProcessException(
          ExceptionMessages.MESSAGE_CATALOG, "range_type_unsupported",
          new Object[] {mStructureFeature.getRange().getName()});
    }
  }

  public void process(CAS tcas) {

    FSIndex sentenceIndex = tcas.getAnnotationIndex(mNameType);

    for (AnnotationFS nameAnnotation : sentenceIndex) {
      // check if the document language is supported
      String language = tcas.getDocumentLanguage();

      if (!NumberUtil.isLanguageSupported(language)) {
        if (mLogger.isLoggable(Level.INFO)) {
          mLogger.log(Level.INFO, "Unsupported language: " + language);
        }
        continue;
      }

      String text = nameAnnotation.getCoveredText();

      // if possible replace text with normalization from dictionary
      if (mLookupDictionary != null) {

        StringList tokens = new StringList(text);

        String normalizedText = mLookupDictionary.get(tokens);

        if (normalizedText != null) {
          text = normalizedText;
        }
      }

      if (CAS.TYPE_NAME_STRING.equals(mStructureFeature.getRange().getName())) {
        nameAnnotation.setStringValue(mStructureFeature, text);
      } else {

        Number number;
        try {
          number = NumberUtil.parse(text, language);
        } catch (ParseException e) {
          if (mLogger.isLoggable(Level.INFO)) {
            mLogger.log(Level.INFO, "Invalid number format: " + text);
          }
          continue;
        }

        if (CAS.TYPE_NAME_BYTE.equals(mStructureFeature.getRange().getName())) {
          nameAnnotation.setByteValue(mStructureFeature, number.byteValue());
        } else if (CAS.TYPE_NAME_SHORT.equals(mStructureFeature.getRange()
            .getName())) {
          nameAnnotation.setShortValue(mStructureFeature, number.shortValue());
        } else if (CAS.TYPE_NAME_INTEGER.equals(mStructureFeature.getRange()
            .getName())) {
          nameAnnotation.setIntValue(mStructureFeature, number.intValue());
        } else if (CAS.TYPE_NAME_LONG.equals(mStructureFeature.getRange()
            .getName())) {
          nameAnnotation.setLongValue(mStructureFeature, number.longValue());
        } else if (CAS.TYPE_NAME_FLOAT.equals(mStructureFeature.getRange()
            .getName())) {
          nameAnnotation.setFloatValue(mStructureFeature, number.floatValue());
        } else if (CAS.TYPE_NAME_DOUBLE.equals(mStructureFeature.getRange()
            .getName())) {
          nameAnnotation
              .setDoubleValue(mStructureFeature, number.doubleValue());
        }
      }
    }
  }
}