All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.cogroo.analyzer.ContractionFinder Maven / Gradle / Ivy

There is a newer version: 4.3.1
Show newest version
/**
 * Copyright (C) 2012 cogroo 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.cogroo.analyzer;

import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.util.Span;

import org.apache.log4j.Logger;
import org.cogroo.ContractionUtility;
import org.cogroo.config.Analyzers;
import org.cogroo.text.Document;
import org.cogroo.text.Sentence;
import org.cogroo.text.Token;
import org.cogroo.text.impl.TokenImpl;
import org.cogroo.util.TextUtils;


/**
 * The ContractionFinder class searches for contractions in a given
 * sentence and then expands them to their primitive form.
 * 
 */
public class ContractionFinder implements Analyzer {

  private NameFinderME contractionFinder;
  
  protected static final Logger LOGGER = Logger.getLogger(ContractionFinder.class);

  public ContractionFinder(NameFinderME contractionFinder) {
    this.contractionFinder = contractionFinder;
  }

  public void analyze(Document document) {
    List sentences = document.getSentences();

    for (Sentence sentence : sentences) {
      Span[] contractionsSpan;
      
      synchronized (this.contractionFinder) {
        contractionsSpan = contractionFinder.find(TextUtils
            .tokensToString(sentence.getTokens()));
      }
      
      List newTokens = sentence.getTokens();

      for (int i = contractionsSpan.length - 1; i >= 0; i--) {

        int start = contractionsSpan[i].getStart();

        String lexeme = sentence.getTokens().get(start).getLexeme();
        String[] contractions = ContractionUtility.expand(lexeme);

        Token original = newTokens.remove(start);
        if(contractions != null) {
          for (int j = contractions.length - 1; j >= 0; j--) {
            Token token = new TokenImpl(original.getStart(), original.getEnd(), contractions[j]);
            newTokens.add(start, token);
  
            String caze = null;
            if (j == 0)
              caze = "B";
            else if (j == contractions.length - 1)
              caze = "E";
            else
              caze = "I";
  
            token.addContext(Analyzers.CONTRACTION_FINDER, caze);
          }
        } else {
          LOGGER.debug("Missing contraction: " + lexeme);
        }
      }
      sentence.setTokens(newTokens);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy