org.cogroo.analyzer.NameFinder Maven / Gradle / Ivy

Go to download
/**
 * Copyright (C) 2012 cogroo 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.cogroo.analyzer;

import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.util.Span;

import org.cogroo.config.Analyzers;
import org.cogroo.text.Document;
import org.cogroo.text.Sentence;
import org.cogroo.text.Token;
import org.cogroo.text.impl.TokenImpl;
import org.cogroo.util.TextUtils;


/**
 * The NameFinder class searches for subsequent proper nouns in the
 * document sentences and gathers each of these sets in one word.
 * 
 */
public class NameFinder implements Analyzer {

  private NameFinderME nameFinder;

  public NameFinder(NameFinderME nameFinder) {
    this.nameFinder = nameFinder;
  }

  public void analyze(Document document) {
    List sentences = document.getSentences();

    for (Sentence sentence : sentences) {
      Span[] namesSpan;
      synchronized (this.nameFinder) {
        namesSpan = nameFinder.find(TextUtils.tokensToString(sentence
            .getTokens()));
      }
      
      List newTokens = sentence.getTokens();

      for (int i = namesSpan.length - 1; i >= 0; i--) {
        int start = namesSpan[i].getStart(), end = namesSpan[i].getEnd();

        int chStart = newTokens.get(start).getStart();
        int chEnd = newTokens.get(end - 1).getEnd();

        String name = sentence.getText().substring(chStart, chEnd).replace(" ", "_");
        newTokens.remove(end - 1);

        for (int j = end - 2; j >= start; j--) {
          newTokens.remove(j);
        }
        Token token = new TokenImpl(chStart, chEnd, name);
        newTokens.add(start, token);
        
        token.addContext(Analyzers.NAME_FINDER, "P");
        
      }
      sentence.setTokens(newTokens);
    }
  }
}