All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.coref.resolver.ProperNounResolver Maven / Gradle / Ivy

There is a newer version: 2.5.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.coref.resolver;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

import opennlp.tools.coref.DiscourseEntity;
import opennlp.tools.coref.mention.MentionContext;

/**
 * Resolves coreference between proper nouns.
 */
public class ProperNounResolver extends MaxentResolver {

  private static Map> acroMap;
  private static boolean acroMapLoaded = false;

  public ProperNounResolver(String projectName, ResolverMode m) throws IOException {
    super(projectName,"pnmodel", m, 500);
    if (!acroMapLoaded) {
      initAcronyms(projectName + "/acronyms");
      acroMapLoaded = true;
    }
    showExclusions = false;
  }

  public ProperNounResolver(String projectName, ResolverMode m,NonReferentialResolver nonRefResolver) throws IOException {
    super(projectName,"pnmodel", m, 500,nonRefResolver);
    if (!acroMapLoaded) {
      initAcronyms(projectName + "/acronyms");
      acroMapLoaded = true;
    }
    showExclusions = false;
  }

  public boolean canResolve(MentionContext mention) {
    return (mention.getHeadTokenTag().startsWith("NNP") || mention.getHeadTokenTag().startsWith("CD"));
  }

  private void initAcronyms(String name) {
    acroMap = new HashMap>(15000);
    try {
      BufferedReader str;
      str = new BufferedReader(new FileReader(name));
      //System.err.println("Reading acronyms database: " + file + " ");
      String line;
      while (null != (line = str.readLine())) {
        StringTokenizer st = new StringTokenizer(line, "\t");
        String acro = st.nextToken();
        String full = st.nextToken();
        Set exSet = acroMap.get(acro);
        if (exSet == null) {
          exSet = new HashSet();
          acroMap.put(acro, exSet);
        }
        exSet.add(full);
        exSet = acroMap.get(full);
        if (exSet == null) {
          exSet = new HashSet();
          acroMap.put(full, exSet);
        }
        exSet.add(acro);
      }
    }
    catch (IOException e) {
      System.err.println("ProperNounResolver.initAcronyms: Acronym Database not found: " + e);
    }
  }

  private boolean isAcronym(String ecStrip, String xecStrip) {
    Set exSet = acroMap.get(ecStrip);
    if (exSet != null && exSet.contains(xecStrip)) {
      return true;
    }
    return false;
  }

  protected List getAcronymFeatures(MentionContext mention, DiscourseEntity entity) {
    MentionContext xec = ResolverUtils.getProperNounExtent(entity);
    String ecStrip = ResolverUtils.stripNp(mention);
    String xecStrip = ResolverUtils.stripNp(xec);
    if (ecStrip != null && xecStrip != null) {
      if (isAcronym(ecStrip, xecStrip)) {
        List features = new ArrayList(1);
        features.add("knownAcronym");
        return features;
      }
    }
    return Collections.emptyList();
  }

  protected List getFeatures(MentionContext mention, DiscourseEntity entity) {
    //System.err.println("ProperNounResolver.getFeatures: "+mention.toText()+" -> "+entity);
    List features = new ArrayList();
    features.addAll(super.getFeatures(mention, entity));
    if (entity != null) {
      features.addAll(ResolverUtils.getStringMatchFeatures(mention, entity));
      features.addAll(getAcronymFeatures(mention, entity));
    }
    return features;
  }

  public boolean excluded(MentionContext mention, DiscourseEntity entity) {
    if (super.excluded(mention, entity)) {
      return true;
    }

    for (Iterator ei = entity.getMentions(); ei.hasNext();) {
      MentionContext xec = ei.next();
      if (xec.getHeadTokenTag().startsWith("NNP")) { // || initialCaps.matcher(xec.headToken.toString()).find()) {
        //System.err.println("MaxentProperNounResolver.exclude: kept "+xec.toText()+" with "+xec.headTag);
        return false;
      }
    }

    return true;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy