opennlp.tools.coref.mention.AbstractMentionFinder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.coref.mention;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import opennlp.tools.coref.Linker;
import opennlp.tools.coref.resolver.ResolverUtils;
import opennlp.tools.util.Span;

/**
 * Provides default implementation of many of the methods in the {@link MentionFinder} interface.
 */
public abstract class AbstractMentionFinder implements MentionFinder {

  protected HeadFinder headFinder;

  protected boolean collectPrenominalNamedEntities;
  protected boolean collectCoordinatedNounPhrases;

  private void gatherHeads(Parse p, Map heads) {
    Parse head = headFinder.getHead(p);
    //System.err.println("AbstractMention.gatherHeads: "+head+" -> ("+p.hashCode()+") "+p);
    //if (head != null) { System.err.println("head.hashCode()="+head.hashCode());}
    if (head != null) {
      heads.put(head, p);
    }
  }

  /** Assigns head relations between noun phrases and the child np
   *  which is their head.
   *  @param nps List of valid nps for this mention finder.
   *  @return mapping from noun phrases and the child np which is their head
   **/
  protected Map constructHeadMap(List nps) {
    Map headMap = new HashMap();
    for (int ni = 0; ni < nps.size(); ni++) {
      Parse np = nps.get(ni);
      gatherHeads(np, headMap);
    }
    return headMap;
  }

  public boolean isPrenominalNamedEntityCollection() {
    return collectPrenominalNamedEntities;
  }

  public void setPrenominalNamedEntityCollection(boolean b) {
    collectPrenominalNamedEntities = b;
  }

  protected boolean isBasalNounPhrase(Parse np) {
    return np.getNounPhrases().size() == 0;
  }

  protected boolean isPossessive(Parse np) {
    List parts = np.getSyntacticChildren();
    if (parts.size() > 1) {
      Parse child0 = parts.get(0);
      if (child0.isNounPhrase()) {
        List ctoks = child0.getTokens();
        Parse tok = ctoks.get(ctoks.size() - 1);
        if (tok.getSyntacticType().equals("POS")) {
          return true;
        }
      }
    }
    if (parts.size() > 2) {
      Parse child0 = parts.get(0);
      Parse child1 = parts.get(1);
      Parse child2 = parts.get(2);
      if (child1.isToken() && child1.getSyntacticType().equals("POS") && child0.isNounPhrase() && child2.isNounPhrase()) {
        return true;
      }
    }
    return false;
  }

  protected boolean isOfPrepPhrase(Parse np) {
    List parts = np.getSyntacticChildren();
    if (parts.size() == 2) {
      Parse child0 = parts.get(0);
      if (child0.isNounPhrase()) {
        Parse child1 = parts.get(1);
        List cparts = child1.getSyntacticChildren();
        if (cparts.size() == 2) {
          Parse child2 = cparts.get(0);
          if (child2.isToken() && child2.toString().equals("of")) {
            return true;
          }
        }
      }
    }
    return false;
  }

  protected boolean isConjoinedBasal(Parse np) {
    List parts = np.getSyntacticChildren();
    boolean allToken = true;
    boolean hasConjunction = false;
    for (int ti = 0; ti < parts.size(); ti++) {
      Parse c = parts.get(ti);
      if (c.isToken()) {
        if (c.getSyntacticType().equals("CC")) {
          hasConjunction = true;
        }
      }
      else {
        allToken = false;
        break;
      }
    }
    return allToken && hasConjunction;
  }

  private void collectCoordinatedNounPhraseMentions(Parse np, List entities) {
    //System.err.println("collectCoordNp: "+np);
    //exclude nps with UCPs inside.
    List sc = np.getSyntacticChildren();
    for (Iterator sci = sc.iterator();sci.hasNext();) {
      Parse scp = (Parse) sci.next();
      if (scp.getSyntacticType().equals("UCP") || scp.getSyntacticType().equals("NX")) {
        return;
      }
    }
    List npTokens = np.getTokens();
    boolean inCoordinatedNounPhrase = false;
    int lastNpTokenIndex = headFinder.getHeadIndex(np);
    for (int ti = lastNpTokenIndex - 1; ti >= 0; ti--) {
      Parse tok = npTokens.get(ti);
      String tokStr = tok.toString();
      if ((tokStr.equals("and") || tokStr.equals("or")) && !isPartOfName(tok)) {
        if (lastNpTokenIndex != ti) {
          if (ti - 1 >= 0 && (npTokens.get(ti - 1)).getSyntacticType().startsWith("NN")) {
            Span npSpan = new Span((npTokens.get(ti + 1)).getSpan().getStart(), (npTokens.get(lastNpTokenIndex)).getSpan().getEnd());
            Mention snpExtent = new Mention(npSpan, npSpan, tok.getEntityId(), null,"CNP");
            entities.add(snpExtent);
            //System.err.println("adding extent for conjunction in: "+np+" preeceeded by "+((Parse) npTokens.get(ti-1)).getSyntacticType());
            inCoordinatedNounPhrase = true;
          }
          else {
            break;
          }
        }
        lastNpTokenIndex = ti - 1;
      }
      else if (inCoordinatedNounPhrase && tokStr.equals(",")) {
        if (lastNpTokenIndex != ti) {
          Span npSpan = new Span((npTokens.get(ti + 1)).getSpan().getStart(), (npTokens.get(lastNpTokenIndex)).getSpan().getEnd());
          Mention snpExtent = new Mention(npSpan, npSpan, tok.getEntityId(), null,"CNP");
          entities.add(snpExtent);
          //System.err.println("adding extent for comma in: "+np);
        }
        lastNpTokenIndex = ti - 1;
      }
      else if (inCoordinatedNounPhrase && ti == 0 && lastNpTokenIndex >= 0) {
        Span npSpan = new Span((npTokens.get(ti)).getSpan().getStart(), (npTokens.get(lastNpTokenIndex)).getSpan().getEnd());
        Mention snpExtent = new Mention(npSpan, npSpan, tok.getEntityId(), null,"CNP");
        entities.add(snpExtent);
        //System.err.println("adding extent for start coord in: "+np);
      }
    }
  }

  private boolean handledPronoun(String tok) {
    return ResolverUtils.singularThirdPersonPronounPattern.matcher(tok).find() ||
                 ResolverUtils.pluralThirdPersonPronounPattern.matcher(tok).find() ||
                 ResolverUtils.speechPronounPattern.matcher(tok).find();
  }

  private void collectPossesivePronouns(Parse np, List entities) {
    //TODO: Look at how training is done and examine whether this is needed or can be accomidated in a different way.
    /*
    List snps = np.getSubNounPhrases();
    if (snps.size() != 0) {
      //System.err.println("AbstractMentionFinder: Found existing snps");
      for (int si = 0, sl = snps.size(); si < sl; si++) {
        Parse snp = (Parse) snps.get(si);
        Extent ppExtent = new Extent(snp.getSpan(), snp.getSpan(), snp.getEntityId(), null,Linker.PRONOUN_MODIFIER);
        entities.add(ppExtent);
      }
    }
    else {
    */
      //System.err.println("AbstractEntityFinder.collectPossesivePronouns: "+np);
      List npTokens = np.getTokens();
      Parse headToken = headFinder.getHeadToken(np);
      for (int ti = npTokens.size() - 2; ti >= 0; ti--) {
        Parse tok = npTokens.get(ti);
        if (tok == headToken) {
          continue;
        }
        if (tok.getSyntacticType().startsWith("PRP") && handledPronoun(tok.toString())) {
          Mention ppExtent = new Mention(tok.getSpan(), tok.getSpan(), tok.getEntityId(), null,Linker.PRONOUN_MODIFIER);
          //System.err.println("AbstractEntityFinder.collectPossesivePronouns: adding possesive pronoun: "+tok+" "+tok.getEntityId());
          entities.add(ppExtent);
          //System.err.println("AbstractMentionFinder: adding pos-pro: "+ppExtent);
          break;
        }
      }
    //}
  }

  private void removeDuplicates(List extents) {
    Mention lastExtent = null;
    for (Iterator ei = extents.iterator(); ei.hasNext();) {
      Mention e = ei.next();
      if (lastExtent != null && e.getSpan().equals(lastExtent.getSpan())) {
        ei.remove();
      }
      else {
        lastExtent = e;
      }
    }
  }

  private boolean isHeadOfExistingMention(Parse np, Map headMap,
      Set mentions) {
    Parse head = headMap.get(np);
    while(head != null){
      if (mentions.contains(head)) {
        return true;
      }
      head = headMap.get(head);
    }
    return false;
  }


  private void clearMentions(Set mentions, Parse np) {
    Span npSpan =np.getSpan();
    for(Iterator mi=mentions.iterator();mi.hasNext();) {
      Parse mention = mi.next();
      if (!mention.getSpan().contains(npSpan)) {
        //System.err.println("clearing "+mention+" for "+np);
        mi.remove();
      }
    }
  }

  private Mention[] collectMentions(List nps, Map headMap) {
    List mentions = new ArrayList(nps.size());
    Set recentMentions = new HashSet();
    //System.err.println("AbtractMentionFinder.collectMentions: "+headMap);
    for (int npi = 0, npl = nps.size(); npi < npl; npi++) {
      Parse np = nps.get(npi);
      //System.err.println("AbstractMentionFinder: collectMentions: np[" + npi + "]=" + np + " head=" + headMap.get(np));
      if (!isHeadOfExistingMention(np,headMap, recentMentions)) {
        clearMentions(recentMentions, np);
        if (!isPartOfName(np)) {
          Parse head = headFinder.getLastHead(np);
          Mention extent = new Mention(np.getSpan(), head.getSpan(), head.getEntityId(), np, null);
          //System.err.println("adding "+np+" with head "+head);
          mentions.add(extent);
          recentMentions.add(np);
          // determine name-entity type
          String entityType = getEntityType(headFinder.getHeadToken(head));
          if (entityType != null) {
            extent.setNameType(entityType);
          }
        }
        else {
          //System.err.println("AbstractMentionFinder.collectMentions excluding np as part of name. np=" + np);
        }
      }
   	  else {
        //System.err.println("AbstractMentionFinder.collectMentions excluding np as head of previous mention. np=" + np);
      }
      if (isBasalNounPhrase(np)) {
        if (collectPrenominalNamedEntities) {
          collectPrenominalNamedEntities(np, mentions);
        }
        if (collectCoordinatedNounPhrases) {
          collectCoordinatedNounPhraseMentions(np, mentions);
        }
        collectPossesivePronouns(np, mentions);
      }
      else {
        // Could use to get NP -> tokens CON structures for basal nps including NP -> NAC tokens
        //collectComplexNounPhrases(np,mentions);
      }
    }
    Collections.sort(mentions);
    removeDuplicates(mentions);
    return mentions.toArray(new Mention[mentions.size()]);
  }

  /**
   * Adds a mention for the non-treebank-labeled possesive noun phrases.
   * @param possesiveNounPhrase The possesive noun phase which may require an additional mention.
   * @param mentions The list of mentions into which a new mention can be added.
   */
//  private void addPossesiveMentions(Parse possesiveNounPhrase, List mentions) {
//    List kids = possesiveNounPhrase.getSyntacticChildren();
//    if (kids.size() >1) {
//      Parse firstToken = kids.get(1);
//      if (firstToken.isToken() && !firstToken.getSyntacticType().equals("POS")) {
//        Parse lastToken = kids.get(kids.size()-1);
//        if (lastToken.isToken()) {
//          Span extentSpan = new Span(firstToken.getSpan().getStart(),lastToken.getSpan().getEnd());
//          Mention extent = new Mention(extentSpan, extentSpan, -1, null, null);
//          mentions.add(extent);
//        }
//        else {
//          System.err.println("AbstractMentionFinder.addPossesiveMentions: odd parse structure: "+possesiveNounPhrase);
//        }
//      }
//    }
//  }

  private void collectPrenominalNamedEntities(Parse np, List extents) {
    Parse htoken = headFinder.getHeadToken(np);
    List nes = np.getNamedEntities();
    Span headTokenSpan = htoken.getSpan();
    for (int nei = 0, nel = nes.size(); nei < nel; nei++) {
      Parse ne = nes.get(nei);
      if (!ne.getSpan().contains(headTokenSpan)) {
        //System.err.println("adding extent for prenominal ne: "+ne);
        Mention extent = new Mention(ne.getSpan(), ne.getSpan(), ne.getEntityId(),null,"NAME");
        extent.setNameType(ne.getEntityType());
        extents.add(extent);
      }
    }
  }

  private String getEntityType(Parse headToken) {
    String entityType;
    for (Parse parent = headToken.getParent(); parent != null; parent = parent.getParent()) {
      entityType = parent.getEntityType();
      if (entityType != null) {
        return entityType;
      }
      if (parent.isSentence()) {
        break;
      }
    }
    List tc = headToken.getChildren();
    int tcs = tc.size();
    if (tcs > 0) {
      Parse tchild = tc.get(tcs - 1);
      entityType = tchild.getEntityType();
      if (entityType != null) {
        return entityType;
      }
    }
    return null;
  }

  private boolean isPartOfName(Parse np) {
    String entityType;
    for (Parse parent = np.getParent(); parent != null; parent = parent.getParent()) {
      entityType = parent.getEntityType();
      //System.err.println("AbstractMentionFinder.isPartOfName: entityType="+entityType);
      if (entityType != null) {
        //System.err.println("npSpan = "+np.getSpan()+" parentSpan="+parent.getSpan());
        if (!np.getSpan().contains(parent.getSpan())) {
          return true;
        }
      }
      if (parent.isSentence()) {
        break;
      }
    }
    return false;
  }

  /** Return all noun phrases which are contained by p.
   * @param p The parse in which to find the noun phrases.
   * @return A list of Parse objects which are noun phrases contained by p.
   */
  //protected abstract List getNounPhrases(Parse p);

  public List getNamedEntities(Parse p) {
    return p.getNamedEntities();
  }

  public Mention[] getMentions(Parse p) {
    List nps = p.getNounPhrases();
    Collections.sort(nps);
    Map headMap = constructHeadMap(nps);
    //System.err.println("AbstractMentionFinder.getMentions: got " + nps.size()); // + " nps, and " + nes.size() + " named entities");
    Mention[] mentions = collectMentions(nps, headMap);
    return mentions;
  }

  public boolean isCoordinatedNounPhraseCollection() {
    return collectCoordinatedNounPhrases;
  }

  public void setCoordinatedNounPhraseCollection(boolean b) {
    collectCoordinatedNounPhrases = b;
  }
}