edu.stanford.nlp.trees.international.french.FrenchHeadFinder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.trees.international.french;

import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.Generics;


/**
 * TODO wsg2010: Compare these head finding rules to those found in Arun Abishek's 
 * master's thesis.
 * 
 * @author mcdm
 */
public class FrenchHeadFinder extends AbstractCollinsHeadFinder {

  public FrenchHeadFinder() {
    this(new FrenchTreebankLanguagePack());
  }


  public FrenchHeadFinder(FrenchTreebankLanguagePack tlp) {
    super(tlp);


    //French POS:
    // A (adjective), ADV (adverb), C (conjunction and subordinating conjunction), CL (clitics),
    // CS (subordinating conjunction) but occurs only once!,
    // D (determiner), ET (foreign word), I (interjection), N (noun),
    // P (preposition), PREF (prefix), PRO (strong pronoun -- very confusing), V (verb), PUNC (punctuation)

    nonTerminalInfo = Generics.newHashMap();

    // "sentence"
    nonTerminalInfo.put(tlp.startSymbol(), new String[][]{{"left", "VN", "NP"}, {"left"}});
    nonTerminalInfo.put("SENT", new String[][]{{"left", "VN", "NP"}, {"left"}});
    
    // adjectival phrases
    nonTerminalInfo.put("AP", new String[][]{{"left", "A", "V"}, {"rightdis", "N", "ET"}, {"left"}});

    // adverbial phrases
    nonTerminalInfo.put("AdP", new String[][]{{"right", "ADV"}, {"left", "N"}, {"right"}});

    // coordinated phrases
    nonTerminalInfo.put("COORD", new String[][]{{"leftdis", "C", "CC", "ADV", "PP", "P"}, {"left"}});

    // noun phrases
    nonTerminalInfo.put("NP", new String[][]{{"rightdis", "N", "PRO", "NP", "A"}, {"right", "ET"}, {"right"}});

    // prepositional phrases
    nonTerminalInfo.put("PP", new String[][]{{"left", "P", "PRO", "A", "NP", "V", "PP", "ADV"}, {"left"}});

    // verbal nucleus
    nonTerminalInfo.put("VN", new String[][]{{"right", "V", "VN"}, {"right"}});

    // infinitive clauses
    nonTerminalInfo.put("VPinf", new String[][]{{"left", "VN", "V"}, {"left"}});

    // nonfinite clauses
    nonTerminalInfo.put("VPpart", new String[][]{{"left", "VN", "V", "AP", "A", "AdP", "VPpart"}, {"left"}});

    // relative clauses
    nonTerminalInfo.put("Srel", new String[][]{{"left", "NP", "PRO", "PP", "C", "ADV"}});

    // subordinate clauses
    nonTerminalInfo.put("Ssub", new String[][]{{"left", "C", "PC", "ADV", "P", "PP"}, {"left"}});

    // parenthetical clauses
    nonTerminalInfo.put("Sint", new String[][]{{"left", "VN", "V", "NP", "Sint", "Ssub", "PP"}, {"left"}});

    // adverbes
    nonTerminalInfo.put("ADV", new String[][] {{"left", "ADV", "PP", "P"}});

    // compound categories: start with MW: D, A, C, N, ADV, V, P, PRO, CL
    nonTerminalInfo.put("MWD", new String[][] {{"left", "D"}, {"left"}});
    nonTerminalInfo.put("MWA", new String[][] {{"left", "P"}, {"left", "N"}, {"right", "A"}, {"right"}});
    nonTerminalInfo.put("MWC", new String[][] {{"left", "C", "CS"}, {"left"}});
    nonTerminalInfo.put("MWN", new String[][] {{"right", "N", "ET"}, {"right"}});
    nonTerminalInfo.put("MWV", new String[][] {{"left", "V"}, {"left"}});
    nonTerminalInfo.put("MWP", new String[][] {{"left", "P", "ADV", "PRO"}, {"left"}});
    nonTerminalInfo.put("MWPRO", new String[][] {{"left", "PRO", "CL", "N", "A"}, {"left"}});
    nonTerminalInfo.put("MWCL", new String[][] {{"left", "CL"}, {"right"}});
    nonTerminalInfo.put("MWADV", new String[][] {{"left", "P", "ADV"}, {"left"}});

    nonTerminalInfo.put("MWI", new String[][] {{"left", "N", "ADV", "P"}, {"left"}});
    nonTerminalInfo.put("MWET", new String[][] {{"left", "ET", "N"}, {"left"}});

    //TODO: wsg2011: For phrasal nodes that lacked a label.
    nonTerminalInfo.put(FrenchXMLTreeReader.MISSING_PHRASAL, new String[][]{{"left"}});
    
  }


  /**
   * Go through trees and determine their heads and print them.
   * Just for debugging. 

   * Usage: 
   * java edu.stanford.nlp.trees.FrenchHeadFinder treebankFilePath
   * 
   *
   * @param args The treebankFilePath
   */
  public static void main(String[] args) {
    Treebank treebank = new DiskTreebank();
    CategoryWordTag.suppressTerminalDetails = true;
    treebank.loadPath(args[0]);
    final HeadFinder chf = new FrenchHeadFinder();
    treebank.apply(pt -> {
      pt.percolateHeads(chf);
      pt.pennPrint();
      System.out.println();
    });
  }

  private static final long serialVersionUID = 8747319554557223422L;


}