edu.emory.mathcs.nlp.conversion.EnglishC2DConverter Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2014, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.conversion;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UnknownFormatConversionException;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import edu.emory.mathcs.nlp.common.collection.arc.AbstractArc;
import edu.emory.mathcs.nlp.common.constituent.CTLib;
import edu.emory.mathcs.nlp.common.constituent.CTLibEn;
import edu.emory.mathcs.nlp.common.constituent.CTNode;
import edu.emory.mathcs.nlp.common.constituent.CTTagEn;
import edu.emory.mathcs.nlp.common.constituent.CTTree;
import edu.emory.mathcs.nlp.common.propbank.PBLib;
import edu.emory.mathcs.nlp.common.treebank.DEPLibEn;
import edu.emory.mathcs.nlp.common.treebank.DEPTagEn;
import edu.emory.mathcs.nlp.common.treebank.PBArc;
import edu.emory.mathcs.nlp.common.treebank.POSLibEn;
import edu.emory.mathcs.nlp.common.treebank.POSTagEn;
import edu.emory.mathcs.nlp.common.util.DSUtils;
import edu.emory.mathcs.nlp.common.util.ENUtils;
import edu.emory.mathcs.nlp.common.util.Joiner;
import edu.emory.mathcs.nlp.common.util.NLPUtils;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.Splitter;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.component.dep.DEPArc;
import edu.emory.mathcs.nlp.component.template.node.FeatMap;
import edu.emory.mathcs.nlp.component.template.node.NLPNode;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Emoticon;
import edu.emory.mathcs.nlp.conversion.util.C2DInfo;
import edu.emory.mathcs.nlp.conversion.util.HeadRule;
import edu.emory.mathcs.nlp.conversion.util.HeadRuleMap;
/**
* Constituent to dependency converter for English.
* @since 3.0.0
* @author Jinho D. Choi ({@code [email protected]})
*/
public class EnglishC2DConverter extends C2DConverter
{
private final Set S_NPADVMOD = DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP, CTTagEn.C_QP);
private final Set S_ADVCL = DSUtils.toHashSet(CTTagEn.C_S, CTTagEn.C_SBAR, CTTagEn.C_SINV);
private final Set S_NFMOD = DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP, CTTagEn.C_WHNP);
private final Set S_CCOMP = DSUtils.toHashSet(CTTagEn.C_S, CTTagEn.C_SQ, CTTagEn.C_SINV, CTTagEn.C_SBARQ);
private final Set S_META = DSUtils.toHashSet(CTTagEn.C_EDITED, CTTagEn.C_EMBED, CTTagEn.C_LST, CTTagEn.C_META, CTLibEn.POS_CODE, CTTagEn.C_CAPTION, CTTagEn.C_CIT, CTTagEn.C_HEADING, CTTagEn.C_TITLE);
private final Set S_MARK = DSUtils.toHashSet(CTLibEn.POS_IN, CTLibEn.POS_TO, CTLibEn.POS_DT);
private final Set S_POSS = DSUtils.toHashSet(CTLibEn.POS_PRPS, CTLibEn.POS_WPS);
private final Set S_INTJ = DSUtils.toHashSet(CTTagEn.C_INTJ, CTLibEn.POS_UH);
private final Set S_PRT = DSUtils.toHashSet(CTTagEn.C_PRT, CTLibEn.POS_RP);
// private final Set S_NUM = DSUtils.toHashSet(CTLibEn.POS_CD, CTTagEn.C_QP);
private final Set S_DET = DSUtils.toHashSet(CTLibEn.POS_DT, CTLibEn.POS_WDT, CTLibEn.POS_WP);
private final Set S_AUX = DSUtils.toHashSet(CTLibEn.POS_MD, CTLibEn.POS_TO);
// private final Set S_NN = DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP);
// private final Set S_ADJT_PHRASE = DSUtils.toHashSet(CTTagEn.C_ADJP, CTTagEn.C_WHADJP);
private final Set S_NOUN_PHRASE = DSUtils.toHashSet(CTTagEn.C_NP, CTTagEn.C_NML);
private final Set S_PREP_PHRASE = DSUtils.toHashSet(CTTagEn.C_PP, CTTagEn.C_WHPP);
private final Set S_ADVB_PHRASE = DSUtils.toHashSet(CTTagEn.C_ADJP, CTTagEn.C_ADVP, CTTagEn.C_PP);
private final Set S_PREPOSITION = DSUtils.toHashSet(CTLibEn.POS_IN, CTLibEn.POS_TO);
// private final Set S_PARTICIPIAL = DSUtils.toHashSet(CTLibEn.POS_VBG, CTLibEn.POS_VBN);
private final Set S_PREP_DET = DSUtils.toHashSet(CTLibEn.POS_IN, CTLibEn.POS_DT);
private final Set S_COMP_PARENT_S = DSUtils.toHashSet(CTTagEn.C_VP, CTTagEn.C_SINV, CTTagEn.C_SQ);
private final Set S_COMP_PARENT_A = DSUtils.toHashSet(CTTagEn.C_ADJP, CTTagEn.C_ADVP);
private final Set S_NMOD_PARENT = DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP, CTTagEn.C_NX, CTTagEn.C_WHNP);
private final Set S_POSS_PARENT = DSUtils.toHashSet(CTTagEn.C_NP, CTTagEn.C_NML, CTTagEn.C_WHNP, CTTagEn.C_QP, CTTagEn.C_ADJP);
private final Set S_COMPLM = DSUtils.toHashSet("that", "if", "whether");
private final int SIZE_HEAD_FLAGS = 4;
private Set s_semTags;
private Set s_synTags;
private Map> m_rnr;
private Map> m_xsubj;
private Map m_coord;
private Predicate mt_s;
private Predicate mt_to;
private Predicate mt_pos;
private Predicate mt_sbj;
private Predicate mt_prd;
private Predicate mt_none;
private Predicate mt_in_dt;
private Predicate mt_np_prd;
private Emoticon emoticon;
public EnglishC2DConverter(HeadRuleMap headrules)
{
super(headrules, new HeadRule(HeadRule.DIR_RIGHT_TO_LEFT));
initBasic();
initCoord();
initMatchers();
emoticon = new Emoticon();
}
@Override
public NLPNode[] toDependencyGraph(CTTree cTree)
{
NLPNode[] tree = null;
try
{
CTLibEn.preprocess(cTree);
clearMaps();
if (!mapEmtpyCategories(cTree)) return null;
setHeads(cTree.getRoot());
tree = getDEPTree(cTree);
}
catch (Exception e) {e.printStackTrace();}
if (tree != null) finalize(tree);
return tree;
}
// ============================= Initialization =============================
private void initBasic()
{
s_semTags = DSUtils.toHashSet(CTTagEn.F_BNF, CTTagEn.F_DIR, CTTagEn.F_EXT, CTTagEn.F_LOC, CTTagEn.F_MNR, CTTagEn.F_PRP, CTTagEn.F_TMP, CTTagEn.F_VOC);
s_synTags = DSUtils.toHashSet(CTTagEn.F_ADV, CTTagEn.F_CLF, CTTagEn.F_CLR, CTTagEn.F_DTV, CTTagEn.F_NOM, CTTagEn.F_PUT, CTTagEn.F_PRD, CTTagEn.F_TPC);
m_rnr = new HashMap<>();
m_xsubj = new HashMap<>();
}
private void initCoord()
{
m_coord = new HashMap<>();
m_coord.put(CTTagEn.C_ADJP , PatternUtils.createClosedORPattern("ADJP","JJ.*","VBN","VBG"));
m_coord.put(CTTagEn.C_ADVP , PatternUtils.createClosedORPattern("ADVP","RB.*"));
m_coord.put(CTTagEn.C_INTJ , PatternUtils.createClosedORPattern("INTJ","UH"));
m_coord.put(CTTagEn.C_PP , PatternUtils.createClosedORPattern("PP","IN","VBG"));
m_coord.put(CTTagEn.C_PRT , PatternUtils.createClosedORPattern("PRT","RP"));
m_coord.put(CTTagEn.C_NAC , PatternUtils.createClosedORPattern("NP"));
m_coord.put(CTTagEn.C_NML , PatternUtils.createClosedORPattern("NP","NML","NN.*","PRP"));
m_coord.put(CTTagEn.C_NP , PatternUtils.createClosedORPattern("NP","NML","NN.*","PRP"));
m_coord.put(CTTagEn.C_NX , PatternUtils.createClosedORPattern("NX"));
m_coord.put(CTTagEn.C_VP , PatternUtils.createClosedORPattern("VP","VB.*"));
m_coord.put(CTTagEn.C_S , PatternUtils.createClosedORPattern("S","SINV","SQ","SBARQ"));
m_coord.put(CTTagEn.C_SBAR , PatternUtils.createClosedORPattern("SBAR.*"));
m_coord.put(CTTagEn.C_SBARQ , PatternUtils.createClosedORPattern("SBAR.*"));
m_coord.put(CTTagEn.C_SINV , PatternUtils.createClosedORPattern("S","SINV"));
m_coord.put(CTTagEn.C_SQ , PatternUtils.createClosedORPattern("S","SQ","SBARQ"));
m_coord.put(CTTagEn.C_WHNP , PatternUtils.createClosedORPattern("NN.*","WP"));
m_coord.put(CTTagEn.C_WHADJP, PatternUtils.createClosedORPattern("JJ.*","VBN","VBG"));
m_coord.put(CTTagEn.C_WHADVP, PatternUtils.createClosedORPattern("RB.*","WRB","IN"));
}
private void initMatchers()
{
mt_s = CTLib.matchC(CTTagEn.C_S);
mt_to = CTLib.matchC(POSTagEn.POS_TO);
mt_pos = CTLib.matchC(POSTagEn.POS_POS);
mt_none = CTLib.matchC(CTLibEn.NONE);
mt_sbj = CTLib.matchF(CTTagEn.F_SBJ);
mt_prd = CTLib.matchF(CTTagEn.F_PRD);
mt_np_prd = CTLib.matchCF(CTTagEn.C_NP, CTTagEn.F_PRD);
mt_in_dt = CTLib.matchCo(DSUtils.toHashSet(POSTagEn.POS_IN, POSTagEn.POS_DT));
}
private void clearMaps()
{
m_rnr.clear();
m_xsubj.clear();
}
// ============================= Empty categories =============================
/**
* Removes, relocates empty categories in the specific tree.
* @param cTree the constituent tree to be processed.
* @return {@true} if the constituent tree contains nodes after relocating empty categories.
*/
private boolean mapEmtpyCategories(CTTree cTree)
{
for (CTNode node : cTree.getTerminalList())
{
if (!node.isEmptyCategory()) continue;
if (node.getParent() == null) continue;
if (node.wordFormStartsWith(CTTagEn.E_PRO))
mapPRO(cTree, node);
else if (node.wordFormStartsWith(CTTagEn.E_TRACE))
mapTrace(cTree, node);
else if (node.matchesWordForm(CTLibEn.P_PASSIVE_NULL))
mapPassiveNull(cTree, node);
else if (node.isWordForm(CTTagEn.E_ZERO))
continue;
else if (CTLibEn.isDiscontinuousConstituent(node))
mapDiscontinuousConstituent(cTree, node);
// else if (node.wordFormStartsWith(CTTagEn.E_EXP))
// reloateEXP(cTree, node);
else
removeCTNode(node);
}
return cTree.getRoot().getChildrenSize() > 0;
}
/** Called by {@link #mapEmtpyCategories(CTTree)}. */
private void mapPRO(CTTree cTree, CTNode ec)
{
CTNode np = ec.getParent();
CTNode vp = np.getParent().getFirstLowestChainedDescendant(CTLibEn.M_VP);
if (vp == null) // small clauses
relocatePRD(np, ec);
else
{
CTNode ante;
if ((ante = ec.getAntecedent()) != null && CTLibEn.isWhPhrase(ante)) // relative clauses
{
if (cTree.getEmptyCategoryList(ante.getEmptyCategoryIndex()).size() == 1)
mapTrace(cTree, ec);
}
addXSubject(ec, m_xsubj);
}
}
/** Called by {@link #mapEmtpyCategories(CTTree)}. */
private void mapTrace(CTTree cTree, CTNode ec)
{
CTNode ante = ec.getAntecedent();
if (ante == null || ec.isDescendantOf(ante))
removeCTNode(ec);
else if (ante.hasFunctionTag(CTTagEn.F_TPC))
{
if (!ante.hasFunctionTag(CTTagEn.F_SBJ))
{
CTNode parent = ec.getParent();
parent.removeChild(ec);
replaceEC(parent, ante);
}
else
removeCTNode(ec);
}
else // relative clauses
{
CTNode parent = ante.getHighestChainedAncestor(CTLibEn.M_SBAR);
if (parent != null) parent.addFunctionTag(DEPTagEn.DEP_RELCL);
replaceEC(ec, ante);
}
}
/** Called by {@link #mapEmtpyCategories(CTTree)}. */
private void mapPassiveNull(CTTree cTree, CTNode ec)
{
CTNode np = ec.getParent();
if (np.hasFunctionTag(CTTagEn.F_SBJ))
{
// small clauses
if (np.getRightNearestSibling(CTLibEn.M_VP) == null)
relocatePRD(np, ec);
else
addXSubject(ec, m_xsubj);
}
}
/** Called by {@link #mapEmtpyCategories(CTTree)}. */
private void mapDiscontinuousConstituent(CTTree cTree, CTNode ec)
{
CTNode parent = ec.getParent();
CTNode ante = ec.getAntecedent();
if (ec.wordFormStartsWith(CTTagEn.E_ICH) && parent.getLeftNearestSibling(CTLibEn.M_WHx) != null)
removeCTNode(ec);
else if (ante == null || ec.isDescendantOf(ante))
removeCTNode(ec);
else
{
List list = cTree.getEmptyCategoryList(ante.getEmptyCategoryIndex());
boolean isRNR = CTLibEn.isRNR(ec);
int i, size = list.size();
CTNode node;
Deque dq = isRNR ? new ArrayDeque() : null;
if (ec.getTerminalID() < ante.getFirstTerminal().getTerminalID())
{
for (i=0; i0; i--)
{
node = list.get(i);
if (isRNR) dq.addFirst(node.getParent().getParent());
removeCTNode(node);
}
ec = list.get(0);
}
if (isRNR && !dq.isEmpty())
m_rnr.put(ante, dq);
parent = ec.getParent();
parent.removeChild(ec);
replaceEC(parent, ante);
}
}
/** Called by {@link #mapPRO(CTTree, CTNode)} and {@link #mapPassiveNull(CTTree, CTNode)}. */
private void relocatePRD(CTNode np, CTNode ec)
{
CTNode s = np.getParent();
CTNode prd = s.getFirstChild(mt_prd);
Set fTags = s.getFunctionTagSet();
if (prd != null && (fTags.isEmpty() || fTags.contains(CTTagEn.F_CLR)))
{
fTags.clear();
fTags.add(DEPTagEn.DEP_OPRD);
}
removeCTNode(ec);
}
/* private void reloateEXP(CTTree cTree, CTNode ec)
{
int idx = ec.form.lastIndexOf("-");
if (idx != -1)
{
int coIndex = Integer.parseInt(ec.form.substring(idx+1));
CTNode ante = cTree.getCoIndexedAntecedent(coIndex);
if (ante != null) ante.addFTag(DEPTagEn.CONLL_EXTR);
}
removeCTNode(ec);
}*/
/**
* @param ec empty subject.
* @param map key: antecedent, value: list of clauses containing empty subjects.
*/
private void addXSubject(CTNode ec, Map> map)
{
CTNode ante = ec.getAntecedent();
while (ante != null && ante.isEmptyCategoryTerminal())
{
if (CTLibEn.isWhPhrase(ante)) return;
ante = ante.getFirstTerminal().getAntecedent();
}
if (ante != null)
{
CTNode s = ec.getNearestAncestor(mt_s);
if (s != null)
{
Deque dq = map.get(ante);
if (dq == null) dq = new ArrayDeque();
dq.add(s);
map.put(ante, dq);
}
}
}
private void removeCTNode(CTNode node)
{
CTNode parent = node.getParent();
if (parent != null)
{
parent.removeChild(node);
if (parent.getChildrenSize() == 0)
removeCTNode(parent);
}
}
private void replaceEC(CTNode ec, CTNode ante)
{
removeCTNode(ante);
ec.getParent().replaceChild(ec, ante);
}
// ============================= Find heads =============================
@Override
protected void setHeadsAux(HeadRule rule, CTNode curr)
{
if (findHeadsCoordination(rule, curr)) return;
// findHyphens(curr);
findHeadsApposition(curr);
findHeadsSmallClause(curr);
CTNode head = getHead(rule, curr.getChildrenList(), SIZE_HEAD_FLAGS);
if (head.getC2DInfo().getLabel() != null) head.getC2DInfo().setLabel(null);
curr.setC2DInfo(new C2DInfo(head));
}
/**
* If the specific node contains a coordination structure, find the head of each coordination.
* @param curr the specific node to be compared.
* @return {@code true} if this node contains a coordination structure.
*/
private boolean findHeadsCoordination(HeadRule rule, CTNode curr)
{
// skip pre-conjunctions and punctuation
int i, sId, size = curr.getChildrenSize();
CTNode node;
for (sId=0; sId 0)
findHeadsCoordinationAux(rule, curr, bId, eId, prevHead);
// findHeadsCoordinationAux(rule, curr, bId, eId, mainHead);
curr.setC2DInfo(new C2DInfo(mainHead));
return true;
}
/** Called by {@link #findHeadsCoordination(HeadRule, CTNode)}. */
private Pattern getConjunctPattern(CTNode curr, int sId, int size)
{
Pattern rTags = m_coord.get(curr.getConstituentTag());
if (rTags != null)
{
boolean b = false;
int i;
for (i=sId; i 1) System.err.println("Warning: multiple roots exist");
}
/** Called by {@link #getDEPTree(CTTree)}. */
private void addSecondaryHeads(NLPNode[] dTree)
{
for (CTNode curr : m_xsubj.keySet())
{
if (curr.hasC2DInfo())
addSecondaryHeadsAux(dTree, curr, m_xsubj.get(curr), DEPTagEn.DEP2_XSUBJ);
}
for (CTNode curr : m_rnr.keySet())
{
if (curr.getParent() == null)
continue;
if (curr.getParent().getC2DInfo().getNonTerminalHead() != curr)
addSecondaryHeadsAux(dTree, curr, m_rnr.get(curr), DEPTagEn.DEP2_RNR);
else
addSecondaryChildren(dTree, curr, m_rnr.get(curr), DEPTagEn.DEP2_RNR);
}
}
/** Called by {@link #addSecondaryHeads(DEPTree)}. */
private void addSecondaryHeadsAux(NLPNode[] dTree, CTNode cNode, Deque dq, String label)
{
if (cNode.isEmptyCategoryTerminal()) return;
NLPNode node = getNLPNode(dTree, cNode);
NLPNode head;
for (CTNode cHead : dq)
{
head = getNLPNode(dTree, cHead);
if (head == null)
{
System.err.println("HEAD NOT EXIST: AUX");
continue;
}
if (!node.isDependentOf(head)) node.addSecondaryHead(head, label);
if (label.equals(DEPTagEn.DEP2_XSUBJ) && head.isDependencyLabel(DEPTagEn.DEP_CCOMP))
head.setDependencyLabel(DEPTagEn.DEP_XCOMP);
}
}
/** Called by {@link #addSecondaryHeads(DEPTree)}. */
private void addSecondaryChildren(NLPNode[] dTree, CTNode cHead, Deque dq, String label)
{
NLPNode head = getNLPNode(dTree, cHead);
NLPNode node;
for (CTNode cNode : dq)
{
node = getNLPNode(dTree, cNode);
if (node == null || node.getID() == 0)
{
System.err.println("HEAD NOT EXIST: CHILDREN");
continue;
}
node.addSecondaryHead(head, label);
}
}
/** Called by {@link #getDEPTree(CTTree)}. */
private void addFeats(NLPNode[] dTree, CTTree cTree, CTNode cNode)
{
CTNode ante;
String feat;
if (!cNode.isEmptyCategoryTerminal() && cNode.getGappingRelationIndex() != -1 && cNode.getParent().getGappingRelationIndex() == -1 && (ante = cTree.getAntecedent(cNode.getGappingRelationIndex())) != null)
{
NLPNode dNode = getNLPNode(dTree, cNode);
dNode.addSecondaryHead(getNLPNode(dTree, ante), DEPTagEn.DEP2_GAP);
}
if ((feat = getFunctionTags(cNode, s_semTags)) != null)
cNode.getC2DInfo().putFeat(NLPUtils.FEAT_SEM, feat);
if ((feat = getFunctionTags(cNode, s_synTags)) != null)
cNode.getC2DInfo().putFeat(NLPUtils.FEAT_SYN, feat);
for (CTNode child : cNode.getChildrenList())
addFeats(dTree, cTree, child);
}
/** Called by {@link #addFeats(DEPTree, CTTree, CTNode)}. */
private String getFunctionTags(CTNode node, Set sTags)
{
List tags = new ArrayList<>();
for (String tag : node.getFunctionTagSet())
{
if (sTags.contains(tag))
tags.add(tag);
}
if (tags.isEmpty()) return null;
Collections.sort(tags);
return Joiner.join(tags, FeatMap.DELIM_VALUES);
}
private NLPNode getNLPNode(NLPNode[] dTree, CTNode cNode)
{
if (cNode.isConstituentTag(CTTagEn.TOP)) return null;
CTNode cHead = cNode.isTerminal() ? cNode : cNode.getC2DInfo().getTerminalHead();
return cHead.isEmptyCategory() ? null : dTree[cHead.getTokenID()+1];
// return cNode.isTerminal() ? dTree.get(cNode.getTokenID()+1) : dTree.get(cNode.getC2DInfo().getTerminalHead().getTokenID()+1);
}
// ============================= Edited phrases =============================
public NLPNode[] getDEPTreeWithoutEdited(CTTree cTree, NLPNode[] dTree)
{
List nodes = new ArrayList<>();
Set set = new HashSet<>();
int id = 1;
addEditedTokensAux(cTree.getRoot(), set);
for (NLPNode node : dTree)
{
if (!set.contains(node.getID()))
{
removeEditedHeads(node.getSecondaryHeadList(), set);
removeEditedHeads(node.getSemanticHeadList() , set);
node.setID(id++);
nodes.add(node);
}
}
return (nodes.size() > 0) ? NLPUtils.toDependencyTree(nodes, NLPNode::new) : null;
}
/** Called by {@link #getDEPTreeWithoutEdited(CTTree, DEPTree)}. */
private void addEditedTokensAux(CTNode curr, Set set)
{
for (CTNode child : curr.getChildrenList())
{
if (CTLibEn.isEditedPhrase(child))
{
for (CTNode sub : child.getTokenList())
set.add(sub.getTokenID()+1);
}
else if (!child.isTerminal())
addEditedTokensAux(child, set);
}
}
/** Called by {@link #getDEPTreeWithoutEdited(CTTree, DEPTree)}. */
private >void removeEditedHeads(List heads, Set set)
{
if (heads == null) return;
List remove = new ArrayList<>();
for (T arc : heads)
{
if (arc.getNode() == null || set.contains(arc.getNode().getID()))
remove.add(arc);
}
heads.removeAll(remove);
}
// ============================= Add PropBank arguments =============================
private void addSemanticHeads(NLPNode[] dTree, CTTree cTree)
{
initPropBank(dTree, cTree.getRoot());
arrangePropBank(dTree);
relabelNumberedArguments(dTree);
}
/** Called by {@link #addSemanticHeads(DEPTree, CTTree)}. */
private void initPropBank(NLPNode[] dTree, CTNode cNode)
{
NLPNode dNode = getNLPNode(dTree, cNode);
if (dNode != null)
{
if (cNode.isPBHead())
dNode.putFeat(NLPUtils.FEAT_PREDICATE, cNode.getPBRolesetID());
NLPNode sHead, d;
String label;
CTNode c;
for (PBArc p : cNode.getPBHeads())
{
sHead = getNLPNode(dTree, p.getNode());
label = PBLib.getShortLabel(p.getLabel());
if ((c = getReferentArgument(cNode)) != null)
{
if ((c = CTLibEn.getRelativizer(c)) != null && (c = c.getAntecedent()) != null)
{
d = getNLPNode(dTree, c);
if (d != null && d.getSemanticHeadArc(sHead) == null)
d.addSemanticHead(new DEPArc<>(sHead, label));
}
label = PBLib.PREFIX_REFERENT + label;
}
if (!dNode.isArgumentOf(sHead) && dNode != sHead)
dNode.addSemanticHead(sHead, label);
}
}
for (CTNode child : cNode.getChildrenList())
initPropBank(dTree, child);
}
/** Called by {@link #initPropBank(DEPTree, CTNode)}. */
private CTNode getReferentArgument(CTNode node)
{
CTNode ref;
if ((ref = CTLibEn.getWhPhrase(node)) != null)
return ref;
if (node.isConstituentTag(CTTagEn.C_PP))
{
for (CTNode child : node.getChildrenList())
{
if ((ref = CTLibEn.getWhPhrase(child)) != null)
return ref;
}
}
return null;
}
/** Called by {@link #addSemanticHeads(DEPTree, CTTree)}. */
private void arrangePropBank(NLPNode[] tree)
{
List> remove;
NLPNode head;
String label;
for (NLPNode node : tree)
{
remove = new ArrayList<>();
for (DEPArc arc : node.getSemanticHeadList())
{
head = arc.getNode();
label = arc.getLabel();
if (ancestorHasSemanticHead(node, head, label))
remove.add(arc);
// else if (rnrHasSHead(node, head, label))
// remove.add(arc);
}
node.removeSemanticHeads(remove);
}
}
/** Called by {@link #arrangePropBank(DEPTree)}. */
private boolean ancestorHasSemanticHead(NLPNode dNode, NLPNode sHead, String label)
{
NLPNode dHead = dNode.getDependencyHead();
while (dHead.getID() != 0)
{
if (dHead.isArgumentOf(sHead, label))
return true;
dHead = dHead.getDependencyHead();
}
return false;
}
// private boolean rnrHasSHead(NLPNode dNode, NLPNode sHead, String label)
// {
// for (DEPArc rnr : dNode.getSecondaryHeadList(DEPTagEn.DEP2_RNR))
// {
// if (rnr.getNode().isArgumentOf(sHead, label))
// return true;
// }
//
// return false;
// }
/** Called by {@link #addSemanticHeads(DEPTree, CTTree)}. */
private void relabelNumberedArguments(NLPNode[] tree)
{
Map map = new HashMap<>();
String key;
for (NLPNode node : tree)
{
for (DEPArc arc : node.getSemanticHeadList())
{
if (PBLib.isReferentArgument(arc.getLabel()))
continue;
if (PBLib.isModifier(arc.getLabel()))
continue;
key = arc.toString();
if (map.containsKey(key))
arc.setLabel(PBLib.PREFIX_CONCATENATION + arc.getLabel());
else
map.put(key, node);
}
}
}
private void finalize(NLPNode[] tree)
{
finalizeLabels(tree);
finalizeCompound(tree, POSTagEn.POS_NN, DEPTagEn.DEP_NMOD , n -> n.getPartOfSpeechTag().startsWith(POSTagEn.POS_NNP) || n.isDependencyLabel(DEPTagEn.DEP_NMOD) || n.isDependencyLabel(DEPTagEn.DEP_DEP));
finalizeCompound(tree, POSTagEn.POS_CD, DEPTagEn.DEP_QMOD, n -> n.isDependencyLabel(DEPTagEn.DEP_QMOD) || n.isDependencyLabel(DEPTagEn.DEP_DEP));
}
private void finalizeLabels(NLPNode[] tree)
{
for (NLPNode node : tree)
{
if (isDative(node))
node.setDependencyLabel(DEPTagEn.DEP_DATIVE);
else if (isEmoticon(node))
node.setDependencyLabel(DEPTagEn.DEP_DISCOURSE);
else if (isVocative(node))
node.setDependencyLabel(DEPTagEn.DEP_VOCATIVE);
}
}
private boolean isDative(NLPNode node)
{
if (!POSLibEn.isVerb(node.getDependencyHead().getPartOfSpeechTag())) return false;
// if (node.isDependencyLabel(DEPTagEn.DEP_IOBJ)) return true;
String feat;
if ((feat = node.getFeat(NLPUtils.FEAT_SYN)) != null && DSUtils.toHashSet(Splitter.splitCommas(feat)).contains(CTTagEn.F_DTV)) return true;
if (CTTagEn.F_BNF.equals(node.getFeat(NLPUtils.FEAT_SEM))) return true;
return false;
}
private boolean isEmoticon(NLPNode node)
{
String s = node.getWordForm();
int[] idx = emoticon.getEmoticonRange(s);
return idx != null && idx[0] == 0 && idx[1] == s.length();
}
private boolean isVocative(NLPNode node)
{
String feat;
return (feat = node.getFeat(NLPUtils.FEAT_SEM)) != null && feat.equals(CTLibEn.F_VOC);
}
private void finalizeCompound(NLPNode[] tree, String pos, String label, Predicate p)
{
NLPNode node, head;
int i, j;
for (i=tree.length-1; i>0; i--)
{
head = tree[i];
if (head.getPartOfSpeechTag().startsWith(pos) && !head.isDependencyLabel(label))
{
for (j=i-1; j>0; j--)
{
node = tree[j];
if (node.getPartOfSpeechTag().startsWith(pos) && node.isDescendantOf(head) && node.getDependencyHead().getID() > node.getID() && p.test(node))
{
node.setDependencyLabel(DEPTagEn.DEP_COMPOUND);
i = j;
}
else if (node.isPartOfSpeechTag(POSTagEn.POS_HYPH))
continue;
else
break;
}
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy