All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.emory.mathcs.nlp.conversion.EnglishC2DConverter Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2014, Emory University
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.emory.mathcs.nlp.conversion;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UnknownFormatConversionException;
import java.util.function.Predicate;
import java.util.regex.Pattern;

import edu.emory.mathcs.nlp.common.collection.arc.AbstractArc;
import edu.emory.mathcs.nlp.common.constituent.CTLib;
import edu.emory.mathcs.nlp.common.constituent.CTLibEn;
import edu.emory.mathcs.nlp.common.constituent.CTNode;
import edu.emory.mathcs.nlp.common.constituent.CTTagEn;
import edu.emory.mathcs.nlp.common.constituent.CTTree;
import edu.emory.mathcs.nlp.common.propbank.PBLib;
import edu.emory.mathcs.nlp.common.treebank.DEPLibEn;
import edu.emory.mathcs.nlp.common.treebank.DEPTagEn;
import edu.emory.mathcs.nlp.common.treebank.PBArc;
import edu.emory.mathcs.nlp.common.treebank.POSLibEn;
import edu.emory.mathcs.nlp.common.treebank.POSTagEn;
import edu.emory.mathcs.nlp.common.util.DSUtils;
import edu.emory.mathcs.nlp.common.util.ENUtils;
import edu.emory.mathcs.nlp.common.util.Joiner;
import edu.emory.mathcs.nlp.common.util.NLPUtils;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.Splitter;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.component.dep.DEPArc;
import edu.emory.mathcs.nlp.component.template.node.FeatMap;
import edu.emory.mathcs.nlp.component.template.node.NLPNode;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Emoticon;
import edu.emory.mathcs.nlp.conversion.util.C2DInfo;
import edu.emory.mathcs.nlp.conversion.util.HeadRule;
import edu.emory.mathcs.nlp.conversion.util.HeadRuleMap;


/**
 * Constituent to dependency converter for English.
 * @since 3.0.0
 * @author Jinho D. Choi ({@code [email protected]})
 */
public class EnglishC2DConverter extends C2DConverter
{
	
	private final Set S_NPADVMOD	= DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP, CTTagEn.C_QP);
	private final Set S_ADVCL		= DSUtils.toHashSet(CTTagEn.C_S, CTTagEn.C_SBAR, CTTagEn.C_SINV);
	private final Set S_NFMOD		= DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP, CTTagEn.C_WHNP);
	private final Set S_CCOMP		= DSUtils.toHashSet(CTTagEn.C_S, CTTagEn.C_SQ, CTTagEn.C_SINV, CTTagEn.C_SBARQ);
	private final Set S_META		= DSUtils.toHashSet(CTTagEn.C_EDITED, CTTagEn.C_EMBED, CTTagEn.C_LST, CTTagEn.C_META, CTLibEn.POS_CODE, CTTagEn.C_CAPTION, CTTagEn.C_CIT, CTTagEn.C_HEADING, CTTagEn.C_TITLE);
	private final Set S_MARK		= DSUtils.toHashSet(CTLibEn.POS_IN, CTLibEn.POS_TO, CTLibEn.POS_DT);
	private final Set S_POSS		= DSUtils.toHashSet(CTLibEn.POS_PRPS, CTLibEn.POS_WPS);
	private final Set S_INTJ		= DSUtils.toHashSet(CTTagEn.C_INTJ, CTLibEn.POS_UH);
	private final Set S_PRT 		= DSUtils.toHashSet(CTTagEn.C_PRT, CTLibEn.POS_RP);
//	private final Set S_NUM			= DSUtils.toHashSet(CTLibEn.POS_CD, CTTagEn.C_QP);
	private final Set S_DET			= DSUtils.toHashSet(CTLibEn.POS_DT, CTLibEn.POS_WDT, CTLibEn.POS_WP);
	private final Set S_AUX			= DSUtils.toHashSet(CTLibEn.POS_MD, CTLibEn.POS_TO);
//	private final Set S_NN			= DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP);

//	private final Set S_ADJT_PHRASE	= DSUtils.toHashSet(CTTagEn.C_ADJP, CTTagEn.C_WHADJP);
	private final Set S_NOUN_PHRASE	= DSUtils.toHashSet(CTTagEn.C_NP, CTTagEn.C_NML);
	private final Set S_PREP_PHRASE	= DSUtils.toHashSet(CTTagEn.C_PP, CTTagEn.C_WHPP);
	private final Set S_ADVB_PHRASE	= DSUtils.toHashSet(CTTagEn.C_ADJP, CTTagEn.C_ADVP, CTTagEn.C_PP);
	private final Set S_PREPOSITION	= DSUtils.toHashSet(CTLibEn.POS_IN, CTLibEn.POS_TO);
//	private final Set S_PARTICIPIAL	= DSUtils.toHashSet(CTLibEn.POS_VBG, CTLibEn.POS_VBN);
	private final Set S_PREP_DET	= DSUtils.toHashSet(CTLibEn.POS_IN, CTLibEn.POS_DT);
	
	private final Set S_COMP_PARENT_S = DSUtils.toHashSet(CTTagEn.C_VP, CTTagEn.C_SINV, CTTagEn.C_SQ);
	private final Set S_COMP_PARENT_A = DSUtils.toHashSet(CTTagEn.C_ADJP, CTTagEn.C_ADVP);
	private final Set S_NMOD_PARENT	  = DSUtils.toHashSet(CTTagEn.C_NML, CTTagEn.C_NP, CTTagEn.C_NX, CTTagEn.C_WHNP);
	private final Set S_POSS_PARENT	  = DSUtils.toHashSet(CTTagEn.C_NP, CTTagEn.C_NML, CTTagEn.C_WHNP, CTTagEn.C_QP, CTTagEn.C_ADJP);
	
	private final Set S_COMPLM = DSUtils.toHashSet("that", "if", "whether");
	private final int SIZE_HEAD_FLAGS = 4;
	
	private Set s_semTags;
	private Set s_synTags;
	
	private Map> m_rnr;
	private Map> m_xsubj;
	private Map       m_coord;
	
	private Predicate mt_s;
	private Predicate mt_to;
	private Predicate mt_pos;
	private Predicate mt_sbj;
	private Predicate mt_prd;
	private Predicate mt_none;
	private Predicate mt_in_dt;
	private Predicate mt_np_prd;
	
	private Emoticon emoticon;
	
	public EnglishC2DConverter(HeadRuleMap headrules)
	{
		super(headrules, new HeadRule(HeadRule.DIR_RIGHT_TO_LEFT));
		
		initBasic();
		initCoord();
		initMatchers();
		emoticon = new Emoticon();
	}
	
	@Override
	public NLPNode[] toDependencyGraph(CTTree cTree)
	{
		NLPNode[] tree = null;
		
		try
		{
			CTLibEn.preprocess(cTree);
			clearMaps();
			if (!mapEmtpyCategories(cTree))	return null;
			setHeads(cTree.getRoot());
			tree = getDEPTree(cTree);	
		}
		catch (Exception e) {e.printStackTrace();}
		
		if (tree != null) finalize(tree);
		return tree;
	}
	
// ============================= Initialization ============================= 
	
	private void initBasic()
	{
		s_semTags = DSUtils.toHashSet(CTTagEn.F_BNF, CTTagEn.F_DIR, CTTagEn.F_EXT, CTTagEn.F_LOC, CTTagEn.F_MNR, CTTagEn.F_PRP, CTTagEn.F_TMP, CTTagEn.F_VOC);
		s_synTags = DSUtils.toHashSet(CTTagEn.F_ADV, CTTagEn.F_CLF, CTTagEn.F_CLR, CTTagEn.F_DTV, CTTagEn.F_NOM, CTTagEn.F_PUT, CTTagEn.F_PRD, CTTagEn.F_TPC);
		m_rnr     = new HashMap<>();
		m_xsubj   = new HashMap<>();
	}
	
	private void initCoord()
	{
		m_coord = new HashMap<>();
		
		m_coord.put(CTTagEn.C_ADJP	, PatternUtils.createClosedORPattern("ADJP","JJ.*","VBN","VBG"));
		m_coord.put(CTTagEn.C_ADVP	, PatternUtils.createClosedORPattern("ADVP","RB.*"));
		m_coord.put(CTTagEn.C_INTJ	, PatternUtils.createClosedORPattern("INTJ","UH"));
		m_coord.put(CTTagEn.C_PP  	, PatternUtils.createClosedORPattern("PP","IN","VBG"));
		m_coord.put(CTTagEn.C_PRT 	, PatternUtils.createClosedORPattern("PRT","RP"));
		m_coord.put(CTTagEn.C_NAC 	, PatternUtils.createClosedORPattern("NP"));
		m_coord.put(CTTagEn.C_NML 	, PatternUtils.createClosedORPattern("NP","NML","NN.*","PRP"));
		m_coord.put(CTTagEn.C_NP  	, PatternUtils.createClosedORPattern("NP","NML","NN.*","PRP"));
		m_coord.put(CTTagEn.C_NX  	, PatternUtils.createClosedORPattern("NX"));
		m_coord.put(CTTagEn.C_VP  	, PatternUtils.createClosedORPattern("VP","VB.*"));
		m_coord.put(CTTagEn.C_S   	, PatternUtils.createClosedORPattern("S","SINV","SQ","SBARQ"));
		m_coord.put(CTTagEn.C_SBAR	, PatternUtils.createClosedORPattern("SBAR.*"));
		m_coord.put(CTTagEn.C_SBARQ	, PatternUtils.createClosedORPattern("SBAR.*"));
		m_coord.put(CTTagEn.C_SINV	, PatternUtils.createClosedORPattern("S","SINV"));
		m_coord.put(CTTagEn.C_SQ	, PatternUtils.createClosedORPattern("S","SQ","SBARQ"));
		m_coord.put(CTTagEn.C_WHNP	, PatternUtils.createClosedORPattern("NN.*","WP"));
		m_coord.put(CTTagEn.C_WHADJP, PatternUtils.createClosedORPattern("JJ.*","VBN","VBG"));
		m_coord.put(CTTagEn.C_WHADVP, PatternUtils.createClosedORPattern("RB.*","WRB","IN"));
	}
	
	private void initMatchers()
	{
		mt_s		= CTLib.matchC(CTTagEn.C_S);
		mt_to		= CTLib.matchC(POSTagEn.POS_TO);
		mt_pos		= CTLib.matchC(POSTagEn.POS_POS);
		mt_none		= CTLib.matchC(CTLibEn.NONE);
		
		mt_sbj  	= CTLib.matchF(CTTagEn.F_SBJ);
		mt_prd  	= CTLib.matchF(CTTagEn.F_PRD);
		mt_np_prd	= CTLib.matchCF(CTTagEn.C_NP, CTTagEn.F_PRD);
		mt_in_dt	= CTLib.matchCo(DSUtils.toHashSet(POSTagEn.POS_IN, POSTagEn.POS_DT));
	}

	private void clearMaps()
	{
		m_rnr.clear();
		m_xsubj.clear();
	}
	
// ============================= Empty categories ============================= 
	
	/**
	 * Removes, relocates empty categories in the specific tree. 
	 * @param cTree the constituent tree to be processed.
	 * @return {@true} if the constituent tree contains nodes after relocating empty categories.
	 */
	private boolean mapEmtpyCategories(CTTree cTree)
	{
		for (CTNode node : cTree.getTerminalList())
		{
			if (!node.isEmptyCategory())	continue;
			if (node.getParent() == null)	continue;
			
			if      (node.wordFormStartsWith(CTTagEn.E_PRO))
				mapPRO(cTree, node);
			else if (node.wordFormStartsWith(CTTagEn.E_TRACE))
				mapTrace(cTree, node);
			else if (node.matchesWordForm(CTLibEn.P_PASSIVE_NULL))
				mapPassiveNull(cTree, node);
			else if (node.isWordForm(CTTagEn.E_ZERO))
				continue;
			else if (CTLibEn.isDiscontinuousConstituent(node))
				mapDiscontinuousConstituent(cTree, node);
//			else if (node.wordFormStartsWith(CTTagEn.E_EXP))
//				reloateEXP(cTree, node);
			else
				removeCTNode(node);
		}
		
		return cTree.getRoot().getChildrenSize() > 0;
	}
	
	/** Called by {@link #mapEmtpyCategories(CTTree)}. */
	private void mapPRO(CTTree cTree, CTNode ec)
	{
		CTNode np = ec.getParent();
		CTNode vp = np.getParent().getFirstLowestChainedDescendant(CTLibEn.M_VP);
		
		if (vp == null)		// small clauses
			relocatePRD(np, ec);
		else
		{
			CTNode ante;
			
			if ((ante = ec.getAntecedent()) != null && CTLibEn.isWhPhrase(ante))	// relative clauses
			{
				if (cTree.getEmptyCategoryList(ante.getEmptyCategoryIndex()).size() == 1)
					mapTrace(cTree, ec);
			}
			
			addXSubject(ec, m_xsubj);
		}
	}
	
	/** Called by {@link #mapEmtpyCategories(CTTree)}. */
	private void mapTrace(CTTree cTree, CTNode ec)
	{
		CTNode ante = ec.getAntecedent();
		
		if (ante == null || ec.isDescendantOf(ante))
			removeCTNode(ec);
		else if (ante.hasFunctionTag(CTTagEn.F_TPC))
		{
			if (!ante.hasFunctionTag(CTTagEn.F_SBJ))
			{
				CTNode parent = ec.getParent();
				parent.removeChild(ec);
				replaceEC(parent, ante);
			}
			else
				removeCTNode(ec);
		}
		else	// relative clauses
		{
			CTNode parent = ante.getHighestChainedAncestor(CTLibEn.M_SBAR);
			if (parent != null) parent.addFunctionTag(DEPTagEn.DEP_RELCL);
			replaceEC(ec, ante);
		}
	}
	
	/** Called by {@link #mapEmtpyCategories(CTTree)}. */
	private void mapPassiveNull(CTTree cTree, CTNode ec)
	{
		CTNode np = ec.getParent();
		
		if (np.hasFunctionTag(CTTagEn.F_SBJ))
		{
			// small clauses
			if (np.getRightNearestSibling(CTLibEn.M_VP) == null)
				relocatePRD(np, ec);
			else
				addXSubject(ec, m_xsubj);
		}
	}
	
	/** Called by {@link #mapEmtpyCategories(CTTree)}. */
	private void mapDiscontinuousConstituent(CTTree cTree, CTNode ec)
	{
		CTNode parent = ec.getParent();
		CTNode ante   = ec.getAntecedent();
		
		if (ec.wordFormStartsWith(CTTagEn.E_ICH) && parent.getLeftNearestSibling(CTLibEn.M_WHx) != null)
			removeCTNode(ec);
		else if (ante == null || ec.isDescendantOf(ante))
			removeCTNode(ec);
		else
		{
			List list = cTree.getEmptyCategoryList(ante.getEmptyCategoryIndex());
			boolean isRNR = CTLibEn.isRNR(ec);
			int i, size = list.size();
			CTNode node;
			
			Deque dq = isRNR ? new ArrayDeque() : null;
			
			if (ec.getTerminalID() < ante.getFirstTerminal().getTerminalID())
			{		
				for (i=0; i0; i--)
				{
					node = list.get(i);
					if (isRNR)	dq.addFirst(node.getParent().getParent());
					removeCTNode(node);
				}
				
				ec = list.get(0);
			}
			
			if (isRNR && !dq.isEmpty())
				m_rnr.put(ante, dq);
			
			parent = ec.getParent();
			parent.removeChild(ec);
			replaceEC(parent, ante);
		}
	}
	
	/** Called by {@link #mapPRO(CTTree, CTNode)} and {@link #mapPassiveNull(CTTree, CTNode)}. */
	private void relocatePRD(CTNode np, CTNode ec)
	{
		CTNode s   = np.getParent();
		CTNode prd = s.getFirstChild(mt_prd);
		Set fTags = s.getFunctionTagSet();
		
		if (prd != null && (fTags.isEmpty() || fTags.contains(CTTagEn.F_CLR)))
		{
			fTags.clear();
			fTags.add(DEPTagEn.DEP_OPRD);
		}

		removeCTNode(ec);
	}
	
/*	private void reloateEXP(CTTree cTree, CTNode ec)
	{
		int idx = ec.form.lastIndexOf("-");
		
		if (idx != -1)
		{
			int coIndex = Integer.parseInt(ec.form.substring(idx+1));
			CTNode ante = cTree.getCoIndexedAntecedent(coIndex);
			if (ante != null)	ante.addFTag(DEPTagEn.CONLL_EXTR);
		}
		
		removeCTNode(ec);
	}*/
	
	/**
	 * @param ec empty subject.
	 * @param map key: antecedent, value: list of clauses containing empty subjects.
	 */
	private void addXSubject(CTNode ec, Map> map)
	{
		CTNode ante = ec.getAntecedent();
		
		while (ante != null && ante.isEmptyCategoryTerminal())
		{
			if (CTLibEn.isWhPhrase(ante)) return;
			ante = ante.getFirstTerminal().getAntecedent();
		}
		
		if (ante != null)
		{
			CTNode s = ec.getNearestAncestor(mt_s);
			
			if (s != null)
			{
				Deque dq = map.get(ante);
				if (dq == null)	dq = new ArrayDeque();
				
				dq.add(s);
				map.put(ante, dq);
			}
		}
	}
	
	private void removeCTNode(CTNode node)
	{
		CTNode parent = node.getParent();
	
		if (parent != null)
		{
			parent.removeChild(node);
			
			if (parent.getChildrenSize() == 0)
				removeCTNode(parent);			
		}
	}
	
	private void replaceEC(CTNode ec, CTNode ante)
	{
		removeCTNode(ante);
		ec.getParent().replaceChild(ec, ante);
	}
	
// ============================= Find heads =============================
	
	@Override
	protected void setHeadsAux(HeadRule rule, CTNode curr)
	{
		if (findHeadsCoordination(rule, curr))	return;
		
//		findHyphens(curr);
		findHeadsApposition(curr);
		findHeadsSmallClause(curr);

		CTNode head = getHead(rule, curr.getChildrenList(), SIZE_HEAD_FLAGS);
		if (head.getC2DInfo().getLabel() != null) head.getC2DInfo().setLabel(null); 
		curr.setC2DInfo(new C2DInfo(head));
	}
	
	/**
	 * If the specific node contains a coordination structure, find the head of each coordination.
	 * @param curr the specific node to be compared. 
	 * @return {@code true} if this node contains a coordination structure.
	 */
	private boolean findHeadsCoordination(HeadRule rule, CTNode curr)
	{
		// skip pre-conjunctions and punctuation
		int i, sId, size = curr.getChildrenSize();
		CTNode node;
		
		for (sId=0; sId 0)
			findHeadsCoordinationAux(rule, curr, bId, eId, prevHead);
//			findHeadsCoordinationAux(rule, curr, bId, eId, mainHead);
		
		curr.setC2DInfo(new C2DInfo(mainHead));
		return true;
	}
	
	/** Called by {@link #findHeadsCoordination(HeadRule, CTNode)}. */
	private Pattern getConjunctPattern(CTNode curr, int sId, int size)
	{
		Pattern rTags = m_coord.get(curr.getConstituentTag());
		
		if (rTags != null)
		{
			boolean b = false;
			int i;
			
			for (i=sId; i 1)	System.err.println("Warning: multiple roots exist");
	}
	
	/** Called by {@link #getDEPTree(CTTree)}. */
	private void addSecondaryHeads(NLPNode[] dTree)
	{
		for (CTNode curr : m_xsubj.keySet())
		{
			if (curr.hasC2DInfo())
				addSecondaryHeadsAux(dTree, curr, m_xsubj.get(curr), DEPTagEn.DEP2_XSUBJ);
		}
		
		for (CTNode curr : m_rnr.keySet())
		{
			if (curr.getParent() == null)
				continue;
			
			if (curr.getParent().getC2DInfo().getNonTerminalHead() != curr)
				addSecondaryHeadsAux(dTree, curr, m_rnr.get(curr), DEPTagEn.DEP2_RNR);
			else
				addSecondaryChildren(dTree, curr, m_rnr.get(curr), DEPTagEn.DEP2_RNR);
		}
	}
	
	/** Called by {@link #addSecondaryHeads(DEPTree)}. */
	private void addSecondaryHeadsAux(NLPNode[] dTree, CTNode cNode, Deque dq, String label)
	{
		if (cNode.isEmptyCategoryTerminal()) return;
		NLPNode node = getNLPNode(dTree, cNode);
		NLPNode head;
		
		for (CTNode cHead : dq)
		{
			head = getNLPNode(dTree, cHead);
			
			if (head == null)
			{
				System.err.println("HEAD NOT EXIST: AUX");
				continue;
			}
			
			if (!node.isDependentOf(head)) node.addSecondaryHead(head, label);
			
			if (label.equals(DEPTagEn.DEP2_XSUBJ) && head.isDependencyLabel(DEPTagEn.DEP_CCOMP))
				head.setDependencyLabel(DEPTagEn.DEP_XCOMP);
		}
	}
	
	/** Called by {@link #addSecondaryHeads(DEPTree)}. */
	private void addSecondaryChildren(NLPNode[] dTree, CTNode cHead, Deque dq, String label)
	{
		NLPNode head = getNLPNode(dTree, cHead);
		NLPNode node;
		
		for (CTNode cNode : dq)
		{
			node = getNLPNode(dTree, cNode);
			
			if (node == null || node.getID() == 0)
			{
				System.err.println("HEAD NOT EXIST: CHILDREN");
				continue;
			}
			
			node.addSecondaryHead(head, label);			
		}
	}
	
	/** Called by {@link #getDEPTree(CTTree)}. */
	private void addFeats(NLPNode[] dTree, CTTree cTree, CTNode cNode)
	{
		CTNode ante;
		String feat;
		
		if (!cNode.isEmptyCategoryTerminal() && cNode.getGappingRelationIndex() != -1 && cNode.getParent().getGappingRelationIndex() == -1 && (ante = cTree.getAntecedent(cNode.getGappingRelationIndex())) != null)
		{
			NLPNode dNode = getNLPNode(dTree, cNode);
			dNode.addSecondaryHead(getNLPNode(dTree, ante), DEPTagEn.DEP2_GAP);
		}
		
		if ((feat = getFunctionTags(cNode, s_semTags)) != null)
			cNode.getC2DInfo().putFeat(NLPUtils.FEAT_SEM, feat);
		
		if ((feat = getFunctionTags(cNode, s_synTags)) != null)
			cNode.getC2DInfo().putFeat(NLPUtils.FEAT_SYN, feat);

		for (CTNode child : cNode.getChildrenList())
			addFeats(dTree, cTree, child);
	}
	
	/** Called by {@link #addFeats(DEPTree, CTTree, CTNode)}. */
	private String getFunctionTags(CTNode node, Set sTags)
	{
		List tags = new ArrayList<>();
		
		for (String tag : node.getFunctionTagSet())
		{
			if (sTags.contains(tag))
				tags.add(tag);
		}
		
		if (tags.isEmpty())	return null;
		Collections.sort(tags);
		return Joiner.join(tags, FeatMap.DELIM_VALUES);
	}
	
	private NLPNode getNLPNode(NLPNode[] dTree, CTNode cNode)
	{
		if (cNode.isConstituentTag(CTTagEn.TOP)) return null;
		CTNode cHead = cNode.isTerminal() ? cNode : cNode.getC2DInfo().getTerminalHead();
		return cHead.isEmptyCategory() ? null : dTree[cHead.getTokenID()+1];
//		return cNode.isTerminal() ? dTree.get(cNode.getTokenID()+1) : dTree.get(cNode.getC2DInfo().getTerminalHead().getTokenID()+1);
	}
	
// ============================= Edited phrases =============================
	
	public NLPNode[] getDEPTreeWithoutEdited(CTTree cTree, NLPNode[] dTree)
	{
		List nodes = new ArrayList<>();
		Set set = new HashSet<>();
		int id = 1;
			
		addEditedTokensAux(cTree.getRoot(), set);
			
		for (NLPNode node : dTree)
		{
			if (!set.contains(node.getID()))
			{
				removeEditedHeads(node.getSecondaryHeadList(), set);
				removeEditedHeads(node.getSemanticHeadList() , set);
				node.setID(id++);
				nodes.add(node);
			}
		}
		
		return (nodes.size() > 0) ? NLPUtils.toDependencyTree(nodes, NLPNode::new) : null;
	}
		
	/** Called by {@link #getDEPTreeWithoutEdited(CTTree, DEPTree)}. */
	private void addEditedTokensAux(CTNode curr, Set set)
	{
		for (CTNode child : curr.getChildrenList())
		{
			if (CTLibEn.isEditedPhrase(child))
			{
				for (CTNode sub : child.getTokenList())
					set.add(sub.getTokenID()+1);
			}
			else if (!child.isTerminal())
				addEditedTokensAux(child, set);
		}
	}
		
	/** Called by {@link #getDEPTreeWithoutEdited(CTTree, DEPTree)}. */
	private >void removeEditedHeads(List heads, Set set)
	{
		if (heads == null) return;
		List remove = new ArrayList<>();
		
		for (T arc : heads)
		{
			if (arc.getNode() == null || set.contains(arc.getNode().getID()))
				remove.add(arc);
		}
		
		heads.removeAll(remove);
	}	
	
	// ============================= Add PropBank arguments =============================
	
	private void addSemanticHeads(NLPNode[] dTree, CTTree cTree)
	{
		initPropBank(dTree, cTree.getRoot());
		arrangePropBank(dTree);
		relabelNumberedArguments(dTree);
	}
	
	/** Called by {@link #addSemanticHeads(DEPTree, CTTree)}. */
	private void initPropBank(NLPNode[] dTree, CTNode cNode)
	{
		NLPNode dNode = getNLPNode(dTree, cNode);
		
		if (dNode != null)
		{
			if (cNode.isPBHead())
				dNode.putFeat(NLPUtils.FEAT_PREDICATE, cNode.getPBRolesetID());
			
			NLPNode sHead, d;
			String  label;
			CTNode  c;
			
			for (PBArc p : cNode.getPBHeads())
			{
				sHead = getNLPNode(dTree, p.getNode());
				label = PBLib.getShortLabel(p.getLabel());
				
				if ((c = getReferentArgument(cNode)) != null)
				{
					if ((c = CTLibEn.getRelativizer(c)) != null && (c = c.getAntecedent()) != null)
					{
						d = getNLPNode(dTree, c);
						
						if (d != null && d.getSemanticHeadArc(sHead) == null)
							d.addSemanticHead(new DEPArc<>(sHead, label));
					}
					
					label = PBLib.PREFIX_REFERENT + label;
				}
				
				if (!dNode.isArgumentOf(sHead) && dNode != sHead)
					dNode.addSemanticHead(sHead, label);
			}	
		}
		
		for (CTNode child : cNode.getChildrenList())
			initPropBank(dTree, child);
	}
	
	/** Called by {@link #initPropBank(DEPTree, CTNode)}. */
	private CTNode getReferentArgument(CTNode node)
	{
		CTNode ref;
		
		if ((ref = CTLibEn.getWhPhrase(node)) != null)
			return ref;
		
		if (node.isConstituentTag(CTTagEn.C_PP))
		{
			for (CTNode child : node.getChildrenList()) 
			{
				if ((ref = CTLibEn.getWhPhrase(child)) != null)
					return ref;
			}
		}

		return null;
	}
	
	/** Called by {@link #addSemanticHeads(DEPTree, CTTree)}. */
	private void arrangePropBank(NLPNode[] tree)
	{
		List> remove;
		NLPNode head;
		String label;
		
		for (NLPNode node : tree)
		{
			remove = new ArrayList<>();
			
			for (DEPArc arc : node.getSemanticHeadList())
			{
				head  = arc.getNode();
				label = arc.getLabel();
				
				if (ancestorHasSemanticHead(node, head, label))
					remove.add(arc);
			//	else if (rnrHasSHead(node, head, label))
			//		remove.add(arc);
			}
			
			node.removeSemanticHeads(remove);
		}
	}
	
	/** Called by {@link #arrangePropBank(DEPTree)}. */
	private boolean ancestorHasSemanticHead(NLPNode dNode, NLPNode sHead, String label)
	{
		NLPNode dHead = dNode.getDependencyHead();
		
		while (dHead.getID() != 0)
		{
			if (dHead.isArgumentOf(sHead, label))
				return true;
			
			dHead = dHead.getDependencyHead();
		}
		
		return false;
	}
	
//	private boolean rnrHasSHead(NLPNode dNode, NLPNode sHead, String label)
//	{
//		for (DEPArc rnr : dNode.getSecondaryHeadList(DEPTagEn.DEP2_RNR))
//		{
//			if (rnr.getNode().isArgumentOf(sHead, label))
//				return true;
//		}
//		
//		return false;
//	}
	
	/** Called by {@link #addSemanticHeads(DEPTree, CTTree)}. */
	private void relabelNumberedArguments(NLPNode[] tree)
	{
		Map map = new HashMap<>();
		String key;
		
		for (NLPNode node : tree)
		{
			for (DEPArc arc : node.getSemanticHeadList())
			{
				if (PBLib.isReferentArgument(arc.getLabel()))
					continue;
								
				if (PBLib.isModifier(arc.getLabel()))
					continue;
				
				key = arc.toString();
				
				if (map.containsKey(key))
					arc.setLabel(PBLib.PREFIX_CONCATENATION + arc.getLabel());
				else
					map.put(key, node);
			}
		}
	}
	
	private void finalize(NLPNode[] tree)
	{
		finalizeLabels(tree);
		finalizeCompound(tree, POSTagEn.POS_NN, DEPTagEn.DEP_NMOD , n -> n.getPartOfSpeechTag().startsWith(POSTagEn.POS_NNP) || n.isDependencyLabel(DEPTagEn.DEP_NMOD) || n.isDependencyLabel(DEPTagEn.DEP_DEP));
		finalizeCompound(tree, POSTagEn.POS_CD, DEPTagEn.DEP_QMOD, n -> n.isDependencyLabel(DEPTagEn.DEP_QMOD) || n.isDependencyLabel(DEPTagEn.DEP_DEP));
	}
	
	private void finalizeLabels(NLPNode[] tree)
	{
		for (NLPNode node : tree)
		{
			if (isDative(node))
				node.setDependencyLabel(DEPTagEn.DEP_DATIVE);
			else if (isEmoticon(node))
				node.setDependencyLabel(DEPTagEn.DEP_DISCOURSE);
			else if (isVocative(node))
				node.setDependencyLabel(DEPTagEn.DEP_VOCATIVE);
		}
	}
	
	private boolean isDative(NLPNode node)
	{
		if (!POSLibEn.isVerb(node.getDependencyHead().getPartOfSpeechTag())) return false;
//		if (node.isDependencyLabel(DEPTagEn.DEP_IOBJ)) return true;
		String feat;
		
		if ((feat = node.getFeat(NLPUtils.FEAT_SYN)) != null && DSUtils.toHashSet(Splitter.splitCommas(feat)).contains(CTTagEn.F_DTV)) return true;
		if (CTTagEn.F_BNF.equals(node.getFeat(NLPUtils.FEAT_SEM))) return true;
		
		return false;
	}
	
	private boolean isEmoticon(NLPNode node)
	{
		String s = node.getWordForm();
		int[] idx = emoticon.getEmoticonRange(s);
		return idx != null && idx[0] == 0 && idx[1] == s.length();
	}
	
	private boolean isVocative(NLPNode node)
	{
		String feat;
		return (feat = node.getFeat(NLPUtils.FEAT_SEM)) != null && feat.equals(CTLibEn.F_VOC);
	}
	
	private void finalizeCompound(NLPNode[] tree, String pos, String label, Predicate p)
	{
		NLPNode node, head;
		int i, j;
		
		for (i=tree.length-1; i>0; i--)
		{
			head = tree[i];
			
			if (head.getPartOfSpeechTag().startsWith(pos) && !head.isDependencyLabel(label))
			{
				for (j=i-1; j>0; j--)
				{
					node = tree[j];
					
					if (node.getPartOfSpeechTag().startsWith(pos) && node.isDescendantOf(head) && node.getDependencyHead().getID() > node.getID() && p.test(node))
					{
						node.setDependencyLabel(DEPTagEn.DEP_COMPOUND);
						i = j;
					}
					else if (node.isPartOfSpeechTag(POSTagEn.POS_HYPH))
						continue;
					else
						break;
				}
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy