All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.googlecode.clearnlp.conversion.EnglishC2DConverter Maven / Gradle / Ivy

/**
* Copyright (c) 2009-2012, Regents of the University of Colorado
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package com.googlecode.clearnlp.conversion;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import com.googlecode.clearnlp.constituent.CTLib;
import com.googlecode.clearnlp.constituent.CTLibEn;
import com.googlecode.clearnlp.constituent.CTNode;
import com.googlecode.clearnlp.constituent.CTTree;
import com.googlecode.clearnlp.dependency.DEPArc;
import com.googlecode.clearnlp.dependency.DEPFeat;
import com.googlecode.clearnlp.dependency.DEPLib;
import com.googlecode.clearnlp.dependency.DEPLibEn;
import com.googlecode.clearnlp.dependency.DEPNode;
import com.googlecode.clearnlp.dependency.DEPTree;
import com.googlecode.clearnlp.dependency.srl.SRLLib;
import com.googlecode.clearnlp.headrule.HeadRule;
import com.googlecode.clearnlp.headrule.HeadRuleMap;
import com.googlecode.clearnlp.morphology.MPLibEn;
import com.googlecode.clearnlp.util.UTArray;
import com.googlecode.clearnlp.util.pair.Pair;
import com.googlecode.clearnlp.util.pair.StringIntPair;


/**
 * Constituent to dependency converter for English.
 * @since 1.0.0
 * @author Jinho D. Choi ({@code [email protected]})
 */
public class EnglishC2DConverter extends AbstractC2DConverter
{
	static final public byte TYPE_STANFORD = 0;
	private final int SIZE_HEAD_FLAGS = 4;
	
	private final String[] a_semTags = {CTLibEn.FTAG_BNF, CTLibEn.FTAG_DIR, CTLibEn.FTAG_EXT, CTLibEn.FTAG_LOC, CTLibEn.FTAG_MNR, CTLibEn.FTAG_PRP, CTLibEn.FTAG_TMP, CTLibEn.FTAG_VOC};
	private final String[] a_synTags = {CTLibEn.FTAG_ADV, CTLibEn.FTAG_CLF, CTLibEn.FTAG_CLR, CTLibEn.FTAG_DTV, CTLibEn.FTAG_NOM, CTLibEn.FTAG_PUT, CTLibEn.FTAG_PRD, CTLibEn.FTAG_TPC};
	private Set    s_semTags;
	private Set    s_synTags;
	
	private Map> m_rnr;
	private Map> m_xsbj;
	private Map       m_coord;
	
	private List>> l_mergeLabels;
	
	public EnglishC2DConverter(HeadRuleMap headrules, String mergeLabels)
	{
		super(headrules);
		
		initBasic();
		initCoord();
		initMerge(mergeLabels);
	}
	
	private void initBasic()
	{
		s_semTags = UTArray.toSet(a_semTags);
		s_synTags = UTArray.toSet(a_synTags);

		m_rnr   = new HashMap>();
		m_xsbj  = new HashMap>();
	}
	
	private void initCoord()
	{
		m_coord = new HashMap();
		
		m_coord.put(CTLibEn.PTAG_ADJP	, Pattern.compile("^(ADJP|JJ.*|VBN|VBG)$"));
		m_coord.put(CTLibEn.PTAG_ADVP	, Pattern.compile("^(ADVP|RB.*)$"));
		m_coord.put(CTLibEn.PTAG_INTJ	, Pattern.compile("^(INTJ|UH)$"));
		m_coord.put(CTLibEn.PTAG_PP  	, Pattern.compile("^(PP|IN|VBG)$"));
		m_coord.put(CTLibEn.PTAG_PRT 	, Pattern.compile("^(PRT|RP)$"));
		m_coord.put(CTLibEn.PTAG_NAC 	, Pattern.compile("^(NP)$"));
		m_coord.put(CTLibEn.PTAG_NML 	, Pattern.compile("^(NP|NML|NN.*|PRP)$"));
		m_coord.put(CTLibEn.PTAG_NP  	, Pattern.compile("^(NP|NML|NN.*|PRP)$"));
		m_coord.put(CTLibEn.PTAG_NX  	, Pattern.compile("^(NX)$"));
		m_coord.put(CTLibEn.PTAG_VP  	, Pattern.compile("^(VP|VB.*)$"));
		m_coord.put(CTLibEn.PTAG_S   	, Pattern.compile("^(S|SINV|SQ|SBARQ)$"));
		m_coord.put(CTLibEn.PTAG_SBAR	, Pattern.compile("^(SBAR.*)$"));
		m_coord.put(CTLibEn.PTAG_SBARQ	, Pattern.compile("^(SBAR.*)$"));
		m_coord.put(CTLibEn.PTAG_SINV	, Pattern.compile("^(S|SINV)$"));
		m_coord.put(CTLibEn.PTAG_SQ		, Pattern.compile("^(S|SQ|SBARQ)$"));
		m_coord.put(CTLibEn.PTAG_WHNP	, Pattern.compile("^(NN.*|WP)$"));
		m_coord.put(CTLibEn.PTAG_WHADJP	, Pattern.compile("^(JJ.*|VBN|VBG)$"));
		m_coord.put(CTLibEn.PTAG_WHADVP	, Pattern.compile("^(RB.*|WRB|IN)$"));
	}
	
	private void initMerge(String mergeLabels)
	{
		l_mergeLabels = new ArrayList>>();
		
		if (mergeLabels != null)
		{
			String[]    tmp;
			String      nLabel;
			Set oLabels;
			
			for (String ms : mergeLabels.split("\\"+DEPFeat.DELIM_FEATS))
			{
				tmp     = ms.split(DEPFeat.DELIM_KEY_VALUE);
				nLabel  = tmp[0];
				oLabels = new HashSet();
				
				for (String oLabel : tmp[1].split(DEPFeat.DELIM_VALUES))
					oLabels.add(oLabel);
						
				l_mergeLabels.add(new Pair>(nLabel, oLabels));
			}
		}
	}

	@Override
	public DEPTree toDEPTree(CTTree cTree)
	{
		clearMaps();
		
		if (!mapEmtpyCategories(cTree))	return null;
		setHeads(cTree.getRoot());
		
		return getDEPTree(cTree);
	}
	
	private void clearMaps()
	{
		m_rnr.clear();
		m_xsbj.clear();
	}
	
	// ============================= Map empty categories ============================= 
	
	/**
	 * Removes, relocates empty categories in the specific tree. 
	 * Returns {@true} if the constituent tree contains nodes after relocating empty categories.
	 * @param cTree the constituent tree to be processed.
	 * @return {@true} if the constituent tree contains nodes after relocating empty categories.
	 */
	public boolean mapEmtpyCategories(CTTree cTree)
	{
		for (CTNode node : cTree.getTerminals())
		{
			if (!node.isEmptyCategory())	continue;
			if (node.getParent() == null)	continue;
			
			if      (node.form.startsWith(CTLibEn.EC_PRO))
				mapPRO(cTree, node);
			else if (node.form.startsWith(CTLibEn.EC_TRACE))
				mapTrace(cTree, node);
			else if (CTLibEn.RE_NULL.matcher(node.form).find())
				mapNull(cTree, node);
			else if (node.isForm("0"))
				continue;
			else if (CTLibEn.RE_ICH_PPA_RNR.matcher(node.form).find())
				mapICH(cTree, node);
		//	else if (node.form.startsWith(CTLibEn.EC_EXP))
		//		reloateEXP(cTree, node);
			else
				removeCTNode(node);
		}
		
		return cTree.getRoot().getChildrenSize() > 0;
	}
	
	/** Called by {@link EnglishC2DConverter#mapEmtpyCategories(CTTree)}. */
	private void mapPRO(CTTree cTree, CTNode ec)
	{
		CTNode np = ec.getParent();
		CTNode vp = np.getParent().getFirstChainedDescendant(CTLibEn.PTAG_VP);
		
		if (vp == null)								// small clauses
			relocatePRD(np, ec);
		else
		{
			CTNode ante;
			
			if ((ante = ec.getAntecedent()) != null && ante.pTag.startsWith("WH"))	// relative clauses
			{
				if (cTree.getCoIndexedEmptyCategories(ante.coIndex).size() == 1)
					mapTrace(cTree, ec);
			}
			
			addXSubject(ec, m_xsbj);
		}
	}
	
	/** Called by {@link EnglishC2DConverter#mapEmtpyCategories(CTTree)}. */
	private void mapTrace(CTTree cTree, CTNode ec)
	{
		CTNode ante = ec.getAntecedent();
		
		if (ante == null || ec.isDescendantOf(ante))
			removeCTNode(ec);
		else if (ante.hasFTag(CTLibEn.FTAG_TPC))
		{
			if (!ante.hasFTag(CTLibEn.FTAG_SBJ))
			{
				CTNode parent = ec.getParent();
				parent.removeChild(ec);
				replaceEC(parent, ante);
			}
			else
				removeCTNode(ec);
		}
		else	// relative clauses
		{
			CTNode parent = ante.getHighestChainedAncestor(CTLibEn.PTAG_SBAR);
			if (parent != null)		parent.addFTag(DEPLibEn.DEP_RCMOD);
			replaceEC(ec, ante);
		}
	}
	
	/** Called by {@link EnglishC2DConverter#mapEmtpyCategories(CTTree)}. */
	private void mapNull(CTTree cTree, CTNode ec)
	{
		CTNode np = ec.getParent();
		
		if (np.hasFTag(CTLibEn.FTAG_SBJ))
		{
			// small clauses
			if (np.getNextSibling(CTLibEn.PTAG_VP) == null)
				relocatePRD(np, ec);
			else
				addXSubject(ec, m_xsbj);
		}
	}
	
	/** Called by {@link EnglishC2DConverter#mapEmtpyCategories(CTTree)}. */
	private void mapICH(CTTree cTree, CTNode ec)
	{
		CTNode parent = ec.getParent();
		CTNode ante   = ec.getAntecedent();
		
		if (ec.form.startsWith(CTLibEn.EC_ICH) && parent.getPrevSibling("+WH.*") != null)
			removeCTNode(ec);
		else if (ante == null || ec.isDescendantOf(ante))
			removeCTNode(ec);
		else
		{
			List list = cTree.getCoIndexedEmptyCategories(ante.coIndex);
			boolean isRNR = ec.form.startsWith(CTLibEn.EC_RNR);
			int i, size = list.size();
			CTNode node;
			
			Deque dq = isRNR ? new ArrayDeque() : null; 
			
			if (ec.getTerminalId() < ante.getFirstTerminal().getTerminalId())
			{		
				for (i=0; i0; i--)
				{
					node = list.get(i);
					if (isRNR)	dq.addFirst(node.getParent().getParent());
					removeCTNode(node);
				}
				
				ec = list.get(0);
			}
			
			if (isRNR && !dq.isEmpty())
				m_rnr.put(ante, dq);
			
			parent = ec.getParent();
			parent.removeChild(ec);
			replaceEC(parent, ante);
		}
	}
	
	/** Called by {@link EnglishC2DConverter#mapPRO(CTTree, CTNode)} and {@link EnglishC2DConverter#mapNull(CTTree, CTNode)}. */
	private void relocatePRD(CTNode np, CTNode ec)
	{
		CTNode s   = np.getParent();
		CTNode prd = s.getFirstChild("-"+CTLibEn.FTAG_PRD);
		Set fTags = s.getFTags();
		
		if (prd != null && (fTags.isEmpty() || fTags.contains(CTLibEn.FTAG_CLR)))
		{
			fTags.clear();
			fTags.add(DEPLibEn.DEP_OPRD);
		}

		removeCTNode(ec);
	}
	
/*	private void reloateEXP(CTTree cTree, CTNode ec)
	{
		int idx = ec.form.lastIndexOf("-");
		
		if (idx != -1)
		{
			int coIndex = Integer.parseInt(ec.form.substring(idx+1));
			CTNode ante = cTree.getCoIndexedAntecedent(coIndex);
			if (ante != null)	ante.addFTag(DEPLibEn.CONLL_EXTR);
		}
		
		removeCTNode(ec);
	}*/
	
	/**
	 * @param ec empty subject.
	 * @param map key: antecedent, value: list of clauses containing empty subjects.
	 */
	private void addXSubject(CTNode ec, Map> map)
	{
		CTNode ante = ec.getAntecedent();
		
		while (ante != null && ante.isEmptyCategoryRec() && !ante.pTag.startsWith("WH"))
			ante = ante.getFirstTerminal().getAntecedent();
		
		if (ante != null)
		{
			CTNode s = ec.getNearestAncestor(CTLibEn.PTAG_S);
			
			if (s != null)
			{
				Deque dq = map.get(ante);
				if (dq == null)	dq = new ArrayDeque();
				
				dq.add(s);
				map.put(ante, dq);
			}
		}
	}
	
	private void removeCTNode(CTNode node)
	{
		CTNode parent = node.getParent();
	
		if (parent != null)
		{
			parent.removeChild(node);
			
			if (parent.getChildrenSize() == 0)
				removeCTNode(parent);			
		}
	}
	
	private void replaceEC(CTNode ec, CTNode ante)
	{
		removeCTNode(ante);
		ec.getParent().setChild(ec.getSiblingId(), ante);
	}
	
	// ============================= Find heads =============================
	
	@Override
	protected void setHeadsAux(HeadRule rule, CTNode curr)
	{
		if (findHeadsCoordination(rule, curr))	return;
		
		findHyphens(curr);
		findHeadsApposition(curr);
		findHeadsSmallClause(curr);

		CTNode head = getHead(rule, curr.getChildren(), SIZE_HEAD_FLAGS);
		if (head.c2d.getLabel() != null)	head.c2d.setLabel(null); 
		curr.c2d = new C2DInfo(head);
	}
	
	
	/**
	 * If the specific node contains a coordination structure, find the head of each coordination.
	 * @param curr the specific node to be compared. 
	 * @return {@code true} if this node contains a coordination structure.
	 */
	private boolean findHeadsCoordination(HeadRule rule, CTNode curr)
	{
		// skip pre-conjunctions and punctuation
		int i, sId, size = curr.getChildrenSize();
		CTNode node;
		
		for (sId=0; sId 0)
			findHeadsCoordinationAux(rule, curr, bId, eId, prevHead);
		
		curr.c2d = new C2DInfo(mainHead);
		return true;
	}
	
	/** Called by {@link EnglishC2DConverter#findHeadsCoordination(HeadRule, CTNode)}. */
	private Pattern getConjunctPattern(CTNode curr, int sId, int size)
	{
		Pattern rTags = m_coord.get(curr.pTag);
		
		if (rTags != null)
		{
			boolean b = false;
			int i;
			
			for (i=sId; i> p : l_mergeLabels)
		{
			for (i=1; i 1)	System.err.println("Warning: multiple roots exist");
	}
	
	/** Splits certain Stanford dependency labels into finer-grained labels. */
	private void splitLabels(DEPTree tree)
	{
		int i, size = tree.size();
		List list;
		DEPNode node;

		tree.setDependents();
		
		for (i=1; i 1)
				list.get(0).setLabel(DEPLibEn.DEP_IOBJ);
		}
	}
	
	/** Adds secondary dependency heads. */
	private void addXHeads(DEPTree dTree)
	{
		for (CTNode curr : m_xsbj.keySet())
		{
			if (curr.c2d != null)
				addXHeadsAux(dTree, curr, m_xsbj.get(curr), DEPLibEn.DEP_XSUBJ);
		}
		
		for (CTNode curr : m_rnr.keySet())
		{
			if (curr.getParent() == null)
				continue;
			
			if (curr.getParent().c2d.getPhraseHead() != curr)
				addXHeadsAux(dTree, curr, m_rnr.get(curr), DEPLibEn.DEP_RNR);
			else
				addXChildren(dTree, curr, m_rnr.get(curr), DEPLibEn.DEP_RNR);
		}
	}
	
	/** Called by {@link EnglishC2DConverter#addDEPHeads(DEPTree, CTTree)} */
	private void addXHeadsAux(DEPTree dTree, CTNode cNode, Deque dq, String label)
	{
		DEPNode node = getDEPNode(dTree, cNode);
		DEPNode head;
		
		for (CTNode cHead : dq)
		{
			head = getDEPNode(dTree, cHead);
			node.addXHead(head, label);
			
			if (label.equals(DEPLibEn.DEP_XSUBJ) && head.isLabel(DEPLibEn.DEP_CCOMP))
				head.setLabel(DEPLibEn.DEP_XCOMP);
		}
	}
	
	/** {@link EnglishC2DConverter#addDEPHeads(DEPTree, CTTree)} */
	private void addXChildren(DEPTree dTree, CTNode cHead, Deque dq, String label)
	{
		DEPNode head = getDEPNode(dTree, cHead);
		DEPNode node;
		
		for (CTNode cNode : dq)
		{
			node = getDEPNode(dTree, cNode);
			node.addXHead(head, label);			
		}
	}
	
	/** Add extra features. */
	private void addFeats(DEPTree dTree, CTTree cTree, CTNode cNode)
	{
		CTNode ante;
		String feat;
		
		if (cNode.gapIndex != -1 && cNode.getParent().gapIndex == -1 && (ante = cTree.getCoIndexedAntecedent(cNode.gapIndex)) != null)
		{
			DEPNode dNode = getDEPNode(dTree, cNode);
			dNode.addXHead(getDEPNode(dTree, ante), DEPLibEn.DEP_GAP);
		}
		
		if ((feat = getFunctionTags(cNode, s_semTags)) != null)
			cNode.c2d.putFeat(DEPLibEn.FEAT_SEM, feat);
		
		if ((feat = getFunctionTags(cNode, s_synTags)) != null)
			cNode.c2d.putFeat(DEPLibEn.FEAT_SYN, feat);

		for (CTNode child : cNode.getChildren())
			addFeats(dTree, cTree, child);
	}
	
	/** Called by {@link EnglishC2DConverter#addFeats(DEPTree, CTTree, CTNode)}. */
	private String getFunctionTags(CTNode node, Set sTags)
	{
		List tags = new ArrayList();
		
		for (String tag : node.getFTags())
		{
			if (sTags.contains(tag))
				tags.add(tag);
		}
		
		if (tags.isEmpty())	return null;
		Collections.sort(tags);

		StringBuilder build = new StringBuilder();
		
		for (String tag : tags)
		{
			build.append(DEPFeat.DELIM_VALUES);
			build.append(tag);
		}
		
		return build.substring(DEPFeat.DELIM_VALUES.length());
	}
	
	// ============================= Add PropBank arguments =============================
	
	private void addPBArgs(DEPTree dTree, CTTree cTree)
	{
		CTNode root = cTree.getRoot();
		dTree.initSHeads();
		
		if (root.pbArgs != null)
		{
			initPBArgs(dTree, cTree, root);
			arrangePBArgs(dTree);
			relabelArgNs(dTree);
		}
	}
	
	private void initPBArgs(DEPTree dTree, CTTree cTree, CTNode cNode)
	{
		if (!cNode.isPTag(CTLib.PTAG_TOP))
		{
			DEPNode dNode, sHead;
			
			if (cNode.isPhrase())
				dNode = getDEPNode(dTree, cNode);
			else
				dNode = dTree.get(cNode.getTokenId()+1);
			
			for (StringIntPair p : cNode.pbArgs)
			{
				sHead = dTree.get(p.i);
				
				if (isRefArgument(cNode))
					p.s = "R-"+p.s;
				
				if (!dNode.containsSHead(sHead) && dNode != sHead)
					dNode.addSHead(sHead, p.s);
			}
		}
		
		for (CTNode child : cNode.getChildren())
			initPBArgs(dTree, cTree, child);
	}
	
	private boolean isRefArgument(CTNode cNode)
	{
		if (CTLibEn.isRelPhrase(cNode))
			return true;
		
		if (cNode.isPTag(CTLibEn.PTAG_PP) && containsRefArgument(cNode))
			return true;

		return false;
	}
	
	private boolean containsRefArgument(CTNode cNode)
	{
		for (CTNode child : cNode.getChildren()) 
		{
			if (child.isPTagAny(CTLibEn.PTAG_ADJP, CTLibEn.PTAG_ADVP, CTLibEn.PTAG_NP, CTLibEn.PTAG_PP))
			{
				for (CTNode gc : child.getChildren())
				{
					if (!gc.isEmptyCategoryRec() && CTLibEn.isRelPhrase(gc))
						return true;		
				}
			}
		}
		
		return false;
	}
	
	private void arrangePBArgs(DEPTree dTree)
	{
		int i, size = dTree.size();
		List remove;
		DEPNode node, head;
		String label;
		
		for (i=1; i();
			
			for (DEPArc arc : node.getSHeads())
			{
				head  = arc.getNode();
				label = arc.getLabel();
				
				if (ancestorHasSHead(node, head, label))
					remove.add(arc);
			//	else if (rnrHasSHead(node, head, label))
			//		remove.add(arc);
			}
			
			node.removeSHeads(remove);
		}
	}
	
	private boolean ancestorHasSHead(DEPNode dNode, DEPNode sHead, String label)
	{
		DEPNode dHead = dNode.getHead();
		
		while (dHead != null)
		{
			if (dHead.isArgumentOf(sHead, label))
				return true;
			
			dHead = dHead.getHead();
		}
		
		return false;
	}
	
	protected boolean rnrHasSHead(DEPNode dNode, DEPNode sHead, String label)
	{
		for (DEPArc rnr : dNode.getXHeads(DEPLibEn.DEP_RNR))
		{
			if (rnr.getNode().isArgumentOf(sHead, label))
				return true;
		}
		
		return false;
	}
	
	private void relabelArgNs(DEPTree dTree)
	{
		Map map = new HashMap();
		int i, size = dTree.size();
		List remove;
		DEPNode node;
		String key;
		
		for (i=1; i();
			
			for (DEPArc arc : node.getSHeads())
			{
				if (arc.getLabel().startsWith(SRLLib.PREFIX_REFERENT))
					continue;
				
				if (arc.getLabel().startsWith("AM"))
					continue;
				
				key = arc.toString();
				
				if (map.containsKey(key))
					arc.setLabel(SRLLib.PREFIX_CONCATENATION + arc.getLabel());
				else
					map.put(key, node);
			}
			
			node.removeSHeads(remove);
		}
	}
	
	private DEPNode getDEPNode(DEPTree dTree, CTNode cNode)
	{
		return dTree.get(cNode.c2d.getDependencyHead().getTokenId() + 1);
	}
	
	
	// ============================= Get CoNLL labels =============================
	
/*	private void convertToCoNLLLabels(DEPTree tree)
	{
		int i, size = tree.size();
		DEPNode node;
		
		for (i=1; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy