All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.emory.mathcs.nlp.common.constituent.CTTree Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2015, Emory University
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.emory.mathcs.nlp.common.constituent;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map.Entry;

import edu.emory.mathcs.nlp.common.constant.StringConst;
import edu.emory.mathcs.nlp.common.propbank.PBArgument;
import edu.emory.mathcs.nlp.common.propbank.PBInstance;
import edu.emory.mathcs.nlp.common.propbank.PBLib;
import edu.emory.mathcs.nlp.common.propbank.PBLocation;
import edu.emory.mathcs.nlp.common.treebank.PBArc;
import it.unimi.dsi.fastutil.ints.Int2IntMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;


/**
 * Constituent tree.
 * @see CTReader
 * @see CTNode 
 * @author Jinho D. Choi ({@code [email protected]})
 */
public class CTTree
{
	private CTNode                      n_root;
	private List                n_termainals;
	private List                n_tokens;
	private Int2ObjectMap> m_nulls;
	
	/**
	 * Constructs a constituent tree using the specific root node.
	 * @param root the root node of this tree.
	 */
	public CTTree(CTNode root)
	{
		n_root = root;
		initTerminals();
		linkEmtpyCategories();
	}
	
	/** Called by {@link #CTTree(CTNode)}. */
	private void initTerminals()
	{
		List terminals = new ArrayList<>();
		List tokens    = new ArrayList<>();

		initTerminalsAux(n_root, terminals, tokens);
		n_termainals = terminals;
		n_tokens     = tokens;
	}

	/** Called by {@link #initTerminals()}. */
	private void initTerminalsAux(CTNode curr, List terminals, List tokens)
	{
		if (curr.isTerminal())
		{
			curr.setTerminalID(terminals.size());
			terminals.add(curr);
			
			if (!curr.isEmptyCategory())
			{
				curr.setTokenID(tokens.size());
				tokens.add(curr);
			}
		}
		else
		{
			for (CTNode child : curr.getChildrenList())
				initTerminalsAux(child, terminals, tokens);			
		}
	}
	
	/** Called by {@link #CTTree(CTNode)}. */
	private void linkEmtpyCategories()
	{
		m_nulls = new Int2ObjectOpenHashMap>();
		List list;
		int idx, coIndex;
		String form;
		
		for (CTNode node : n_termainals)
		{
			form = node.getWordForm();
			
			if (node.isEmptyCategory() && (idx = form.lastIndexOf("-")) >= 0)
			{
				coIndex = Integer.parseInt(form.substring(idx+1));
				node.setAntecedent(getAntecedent(coIndex));
				
				if (node.hasAntecedent())
				{
					if (m_nulls.containsKey(coIndex))
						list = m_nulls.get(coIndex);
					else
					{
						list = new ArrayList<>();
						m_nulls.put(coIndex, list);
					}
		
					list.add(node);
				}
			}
		}
	}
	
//	======================== Getters ========================

	/** @return the root of this tree. */
	public CTNode getRoot()
	{
		return n_root;
	}
	
	/**
	 * @return a node in this tree with the specific terminal ID and height.
	 * @param terminalID {@link CTNode#i_terminalID}.
	 * @param height the height (starting at 0) of the node from its first terminal node.
	 */
	public CTNode getNode(int terminalID, int height)
	{
		CTNode node = getTerminal(terminalID);
		
		for (int i=height; i>0; i--)
			node = node.getParent();
		
		return node;
	}
	
	public CTNode getNode(PBLocation location)
	{
		return getNode(location.getTerminalID(), location.getHeight());
	}
	
	/** @return a terminal node in this tree with the specific ID. */
	public CTNode getTerminal(int terminalID)
	{
		return n_termainals.get(terminalID);
	}
	
	/** @return the list of all terminal nodes. */
	public List getTerminalList()
	{
		return n_termainals;
	}
	
	/** @return a terminal node in this tree with respect to its token ID. */
	public CTNode getToken(int tokenID)
	{
		return n_tokens.get(tokenID);
	}
	
	/** @return the list of all terminal nodes discarding empty categories. */
	public List getTokenList()
	{
		return n_tokens;
	}
	
	/** @return the antecedent corresponding to the specific index if exists; otherwise, {@code null}. */
	public CTNode getAntecedent(int index)
	{
		return getAntecedentAux(index, n_root);
	}
	
	/** Called by {@link CTTree#getAntecedent(int)}. */
	private CTNode getAntecedentAux(int index, CTNode curr)
	{
		if (curr.getEmptyCategoryIndex() == index)
			return curr;
		else if (curr.getGappingRelationIndex() == index)
		{
			int t = curr.getEmptyCategoryIndex();
			curr.setEmptyCategoryIndex(curr.getGappingRelationIndex());
			curr.setGappingRelationIndex(t);
			
			return curr;
		}
		
		CTNode ante;
		
		for (CTNode child : curr.getChildrenList())
		{
			if ((ante = getAntecedentAux(index, child)) != null)
				return ante;
		}
		
		return null;
	}
	
	/** @return a list of empty categories with he specific co-index if exists; otherwise, {@code null}. */
	public List getEmptyCategoryList(int index)
	{
		return m_nulls.get(index);
	}
	
//	======================== Boolean ========================
	
	/** @return {@code true} if both the specific terminal ID and height are within the range of this tree. */
	public boolean isRange(int terminalId, int height)
	{
		if (terminalId < 0 || terminalId >= n_termainals.size())
			return false;
		
		CTNode node = n_termainals.get(terminalId);
		
		for (int i=height; i>0; i--)
		{
			if (!node.hasParent())
				return false;
			
			node = node.getParent();
		}
		
		return true;
	}
	
	public boolean isRange(PBLocation loc)
	{
		return isRange(loc.getTerminalID(), loc.getHeight());
	}
	
	public boolean compareBrackets(CTTree tree)
	{
		int i, size = n_termainals.size();
		
		if (size != tree.getTerminalList().size())
			return false;
		
		CTNode node1, node2;
		
		for (i=0; i> mOrg = new Int2ObjectOpenHashMap>();
		getCoIndexMap(n_root, mOrg);
		if (mOrg.isEmpty())	return;
		
		List>> ps = new ArrayList<>(mOrg.entrySet());
		Collections.sort(ps, Entry.comparingByKey());
		
		Int2IntMap mNew = new Int2IntOpenHashMap();
		int coIndex = 1, last, i;
		boolean isAnteFound;
		List list;
		CTNode curr, ec;
		
		for (Entry> p : ps)
		{
			list = p.getValue();
			last = list.size() - 1;
			isAnteFound = false;
			
			for (i=last; i>=0; i--)
			{
				curr = list.get(i);
				
				if (curr.isEmptyCategoryTerminal())
				{
					ec = curr.getTerminalList().get(0);
					
					if (i == last || isAnteFound || CTLibEn.isDiscontinuousConstituent(ec) || CTLibEn.containsCoordination(curr.getLowestCommonAncestor(list.get(i+1))))
						curr.setEmptyCategoryIndex(-1);
					else
						curr.setEmptyCategoryIndex(coIndex++);

					if (isAnteFound || i > 0)
						ec.appendWordForm("-"+coIndex);
				}
				else if (isAnteFound)
				{
					curr.setEmptyCategoryIndex(-1);
				}
				else
				{
					curr.setEmptyCategoryIndex(coIndex);
					mNew.put(p.getKey().intValue(), coIndex);
					isAnteFound  = true;
				}
			}
			
			coIndex++;
		}
		
		int[] lastIndex = {coIndex};
		remapGapIndices(mNew, lastIndex, n_root);
	}
	
	/** Called by {@link #normalizeIndices()}. */
	private void getCoIndexMap(CTNode curr, Int2ObjectMap> map)
	{
		if (!curr.isTerminal())
		{
			if (curr.getEmptyCategoryIndex() != -1)
			{
				int key = curr.getEmptyCategoryIndex();
				List list;
				
				if (map.containsKey(key))
					list = map.get(key);
				else
				{
					list = new ArrayList();
					map.put(key, list);
				}
				
				list.add(curr);
			}
			
			for (CTNode child : curr.getChildrenList())
				getCoIndexMap(child, map);
		}
		else if (curr.isEmptyCategory())
		{
			if (curr.isWordForm("*0*"))
				curr.setWordForm("0");
		}
	}
	
	/** Called by {@link #normalizeIndices()}. */
	private void remapGapIndices(Int2IntMap map, int[] lastIndex, CTNode curr)
	{
		int gapIndex = curr.getGappingRelationIndex();
		
		if (map.containsKey(gapIndex))
		{
			curr.setGappingRelationIndex(map.get(gapIndex));
		}
		else if (gapIndex != -1)
		{
			curr.setGappingRelationIndex(lastIndex[0]);
			map.put(gapIndex, lastIndex[0]++);
		}
		
		for (CTNode child : curr.getChildrenList())
			remapGapIndices(map, lastIndex, child);
	}

//	======================== Strings ========================
	
	/** @return {@link #toForms(boolean, String)}, where {@code includeEmptyCategories=false, delim=" "}. */
	public String toForms()
	{
		return toForms(false, StringConst.SPACE);
	}
	
	/**
	 * @return the string containing ordered word-forms of this tree.
	 * @param includeEmptyCategories if {@code true}, include forms of empty categories.
	 */
	public String toForms(boolean includeEmptyCategories, String delim)
	{
		StringBuilder build = new StringBuilder();
		
		if (includeEmptyCategories)
		{
			for (CTNode node : n_termainals)
			{
				build.append(delim);
				build.append(node.getWordForm());
			}	
		}
		else
		{
			for (CTNode node : n_tokens)
			{
				build.append(delim);
				build.append(node.getWordForm());
			}
		}
		
		return build.length() == 0 ? StringConst.EMPTY : build.substring(delim.length());
	}
	
	@Override
	/** @see CTNode#toString(). */
	public String toString()
	{
		return n_root.toString();
	}
	
	/** @see CTNode#toStringLine(). */
	public String toStringLine()
	{
		return n_root.toStringLine();
	}
	
	/** @see CTNode#toString(boolean, boolean, String). */
	public String toString(boolean includeLineNumbers, boolean includeAntecedentPointers, String delim)
	{
		return n_root.toString(includeLineNumbers, includeAntecedentPointers, delim);
	}
	
	public String toColumnPOS(boolean includeEmptyCategories, String delim)
	{
		StringBuilder build = new StringBuilder();
		
		for (CTNode node : n_tokens)
		{
			build.append(node.getWordForm());
			build.append(delim);
			build.append(node.getConstituentTag());
			build.append("\n");
		}
		
		return build.toString();
	}
	
//	======================== PropBank ========================
	
	/** Assigns PropBank locations to all nodes. */
	public void initPBLocations()
	{
		int terminalID, height;
		
		for (CTNode node : n_termainals)
		{
			terminalID = node.getTerminalID();
			height = 0;
			node.setPBLocation(terminalID, height);
			
			while (node.hasParent() && node.getParent().getPBLocation() == null)
			{
				node = node.getParent();
				node.setPBLocation(terminalID, ++height);
			}
		}
	}
	
	public void initPropBank()
	{
		initPropBankAux(n_root);
	}
	
	private void initPropBankAux(CTNode node)
	{
		node.initPropBank();
		
		for (CTNode child : node.getChildrenList())
			initPropBankAux(child);
	}
	
	/** PRE: {@link #initPBLocations()} and {@link #initPropBank()} must be called. */
	public void initPBInstance(PBInstance instance)
	{
		CTNode pNode = getTerminal(instance.getPredicateID()), aNode;
		String label;
		
		pNode.setPBRolesetID(instance.getRolesetID());
		if (!hasPropBank()) initPropBank();
		
		for (PBArgument arg : instance.getArgumentList())
		{
			label = arg.getLabel();
			
			if (PBLib.isLinkArgument(label))	continue;
			if (PBLib.isUndefinedLabel(label))	continue;
			
			for (PBLocation loc : arg.getLocationList())
			{
				aNode = getNode(loc);
				
				if (aNode != pNode)
					aNode.addPBHead(new PBArc(pNode, label));
			}
		}
	}
	
	public List getPBHeadList()
	{
		List predicates = new ArrayList<>();
		
		for (CTNode node : n_tokens)
		{
			if (node.isPBHead())
				predicates.add(node);
		}
		
		return predicates;
	}
	
	public boolean hasPropBank()
	{
		return n_root.getPBHeads() != null;
	}
	
	public boolean hasNamedEntity()
	{
		return getTerminal(0).getNamedEntityTag() != null;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy