All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.as.text_understanding.pasta.Pasta Maven / Gradle / Ivy

The newest version!
package com.as.text_understanding.pasta;

import static com.as.text_understanding.common.TextUnderstandingUtilities.each;
import static com.as.text_understanding.tree_util.TreeUtilities.treeToYield;
import static com.as.text_understanding.tree_util.TreeUtilities.yieldToString;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import com.as.text_understanding.common.TextUnderstandingException;
import com.as.text_understanding.representation.pasta.Argument;
import com.as.text_understanding.representation.pasta.ArgumentType;
import com.as.text_understanding.representation.pasta.Predicate;
import com.as.text_understanding.representation.pasta.PredicateAndArguments;
import com.as.text_understanding.representation.tree.TreeItem;
import com.as.text_understanding.representation.tree.TreeNode;
import com.as.text_understanding.tree_travel.TreeTravelNode;
import com.as.text_understanding.tree_util.TreeUtilities;
import com.as.text_understanding.tree_util.head.HeadFinder;
import com.as.text_understanding.tree_util.item.ItemFinder;
import com.as.text_understanding.uima_annotators.pasta.DemoPastaAnnotator;
import com.as.text_understanding.uima_annotators.pasta.FromDkproPastaAnnotator;
import com.as.text_understanding.uima_annotators.pasta.PastaAnnotator;

/**
 * This class gets a tree as an input (as a {@link TreeTravelNode}), and produces a list of {@link PredicateAndArguments} that
 * resembles the predicate-argument structures expressed by that tree.
 * 
* Note that this class has nothing to do with UIMA, and in particular does not operate over UIMA objects. *

* Usage: Use the constructor with the tree, then call {@link #annotate()}, then call {@link #getResult()} to get the PASTA result. *
* How to create a tree? use a parser. One option is to use a parser from DKPro UIMA annotators and convert the input into a tree * in the form of {@link TreeTravelNode}. An example of that can be found at {@link FromDkproPastaAnnotator} (See {@link DemoPastaAnnotator}, * {@link FromDkproPastaAnnotator}, {@link PastaAnnotator}). * *

* Date: Feb 24, 2016 * @author Asher Stern * */ public class Pasta { //////////////////// PUBLIC //////////////////// public static Set ARGUMENT_PHRASE_SYMBOLS = new LinkedHashSet<>(Arrays.asList(new String[]{"NP", "PP", "ADJP", "ADVP"})); public Pasta(TreeTravelNode root) { super(); this.root = root; } public void annotate() { alreadyHandled = new LinkedHashSet<>(); mapDetectedArguments = new LinkedHashMap<>(); result = new LinkedList<>(); bfsQueue = new LinkedList<>(); bfsQueue.add(root); while (!bfsQueue.isEmpty()) { TreeTravelNode next = bfsQueue.remove(); process(next); } } public List getResult() { return result; } public static String pasResultToString(final List pasResult) { return pasResultToString(pasResult, false, false); } public static String pasResultToString(final List pasResult, boolean printHead, boolean printItem) { StringBuilder sb = new StringBuilder(); for (final PredicateAndArguments predicateAndArguments : pasResult) { if (!isTerminal(predicateAndArguments.getPredicate().getVerbNode())) {throw new TextUnderstandingException("verb node is a non-terminal");} final String verbWord = predicateAndArguments.getPredicate().getVerbNode().getItself().getItem().getTerminal().getToken(); sb.append(verbWord).append("\n"); for (final Argument argument : predicateAndArguments.getArguments()) { sb.append("\t").append(argument.isClause()?" (c)":"").append(" (").append(argument.getType().name()).append(") "); if (argument.getPreposition()!=null) { sb.append("<").append(printPreposition(argument.getPreposition())).append("> "); } if (printHead) { TreeNode headNode = HeadFinder.findTerminalHead(argument.getSubtree().getItself()); final String head = headNode.getItem().getTerminal().getToken(); sb.append(head).append(": "); } if (printItem && (!argument.isClause())) { ItemFinder itemFinder = new ItemFinder(); sb.append(ItemFinder.itemsToString(itemFinder.findItems(argument.getSubtree()))).append(": "); } sb.append(yieldToString(treeToYield(argument.getSubtree().getItself()))).append("\n"); } } return sb.toString(); } //////////////////// PRIVATE //////////////////// private void process(TreeTravelNode node) { if (!isTerminal(node)) { final String symbol = node.getItself().getItem().getSymbol(); if (symbol.equals("VP")) { if (!alreadyHandled.contains(node)) { findPas(node); } } else if (symbol.equals("SBAR")) { handleSBAR(node); } else if (symbol.equals("RRC")) { handleRRC(node); } if (node.getChildren()!=null) { for (TreeTravelNode child : node.getChildren()) { bfsQueue.add(child); } } } } // Assumes the input is a VP node. private Set findActualVerbsVP(TreeTravelNode node) { ArrayList nonTerminalChildren = node.getNonTerminalChildren(); boolean actualVPsInChildren = true; boolean vpDetected = false; for (TreeTravelNode child : each(nonTerminalChildren)) { String symbol = child.getItself().getItem().getSymbol(); if (symbol.equals("VP")) { vpDetected=true; } if ( (!symbol.equals("VP")) && (!symbol.equals("CC")) ) { actualVPsInChildren = false; } } if (!vpDetected) { actualVPsInChildren = false; } if (actualVPsInChildren) { Set ret = new LinkedHashSet<>(); for (TreeTravelNode child : each(nonTerminalChildren)) { String symbol = child.getItself().getItem().getSymbol(); if (symbol.equals("VP")) { ret.addAll(findActualVerbsVP(child)); } } return ret; } else { Set ret = new LinkedHashSet<>(); ret.add(node); return ret; } } // TODO what about "S", "SINV", etc. ? TreeTravelNode findSubject(TreeTravelNode vpNode) { TreeTravelNode ret = vpNode.getLastPredecessorSibling("NP"); if (null==ret) { ret = vpNode.getFirstSibling("NP"); } if (null==ret) { ret = vpNode.getFirstSibling("PP"); } return ret; } // This includes subject! private List findSiblingArguments(TreeTravelNode vpNode) { List ret = new LinkedList<>(); ArrayList siblings = vpNode.getAllSiblings(); for (TreeTravelNode sibling : each(siblings)) { TreeItem siblingItem = sibling.getItself().getItem(); if (!siblingItem.isTerminal()) { String symbol = siblingItem.getSymbol(); if ( ARGUMENT_PHRASE_SYMBOLS.contains(symbol) || symbol.startsWith("S") ) { ret.add(sibling); } } } return ret; } private List findInVPArguments(TreeTravelNode vpNode) { List ret = new LinkedList<>(); ArrayList children = vpNode.getChildren(); for (TreeTravelNode child : each(children)) { if (!isTerminal(child)) { String symbol = child.getItself().getItem().getSymbol(); if (symbol.length()>0) { if ( ARGUMENT_PHRASE_SYMBOLS.contains(symbol) || symbol.startsWith("S") ) { ret.add(child); } } } } return ret; } // The given node is some node that might be an argument: // if it is NP - it is an argument, // if it is PP - it's first NP child is an argument, // otherwise, it is not an argument. private TreeTravelNode findArgumentInNodeItself(TreeTravelNode node) { if (node!=null) { TreeItem nodeItem = node.getItself().getItem(); String symbol = nodeItem.getSymbol(); if (symbol.equals("PP")) { ArrayList ppChildren = node.getNonTerminalChildren(); for (TreeTravelNode ppChild : each(ppChildren)) { String childSymbol = ppChild.getItself().getItem().getSymbol(); if ( childSymbol.equals("NP" ) || childSymbol.startsWith("S") ) { return ppChild; } } } } return node; } private void addArgumentToDetectedArguments(TreeTravelNode vpNode, final Argument argument) { // const unsigned int uniqueID = argument.getRoot()->getUniqueId(); if (mapDetectedArguments.containsKey(vpNode)) { List ofNode = mapDetectedArguments.get(vpNode); boolean alreadyExists = false; for (Argument existingArgument : each(ofNode)) { if (existingArgument.getSubtree()==argument.getSubtree()) { alreadyExists=true; break; } } if (!alreadyExists) { mapDetectedArguments.get(vpNode).add(argument); } } else { List _list = new LinkedList<>(); _list.add(argument); mapDetectedArguments.put(vpNode, _list); } } private void handleRRC(TreeTravelNode node) { TreeTravelNode childVP = null; ArrayList children = node.getChildren(); for (TreeTravelNode child : children) { TreeItem childItem = child.getItself().getItem(); if (!isTerminal(child)) { if (childItem.getSymbol().equals("VP")) { childVP = child; break; } } } if (childVP != null) { TreeTravelNode sibling = node.getImmediateNonTerminalPredecessorSibling(); TreeTravelNode argumentNode = findArgumentInNodeItself(sibling); if (argumentNode!=null) { // addArgumentToDetectedArguments(childVP, new Argument(ArgumentType.SUBJECT, false, argumentNode)); addArgumentToDetectedArguments(childVP, buildArgument(sibling, ArgumentType.SUBJECT) ); // buildArgument on the sibling will find again argumentNode, which is not null. } } } private void handleSBAR(TreeTravelNode node) { boolean containsWH_P = false; for (TreeTravelNode child : node.getChildren()) { if (!isTerminal(child)) { final String childSymbol = child.getItself().getItem().getSymbol(); if ( childSymbol.startsWith("WH") && childSymbol.endsWith("P") ) { containsWH_P=true; break; } } } if (containsWH_P) { TreeTravelNode vpDescendant = node.getFirstDescendant("VP", false, true, false); if (vpDescendant!=null) { TreeTravelNode subjectNode = null; subjectNode = node.getLastPredecessorSibling("NP"); if (subjectNode==null) { subjectNode = node.getLastPredecessorSibling("PP"); } if (subjectNode!=null) { // subjectNode = findArgumentInNodeItself(subjectNode); // if (subjectNode==null) { throw new TextUnderstandingException("findArgumentInNodeItself returned nullptr unexpectedly."); } // addArgumentToDetectedArguments(vpDescendant, new Argument(ArgumentType.SUBJECT, false, subjectNode) ); addArgumentToDetectedArguments(vpDescendant, buildArgument(subjectNode, ArgumentType.SUBJECT) ); } } } } private LinkedList mergeArgumentLists(final List highPriorityList, final List lowPriorityList) { LinkedList ret = new LinkedList<>(); if (highPriorityList!=null) { ret.addAll(highPriorityList); } LinkedHashSet highPriorityNodes = new LinkedHashSet<>(); if (highPriorityList!=null) { for (Argument argument : highPriorityList) { highPriorityNodes.add(argument.getSubtree()); } } boolean subjectAlreadyExists=false; if (highPriorityList!=null) { for (Argument argument : highPriorityList) { if (argument.getType()==ArgumentType.SUBJECT) { subjectAlreadyExists=true; break; } } } if (lowPriorityList!=null) { for (final Argument argument : lowPriorityList) { if (!highPriorityNodes.contains(argument.getSubtree())) // it does not exist in the other list. { if (subjectAlreadyExists && (argument.getType()==ArgumentType.SUBJECT) ) { ret.add(new Argument(ArgumentType.OBJECT, argument.isClause(), argument.getSubtree())); } else { ret.add(argument); } } } } return ret; } private Argument buildArgument(final TreeTravelNode argumentRoot, ArgumentType typeIfNP_PP_S) { TreeTravelNode argumentNode = argumentRoot; List preposition = null; if (argumentRoot.getItself().getItem().getSymbol().equals("PP")) { argumentNode = findArgumentInNodeItself(argumentRoot); if (argumentNode==null) {throw new TextUnderstandingException("findArgumentInNodeItself returned nullptr unexpectedly.");} preposition = findPreposition(argumentRoot); } ArgumentType type = typeIfNP_PP_S; final String argumentSymbol = argumentRoot.getItself().getItem().getSymbol(); // argumentSymbol.size() should not be 0, but, to be on the safe side I add this check if ( (argumentSymbol.length()==0) || ( (!argumentSymbol.equals("NP")) && (!argumentSymbol.equals("PP")) && (!argumentSymbol.startsWith("S")) ) ) { type = ArgumentType.MODIFIER; } boolean clause = (argumentSymbol.length()>0) && (argumentSymbol.startsWith("S")); return new Argument(type, clause, preposition, argumentNode); } private List findPreposition(final TreeTravelNode prepositionPhrase) { ArrayList children = prepositionPhrase.getChildren(); List ret = new LinkedList<>(); int index=0; for (TreeTravelNode child : children) { if ( (index+1)==children.size() ) { break; } if (!child.getItself().getItem().isTerminal()) { if (child.getItself().getItem().getSymbol().equals("NP")) { break; } } ret.add(child); ++index; } return ret; } private void findPas(TreeTravelNode node) { LinkedList arguments = new LinkedList<>(); Set actualVPs = findActualVerbsVP(node); TreeTravelNode subject = findSubject(node); if (subject!=null) { // TreeTravelNode argumentNode = subject; // if (subject.getItself().getItem().getSymbol().equals("PP")) // { // argumentNode = findArgumentInNodeItself(subject); // if (argumentNode==null) {throw new TextUnderstandingException("findArgumentInNodeItself returned nullptr unexpectedly.");} // } // arguments.add(new Argument(ArgumentType.SUBJECT, false, argumentNode)); arguments.add(buildArgument(subject, ArgumentType.SUBJECT)); } List siblingArguments = findSiblingArguments(node); for (TreeTravelNode siblingArgument : siblingArguments) { if ( (subject==null) || (subject!=siblingArgument) ) // = if this is not the subject (that was added earlier) { arguments.add(buildArgument(siblingArgument, ArgumentType.OBJECT)); } } for (TreeTravelNode actualVP : actualVPs) { if (!alreadyHandled.contains(actualVP)) { TreeTravelNode verbNode = actualVP.getFirstDescendant("V", true, false, true); if (verbNode!=null) { LinkedList actualArguments = new LinkedList<>(); actualArguments.addAll(arguments); List inVpArguments = findInVPArguments(actualVP); for (TreeTravelNode inVpArgumentNode : inVpArguments) { actualArguments.add(buildArgument(inVpArgumentNode, ArgumentType.OBJECT)); } if (mapDetectedArguments.containsKey(actualVP)) { actualArguments = mergeArgumentLists(mapDetectedArguments.get(node), actualArguments); } if (mapDetectedArguments.containsKey(node)) { actualArguments = mergeArgumentLists(mapDetectedArguments.get(node), actualArguments); } Predicate predicate = new Predicate(actualVP, verbNode); ArrayList argumentsVector = new ArrayList<>(actualArguments.size()); argumentsVector.addAll(actualArguments); PredicateAndArguments predicateAndArguments = new PredicateAndArguments(predicate, argumentsVector); result.add(predicateAndArguments); } alreadyHandled.add(actualVP); } } alreadyHandled.add(node); } private static boolean isTerminal(TreeTravelNode node) { return node.getItself().getItem().isTerminal(); } private static final String printPreposition(TreeTravelNode prepositionNode) { return TreeUtilities.yieldToString(TreeUtilities.treeToYield(prepositionNode.getItself())); } private static final String printPreposition(List prepositionNodes) { StringBuilder sb = new StringBuilder(); boolean firstIteration = true; for (TreeTravelNode node : prepositionNodes) { if (firstIteration){firstIteration=false;} else {sb.append(" ");} sb.append(printPreposition(node)); } return sb.toString(); } // private static final String printPreposition(TreeTravelNode prepositionNode) // { // System.out.println(prepositionNode.getIndex()); // System.out.println(prepositionNode.getItself().getItem().getSymbol()); // ArrayList children = prepositionNode.getChildren(); // StringBuilder sb = new StringBuilder(); // boolean firstIteration = true; // int index = 0; // for (TreeTravelNode child : children) // { // if ( (index+1) >= children.size() ) // { // break; // } // TreeItem childItem = child.getItself().getItem(); // if (!childItem.isTerminal()) // { // if (childItem.getSymbol().equals("NP")) // break; // } // if (firstIteration) {firstIteration=false;} // else {sb.append(" ");} // sb.append(TreeUtilities.yieldToString(TreeUtilities.treeToYield(child.getItself()))); // // ++index; // } // // return sb.toString(); // } private final TreeTravelNode root; private Set alreadyHandled; private Map> mapDetectedArguments; // map from a VP to its arguments that were detected earlier. private List result; private Queue bfsQueue; @SuppressWarnings("unused") private static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(Pasta.class); }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy