com.as.text_understanding.tree_util.item.ItemFinder Maven / Gradle / Ivy
Show all versions of text-understanding Show documentation
package com.as.text_understanding.tree_util.item;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import java.util.function.Predicate;
import com.as.text_understanding.common.TextUnderstandingException;
import com.as.text_understanding.representation.tree.Terminal;
import com.as.text_understanding.representation.tree.TreeItem;
import com.as.text_understanding.tree_travel.TreeTravelNode;
/**
*
*
* Date: Mar 12, 2016
* @author asher
*
*/
public class ItemFinder
{
public static final Set CLAUSE_SYMBOLS = buildSet(new String[]{"S","SBAR","SBARQ","SINV","SQ","RRC","WHAVP","WHNP","WHPP"});
public static final String COORDINATION_TAG = "CC";
public static final Set NON_CONTENT_POS_TAGS = buildSet(new String[]{"CC","DT","IN","LS","MD","SYM","TO"});
/**
* Returns a list of concepts, where each concept is a list (a sequence) of terminal-nodes, that are assumed to express a single
* "concept" in the the text, like "yellow flower", "group of children", etc. (think about sentences like "a group of children
* traveled to the forest and found a yellow flower".
*
* The returned list of concepts are concepts governed by the given node, i.e., the given node is an ancestor of these concepts,
* in the parse tree. In other words, all the concepts that are part of the given subtree (the given node is the subtree root).
*
* Usually, only one concept is returned, when the given subtree is an argument subtree. However, in case of coordination ("and",
* "or", etc.) more than one concepts is returned. For example "I have a son and a daughter": the second argument of the predicate
* "have" is "a son and a daughter", but it contains two concepts, 1) "son", 2) "daughter".
*
* Each concept is trimmed from non-content terminal nodes, like determiners.
*
*
* @param subtree
* @return
*/
public List> findItems(TreeTravelNode subtree)
{
List> grossConcepts = findItemsRegardslessContent(subtree, true);
List> ret = new ArrayList<>(grossConcepts.size());
for (List concept : grossConcepts)
{
ret.add(trimList(concept, terminalIsContentPredicate));
}
return ret;
}
/**
* Converts the given list of concepts, which is returned by {@link #findItems(TreeTravelNode)}, into a human-readable string.
* @param concepts
* @return
*/
public static String itemsToString(final List> concepts)
{
StringBuilder sb = new StringBuilder();
boolean multipleConcepts = (concepts.size()>1);
boolean multipleConceptsFirstIteration = true;
for (List concept : concepts)
{
if (multipleConcepts)
{
if (multipleConceptsFirstIteration) {multipleConceptsFirstIteration=false;}
else {sb.append(", ");}
sb.append("{");
}
boolean firstIteration = true;
for (TreeTravelNode node : concept)
{
if (!node.getItself().getItem().isTerminal()) throw new TextUnderstandingException("Non terminal node in concept.");
if (firstIteration) {firstIteration=false;}
else {sb.append(" ");}
sb.append(node.getItself().getItem().getTerminal().getToken());
}
if (multipleConcepts) {sb.append("}");}
}
return sb.toString();
}
/////////////// PRIVATE ///////////////
private List> findItemsRegardslessContent(TreeTravelNode subtree, boolean forceIncludeFirst)
{
if (subtree.getItself().getItem().isTerminal())
{
return Collections.singletonList(Collections.singletonList(subtree));
}
else
{
final String symbol = subtree.getItself().getItem().getSymbol();
if ( (!forceIncludeFirst) && CLAUSE_SYMBOLS.contains(symbol))
{
return Collections.emptyList();
}
else
{
List> ret = new LinkedList<>();
List currentItem = new LinkedList<>();
ret.add(currentItem);
boolean first = true;
for (TreeTravelNode child : subtree.getChildren())
{
if (nodeStartsCoordination(child))
{
currentItem = new LinkedList<>();
ret.add(currentItem);
}
List> ofChild = findItemsRegardslessContent(child, (first&&forceIncludeFirst) );
boolean firstConcept = true;
for (List ofChildConcept : ofChild)
{
if (!firstConcept)
{
currentItem = new LinkedList<>();
ret.add(currentItem);
}
currentItem.addAll(ofChildConcept);
firstConcept = false;
}
first = false;
}
return ret;
}
}
}
/**
* Trims a list by the given predicate. All the elements at the beginning of the list that are tested as false
* by the predicate are not included in the returned list. Similarly, all the elements at the end of the list that are
* tested as false are not included in the returned list.
*
* If the given list is not empty, this method does not return an empty list. If all should be trimmed, then
* the first item will not be trimmed.
*
* @param originalList a list
* @param predicateToSurvive a predicate over the elements of the list, such that all elements at the beginning and the end of
* the list that are tested as false by the predicate are not included in the returned list.
*
* @return A list similar to the original list, but trimmed at its beginning and its end.
*/
private static List trimList(List originalList, Predicate predicateToSurvive)
{
if (null==originalList) return null;
if (originalList.isEmpty()) return originalList;
int endExclusive = originalList.size();
ListIterator backwardIterator = originalList.listIterator(originalList.size());
while (backwardIterator.hasPrevious())
{
T item = backwardIterator.previous();
if (predicateToSurvive.test(item))
{
break;
}
--endExclusive;
}
int begin = 0;
ListIterator forwardIterator = originalList.listIterator();
while (forwardIterator.hasNext())
{
T item = forwardIterator.next();
if (predicateToSurvive.test(item))
{
break;
}
++begin;
}
if (begin>=endExclusive)
{
return Collections.singletonList(originalList.get(0)); // don't return an empty list.
}
else
{
List ret = new ArrayList<>(endExclusive-begin);
ListIterator iterator = originalList.listIterator();
int index = 0;
while (iterator.hasNext())
{
if (index>=endExclusive)
{
break;
}
T item = iterator.next();
if (index>=begin)
{
ret.add(item);
}
++index;
}
return ret;
}
}
private static boolean terminalIsContent(Terminal terminal)
{
final String token = terminal.getToken();
boolean letterOrDigitDetected = false;
for (char c : token.toCharArray())
{
if (Character.isLetterOrDigit(c))
{
letterOrDigitDetected = true;
break;
}
}
if (letterOrDigitDetected)
{
if (!NON_CONTENT_POS_TAGS.contains(terminal.getTag()))
{
return true;
}
}
return false;
}
private static boolean nodeStartsCoordination(TreeTravelNode node)
{
TreeItem item = node.getItself().getItem();
if (item.isTerminal())
{
if (COORDINATION_TAG.equals(item.getTerminal().getTag()))
{
return true;
}
}
return false;
}
@SafeVarargs
private static Set buildSet(T...ts)
{
Set ret = new LinkedHashSet<>();
for (T t : ts) {ret.add(t);}
return ret;
}
private static class TerminalIsContentPredicate implements Predicate
{
@Override
public boolean test(TreeTravelNode t)
{
TreeItem item = t.getItself().getItem();
if (!item.isTerminal()) throw new TextUnderstandingException("Encountered a non-terminal node in concept.");
return terminalIsContent(item.getTerminal());
}
}
private static final TerminalIsContentPredicate terminalIsContentPredicate = new TerminalIsContentPredicate();
}