edu.stanford.nlp.ling.tokensregex.SequenceMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.ling.tokensregex;
import edu.stanford.nlp.util.*;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import static edu.stanford.nlp.ling.tokensregex.SequenceMatcher.FindType.FIND_NONOVERLAPPING;
/**
* A generic sequence matcher.
*
*
* Similar to Java's {@code Matcher} except it matches sequences over an arbitrary type {@code T}
* instead of characters.
* For a type {@code T} to be matchable, it has to have a corresponding {@code NodePattern} that indicates
* whether a node is matched or not.
*
*
*
* A matcher is created as follows:
*
* SequencePattern p = SequencePattern.compile("...");
* SequencePattern m = p.getMatcher(List sequence);
*
*
*
*
* Functions for searching
*
* boolean matches()
* boolean find()
* boolean find(int start)
*
* Functions for retrieving matched patterns
*
* int groupCount()
* List<T> groupNodes(), List<T> groupNodes(int g)
* String group(), String group(int g)
* int start(), int start(int g), int end(), int end(int g)
*
* Functions for replacing
*
* List<T> replaceFirst(List<T> seq), List replaceAll(List<T> seq)
* List<T> replaceFirstExtended(List<MatchReplacement<T>> seq), List<T> replaceAllExtended(List<MatchReplacement<T>> seq)
*
* Functions for defining the region of the sequence to search over
* (default region is entire sequence)
*
* void region(int start, int end)
* int regionStart()
* int regionEnd()
*
*
*
*
* NOTE: When find is used, matches are attempted starting from the specified start index of the sequence
* The match with the earliest starting index is returned.
*
*
* @author Angel Chang
*/
public class SequenceMatcher extends BasicSequenceMatchResult {
private static final Logger logger = Logger.getLogger(SequenceMatcher.class.getName());
boolean matchingCompleted = false;
boolean matched = false;
boolean matchWithResult = false; // If result of matches should be kept
int nextMatchStart = 0;
int regionStart = 0;
int regionEnd = -1;
// TODO: Check and fix implementation for FIND_ALL
/**
* Type of search to perform
*
* - FIND_NONOVERLAPPING - Find nonoverlapping matches (default)
* - FIND_ALL - Find all potential matches
* Greedy/reluctant quantifiers are not enforced
* (perhaps should add syntax where some of them are enforced...)
*
*/
public enum FindType { FIND_NONOVERLAPPING, FIND_ALL }
FindType findType = FIND_NONOVERLAPPING;
// For FIND_ALL
Iterator curMatchIter = null;
MatchedStates curMatchStates = null;
// Branching limit for searching with back tracking. Higher value makes the search faster but uses more memory.
int branchLimit = 2;
protected SequenceMatcher(SequencePattern pattern, List extends T> elements)
{
this.pattern = pattern;
// NOTE: It is important elements DO NOT change as we do matches
// TODO: Should we just make a copy of the elements?
this.elements = elements;
if (elements == null) {
throw new IllegalArgumentException("Cannot match against null elements");
}
this.regionEnd = elements.size();
this.priority = pattern.priority;
this.score = pattern.weight;
this.varGroupBindings = pattern.varGroupBindings;
matchedGroups = new MatchedGroup[pattern.totalGroups];
}
public void setBranchLimit(int blimit){
this.branchLimit = blimit;
}
/**
* Interface that specifies what to replace a matched pattern with
* @param
*/
public static interface MatchReplacement {
/**
* Append to replacement list
* @param match Current matched sequence
* @param list replacement list
*/
public void append(SequenceMatchResult match, List list);
}
/**
* Replacement item is a sequence of items
* @param
*/
public static class BasicMatchReplacement implements MatchReplacement {
List replacement;
public BasicMatchReplacement(T... replacement) {
this.replacement = Arrays.asList(replacement);
}
public BasicMatchReplacement(List replacement) {
this.replacement = replacement;
}
/**
* Append to replacement list our list of replacement items
* @param match Current matched sequence
* @param list replacement list
*/
public void append(SequenceMatchResult match, List list) {
list.addAll(replacement);
}
}
/**
* Replacement item is a matched group specified with a group name
* @param
*/
public static class NamedGroupMatchReplacement implements MatchReplacement {
String groupName;
public NamedGroupMatchReplacement(String groupName) {
this.groupName = groupName;
}
/**
* Append to replacement list the matched group with the specified group name
* @param match Current matched sequence
* @param list replacement list
*/
public void append(SequenceMatchResult match, List list) {
list.addAll(match.groupNodes(groupName));
}
}
/**
* Replacement item is a matched group specified with a group id
* @param
*/
public static class GroupMatchReplacement implements MatchReplacement {
int group;
public GroupMatchReplacement(int group) {
this.group = group;
}
/**
* Append to replacement list the matched group with the specified group id
* @param match Current matched sequence
* @param list replacement list
*/
public void append(SequenceMatchResult match, List list) {
list.addAll(match.groupNodes(group));
}
}
/**
* Replaces all occurrences of the pattern with the specified list
* of replacement items (can include matched groups).
* @param replacement What to replace the matched sequence with
* @return New list with all occurrences of the pattern replaced
* @see #replaceFirst(java.util.List)
* @see #replaceFirstExtended(java.util.List)
* @see #replaceAllExtended(java.util.List)
*/
public List replaceAllExtended(List> replacement) {
List res = new ArrayList();
FindType oldFindType = findType;
findType = FindType.FIND_NONOVERLAPPING;
int index = 0;
while (find()) {
// Copy from current index to found index
res.addAll(elements().subList(index, start()));
for (MatchReplacement r:replacement) {
r.append(this, res);
}
index = end();
}
res.addAll(elements().subList(index, elements().size()));
findType = oldFindType;
return res;
}
/**
* Replaces the first occurrence of the pattern with the specified list
* of replacement items (can include matched groups).
* @param replacement What to replace the matched sequence with
* @return New list with the first occurrence of the pattern replaced
* @see #replaceFirst(java.util.List)
* @see #replaceAll(java.util.List)
* @see #replaceAllExtended(java.util.List)
*/
public List replaceFirstExtended(List> replacement) {
List res = new ArrayList();
FindType oldFindType = findType;
findType = FindType.FIND_NONOVERLAPPING;
int index = 0;
if (find()) {
// Copy from current index to found index
res.addAll(elements().subList(index, start()));
for (MatchReplacement r:replacement) {
r.append(this, res);
}
index = end();
}
res.addAll(elements().subList(index, elements().size()));
findType = oldFindType;
return res;
}
/**
* Replaces all occurrences of the pattern with the specified list.
* Use {@link #replaceAllExtended(java.util.List)} to replace with matched groups.
* @param replacement What to replace the matched sequence with
* @return New list with all occurrences of the pattern replaced
* @see #replaceAllExtended(java.util.List)
* @see #replaceFirst(java.util.List)
* @see #replaceFirstExtended(java.util.List)
*/
public List replaceAll(List replacement) {
List res = new ArrayList();
FindType oldFindType = findType;
findType = FindType.FIND_NONOVERLAPPING;
int index = 0;
while (find()) {
// Copy from current index to found index
res.addAll(elements().subList(index, start()));
res.addAll(replacement);
index = end();
}
res.addAll(elements().subList(index, elements().size()));
findType = oldFindType;
return res;
}
/**
* Replaces the first occurrence of the pattern with the specified list.
* Use {@link #replaceFirstExtended(java.util.List)} to replace with matched groups.
* @param replacement What to replace the matched sequence with
* @return New list with the first occurrence of the pattern replaced
* @see #replaceAll(java.util.List)
* @see #replaceAllExtended(java.util.List)
* @see #replaceFirstExtended(java.util.List)
*/
public List replaceFirst(List replacement) {
List res = new ArrayList();
FindType oldFindType = findType;
findType = FindType.FIND_NONOVERLAPPING;
int index = 0;
if (find()) {
// Copy from current index to found index
res.addAll(elements().subList(index, start()));
res.addAll(replacement);
index = end();
}
res.addAll(elements().subList(index, elements().size()));
findType = oldFindType;
return res;
}
public FindType getFindType() {
return findType;
}
public void setFindType(FindType findType) {
this.findType = findType;
}
public boolean isMatchWithResult() {
return matchWithResult;
}
public void setMatchWithResult(boolean matchWithResult) {
this.matchWithResult = matchWithResult;
}
/**
* Reset the matcher and then searches for pattern at the specified start index
* @param start - Index at which to start the search
* @return true if a match is found (false otherwise)
* @throws IndexOutOfBoundsException if start is {@literal <} 0 or larger then the size of the sequence
* @see #find()
*/
public boolean find(int start)
{
if (start < 0 || start > elements.size()) {
throw new IndexOutOfBoundsException("Invalid region start=" + start + ", need to be between 0 and " + elements.size());
}
reset();
return find(start, false);
}
protected boolean find(int start, boolean matchStart)
{
boolean match = false;
matched = false;
matchingCompleted = false;
if (matchStart) {
match = findMatchStart(start, false);
} else {
for (int i = start; i < regionEnd; i++) {
match = findMatchStart(i, false);
if (match) {
break;
}
}
}
matched = match;
matchingCompleted = true;
if (matched) {
nextMatchStart = (findType == FindType.FIND_NONOVERLAPPING)? end(): start()+1;
} else {
nextMatchStart = -1;
}
return match;
}
/**
* Searches for pattern in the region starting
* at the next index
* @return true if a match is found (false otherwise)
*/
private boolean findNextNonOverlapping()
{
if (nextMatchStart < 0) { return false; }
return find(nextMatchStart, false);
}
private boolean findNextAll()
{
if (curMatchIter != null && curMatchIter.hasNext()) {
int next = curMatchIter.next();
curMatchStates.setMatchedGroups(next);
return true;
}
if (nextMatchStart < 0) { return false; }
boolean matched = find(nextMatchStart, false);
if (matched) {
Collection matchedBranches = curMatchStates.getMatchIndices();
curMatchIter = matchedBranches.iterator();
int next = curMatchIter.next();
curMatchStates.setMatchedGroups(next);
}
return matched;
}
/**
* Applies the matcher and returns all non overlapping matches
* @return a Iterable of match results
*/
public Iterable> findAllNonOverlapping() {
Iterator> iter = new Iterator>() {
SequenceMatchResult next;
private SequenceMatchResult getNext() {
boolean found = find();
if (found) {
return toBasicSequenceMatchResult();
} else {
return null;
}
}
@Override
public boolean hasNext() {
if (next == null) {
next = getNext();
return (next != null);
} else {
return true;
}
}
@Override
public SequenceMatchResult next() {
if (!hasNext()) { throw new NoSuchElementException(); }
SequenceMatchResult res = next;
next = null;
return res;
}
public void remove() {
throw new UnsupportedOperationException();
}
};
return new IterableIterator>(iter);
}
/**
* Searches for the next occurrence of the pattern
* @return true if a match is found (false otherwise)
* @see #find(int)
*/
public boolean find()
{
switch (findType) {
case FIND_NONOVERLAPPING:
return findNextNonOverlapping();
case FIND_ALL:
return findNextAll();
default:
throw new UnsupportedOperationException("Unsupported findType " + findType);
}
}
protected boolean findMatchStart(int start, boolean matchAllTokens)
{
switch (findType) {
case FIND_NONOVERLAPPING:
return findMatchStartBacktracking(start, matchAllTokens);
case FIND_ALL:
// TODO: Should use backtracking here too, need to keep track of todo stack
// so we can recover after finding a match
return findMatchStartNoBacktracking(start, matchAllTokens);
default:
throw new UnsupportedOperationException("Unsupported findType " + findType);
}
}
// Does not do backtracking - alternative matches are stored as we go
protected boolean findMatchStartNoBacktracking(int start, boolean matchAllTokens)
{
boolean matchAll = true;
MatchedStates cStates = getStartStates();
// Save cStates for FIND_ALL ....
curMatchStates = cStates;
for(int i = start; i < regionEnd; i++){
boolean match = cStates.match(i);
if (cStates == null || cStates.size() == 0) {
break;
}
if (!matchAllTokens) {
if ((matchAll && cStates.isAllMatch())
|| (!matchAll && cStates.isMatch())) {
cStates.completeMatch();
return true;
}
}
}
cStates.completeMatch();
return cStates.isMatch();
}
// Does some backtracking...
protected boolean findMatchStartBacktracking(int start, boolean matchAllTokens)
{
boolean matchAll = true;
Stack todo = new Stack();
MatchedStates cStates = getStartStates();
cStates.curPosition = start-1;
todo.push(cStates);
while (!todo.empty()) {
cStates = todo.pop();
int s = cStates.curPosition+1;
for(int i = s; i < regionEnd; i++){
if (Thread.interrupted()) {
throw new RuntimeInterruptedException();
}
boolean match = cStates.match(i);
if (cStates == null || cStates.size() == 0) {
break;
}
if (!matchAllTokens) {
if ((matchAll && cStates.isAllMatch())
|| (!matchAll && cStates.isMatch())) {
cStates.completeMatch();
return true;
}
}
if (branchLimit >= 0 && cStates.branchSize() > branchLimit) {
MatchedStates s2 = cStates.split(branchLimit);
todo.push(s2);
}
}
if (cStates.isMatch()) {
cStates.completeMatch();
return true;
}
cStates.clean();
}
return false;
}
/**
* Checks if the pattern matches the entire sequence
* @return true if the entire sequence is matched (false otherwise)
* @see #find()
*/
public boolean matches()
{
matched = false;
matchingCompleted = false;
boolean status = findMatchStart(0, true);
if (status) {
// Check if entire region is matched
status = ((matchedGroups[0].matchBegin == regionStart) && (matchedGroups[0].matchEnd == regionEnd));
}
matchingCompleted = true;
matched = status;
return status;
}
private void clearMatched()
{
for (int i = 0; i < matchedGroups.length; i++) {
matchedGroups[i] = null;
}
if (matchedResults != null) {
for (int i = 0; i < matchedResults.length; i++) {
matchedResults[i] = null;
}
}
}
private String getStateMessage()
{
if (!matchingCompleted) {
return "Matching not completed";
} else if (!matched) {
return "No match found";
} else {
return "Match successful";
}
}
/**
* Set region to search in
* @param start - start index
* @param end - end index (exclusive)
*/
public void region(int start, int end)
{
if (start < 0 || start > elements.size()) {
throw new IndexOutOfBoundsException("Invalid region start=" + start + ", need to be between 0 and " + elements.size());
}
if (end < 0 || end > elements.size()) {
throw new IndexOutOfBoundsException("Invalid region end=" + end + ", need to be between 0 and " + elements.size());
}
if (start > end) {
throw new IndexOutOfBoundsException("Invalid region end=" + end + ", need to be larger then start=" + start);
}
this.regionStart = start;
this.nextMatchStart = start;
this.regionEnd = end;
}
public int regionEnd()
{
return regionEnd;
}
public int regionStart()
{
return regionStart;
}
/**
* Returns a copy of the current match results. Use this method
* to save away match results for later use, since future operations
* using the SequenceMatcher changes the match results.
* @return Copy of the the current match results
*/
public BasicSequenceMatchResult toBasicSequenceMatchResult() {
if (matchingCompleted && matched) {
return super.toBasicSequenceMatchResult();
} else {
String message = getStateMessage();
throw new IllegalStateException(message);
}
}
public int start(int group) {
if (matchingCompleted && matched) {
return super.start(group);
} else {
String message = getStateMessage();
throw new IllegalStateException(message);
}
}
public int end(int group) {
if (matchingCompleted && matched) {
return super.end(group);
} else {
String message = getStateMessage();
throw new IllegalStateException(message);
}
}
public List groupNodes(int group) {
if (matchingCompleted && matched) {
return super.groupNodes(group);
} else {
String message = getStateMessage();
throw new IllegalStateException(message);
}
}
public Object groupValue(int group) {
if (matchingCompleted && matched) {
return super.groupValue(group);
} else {
String message = getStateMessage();
throw new IllegalStateException(message);
}
}
public MatchedGroupInfo groupInfo(int group) {
if (matchingCompleted && matched) {
return super.groupInfo(group);
} else {
String message = getStateMessage();
throw new IllegalStateException(message);
}
}
public List