Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Carrot2 project.
*
* Copyright (C) 2002-2021, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* https://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.language;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.carrot2.util.StringUtils;
/**
* This dictionary implementation is a middle ground between the complexity of regular expressions
* and sheer speed of plain text matching. It offers case sensitive and case insensitive matching,
* as well as globs (wildcards matching any token sequence).
*
*
The following wildcards are available:
*
*
*
{@code *} - matches zero or more tokens (possessive match),
*
{@code *?} - matches zero or more tokens (reluctant match),
*
{@code +} - matches one or more tokens (possessive match),
*
{@code +?} - matches zero or more tokens (reluctant match),
*
{@code ?} - matches exactly one token (possessive).
*
*
*
In addition, a token type matching is provide in the form of:
*
*
*
{@code {name}} - matches a token with flags named {@code name}.
*
*
*
Token flags are an int bitfield.
*/
public class GlobDictionary implements Predicate {
private final Function tokenNormalization;
private final Function termSplitter;
private Map> tokenToPatterns;
private Map> pureTypePatterns;
public GlobDictionary(
Stream patterns,
Function tokenNormalization,
Function termSplitter) {
this.tokenNormalization = tokenNormalization;
this.termSplitter = termSplitter;
compile(patterns, tokenNormalization);
}
public GlobDictionary(Stream patterns) {
this(patterns, defaultTokenNormalization(), defaultTermSplitter());
}
public static Function defaultTermSplitter() {
return chs -> {
var seq = chs.toString();
List tokens = new ArrayList<>();
for (int p = 0, max = seq.length(); p < max; ) {
while (p < max && seq.charAt(p) == ' ') p++;
int s = p;
while (p < max && seq.charAt(p) != ' ') p++;
if (s < p) {
tokens.add(seq.substring(s, p));
}
}
return tokens.toArray(String[]::new);
};
}
@Override
public boolean test(CharSequence input) {
String[] inputTerms = split(input);
// normalized inputTerms
String[] normalizedTerms = normalize(inputTerms);
return find(inputTerms, normalizedTerms, null, (p) -> true);
}
/**
* Find all matching patterns, optionally aborting prematurely.
*
* @param inputTerms Input terms (verbatim).
* @param normalizedTerms Normalized terms (must use the same normalizer as the dictionary).
* @param types Token types (bitfield) used in {@link MatchType#ANY_OF_TYPE}.
* @param earlyAbort A predicate that indicates early abort condition.
* @return Returns {@code true} if at least one match was found, {@code false} otherwise.
*/
public boolean find(
String[] inputTerms,
String[] normalizedTerms,
int[] types,
Predicate earlyAbort) {
// Already-checked terms and patterns, combined.
boolean found = false;
outer:
for (String normalizedToken : normalizedTerms) {
var patterns = tokenToPatterns.get(normalizedToken);
if (patterns != null) {
for (WordPattern pattern : patterns) {
if (pattern.matches(inputTerms, normalizedTerms, types)) {
found = true;
if (earlyAbort.test(pattern)) {
return found;
}
}
}
}
}
if (!pureTypePatterns.isEmpty() && types != null) {
int allTypeBits = 0;
for (int type : types) {
allTypeBits |= type;
}
for (var e : pureTypePatterns.entrySet()) {
int bitField = e.getKey();
if ((bitField & allTypeBits) == bitField) {
for (WordPattern pattern : e.getValue()) {
if (pattern.matches(inputTerms, normalizedTerms, types)) {
found = true;
if (earlyAbort.test(pattern)) {
return found;
}
}
}
}
}
}
return found;
}
public String[] split(CharSequence input) {
return termSplitter.apply(input);
}
public String[] normalize(String[] tokens) {
var normalized = new String[tokens.length];
for (int i = 0; i < tokens.length; i++) {
normalized[i] = tokenNormalization.apply(tokens[i]);
}
return normalized;
}
@Override
public String toString() {
return "GlobDictionary: " + this.tokenToPatterns;
}
/**
* For each pattern, create an inverted index containing: {@code normalized(token) -> patternList}
* so that we can quickly compute the list of candidate patterns that can (but may not) match a
* given input.
*
*
This is similar in nature to this: http://swtch.com/~rsc/regexp/regexp4.html
*/
private void compile(Stream patterns, Function tokenNormalization) {
HashMap cache = new HashMap<>();
Function normalize =
(s) -> {
String normalized = tokenNormalization.apply(s);
return cache.computeIfAbsent(normalized, (x) -> normalized);
};
// Fail on invalid inputs.
patterns = patterns.peek(GlobDictionary::checkInvalid);
// Rewrite patterns so that tokens with NORMALIZED matching have a prenormalized image.
patterns =
patterns.map(
(pattern) -> {
List modifiedTokens = new ArrayList<>(pattern.tokens.size());
boolean hadChanges = false;
for (var t : pattern.tokens()) {
if (t.matchType == GlobDictionary.MatchType.NORMALIZED) {
hadChanges = true;
modifiedTokens.add(new Token(normalize.apply(t.image), t.matchType, t.typeBits));
} else {
modifiedTokens.add(t);
}
}
return hadChanges ? new WordPattern(modifiedTokens, pattern.payload) : pattern;
});
// Sort patterns on input for hash consistency.
patterns = patterns.sorted();
// Create a simple inverted index from tokens to the patterns they occur in.
HashMap> tokenToPatterns = new HashMap<>();
HashMap> pureTypePatterns = new HashMap<>();
patterns.forEach(
(pattern) -> {
Set