
com.metreeca.text.tokenizers.PatternTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-text Show documentation
Show all versions of metreeca-text Show documentation
A lightweight NLP framework.
The newest version!
/*
* Copyright © 2013-2022 Metreeca srl
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metreeca.text.tokenizers;
import com.metreeca.text.Chunk;
import com.metreeca.text.Token;
import java.util.*;
import java.util.function.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.lang.String.format;
public final class PatternTokenizer implements Function {
private static final Pattern BranchPattern=Pattern.compile(
"\\(\\?<(?[a-zA-Z][a-zA-Z0-9]*)>(?.*)\\)"
);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private Pattern pattern=Pattern.compile("");
private final Map> group2mapper=new HashMap<>();
public PatternTokenizer defaults() {
return urls()
.emails()
.codes()
.numbers()
.symbols()
.words()
.pattern("\\.{3}") // ellipses
.pattern("\\S"); // other
}
public PatternTokenizer urls() {
return pattern("(?\\bhttps?://[-+&@#/%?=~_|!:,.;\\p{LD}]*[-+&@#/%=~_|\\p{LD}])");
}
public PatternTokenizer emails() {
final String body="\\p{LD}[-+_\\p{LD}]"; // letter ord digit the letter, digit or separator
final String head="\\b"+body+"*(?:\\."+body+"*)*"; // dot separated words
final String tail="\\.\\p{L}{2,}"; // leading dot with at least two letters
return pattern("(?\\b"+head+"@"+head+tail+"\\b)");
}
public PatternTokenizer codes() { // codes, acronyms, …
final String body="[-._&/\\p{LD}]"; // separators, letters and digits
final String head="\\b(?:\\p{LD}"+body+"*)?"; // body with leading letter or digit at word boundary
final String tail="(?:"+body+"*\\p{LD})?\\b"; // body with trailing letter or digit at word boundary
final String nonLeadingUppercaseLetter=body+"\\p{Lu}";
final String atLeastALetterAndADigit="\\p{L}"+body+"*\\p{Digit}";
final String atLeastADigitAndALetter="\\p{Digit}"+body+"*\\p{L}";
return pattern("(?"+head+"(?:"
+nonLeadingUppercaseLetter
+"|"+atLeastALetterAndADigit
+"|"+atLeastADigitAndALetter
+")"+tail+")"
);
}
public PatternTokenizer numbers() {
return pattern("(?[-+]?(?:\\d*(:?[.,]\\d{3})*[.,])?\\d+\\b)");
}
public PatternTokenizer symbols() {
return pattern("(?[-+]?\\p{S})");
}
public PatternTokenizer words() {
return pattern("\\b(?:\\p{L}-)?\\p{L}\\p{Ll}*\\b");
}
public PatternTokenizer pattern(final String pattern) {
return pattern(pattern, (t, m) -> t);
}
public PatternTokenizer pattern(final String pattern, final Function mapper) {
return pattern(pattern, (t, m) -> mapper.apply(t));
}
public PatternTokenizer pattern(final String pattern, final BiFunction mapper) {
if ( pattern == null ) {
throw new NullPointerException("null pattern");
}
final StringBuilder builder=new StringBuilder(this.pattern.pattern());
if ( builder.length() > 0 ) {
builder.append('|');
}
final String group=format("p%d", group2mapper.size());
builder.append("(?<").append(group).append('>');
final Matcher matcher=BranchPattern.matcher(pattern);
if ( matcher.matches() ) {
final String type=matcher.group("type");
group2mapper.put(group, (t, m) -> mapper.apply(t.type(type), m));
builder.append(matcher.group("pattern"));
} else {
group2mapper.put(group, mapper);
builder.append(pattern);
}
builder.append(')');
this.pattern=Pattern.compile(builder.toString());
return this;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@Override public Chunk apply(final Token token) {
if ( token == null ) {
throw new NullPointerException("null token");
}
if ( pattern.pattern().isEmpty() ) { defaults(); }
final Collection tokens=new ArrayList<>();
for (final Matcher matcher=pattern.matcher(token.text()); matcher.find(); ) {
tokens.add(mapper(matcher).apply(
token.clip(matcher.start(), matcher.end())
));
}
return new Chunk(tokens);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private UnaryOperator mapper(final Matcher matcher) {
for (final Map.Entry> entry : group2mapper.entrySet()) {
if ( matcher.group(entry.getKey()) != null ) {
return token -> entry.getValue().apply(token, matcher);
}
}
return token -> token;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy