
com.metreeca.text.Token Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-text Show documentation
Show all versions of metreeca-text Show documentation
A lightweight NLP framework.
The newest version!
/*
* Copyright © 2013-2022 Metreeca srl
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metreeca.text;
import com.metreeca.rest.actions.Clean;
import java.io.*;
import java.net.URL;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import static com.metreeca.rest.actions.Clean.normalize;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Arrays.asList;
import static java.util.Collections.synchronizedMap;
import static java.util.stream.Collectors.toSet;
@SuppressWarnings("FieldNotUsedInToString")
public final class Token implements Range {
private static final Map> corpora=synchronizedMap(new HashMap<>());
public static Predicate corpus(final URL corpus) {
if ( corpus == null ) {
throw new NullPointerException("null corpus");
}
return corpora.computeIfAbsent(corpus.toExternalForm(), url -> {
try ( final BufferedReader reader=new BufferedReader(new InputStreamReader(corpus.openStream(), UTF_8)) ) {
return corpus(reader.lines()
.filter(line -> !line.isEmpty())
.filter(line -> !line.startsWith("#"))
.collect(toSet())
);
} catch ( final IOException e ) {
throw new UncheckedIOException(e);
}
});
}
public static Predicate corpus(final String... entries) {
if ( entries == null || Arrays.stream(entries).anyMatch(Objects::isNull) ) {
throw new NullPointerException("null entries");
}
return corpus(asList(entries));
}
public static Predicate corpus(final Collection entries) {
if ( entries == null || entries.stream().anyMatch(Objects::isNull) ) {
throw new NullPointerException("null entries");
}
final Collection set=entries.stream()
.map(Clean::normalize)
.map(Clean::lower)
.collect(toSet());
return token -> set.contains(Clean.lower(normalize(token.text(false))));
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private final String text;
private final String root;
private final String type;
private final double weight;
private final int lower;
private final int upper;
public Token(final String text) {
if ( text == null ) {
throw new NullPointerException("null text");
}
this.text=text;
this.root="";
this.lower=0;
this.upper=text.length();
this.type="";
this.weight=0;
}
private Token(
final String text, final int lower, final int upper,
final String root, final String type, final double weight
) {
this.text=text;
this.root=root;
this.lower=lower;
this.upper=upper;
this.type=type;
this.weight=weight;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@Override public int lower() {
return lower;
}
@Override public int upper() {
return upper;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public boolean is(final String... types) {
if ( types == null ) {
throw new NullPointerException("null types");
}
return !type.isEmpty() && (types.length == 0 || asList(types).contains(type));
}
public boolean isLower() {
return !text.isEmpty() && Character.isLowerCase(text.charAt(0));
}
public boolean isUpper() {
return !text.isEmpty() && Character.isUpperCase(text.charAt(0));
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public String text() {
return text;
}
public String text(final Predicate verbatim) {
return text(verbatim.test(this));
}
public String text(final boolean verbatim) {
return verbatim || root.isEmpty() ? text : root;
}
public Token text(final String text) {
if ( text == null ) {
throw new NullPointerException("null text");
}
return new Token(text, lower, upper, root, type, weight);
}
public String root() {
return root;
}
public Token root(final String root) {
if ( type == null ) {
throw new NullPointerException("null root");
}
return new Token(text, lower, upper, root, type, weight);
}
public String type() {
return type;
}
public Token type(final String type) {
if ( type == null ) {
throw new NullPointerException("null type");
}
return new Token(text, lower, upper, root, type, weight);
}
public double weight() {
return weight;
}
public Token weight(final double weight) {
return new Token(text, lower, upper, root, type, weight);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public R map(final Function mapper) {
if ( mapper == null ) {
throw new NullPointerException("null mapper");
}
return mapper.apply(this);
}
public Token clip(final int lower, final int upper) {
if ( lower < 0 || lower > upper || upper > this.upper-this.lower ) {
throw new IllegalArgumentException("illegal bounds ["+lower+","+upper+")");
}
return new Token(
text.substring(lower, upper), this.lower+lower, this.lower+upper,
"", "", 0
);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@Override public String toString() {
return String.format("(%s %s)", type.isEmpty() ? "*" : type, text);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy