org.languagetool.AnalyzedSentence Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber, Marcin Miłkowski (http://www.languagetool.org)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.ApiStatus;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.*;
/**
* A sentence that has been tokenized and analyzed.
*
* @author Daniel Naber
*/
public final class AnalyzedSentence {
// objects of this type are cached, so everything needs to be immutable
private final AnalyzedTokenReadings[] tokens;
private final AnalyzedTokenReadings[] preDisambigTokens;
private final AnalyzedTokenReadings[] nonBlankTokens;
private final AnalyzedTokenReadings[] nonBlankPreDisambigTokens;
private final int[] whPositions; // maps positions without whitespace to positions that include whitespaces
private final Map> tokenOffsets;
private final Map> lemmaOffsets;
/**
* Creates an AnalyzedSentence from the given {@link AnalyzedTokenReadings}. Whitespace is also a token.
*/
public AnalyzedSentence(AnalyzedTokenReadings[] tokens) {
this(tokens, tokens);
}
public AnalyzedSentence(AnalyzedTokenReadings[] tokens, AnalyzedTokenReadings[] preDisambigTokens) {
this.tokens = tokens;
this.preDisambigTokens = preDisambigTokens;
int whCounter = 0;
int nonWhCounter = 0;
int[] mapping = new int[tokens.length + 1];
this.whPositions = mapping;
this.nonBlankTokens = getNonBlankReadings(tokens, whCounter, nonWhCounter, mapping).toArray(new AnalyzedTokenReadings[0]);
this.nonBlankPreDisambigTokens = getNonBlankReadings(preDisambigTokens, whCounter, nonWhCounter, mapping).toArray(new AnalyzedTokenReadings[0]);
tokenOffsets = indexTokens(nonBlankTokens);
lemmaOffsets = indexLemmas(nonBlankTokens);
}
@NotNull
private List getNonBlankReadings(AnalyzedTokenReadings[] tokens, int whCounter, int nonWhCounter, int[] mapping) {
List l = new ArrayList<>();
for (AnalyzedTokenReadings token : tokens) {
if (!token.isWhitespace() || token.isSentenceStart() || token.isSentenceEnd() || token.isParagraphEnd()) {
l.add(token);
mapping[nonWhCounter] = whCounter;
nonWhCounter++;
}
whCounter++;
}
return l;
}
private AnalyzedSentence(AnalyzedTokenReadings[] tokens, int[] mapping, AnalyzedTokenReadings[] nonBlankTokens, AnalyzedTokenReadings[] nonBlankPreDisambigTokens) {
this.tokens = tokens;
this.preDisambigTokens = tokens;
this.whPositions = mapping;
this.nonBlankTokens = nonBlankTokens;
this.nonBlankPreDisambigTokens = nonBlankPreDisambigTokens;
tokenOffsets = indexTokens(nonBlankTokens);
lemmaOffsets = indexLemmas(nonBlankTokens);
}
private static Map> indexTokens(AnalyzedTokenReadings[] tokens) {
Map> result = new HashMap<>(tokens.length);
for (int i = 0; i < tokens.length; i++) {
result.computeIfAbsent(tokens[i].getToken().toLowerCase(), __ -> new ArrayList<>(1)).add(i);
}
return makeUnmodifiable(result);
}
private static Map> indexLemmas(AnalyzedTokenReadings[] tokens) {
Map> result = new HashMap<>(tokens.length);
for (int i = 0; i < tokens.length; i++) {
AnalyzedTokenReadings tr = tokens[i];
int readingsLength = tr.getReadingsLength();
for (int j = 0; j < readingsLength; j++) {
AnalyzedToken token = tr.getAnalyzedToken(j);
String lemma = token.getLemma();
String key = (lemma != null ? lemma : token.getToken()).toLowerCase();
List list = result.computeIfAbsent(key, __ -> new ArrayList<>(1));
if (list.isEmpty() || list.get(list.size() - 1) != i) {
list.add(i);
}
}
}
return makeUnmodifiable(result);
}
private static Map> makeUnmodifiable(Map> result) {
for (Map.Entry> entry : result.entrySet()) {
entry.setValue(Collections.unmodifiableList(entry.getValue()));
}
return Collections.unmodifiableMap(result);
}
/**
* The method copies {@link AnalyzedSentence} and returns the copy.
* Useful for performing local immunization (for example).
*
* @param sentence {@link AnalyzedSentence} to be copied
* @return a new object which is a copy
* @since 2.5
*/
public AnalyzedSentence copy(AnalyzedSentence sentence) {
AnalyzedTokenReadings[] copyTokens = new AnalyzedTokenReadings[sentence.getTokens().length];
for (int i = 0; i < copyTokens.length; i++) {
AnalyzedTokenReadings analyzedTokens = sentence.getTokens()[i];
copyTokens[i] = new AnalyzedTokenReadings(analyzedTokens, analyzedTokens.getReadings(), "");
}
return new AnalyzedSentence(copyTokens, sentence.whPositions, sentence.getTokensWithoutWhitespace(), sentence.getPreDisambigTokensWithoutWhitespace());
}
/**
* Returns the {@link AnalyzedTokenReadings} of the analyzed text. Whitespace
* is also a token.
*/
public AnalyzedTokenReadings[] getTokens() {
// It would be better to return a clone here to make this object immutable,
// but this would be bad for performance:
return tokens;
}
/**
* @since 4.5
*/
public AnalyzedTokenReadings[] getPreDisambigTokens() {
// It would be better to return a clone here to make this object immutable,
// but this would be bad for performance:
return preDisambigTokens;
}
/**
* Returns the {@link AnalyzedTokenReadings} of the analyzed text, with
* whitespace tokens removed but with the artificial SENT_START
* token included.
*/
public AnalyzedTokenReadings[] getTokensWithoutWhitespace() {
return nonBlankTokens.clone();
}
/**
* @since 4.5
*/
public AnalyzedTokenReadings[] getPreDisambigTokensWithoutWhitespace() {
return nonBlankPreDisambigTokens.clone();
}
/**
* Get a position of a non-whitespace token in the original sentence with
* whitespace.
*
* @param nonWhPosition position of a non-whitespace token
* @return position in the original sentence.
*/
public int getOriginalPosition(int nonWhPosition) {
return whPositions[nonWhPosition];
}
@Override
public String toString() {
return toString(",");
}
/**
* Return string representation without chunk information.
* @since 2.3
*/
public String toShortString(String readingDelimiter) {
return toString(readingDelimiter, false);
}
private volatile String text;
/**
* Return the original text.
* @since 2.7
*/
public String getText() {
String result = text;
if (result == null) {
text = result = calcText();
}
return result;
}
private String calcText() {
StringBuilder sb = new StringBuilder();
for (AnalyzedTokenReadings element : tokens) {
sb.append(element.getToken());
}
return sb.toString();
}
/** Text length taking position fixes (for removed soft hyphens etc.) into account, so
* this is _not_ always equal to {@code getText()}.
* @since 5.1
*/
public int getCorrectedTextLength() {
int len = 0;
for (int i = 0; i < tokens.length; i++) {
AnalyzedTokenReadings element = tokens[i];
len += element.getCleanToken().length();
if (i == tokens.length - 1) { // only apply at end, so the position fix at every token doesn't add up
len += element.getPosFix();
}
}
return len;
}
/**
* Return string representation without any analysis information, just the original text.
* @since 2.6
*/
String toTextString() {
return getText();
}
/**
* Return string representation with chunk information.
*/
public String toString(String readingDelimiter) {
return toString(readingDelimiter, true);
}
private String toString(String readingDelimiter, boolean includeChunks) {
StringBuilder sb = new StringBuilder();
for (AnalyzedTokenReadings element : tokens) {
if (!element.isWhitespace()) {
sb.append(element.getToken());
sb.append('[');
}
Iterator iterator = element.iterator();
while (iterator.hasNext()) {
AnalyzedToken token = iterator.next();
String posTag = token.getPOSTag();
if (element.isSentenceStart()) {
sb.append("");
} else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
sb.append("");
} else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
sb.append("");
} else if (posTag == null && !includeChunks) {
sb.append(token.getToken());
} else {
if (!element.isWhitespace()) {
sb.append(token);
if (iterator.hasNext()) {
sb.append(readingDelimiter);
}
}
}
}
if (!element.isWhitespace()) {
if (includeChunks && element.getChunkTags().size() > 0) {
sb.append(',');
sb.append(StringUtils.join(element.getChunkTags(), "|"));
}
if (element.isImmunized()) {
sb.append("{!}");
}
sb.append(']');
} else {
sb.append(' ');
}
}
return sb.toString();
}
/**
* Get disambiguator actions log.
*/
public String getAnnotations() {
StringBuilder sb = new StringBuilder(40);
sb.append("Disambiguator log: \n");
for (AnalyzedTokenReadings element : tokens) {
if (!element.isWhitespace() &&
!"".equals(element.getHistoricalAnnotations())) {
sb.append(element.getHistoricalAnnotations());
sb.append('\n');
}
}
return sb.toString();
}
/**
* Get the lowercase tokens of this sentence in a set.
* Used internally for performance optimization.
* @since 2.4
*/
public Set getTokenSet() {
return tokenOffsets.keySet();
}
/**
* Get the lowercase lemmas of this sentence in a set.
* Used internally for performance optimization.
* @since 2.5
*/
public Set getLemmaSet() {
return lemmaOffsets.keySet();
}
/**
* @return all offsets in {@link #getTokensWithoutWhitespace()} where tokens with the given text occur (case-insensitive),
* or {@code null} if there are no such occurrences
* @since 5.3
*/
@Nullable
@ApiStatus.Internal
public List getTokenOffsets(String token) {
return tokenOffsets.get(token);
}
/**
* @return all offsets in {@link #getTokensWithoutWhitespace()} where tokens with the given lemma occur (case-insensitive),
* or {@code null} if there are no such occurrences
* @since 5.3
*/
@Nullable
@ApiStatus.Internal
public List getLemmaOffsets(String token) {
return lemmaOffsets.get(token);
}
@SuppressWarnings("ControlFlowStatementWithoutBraces")
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
AnalyzedSentence other = (AnalyzedSentence) o;
// tokenSet and lemmaSet are a subset of tokens and don't need to be included
return Arrays.equals(nonBlankTokens, other.nonBlankTokens)
&& Arrays.equals(tokens, other.tokens)
&& Arrays.equals(whPositions, other.whPositions);
}
@Override
public int hashCode() {
// tokenSet and lemmaSet are a subset of tokens and don't need to be included
return Objects.hash(nonBlankTokens, tokens, whPositions);
}
}