org.languagetool.AnalyzedSentence Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber, Marcin Miłkowski (http://www.languagetool.org)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.tools.StringTools;
import java.util.*;
/**
* A sentence that has been tokenized and analyzed.
*
* @author Daniel Naber
*/
public final class AnalyzedSentence {
private final AnalyzedTokenReadings[] tokens;
private final AnalyzedTokenReadings[] nonBlankTokens;
private final int[] whPositions; // maps positions without whitespace to positions that include whitespaces
private final Set tokenSet;
private final Set lemmaSet;
/**
* Creates an AnalyzedSentence from the given {@link AnalyzedTokenReadings}. Whitespace is also a token.
*/
public AnalyzedSentence(AnalyzedTokenReadings[] tokens) {
this.tokens = tokens;
int whCounter = 0;
int nonWhCounter = 0;
int[] mapping = new int[tokens.length + 1];
List l = new ArrayList<>();
for (AnalyzedTokenReadings token : tokens) {
if (!token.isWhitespace() || token.isSentenceStart() || token.isSentenceEnd() || token.isParagraphEnd()) {
l.add(token);
mapping[nonWhCounter] = whCounter;
nonWhCounter++;
}
whCounter++;
}
this.whPositions = mapping;
this.nonBlankTokens = l.toArray(new AnalyzedTokenReadings[l.size()]);
this.tokenSet = getTokenSet(tokens);
this.lemmaSet = getLemmaSet(tokens);
}
private AnalyzedSentence(AnalyzedTokenReadings[] tokens, int[] mapping, AnalyzedTokenReadings[] nonBlankTokens) {
this.tokens = tokens;
this.whPositions = mapping;
this.nonBlankTokens = nonBlankTokens;
this.tokenSet = getTokenSet(tokens);
this.lemmaSet = getLemmaSet(tokens);
}
private Set getTokenSet(AnalyzedTokenReadings[] tokens) {
Set tokenSet = new HashSet<>();
for (AnalyzedTokenReadings token : tokens) {
tokenSet.add(token.getToken().toLowerCase());
}
return Collections.unmodifiableSet(tokenSet);
}
private Set getLemmaSet(AnalyzedTokenReadings[] tokens) {
Set lemmaSet = new HashSet<>();
for (AnalyzedTokenReadings token : tokens) {
for (AnalyzedToken lemmaTok : token.getReadings()) {
if (lemmaTok.getLemma() != null) {
lemmaSet.add(lemmaTok.getLemma().toLowerCase());
} else {
lemmaSet.add(lemmaTok.getToken().toLowerCase());
}
}
}
return Collections.unmodifiableSet(lemmaSet);
}
/**
* The method copies {@link AnalyzedSentence} and returns the copy.
* Useful for performing local immunization (for example).
*
* @param sentence {@link AnalyzedSentence} to be copied
* @return a new object which is a copy
* @since 2.5
*/
public AnalyzedSentence copy(AnalyzedSentence sentence) {
AnalyzedTokenReadings[] copyTokens = new AnalyzedTokenReadings[sentence.getTokens().length];
for (int i = 0; i < copyTokens.length; i++) {
AnalyzedTokenReadings analyzedTokens = sentence.getTokens()[i];
copyTokens[i] = new AnalyzedTokenReadings(analyzedTokens.getReadings(), analyzedTokens.getStartPos());
copyTokens[i].setHistoricalAnnotations(analyzedTokens.getHistoricalAnnotations());
copyTokens[i].setChunkTags(analyzedTokens.getChunkTags());
if (analyzedTokens.isImmunized()) {
copyTokens[i].immunize();
}
if (analyzedTokens.isIgnoredBySpeller()) {
copyTokens[i].ignoreSpelling();
}
copyTokens[i].setWhitespaceBefore(sentence.getTokens()[i].isWhitespaceBefore());
}
return new AnalyzedSentence(copyTokens, sentence.whPositions, sentence.getTokensWithoutWhitespace());
}
/**
* Returns the {@link AnalyzedTokenReadings} of the analyzed text. Whitespace
* is also a token.
*/
public AnalyzedTokenReadings[] getTokens() {
// It would be better to return a clone here to make this object immutable,
// but this would be bad for performance:
return tokens;
}
/**
* Returns the {@link AnalyzedTokenReadings} of the analyzed text, with
* whitespace tokens removed but with the artificial SENT_START
* token included.
*/
public AnalyzedTokenReadings[] getTokensWithoutWhitespace() {
return nonBlankTokens.clone();
}
/**
* Get a position of a non-whitespace token in the original sentence with
* whitespace.
*
* @param nonWhPosition position of a non-whitespace token
* @return position in the original sentence.
*/
public int getOriginalPosition(int nonWhPosition) {
return whPositions[nonWhPosition];
}
@Override
public String toString() {
return toString(",");
}
/**
* Return string representation without chunk information.
* @since 2.3
*/
public String toShortString(String readingDelimiter) {
return toString(readingDelimiter, false);
}
/**
* Return the original text.
* @since 2.7
*/
public String getText() {
StringBuilder sb = new StringBuilder();
for (AnalyzedTokenReadings element : tokens) {
sb.append(element.getToken());
}
return sb.toString();
}
/**
* Return string representation without any analysis information, just the original text.
* @since 2.6
*/
String toTextString() {
return getText();
}
/**
* Return string representation with chunk information.
*/
public String toString(String readingDelimiter) {
return toString(readingDelimiter, true);
}
private String toString(String readingDelimiter, boolean includeChunks) {
StringBuilder sb = new StringBuilder();
for (AnalyzedTokenReadings element : tokens) {
if (!element.isWhitespace()) {
sb.append(element.getToken());
sb.append('[');
}
Iterator iterator = element.iterator();
while (iterator.hasNext()) {
AnalyzedToken token = iterator.next();
String posTag = token.getPOSTag();
if (element.isSentenceStart()) {
sb.append("");
} else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
sb.append("");
} else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
sb.append("");
} else if (posTag == null && !includeChunks) {
sb.append(token.getToken());
} else {
if (!element.isWhitespace()) {
sb.append(token);
if (iterator.hasNext()) {
sb.append(readingDelimiter);
}
}
}
}
if (!element.isWhitespace()) {
if (includeChunks && element.getChunkTags().size() > 0) {
sb.append(',');
sb.append(StringUtils.join(element.getChunkTags(), "|"));
}
if (element.isImmunized()) {
sb.append("{!}");
}
sb.append(']');
} else {
sb.append(' ');
}
}
return sb.toString();
}
/**
* Get disambiguator actions log.
*/
public String getAnnotations() {
StringBuilder sb = new StringBuilder(40);
sb.append("Disambiguator log: \n");
for (AnalyzedTokenReadings element : tokens) {
if (!element.isWhitespace() &&
!"".equals(element.getHistoricalAnnotations())) {
sb.append(element.getHistoricalAnnotations());
sb.append('\n');
}
}
return sb.toString();
}
/**
* Get the lowercase tokens of this sentence in a set.
* Used internally for performance optimization.
* @since 2.4
*/
public Set getTokenSet() {
return tokenSet;
}
/**
* Get the lowercase lemmas of this sentence in a set.
* Used internally for performance optimization.
* @since 2.5
*/
public Set getLemmaSet() {
return lemmaSet;
}
@SuppressWarnings("ControlFlowStatementWithoutBraces")
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null) return false;
if (getClass() != o.getClass()) return false;
AnalyzedSentence other = (AnalyzedSentence) o;
// tokenSet and lemmaSet are a subset of tokens and don't need to be included
return Arrays.equals(nonBlankTokens, other.nonBlankTokens)
&& Arrays.equals(tokens, other.tokens)
&& Arrays.equals(whPositions, other.whPositions);
}
@Override
public int hashCode() {
// tokenSet and lemmaSet are a subset of tokens and don't need to be included
return Objects.hash(nonBlankTokens, tokens, whPositions);
}
/**
* Returns true if sentences ends with a paragraph break.
* @since 4.3
*/
public boolean hasParagraphEndMark(Language lang) {
return StringTools.isParagraphEnd(getText(), lang.getSentenceTokenizer().singleLineBreaksMarksPara());
}
}