org.languagetool.AnalyzedSentence Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2014 Daniel Naber, Marcin Miłkowski (http://www.languagetool.org)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.ApiStatus;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.util.*;

/**
 * A sentence that has been tokenized and analyzed.
 * 
 * @author Daniel Naber
 */
public final class AnalyzedSentence {

  // objects of this type are cached, so everything needs to be immutable
  private final AnalyzedTokenReadings[] tokens;
  private final AnalyzedTokenReadings[] preDisambigTokens;
  private final AnalyzedTokenReadings[] nonBlankTokens;
  private final AnalyzedTokenReadings[] nonBlankPreDisambigTokens;
  private final int[] whPositions;  // maps positions without whitespace to positions that include whitespaces
  private final Map> tokenOffsets;
  private final Map> lemmaOffsets;

  /**
   * Creates an AnalyzedSentence from the given {@link AnalyzedTokenReadings}. Whitespace is also a token.
   */
  public AnalyzedSentence(AnalyzedTokenReadings[] tokens) {
    this(tokens, tokens);
  }
  
  public AnalyzedSentence(AnalyzedTokenReadings[] tokens, AnalyzedTokenReadings[] preDisambigTokens) {
    this.tokens = tokens;
    this.preDisambigTokens = preDisambigTokens;
    int whCounter = 0;
    int nonWhCounter = 0;
    int[] mapping = new int[tokens.length + 1];
    this.whPositions = mapping;
    this.nonBlankTokens = getNonBlankReadings(tokens, whCounter, nonWhCounter, mapping).toArray(new AnalyzedTokenReadings[0]);
    this.nonBlankPreDisambigTokens = getNonBlankReadings(preDisambigTokens, whCounter, nonWhCounter, mapping).toArray(new AnalyzedTokenReadings[0]);
    tokenOffsets = indexTokens(nonBlankTokens);
    lemmaOffsets = indexLemmas(nonBlankTokens);
  }

  @NotNull
  private List getNonBlankReadings(AnalyzedTokenReadings[] tokens, int whCounter, int nonWhCounter, int[] mapping) {
    List l = new ArrayList<>();
    for (AnalyzedTokenReadings token : tokens) {
      if (!token.isWhitespace() || token.isSentenceStart() || token.isSentenceEnd() || token.isParagraphEnd()) {
        l.add(token);
        mapping[nonWhCounter] = whCounter;
        nonWhCounter++;
      }
      whCounter++;
    }
    return l;
  }

  private AnalyzedSentence(AnalyzedTokenReadings[] tokens, int[] mapping, AnalyzedTokenReadings[] nonBlankTokens, AnalyzedTokenReadings[] nonBlankPreDisambigTokens) {
    this.tokens = tokens;
    this.preDisambigTokens = tokens;
    this.whPositions = mapping;
    this.nonBlankTokens = nonBlankTokens;
    this.nonBlankPreDisambigTokens = nonBlankPreDisambigTokens;
    tokenOffsets = indexTokens(nonBlankTokens);
    lemmaOffsets = indexLemmas(nonBlankTokens);
  }

  private static Map> indexTokens(AnalyzedTokenReadings[] tokens) {
    Map> result = new HashMap<>(tokens.length);
    for (int i = 0; i < tokens.length; i++) {
      result.computeIfAbsent(tokens[i].getToken().toLowerCase(), __ -> new ArrayList<>(1)).add(i);
    }
    return makeUnmodifiable(result);
  }

  private static Map> indexLemmas(AnalyzedTokenReadings[] tokens) {
    Map> result = new HashMap<>(tokens.length);
    for (int i = 0; i < tokens.length; i++) {
      AnalyzedTokenReadings tr = tokens[i];
      int readingsLength = tr.getReadingsLength();
      for (int j = 0; j < readingsLength; j++) {
        AnalyzedToken token = tr.getAnalyzedToken(j);
        String lemma = token.getLemma();
        String key = (lemma != null ? lemma : token.getToken()).toLowerCase();
        List list = result.computeIfAbsent(key, __ -> new ArrayList<>(1));
        if (list.isEmpty() || list.get(list.size() - 1) != i) {
          list.add(i);
        }
      }
    }
    return makeUnmodifiable(result);
  }

  private static Map> makeUnmodifiable(Map> result) {
    for (Map.Entry> entry : result.entrySet()) {
      entry.setValue(Collections.unmodifiableList(entry.getValue()));
    }
    return Collections.unmodifiableMap(result);
  }

  /**
   * The method copies {@link AnalyzedSentence} and returns the copy.
   * Useful for performing local immunization (for example).
   *
   * @param sentence {@link AnalyzedSentence} to be copied
   * @return a new object which is a copy
   * @since  2.5
   */
  public AnalyzedSentence copy(AnalyzedSentence sentence) {
    AnalyzedTokenReadings[] copyTokens = new AnalyzedTokenReadings[sentence.getTokens().length];
    for (int i = 0; i < copyTokens.length; i++) {
      AnalyzedTokenReadings analyzedTokens = sentence.getTokens()[i];
      copyTokens[i] = new AnalyzedTokenReadings(analyzedTokens, analyzedTokens.getReadings(), "");
    }
    return new AnalyzedSentence(copyTokens, sentence.whPositions, sentence.getTokensWithoutWhitespace(), sentence.getPreDisambigTokensWithoutWhitespace());
  }

  /**
   * Returns the {@link AnalyzedTokenReadings} of the analyzed text. Whitespace
   * is also a token.
   */
  public AnalyzedTokenReadings[] getTokens() {
    // It would be better to return a clone here to make this object immutable,
    // but this would be bad for performance:
    return tokens;
  }

  /**
   * @since 4.5
   */
  public AnalyzedTokenReadings[] getPreDisambigTokens() {
    // It would be better to return a clone here to make this object immutable,
    // but this would be bad for performance:
    return preDisambigTokens;
  }

  /**
   * Returns the {@link AnalyzedTokenReadings} of the analyzed text, with
   * whitespace tokens removed but with the artificial SENT_START
   * token included.
   */
  public AnalyzedTokenReadings[] getTokensWithoutWhitespace() {
    return nonBlankTokens.clone();
  }

  /**
   * @since 4.5
   */
  public AnalyzedTokenReadings[] getPreDisambigTokensWithoutWhitespace() {
    return nonBlankPreDisambigTokens.clone();
  }

  /**
   * Get a position of a non-whitespace token in the original sentence with
   * whitespace.
   *
   * @param nonWhPosition position of a non-whitespace token
   * @return position in the original sentence.
   */
  public int getOriginalPosition(int nonWhPosition) {
    return whPositions[nonWhPosition];
  }

  @Override
  public String toString() {
    return toString(",");
  }

  /**
   * Return string representation without chunk information.
   * @since 2.3
   */
  public String toShortString(String readingDelimiter) {
    return toString(readingDelimiter, false);
  }

  private volatile String text;

  /**
   * Return the original text.
   * @since 2.7
   */
  public String getText() {
    String result = text;
    if (result == null) {
      text = result = calcText();
    }
    return result;
  }

  private String calcText() {
    StringBuilder sb = new StringBuilder();
    for (AnalyzedTokenReadings element : tokens) {
      sb.append(element.getToken());
    }
    return sb.toString();
  }

  /** Text length taking position fixes (for removed soft hyphens etc.) into account, so
   * this is _not_ always equal to {@code getText()}.
   * @since 5.1
   */
  public int getCorrectedTextLength() {
    int len = 0;
    for (int i = 0; i < tokens.length; i++) {
      AnalyzedTokenReadings element = tokens[i];
      len += element.getCleanToken().length();
      if (i == tokens.length - 1) {  // only apply at end, so the position fix at every token doesn't add up
        len += element.getPosFix();
      }
    }
    return len;
  }

  /**
   * Return string representation without any analysis information, just the original text.
   * @since 2.6
   */
  String toTextString() {
    return getText();
  }

  /**
   * Return string representation with chunk information.
   */
  public String toString(String readingDelimiter) {
    return toString(readingDelimiter, true);
  }

  private String toString(String readingDelimiter, boolean includeChunks) {
    StringBuilder sb = new StringBuilder();
    for (AnalyzedTokenReadings element : tokens) {
      if (!element.isWhitespace()) {
        sb.append(element.getToken());
        sb.append('[');
      }
      Iterator iterator = element.iterator();
      while (iterator.hasNext()) {
        AnalyzedToken token = iterator.next();
        String posTag = token.getPOSTag();
        if (element.isSentenceStart()) {
          sb.append("");
        } else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
          sb.append("");
        } else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
          sb.append("");
        } else if (posTag == null && !includeChunks) {
          sb.append(token.getToken());
        } else {
          if (!element.isWhitespace()) {
            sb.append(token);
            if (iterator.hasNext()) {
              sb.append(readingDelimiter);
            }
          }
        }
      }
      if (!element.isWhitespace()) {
        if (includeChunks && element.getChunkTags().size() > 0) {
          sb.append(',');
          sb.append(StringUtils.join(element.getChunkTags(), "|"));
        }
        if (element.isImmunized()) {
          sb.append("{!}");
        }
        sb.append(']');
      } else {
        sb.append(' ');
      }

    }
    return sb.toString();
  }

  /**
   * Get disambiguator actions log.
   */
  public String getAnnotations() {
    StringBuilder sb = new StringBuilder(40);
    sb.append("Disambiguator log: \n");
    for (AnalyzedTokenReadings element : tokens) {
      if (!element.isWhitespace() &&
              !"".equals(element.getHistoricalAnnotations())) {
        sb.append(element.getHistoricalAnnotations());
        sb.append('\n');
      }
    }
    return sb.toString();
  }

  /**
   * Get the lowercase tokens of this sentence in a set.
   * Used internally for performance optimization.
   * @since 2.4
   */
  public Set getTokenSet() {
    return tokenOffsets.keySet();
  }

  /**
   * Get the lowercase lemmas of this sentence in a set.
   * Used internally for performance optimization.
   * @since 2.5
   */
  public Set getLemmaSet() {
    return lemmaOffsets.keySet();
  }

  /**
   * @return all offsets in {@link #getTokensWithoutWhitespace()} where tokens with the given text occur (case-insensitive),
   * or {@code null} if there are no such occurrences
   * @since 5.3
   */
  @Nullable
  @ApiStatus.Internal
  public List getTokenOffsets(String token) {
    return tokenOffsets.get(token);
  }

  /**
   * @return all offsets in {@link #getTokensWithoutWhitespace()} where tokens with the given lemma occur (case-insensitive),
   * or {@code null} if there are no such occurrences
   * @since 5.3
   */
  @Nullable
  @ApiStatus.Internal
  public List getLemmaOffsets(String token) {
    return lemmaOffsets.get(token);
  }

  @SuppressWarnings("ControlFlowStatementWithoutBraces")
  @Override
  public boolean equals(Object o) {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }
    AnalyzedSentence other = (AnalyzedSentence) o;
    // tokenSet and lemmaSet are a subset of tokens and don't need to be included
    return Arrays.equals(nonBlankTokens, other.nonBlankTokens) 
        && Arrays.equals(tokens, other.tokens)
        && Arrays.equals(whPositions, other.whPositions);
  }

  @Override
  public int hashCode() {
    // tokenSet and lemmaSet are a subset of tokens and don't need to be included
    return Objects.hash(nonBlankTokens, tokens, whPositions);
  }

}