org.languagetool.AnalyzedTokenReadings Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

package org.languagetool;

import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.languagetool.chunking.ChunkTag;
import org.languagetool.tools.StringTools;

import static org.languagetool.JLanguageTool.*;

/**
 * An array of {@link AnalyzedToken}s used to store multiple POS tags and lemmas
 * for a given single token.
 * 
 * @author Marcin Milkowski
 */
public final class AnalyzedTokenReadings implements Iterable {

  private final boolean isWhitespace;
  private final boolean isLinebreak;
  private final boolean isSentStart;

  private AnalyzedToken[] anTokReadings;
  private int startPos;
  private String token;
  private List chunkTags = new ArrayList<>();
  private boolean isSentEnd;
  private boolean isParaEnd;
  private boolean isWhitespaceBefore;
  private boolean isPosTagUnknown;

  // If true, then the token is marked up as immune against tests:
  // it should never be matched by any rule. Used to have generalized
  // mechanism for exceptions in rules.
  private boolean isImmunized;

  // If true, then the token is marked up as ignored in all spelling rules:
  // other rules can freely match it.
  private boolean isIgnoredBySpeller;

  // Used to hold the string representation of the disambiguator actions on a token.
  private String historicalAnnotations = "";

  // True if the token has the same lemma value for all tokens.
  // Can be used internally to optimize matching.
  private boolean hasSameLemmas;

  public AnalyzedTokenReadings(AnalyzedToken[] tokens, int startPos) {
    this(Arrays.asList(tokens), startPos);
  }

  public AnalyzedTokenReadings(AnalyzedToken token, int startPos) {
    this(Collections.singletonList(token), startPos);
  }

  public AnalyzedTokenReadings(List tokens, int startPos) {
    anTokReadings = tokens.toArray(new AnalyzedToken[tokens.size()]);
    this.startPos = startPos;
    token = anTokReadings[0].getToken();
    isWhitespace = StringTools.isWhitespace(token);
    isWhitespaceBefore = anTokReadings[0].isWhitespaceBefore();
    isLinebreak = "\n".equals(token) || "\r\n".equals(token) || "\r".equals(token) || "\n\r".equals(token);
    isSentStart = SENTENCE_START_TAGNAME.equals(anTokReadings[0].getPOSTag());
    isParaEnd = hasPosTag(PARAGRAPH_END_TAGNAME);
    isSentEnd = hasPosTag(SENTENCE_END_TAGNAME);
    isPosTagUnknown = tokens.size() == 1 && tokens.get(0).getPOSTag() == null;
    setNoRealPOStag();
    hasSameLemmas = areLemmasSame();
  }

  AnalyzedTokenReadings(AnalyzedToken token) {
    this(Collections.singletonList(token), 0);
  }

  public List getReadings() {
    return Arrays.asList(anTokReadings);
  }

  /**
   * Get a token reading.
   * @see #getReadingsLength() getReadingsLength() for how many token readings there are
   */
  public AnalyzedToken getAnalyzedToken(int idx) {
    return anTokReadings[idx];
  }

  /**
   * Checks if the token has a particular POS tag.
   * 
   * @param posTag POS tag to look for
   */
  public boolean hasPosTag(String posTag) {
    boolean found = false;
    for (AnalyzedToken reading : anTokReadings) {
      if (reading.getPOSTag() != null) {
        found = posTag.equals(reading.getPOSTag());
        if (found) {
          break;
        }
      }
    }
    return found;
  }

  /**
   * Checks if one of the token's readings has a particular lemma.
   *
   * @param lemma lemma POS tag to look for
   */
  public boolean hasLemma(String lemma) {
    boolean found = false;
    for (AnalyzedToken reading : anTokReadings) {
      if (reading.getLemma() != null) {
        found = lemma.equals(reading.getLemma());
        if (found) {
          break;
        }
      }
    }
    return found;
  }

  /**
   * Checks if one of the token's readings has one of the given lemmas
   *
   * @param lemmas to look for
   */
  public boolean hasAnyLemma(String... lemmas) {
    boolean found = false;
    for(String lemma : lemmas) {
      for (AnalyzedToken reading : anTokReadings) {
        if (reading.getLemma() != null) {
          found = lemma.equals(reading.getLemma());
          if (found) {
            return found;
          }
        }
      }
    }
    return found;
  }

  /**
   * Checks if the token has a particular POS tag, where only a part of the given POS tag needs to match.
   *
   * @param posTag POS tag substring to look for
   * @since 1.8
   */
  public boolean hasPartialPosTag(String posTag) {
    boolean found = false;
    for (AnalyzedToken reading : anTokReadings) {
      if (reading.getPOSTag() != null) {
        found = reading.getPOSTag().contains(posTag);
        if (found) {
          break;
        }
      }
    }
    return found;
  }

 /**
  * Checks if the token has any of the given particular POS tags (only a part of the given POS tag needs to match)
  *
  * @param posTags POS tag substring to look for
  * @since 4.0
  */
  public boolean hasAnyPartialPosTag(String... posTags) {
    for (String posTag : posTags) {
      if (hasPartialPosTag(posTag)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Checks if the token has a POS tag starting with the given string.
   *
   * @param posTag POS tag substring to look for
   * @since 4.0
   */
  public boolean hasPosTagStartingWith(String posTag) {
    boolean found = false;
    for (AnalyzedToken reading : anTokReadings) {
      if (reading.getPOSTag() != null) {
        found = reading.getPOSTag().startsWith(posTag);
        if (found) {
          break;
        }
      }
    }
    return found;
  }

  /**
   * Checks if at least one of the readings matches a given POS tag regex.
   *
   * @param posTagRegex POS tag regular expression to look for
   * @since 2.9
   */
  public boolean matchesPosTagRegex(String posTagRegex) {
    Pattern pattern = Pattern.compile(posTagRegex);
    boolean found = false;
    for (AnalyzedToken reading : anTokReadings) {
      if (reading.getPOSTag() != null) {
        found = pattern.matcher(reading.getPOSTag()).matches();
        if (found) {
          break;
        }
      }
    }
    return found;
  }

  /**
   * Add a new reading.
   * @param token new reading, given as {@link AnalyzedToken}
   */
  public void addReading(AnalyzedToken token) {
    List l = new ArrayList<>(Arrays.asList(anTokReadings).subList(0, anTokReadings.length - 1));
    if (anTokReadings[anTokReadings.length - 1].getPOSTag() != null) {
      l.add(anTokReadings[anTokReadings.length - 1]);
    }
    token.setWhitespaceBefore(isWhitespaceBefore);
    l.add(token);
    anTokReadings = l.toArray(new AnalyzedToken[l.size()]);
    if (token.getToken().length() > this.token.length()) { //in case a longer token is added
      this.token = token.getToken();
    }
    anTokReadings[anTokReadings.length - 1].setWhitespaceBefore(isWhitespaceBefore);
    isParaEnd = hasPosTag(PARAGRAPH_END_TAGNAME);
    isSentEnd = hasPosTag(SENTENCE_END_TAGNAME);
    setNoRealPOStag();
    hasSameLemmas = areLemmasSame();
  }

  /**
   * Removes a reading from the list of readings. Note: if the token
   * has only one reading, then a new reading with an empty POS tag
   * and an empty lemma is created.
   * @param token reading to be removed
   */
  public void removeReading(AnalyzedToken token) {
    List l = new ArrayList<>();
    AnalyzedToken tmpTok = new AnalyzedToken(token.getToken(), token.getPOSTag(), token.getLemma());
    tmpTok.setWhitespaceBefore(isWhitespaceBefore);
    boolean removedSentEnd = false;
    boolean removedParaEnd = false;
    for (AnalyzedToken anTokReading : anTokReadings) {
      if (!anTokReading.matches(tmpTok)) {
        l.add(anTokReading);
      } else if (SENTENCE_END_TAGNAME.equals(anTokReading.getPOSTag())) {
        removedSentEnd = true;
      } else if (PARAGRAPH_END_TAGNAME.equals(anTokReading.getPOSTag())) {
        removedParaEnd = true;
      }
    }
    if (l.isEmpty()) {
      l.add(new AnalyzedToken(this.token, null, null));
      l.get(0).setWhitespaceBefore(isWhitespaceBefore);
    }
    anTokReadings = l.toArray(new AnalyzedToken[l.size()]);
    setNoRealPOStag();
    if (removedSentEnd) {
      isSentEnd = false;
      setSentEnd();
    }
    if (removedParaEnd) {
      isParaEnd = false;
      setParagraphEnd();
    }
    hasSameLemmas = areLemmasSame();
  }

  /**
   * Removes all readings but the one that matches the token given.
   * @param token Token to be matched
   * @since 1.5
   */
  public void leaveReading(AnalyzedToken token) {
    List l = new ArrayList<>();
    AnalyzedToken tmpTok = new AnalyzedToken(token.getToken(), token.getPOSTag(), token.getLemma());
    tmpTok.setWhitespaceBefore(isWhitespaceBefore);
    for (AnalyzedToken anTokReading : anTokReadings) {
      if (anTokReading.matches(tmpTok)) {
        l.add(anTokReading);
      }
    }
    if (l.isEmpty()) {
      l.add(new AnalyzedToken(this.token, null, null));
      l.get(0).setWhitespaceBefore(isWhitespaceBefore);
    }
    anTokReadings = l.toArray(new AnalyzedToken[l.size()]);
    setNoRealPOStag();
    hasSameLemmas = areLemmasSame();
  }

  /**
   * Number of readings.
   */
  public int getReadingsLength() {
    return anTokReadings.length;
  }

  public boolean isWhitespace() {
    return isWhitespace;
  }

  /**
   * Returns true if the token equals {@code \n}, {@code \r}, {@code \n\r}, or {@code \r\n}.
   */
  public boolean isLinebreak() {
    return isLinebreak;
  }

  /**
   * @since 2.3
   */
  public boolean isSentenceStart() {
    return isSentStart;
  }

  /**
   * @return true when the token is a last token in a paragraph.
   * @since 2.3
   */
  public boolean isParagraphEnd() {
    return isParaEnd;
  }

  /**
   * Add a reading with a paragraph end token unless this is already a paragraph end.
   * @since 2.3
   */
  public void setParagraphEnd() {
    if (!isParagraphEnd()) {
      AnalyzedToken paragraphEnd = new AnalyzedToken(getToken(),
          PARAGRAPH_END_TAGNAME, getAnalyzedToken(0).getLemma());
      addReading(paragraphEnd);
    }
  }

  /**
   * @return true when the token is a last token in a sentence.
   * @since 2.3
   */
  public boolean isSentenceEnd() {
    return isSentEnd;
  }

  /**
   * @return true if the token is LibreOffice/OpenOffice field code.
   * @since 0.9.9
   */
  public boolean isFieldCode() {
    return "\u0001".equals(token) || "\u0002".equals(token);
  }

  /**
   * Add a SENT_END tag.
   */
  public void setSentEnd() {
    if (!isSentenceEnd()) {
      AnalyzedToken sentenceEnd = new AnalyzedToken(getToken(),
          SENTENCE_END_TAGNAME, getAnalyzedToken(0).getLemma());
      addReading(sentenceEnd);
    }
  }

  public int getStartPos() {
    return startPos;
  }

  /** @since 2.9 */
  public int getEndPos() {
    return startPos + token.length();
  }

  public void setStartPos(int position) {
    startPos = position;
  }

  public String getToken() {
    return token;
  }

  public void setWhitespaceBefore(boolean isWhiteSpaceBefore) {
    isWhitespaceBefore = isWhiteSpaceBefore;
    for (AnalyzedToken aTok : anTokReadings) {
      aTok.setWhitespaceBefore(isWhiteSpaceBefore);
    }
  }

  public boolean isWhitespaceBefore() {
    return isWhitespaceBefore;
  }

  public void immunize() {
    isImmunized = true;
  }

  public boolean isImmunized() {
    return isImmunized;
  }

  /**
   * Make the token ignored by all spelling rules.
   * @since 2.5
   */
  public void ignoreSpelling() {
    isIgnoredBySpeller = true;
  }

  /**
   * Test if the token can be ignored by spelling rules.
   * @return true if the token should be ignored.
   * @since 2.5
   */
  public boolean isIgnoredBySpeller() {
    return isIgnoredBySpeller;
  }

  /**
   * Test if the token's POStag equals null.
   * @return true if the token does not have a POStag
   * @since 3.9
   */
  public boolean isPosTagUnknown() {
    return isPosTagUnknown;
  }

  /**
   * Sets the flag on AnalyzedTokens to make matching
   * on {@code UNKNOWN} POS tag correct in the Element class.
   */
  private void setNoRealPOStag() {
    boolean hasNoPOStag = !isLinebreak();
    for (AnalyzedToken an: anTokReadings) {
      String posTag = an.getPOSTag();
      if (PARAGRAPH_END_TAGNAME.equals(posTag) ||
          SENTENCE_END_TAGNAME.equals(posTag)) {
        continue;
      }
      if (posTag != null) {
        hasNoPOStag = false;
      }
    }
    for (AnalyzedToken an: anTokReadings) {
      an.setNoPOSTag(hasNoPOStag);
    }
  }

  /**
   * Used to track disambiguator actions.
   * @return the historicalAnnotations
   */
  public String getHistoricalAnnotations() {
    return historicalAnnotations;
  }

  /**
   * Used to track disambiguator actions.
   * @param historicalAnnotations the historicalAnnotations to set
   */
  public void setHistoricalAnnotations(String historicalAnnotations) {
    this.historicalAnnotations = historicalAnnotations;
  }

  /**
   * @since 2.3
   */
  public void setChunkTags(List chunkTags) {
    this.chunkTags = Objects.requireNonNull(chunkTags);
  }

  /**
   * @since 2.3
   */
  public List getChunkTags() {
    return chunkTags;
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append(token);
    sb.append('[');
    for (AnalyzedToken element : anTokReadings) {
      sb.append(element);
      if (!element.isWhitespaceBefore()) {
        sb.append('*');
      }
      sb.append(',');
    }
    sb.delete(sb.length() - 1, sb.length());
    if (chunkTags.size() > 0) {
      sb.append(',');
      sb.append(StringUtils.join(chunkTags, "|"));
    }
    sb.append(']');
    if (isImmunized()) {
      sb.append("{!},");
    }
    return sb.toString();
  }

  /**
   * @return true if AnalyzedTokenReadings has some real POS tag (= not null or a special tag)
   * @since 2.3
   */
  public boolean isTagged() {
    for (AnalyzedToken element : anTokReadings) {
      if (!element.hasNoTag()) {
        return true;
      }
    }
    return false;
  }

  /**
   * Used to configure the internal variable for lemma equality.
   * @return true if all {@link AnalyzedToken} lemmas are the same.
   * @since 2.5
   */
  private boolean areLemmasSame() {
    String previousLemma = anTokReadings[0].getLemma();
    if (previousLemma == null) {
      for (AnalyzedToken element : anTokReadings) {
        if (element.getLemma() != null) {
          return false;
        }
      }
      return true;
    }
    for (AnalyzedToken element : anTokReadings) {
      if (!previousLemma.equals(element.getLemma())) {
        return false;
      }
    }
    return true;
  }

  /**
   * Used to optimize pattern matching.
   * 
   * @return true if all {@link AnalyzedToken} lemmas are the same.
   */
  public boolean hasSameLemmas() {
    return hasSameLemmas;
  }

  @Override
  public int hashCode() {
    return Arrays.hashCode(anTokReadings) +
           Objects.hash(isLinebreak, isParaEnd, isSentEnd, isSentStart, isWhitespace, isWhitespaceBefore, chunkTags, startPos, token);
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) { return true; }
    if (obj == null) { return false; }
    if (getClass() != obj.getClass()) {
      return false;
    }
    AnalyzedTokenReadings other = (AnalyzedTokenReadings) obj;
    return new EqualsBuilder()
      .append(anTokReadings, other.anTokReadings)
      .append(isLinebreak, other.isLinebreak)
      .append(isParaEnd, other.isParaEnd)
      .append(isSentEnd, other.isSentEnd)
      .append(isSentStart, other.isSentStart)
      .append(isWhitespace, other.isWhitespace)
      .append(isWhitespaceBefore, other.isWhitespaceBefore)
      .append(isImmunized, other.isImmunized)
      .append(startPos, other.startPos)
      .append(chunkTags, other.chunkTags)
      .append(hasSameLemmas, other.hasSameLemmas)
      .append(isIgnoredBySpeller, other.isIgnoredBySpeller)
      .append(token, other.token)
      .isEquals();
  }

  /**
   * @since 2.3
   */
  @Override
  public Iterator iterator() {
    AtomicInteger i = new AtomicInteger(0);
    return new Iterator() {
      @Override
      public boolean hasNext() {
        return i.get() < getReadingsLength();
      }
      @Override
      public AnalyzedToken next() {
        try {
          return anTokReadings[i.getAndAdd(1)];
        } catch (ArrayIndexOutOfBoundsException e) {
          throw new NoSuchElementException("No such element: " + i + ", element count: " + anTokReadings.length);
        }
      }
      @Override
      public void remove() {
        throw new UnsupportedOperationException();
      }
    };
  }
}