org.languagetool.rules.ParagraphRepeatBeginningRule Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;

import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;

/**
 * Check if to paragraphs begin with the same word.
 * If the first word is an article it checks if the first two words are identical
 * 
 * @author Fred Kruse
 * @since 4.1
 */
public class ParagraphRepeatBeginningRule extends TextLevelRule {

  public ParagraphRepeatBeginningRule(ResourceBundle messages) {
    super(messages);
    super.setCategory(Categories.STYLE.getCategory(messages));
    setLocQualityIssueType(ITSIssueType.Style);
    setDefaultOff();
  }

  @Override
  public String getId() {
    return "PARAGRAPH_REPEAT_BEGINNING_RULE";
  }

  @Override
  public String getDescription() {
    return messages.getString("repetition_paragraph_beginning_desc");
  }
  
  private static int getParagraphBegin(AnalyzedTokenReadings[] tokens, int start) {
    if (start < 1) {
      start = 1;
    }
    for(int i = start; i < tokens.length; i++) {
      if ("\n".equals(tokens[i].getToken()) || "\r\n".equals(tokens[i].getToken()) || "\n\r".equals(tokens[i].getToken())) {
        for (i++; i < tokens.length && tokens[i].isWhitespace(); i++);
        return i;
      }
    }
    return -1;
  }
  
  
  public boolean isArticle(AnalyzedTokenReadings token) {
    return token.hasPosTagStartingWith("DT");
  }
  
  private int numCharEqualBeginning(AnalyzedTokenReadings[] tokens, int start, AnalyzedTokenReadings[] nextTokens, int nextStart) throws IOException {
    if(tokens.length < 2 || nextTokens.length < 2) {
      return 0;
    }
    int i = start;
    if (i < 1) {
      i = 1;
    }
    for (; i < tokens.length && (tokens[i].isWhitespace() || tokens[i].isSentenceStart()); i++);
    if (i >= tokens.length) {
      return 0;
    }
    String token = tokens[i].getToken();
    if (token.length() == 1) {
      return 0;
    }
    int j = nextStart;
    if (j < 1) {
      j = 1;
    }
    for (; j < nextTokens.length && (nextTokens[j].isWhitespace() || nextTokens[j].isSentenceStart()); j++);
    if (j >= nextTokens.length) {
      return 0;
    }
    String nextToken = nextTokens[j].getToken();
    if (token.equals(nextToken)) {
      if (!isArticle(tokens[i])) {
        return tokens[i].getEndPos() - tokens[i].getStartPos();
      } else {
        int startPos = tokens[i].getStartPos();
        for (i++; i < tokens.length && tokens[i].isWhitespace(); i++);
        if (i >= tokens.length) {
          return 0;
        }
        for (j++; j < nextTokens.length && nextTokens[j].isWhitespace(); j++);
        if (j >= nextTokens.length) {
          return 0;
        }
        token = tokens[i].getToken();
        nextToken = nextTokens[j].getToken();
        if (token.equals(nextToken) && token.length() > 1) {
          return tokens[i].getEndPos() - startPos;
        }
      }
    }
    return 0;

  }

  @Override
  public RuleMatch[] match(List sentences) throws IOException {
    List ruleMatches = new ArrayList<>();
    if (sentences.size() < 1) {
      return toRuleMatchArray(ruleMatches);
    }
    int pos = 0;
    int nextPos = 0;
    int lastParaBegin = 1;
    int paraBegin = -1;
    int nextParaBegin = 1;
    int numNextSentence = 0;
    AnalyzedTokenReadings[] lastTokens = null;
    AnalyzedSentence nextSentence = sentences.get(0);
    AnalyzedSentence sentence = null;
    AnalyzedTokenReadings[] tokens = null;
    AnalyzedTokenReadings[] nextTokens = nextSentence.getTokens();
    while (numNextSentence < sentences.size()) {
      if (numNextSentence > 0 || paraBegin >= 0) {
        lastTokens = tokens;
        lastParaBegin = paraBegin;
      }
      sentence = nextSentence;
      tokens = nextTokens;
      paraBegin = nextParaBegin;
      nextPos = 0;
      boolean isPara = false;
      while (numNextSentence < sentences.size() && !isPara) {
        nextParaBegin = getParagraphBegin(nextTokens, nextParaBegin);
        if(nextParaBegin >= 0) {
          isPara = true;
        }
        if (nextParaBegin < 0 || nextParaBegin >= nextTokens.length) {
          numNextSentence++;
          if(numNextSentence < sentences.size()) {
            if(nextParaBegin >= nextTokens.length) {
              nextParaBegin = 1;
            }
            nextPos += nextSentence.getText().length();
            nextSentence = sentences.get(numNextSentence);
            nextTokens = nextSentence.getTokens();
          } else {
            nextTokens = null;
          }
        }
      }
      if (tokens.length > 2) {
        int endPos = 0;
        int num = 0;
        num = paraBegin;
        if (num < 1) {
          num = 1;
        }
        if (lastTokens != null) {
          endPos = numCharEqualBeginning(tokens, paraBegin, lastTokens, lastParaBegin);
          if (endPos > 0) {
            for(; tokens[num].isWhitespace() || tokens[num].isSentenceStart(); num++);
            int startPos = pos + tokens[num].getStartPos();
            String msg = messages.getString("repetition_paragraph_beginning_last_msg");
            RuleMatch ruleMatch = new RuleMatch(this, sentence, startPos, startPos+endPos, msg);
            ruleMatches.add(ruleMatch);
          }
        }
        if (endPos == 0 && nextTokens != null) {
          endPos = numCharEqualBeginning(tokens, paraBegin, nextTokens, nextParaBegin);
          if (endPos > 0) {
            for(; tokens[num].isWhitespace() || tokens[num].isSentenceStart(); num++);
            int startPos = pos + tokens[num].getStartPos();
            String msg = messages.getString("repetition_paragraph_beginning_next_msg");
            RuleMatch ruleMatch = new RuleMatch(this, sentence, startPos, startPos+endPos, msg);
            ruleMatches.add(ruleMatch);
          }
        }
      }
      pos += nextPos;
    }
    return toRuleMatchArray(ruleMatches);
  }

}