All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.en.EnglishWordRepeatRule Maven / Gradle / Ivy

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.en;

import java.util.ResourceBundle;
import java.util.regex.Pattern;

import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.Language;
import org.languagetool.rules.Example;
import org.languagetool.rules.WordRepeatRule;

/**
 * Word repeat rule for English, to avoid false alarms in the generic word repetition rule.
 */
public class EnglishWordRepeatRule extends WordRepeatRule {

  private static final Pattern SINGLE_CHAR = Pattern.compile("(?i)^[a-z]$");

  public EnglishWordRepeatRule(ResourceBundle messages, Language language) {
    super(messages, language);
    addExamplePair(Example.wrong("This is is just an example sentence."),
                   Example.fixed("This is just an example sentence."));
  }

  @Override
  public String getId() {
    return "ENGLISH_WORD_REPEAT_RULE";
  }

  @Override
  public boolean ignore(AnalyzedTokenReadings[] tokens, int position) {
    if (position == 0) {
      return false;
    }

    // TODO:
    // What that is is a ...
    // but you you're my best friend ...
    // I'm so so happy
    // I'm very very happy
    String word = tokens[position].getToken();

    if ((repetitionOf("did", tokens, position) || repetitionOf("do", tokens, position)
        || repetitionOf("does", tokens, position)) && (position + 1 < tokens.length)
        && tokens[position + 1].getToken().equalsIgnoreCase("n't")) {
      return true;
    } else if (repetitionOf("her", tokens, position) && posIsIn(tokens, position - 2, "VB", "VBP", "VBZ", "VBG", "VBD", "VBN") && posIsIn(tokens, position + 1, "NN", "NNS", "NN:U", "NN:UN", "NNP")) {
      return true;   // "Please pass her her phone."
    } else if (repetitionOf("had", tokens, position) && posIsIn(tokens, position - 2, "PRP", "NN")) {
      return true;   // "If I had had time, I would have gone to see him."
    } else if (repetitionOf("that", tokens, position) && posIsIn(tokens, position+1, "MD", "NN", "PRP$", "JJ", "VBZ", "VBD")) {
      return true;   // "I don't think that that is a problem."
    } else if (repetitionOf("can", tokens, position) && posIsIn(tokens, position-1, "NN")) {
      return true; // "The can can hold the water."
    } else if (repetitionOf("hip", tokens, position) && (position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase("hooray")) {
      return true;
    } else if (repetitionOf("bam", tokens, position) && (position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase("bigelow")) {
      return true;
    } else if (repetitionOf("wild", tokens, position) && (position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase("west")) {
      return true; // In the wild wild west (https://en.wikipedia.org/wiki/Wild_Wild_West)
    } else if (repetitionOf("far", tokens, position) && (position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase("away")) {
      return true;
    } else if (repetitionOf("so", tokens, position) && (position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase("much")) {
      return true;
    } else if (repetitionOf("so", tokens, position) && (position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase("many")) {
      return true;
    } else if (repetitionOf("s", tokens, position) && position > 1 && tokens[position - 2].getToken().matches("['’`´‘]")) {
      return true; // It's S.T.E.A.M.
    } else if (repetitionOf("in", tokens, position) && position > 2 && tokens[position - 3].getToken().matches("log(ged|s)?|sign(ed|s)?")) {
      return true; // log them in in the
    } else if (repetitionOf("in", tokens, position) && position > 1 && tokens[position - 2].getToken().matches("log(ged|s)?|sign(ed|s)?")) {
      return true; // log in in the
    } else if (repetitionOf("a", tokens, position) && position > 1 && tokens[position - 2].getToken().equals(".")) {
      return true; // "a.k.a a"
    } else if (repetitionOf("on", tokens, position) && position > 1 && tokens[position - 2].getToken().equals(".")) {
      return true; // "You can contact E.ON on Instagram"
    } else if (tokens[position - 1].getToken().equalsIgnoreCase(word) && (((position + 1 < tokens.length) && tokens[position + 1].getToken().equalsIgnoreCase(word)) || (position > 1 && tokens[position - 2].getToken().equalsIgnoreCase(word)))) {
      // three time word repetition
      return true;
    } else if (SINGLE_CHAR.matcher(tokens[position].getToken()).matches() && position > 1 &&
        SINGLE_CHAR.matcher(tokens[position - 2].getToken()).matches() &&
        (position + 1 < tokens.length) && SINGLE_CHAR.matcher(tokens[position + 1].getToken()).matches()) {
      // spelling with spaces in between: "b a s i c a l l y"
      return true;
    } else if (repetitionOf("blah", tokens, position)) {
      return true;   // "blah blah"
    } else if (repetitionOf("mau", tokens, position)) {
      return true;   // "blah blah"
    } else if (repetitionOf("uh", tokens, position)) {
      return true;   // "uh uh"
    } else if (repetitionOf("paw", tokens, position)) {
      return true;   // "paw paw"
    } else if (repetitionOf("cha", tokens, position)) {
      return true;   // "cha cha"
    } else if (repetitionOf("yum", tokens, position)) {
      return true;   // "yum yum"
    } else if (repetitionOf("wop", tokens, position)) {
      return true;   // "wop wop"
    } else if (repetitionOf("woop", tokens, position)) {
      return true;   // "woop woop"
    } else if (repetitionOf("fnarr", tokens, position)) {
      return true;   // "fnarr fnarr" https://www.lexico.com/definition/fnarr_fnarr
    } else if (repetitionOf("fnar", tokens, position)) {
      return true;   // "fnar fnar"
    } else if (repetitionOf("ha", tokens, position)) {
      return true;   // "ha ha"
    } else if (repetitionOf("omg", tokens, position)) {
      return true;   // "omg omg"
    } else if (repetitionOf("boo", tokens, position)) {
      return true;   // "boo boo"
    } else if (repetitionOf("tick", tokens, position)) {
      return true;   // "tick tick"
    } else if (repetitionOf("twinkle", tokens, position)) {
      return true;   // "twinkle twinkle little star"
    } else if (repetitionOf("ta", tokens, position)) {
      return true;
    } else if (repetitionOf("la", tokens, position)) {
      return true;
    } else if (repetitionOf("x", tokens, position)) {
      return true;
    } else if (repetitionOf("hi", tokens, position)) {
      return true;   // "hi hi"
    } else if (repetitionOf("ho", tokens, position)) {
      return true;   // "ho ho"
    } else if (repetitionOf("heh", tokens, position)) {
      return true;
    } else if (repetitionOf("jay", tokens, position)) {
      return true; // Jay Jay (name)
    } else if (repetitionOf("walla", tokens, position)) {
      return true; // Walla Walla is a city in Washington State
    } else if (repetitionOf("sri", tokens, position)) {
      return true; // Sri Sri (name)
    } else if (repetitionOf("hey", tokens, position)) {
      return true;
    } else if (repetitionOf("hah", tokens, position)) {
      return true;
    } else if (repetitionOf("heh", tokens, position)) {
      return true;
    } else if (repetitionOf("oh", tokens, position)) {
      return true;
    } else if (repetitionOf("ouh", tokens, position)) {
      return true;
    } else if (repetitionOf("chop", tokens, position)) {
      return true;
    } else if (repetitionOf("ring", tokens, position)) {
      return true;
    } else if (repetitionOf("beep", tokens, position)) {
      return true;
    } else if (repetitionOf("bleep", tokens, position)) {
      return true;
    } else if (repetitionOf("yeah", tokens, position)) {
      return true;
    } else if (repetitionOf("wait", tokens, position) && position == 2) {
      return true;
    } else if (repetitionOf("quack", tokens, position)) {
      return true;
    } else if (repetitionOf("meow", tokens, position)) {
      return true;
    } else if (repetitionOf("squawk", tokens, position)) {
      return true;
    } else if (repetitionOf("whoa", tokens, position)) {
      return true;
    } else if (repetitionOf("si", tokens, position)) {
      return true;
    } else if (repetitionOf("honk", tokens, position)) {
      return true;
    } else if (repetitionOf("brum", tokens, position)) {
      return true;
    } else if (repetitionOf("chi", tokens, position)) {
      // name
      return true;
    } else if (repetitionOf("santorio", tokens, position)) {
      // name
      return true;
    } else if (repetitionOf("lapu", tokens, position)) {
      // city
      return true;
    } else if (repetitionOf("chow", tokens, position)) {
      // dog breed https://en.wikipedia.org/wiki/Chow_Chow
      return true;
    } else if (repetitionOf("beep", tokens, position)) {
      return true;
    } else if (repetitionOf("shh", tokens, position)) {
      return true;
    } else if (repetitionOf("yummy", tokens, position)) {
      return true;
    } else if (repetitionOf("boom", tokens, position)) {
      return true;
    } else if (repetitionOf("bye", tokens, position)) {
      return true;
    } else if (repetitionOf("ah", tokens, position)) {
      return true;
    } else if (repetitionOf("aah", tokens, position)) {
      return true;
    } else if (repetitionOf("bang", tokens, position)) {
      return true;
    } else if (repetitionOf("woof", tokens, position)) {
      return true;
    } else if (repetitionOf("wink", tokens, position)) {
      return true;
    } else if (repetitionOf("yes", tokens, position)) {
      return true;
    } else if (repetitionOf("tsk", tokens, position)) {
      return true;
    } else if (repetitionOf("hush", tokens, position)) {
      return true;
    } else if (repetitionOf("ding", tokens, position)) {
      return true;
    } else if (repetitionOf("choo", tokens, position)) {
      return true;
    } else if (repetitionOf("miu", tokens, position)) {
      return true;
    } else if (repetitionOf("tuk", tokens, position)) {
      return true;
    } else if (repetitionOf("yadda", tokens, position)) {
      return true;   // "yadda yadda"
    } else if (repetitionOf("walla", tokens, position)) {
      return true;   // "walla walla"
    } else if (repetitionOf("doo", tokens, position)) {
      return true;   // "doo doo"
    } else if (repetitionOf("sapiens", tokens, position)) {
      return true;   // "Homo sapiens sapiens"
    } else if (repetitionOf("tse", tokens, position)) {
      return true;   // "tse tse"
    } else if (repetitionOf("no", tokens, position)) {
      return true;   // "no no"
    } else if (tokens[position].getToken().endsWith("ay")) {
      if (tokens[position - 1].getToken().equals("may") && tokens[position].getToken().equals("May")) {
        return true;   // "may May"
      }
      if (tokens[position - 1].getToken().equals("May") && tokens[position].getToken().equals("may")) {
        return true;   // "May may"
      }
      if (tokens[1].getToken().equals("May") && tokens[2].getToken().equals("May")) {
        return true;   // "May May" SENT_START
      }
    } else if (tokens[position].getToken().endsWith("ill")) {
      return (position > 0 && tokens[position - 1].getToken().equals("will") && tokens[position].getToken().equals("Will")) // will Wills
        || (tokens[position - 1].getToken().equals("Will") && tokens[position].getToken().equals("will")) // Will will ...
        || (tokens[1].getToken().equals("Will") && tokens[2].getToken().equals("Will")); // "Will Will" SENT_START
    }
    return super.ignore(tokens, position);
  }

  private boolean posIsIn(AnalyzedTokenReadings[] tokens, int position, String... posTags) {
    if (position >= 0 && position < tokens.length) {
      for (String posTag : posTags) {
        if (tokens[position].hasPartialPosTag(posTag)) {
          return true;
        }
      }
    }
    return false;
  }

  private boolean repetitionOf(String word, AnalyzedTokenReadings[] tokens, int position) {
    return position > 0 && tokens[position - 1].getToken().equalsIgnoreCase(word) && tokens[position].getToken().equalsIgnoreCase(word);
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy