All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.en.UpperCaseNgramRule Maven / Gradle / Ivy

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2020 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.en;

import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.Language;
import org.languagetool.LinguServices;
import org.languagetool.UserConfig;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.*;
import org.languagetool.rules.ngrams.Probability;
import org.languagetool.rules.patterns.PatternToken;
import org.languagetool.rules.patterns.PatternTokenBuilder;
import org.languagetool.rules.spelling.CachingWordListLoader;
import org.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule;
import org.languagetool.tools.StringTools;

import java.io.IOException;
import java.util.*;
import java.util.function.Supplier;
import java.util.regex.Pattern;

import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.token;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.pos;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.tokenRegex;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.csRegex;

/**
 * Finds some(!) words written uppercase that should be spelled lowercase.
 * @since 5.0
 */
public class UpperCaseNgramRule extends Rule {

  public static final int THRESHOLD = 50;

  private static MorfologikAmericanSpellerRule spellerRule = null;
  private static LinguServices linguServices = null;

  private static final Pattern PUNCT_PATTERN = Pattern.compile("[.!?:]");
  private static final Set exceptions = new HashSet<>(Arrays.asList(
    "Bin", "Spot",  // names
    "Go",           // common usage, as in "Go/No Go decision"
    "French", "Roman", "Hawking", "Square", "Japan", "Premier", "Allied"
  ));
  private static final Pattern TYPICAL_LOWERCASE = Pattern.compile("and|or|the|of|on|with|to|it|in|for|as|at|his|her|its|into|&|/");

  private static AhoCorasickDoubleArrayTrie exceptionTrie = null;
  private static final List> ANTI_PATTERNS = Arrays.asList(
    Arrays.asList(
      token("Hugs"), token("and"), token("Kisses")
    ),
    Arrays.asList( // Please go to File and select Options. 
      token("go"),
      token("to"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("[A-Z].+"),
      token(","),
      tokenRegex("[Aa]nd|[Oo]r|&"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // "The goal is to Develop, Discuss and Learn.""
      tokenRegex("[A-Z].+"),
      token(","),
      tokenRegex("[A-Z].+"),
      tokenRegex("[Aa]nd|[Oo]r|&|,"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // "The goal is to Develop, Discuss and Learn.""
      tokenRegex("[A-Z].+"),
      token(")"),
      token(","),
      tokenRegex("[A-Z].+"),
      tokenRegex("[Aa]nd|[Oo]r|&|,"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      csRegex("[A-Z].+"),
      new PatternTokenBuilder().token("-").min(0).build(),
      token(">"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList(
      csRegex("[A-Z].+"),
      tokenRegex("[→⇾⇉⇒]"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList(
      csRegex("[A-Z].+"),
      tokenRegex("[’']"),
      csRegex("s")
    ),
    Arrays.asList(
      pos("SENT_START"), // Two-word phrases with "?" or "!": "What Happened?", "Catch Up!" (can be headlines)
      tokenRegex("[A-Z].+"),
      tokenRegex("[A-Z].+"),
      tokenRegex("[\\!\\?]")
    ),
    Arrays.asList(
      pos("SENT_START"), // Step1 - Watch the full episode.
      tokenRegex(".*\\w.*"),
      tokenRegex("-|–"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // Step 1 - Watch the full episode.
      tokenRegex(".*\\w.*"),
      tokenRegex("[0-9]+"),
      tokenRegex("-|–"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // Markdowm headline # Show some
      token("#"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // Markdowm headline ## Show some
      token("#"),
      token("#"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // Markdowm headline ## Show some
      token("#"),
      token("#"),
      token("#"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // Scene 4, Lines 93-96
      tokenRegex("[A-Z].+"),
      tokenRegex("\\d+"),
      tokenRegex("-|–|,"),
      tokenRegex("[A-Z].+"),
      tokenRegex("\\d+")
    ),
    Arrays.asList( // 1.- Sign up for ...
      tokenRegex("\\d+"),
      token("."),
      tokenRegex("-|–"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // French Quote + Uppercase word
      tokenRegex("«"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // H1 What's wrong?
      tokenRegex("H[1-6]"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // ii) Enabling you to ...
      tokenRegex("[a-z]{1,2}"),
      token(")"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // bullet point
      token("•"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // Dash + Uppercase word
      tokenRegex("-|–"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("Step|Grade|Phase|Reason"), // I finished Step 6
      tokenRegex("\\d+")
    ),
    Arrays.asList(
      tokenRegex("the|our|their"), // Let's talk to the Onboarding team.
      tokenRegex("[A-Z].+"),
      tokenRegex("team|department")
    ),
    Arrays.asList(
      pos("SENT_START"), // 12.3 Game.
      tokenRegex("\\d+"),
      tokenRegex("\\.|/"),
      tokenRegex("\\d+"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // Lesson #1 - Learn the alphabet.
      tokenRegex(".*\\w.*"),
      token("#"),
      tokenRegex("[0-9]+"),
      tokenRegex("-|–"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      token("BBC"),
      token("Culture")
    ),
    Arrays.asList(
      token("Time"),
      tokenRegex("magazines?")
    ),
    Arrays.asList( // name of TV series
      token("Dublin"),
      token("Murders")
    ),
    Arrays.asList(
      token("Amazon"),
      token("Live")
    ),
    Arrays.asList( // Company name
      token("Volvo"),
      token("Buses")
    ),
    Arrays.asList( // video game
      token("Heavy"),
      token("Rain")
    ),
    Arrays.asList(
      csRegex("[A-Z].+"),
      token("/"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // "Order #76540"
      csRegex("[A-Z].+"),
      token("#"),
      tokenRegex("\\d+")
    ),
    Arrays.asList( // "He plays games at Games.co.uk."
      csRegex("[A-Z].+"),
      token("."),
      tokenRegex("com?|de|us|gov|net|info|org|es|mx|ca|uk|at|ch|it|pl|ru|nl|ie|be|fr|ai|dev|io|pt|mil|club|jp|es|se|dk|no")
    ),
    Arrays.asList(
      tokenRegex("[A-Z].+"),  // He's Ben (Been)
      token("("),
      tokenRegex("[A-Z].+"),
      token(")")
    ),
    Arrays.asList(
      token("["),
      tokenRegex("[A-Z].+"),
      token("]")
    ),
    Arrays.asList(
      token("Pay"),
      token("per"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("Hi|Hello|Heya?"),
      token(","),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // "C stands for Curse."
      tokenRegex("[A-Z]"),
      tokenRegex("is|stands"),
      token("for"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      pos("SENT_START"), // The Story: (short headlines with colon)
      tokenRegex("[A-Z].+"),
      tokenRegex("[A-Z].+"),
      token(":")
    ),
    Arrays.asList(
      pos("SENT_START"), // Stop & Jot: (short headlines with colon)
      tokenRegex("[A-Z].+"),
      token("&"),
      tokenRegex("[A-Z].+"),
      token(":")
    ),
    Arrays.asList(
      pos("SENT_START"), // Easy to Use: (short headlines with colon)
      tokenRegex("[A-Z].+"),
      tokenRegex("[a-z].+"),
      tokenRegex("[A-Z].+"),
      token(":")
    ),
    Arrays.asList(
      tokenRegex("[A-Z].+"),  // e.g. "Top 10% Lunch Deals"
      tokenRegex("\\d+%?"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("[0-9]+"),  // e.g. "6) Have a beer"
      tokenRegex("[)\\]]"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("[a-z]"),  // e.g. "a) Have a beer"
      tokenRegex("[)\\]]"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("[\\(\\]]"),  // e.g. "(b) Have a beer"
      tokenRegex("[a-z0-9]"),
      tokenRegex("[)\\]]")
    ),
    Arrays.asList(
      tokenRegex("[A-Z].+"),  // e.g. "Freelance 2.0"
      tokenRegex("[0-9]+"),
      tokenRegex("."),
      tokenRegex("[0-9]+")
    ),
    Arrays.asList(
      tokenRegex("[A-Z].*"),  // e.g. "You Don't Know" or "Kuiper’s Belt"
      tokenRegex("['’`´‘]"),
      tokenRegex("t|d|ve|s|re|m|ll"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("[A-Z].+"),  // e.g. "Culture, People , Nature", probably a title
      token(","),
      tokenRegex("[A-Z].+"),
      token(","),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      tokenRegex("The"),  // e.g. "The Sea is Watching", probably a title
      tokenRegex("[A-Z].+"),
      token("is"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      token("Professor"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList(
      token("Time"),
      token("magazine")
    ),
    Arrays.asList( // My name is Gentle.
      token("name"),
      token("is"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // They called it Greet.
      tokenRegex("calls?|called|calling|name[ds]?|naming"),
      token("it|him|her|them|me|us|that|this"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // ... to something called Faded
      tokenRegex("some(thing|body|one)"),
      tokenRegex("called|named"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // It is called Ranked mode
      csRegex("is|was|been|were|are"),
      csRegex("calls?|called|calling|name[ds]?|naming"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // What is Foreshadowing?
      tokenRegex("Who|What"),
      tokenRegex("is|are|was|were"),
      tokenRegex("[A-Z].+"),
      token("?")
    ),
    Arrays.asList( // His name is Carp.
      token("name"),
      tokenRegex("is|was"),
      tokenRegex("[A-Z].+")
    ),
    Arrays.asList( // FDM Group
      tokenRegex("[A-Z].*"),
      token("Group")
    ),
    Arrays.asList( // Enter key
      tokenRegex("Enter|Escape|Shift|Control|Meta|Backspace"),
      token("key")
    ),
    Arrays.asList( // Victor or Rabbit as everyone calls him.
      pos("NNP"),
      tokenRegex("or|and|&"),
      tokenRegex("[A-Z].*")
    ),
    Arrays.asList( // Hashtags
      token("#"),
      tokenRegex("[A-Z].*")
    ),
    Arrays.asList(
      tokenRegex("Teams|Maps|Canvas|Remind|Tile|Switch|Gems?|Glamour|Divvy|Solo|Splash|Phrase||Spotlight|Outreach|Grab") // Microsoft Teams, Google Maps, Remind App, Nintendo Switch (not tagged as NNP), Gems (Ruby Gems)
    ),
    Arrays.asList(
      pos("SENT_START"), // Music and Concepts.
      tokenRegex("[A-Z].*"),
      tokenRegex("or|and|&"),
      tokenRegex("[A-Z].*"),
      pos("SENT_END")
    ),
    Arrays.asList( // Please click Send
      csRegex("click(ed|s)?|type(d|s)|hit"),
      tokenRegex("[A-Z].*")
    ),
    Arrays.asList( // Please click on Send
      csRegex("click(ed|s)?"),
      tokenRegex("on|at"),
      tokenRegex("[A-Z].*")
    ),
    Arrays.asList( // Chronicle of a Death Foretold
      csRegex("Chronicle"),
      token("of"),
      tokenRegex("the|an?"),
      tokenRegex("[A-Z].*")
    ),
    Arrays.asList( // Please see Question 2, 
      csRegex("[A-Z].*"),
      tokenRegex("\\d+")
    ),
    Arrays.asList( // Please see Question #2, 
      csRegex("[A-Z].*"),
      token("#"),
      tokenRegex("\\d+")
    ),
    Arrays.asList( // company departments used like proper nouns
      csRegex("Finance|Marketing|Engineering|Controlling|Support|Accounting")
    ),
    Arrays.asList( // They used Draft.js to solve it.
      csRegex("[A-Z].*"),
      token("."),
      tokenRegex("js")
    ),
    Arrays.asList( // And mine is Wed.
      csRegex("Wed")
    ),
    Arrays.asList( // Ender's Game
      new PatternTokenBuilder().posRegex("NN.*").csTokenRegex("[A-Z].+").build(),
      token("'s"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // Title Case: How to Become an Millionaire
      csRegex("How"),
      csRegex("to"),
      new PatternTokenBuilder().pos("VB").csTokenRegex("[A-Z].+").build(),
      csRegex("an?|my|y?our|her|his|the|from|by|about"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // Do
      csRegex("Do|Does|Did|Can|[CW]ould"),
      csRegex("n't"),
      new PatternTokenBuilder().pos("VB").csTokenRegex("[A-Z].+").build(),
      pos("IN"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // Title Case: How to Become an Millionaire
      csRegex("Let"),
      csRegex("'s"),
      new PatternTokenBuilder().pos("VB").csTokenRegex("[A-Z].+").build(),
      pos("IN"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // Keys
      csRegex("Enter|Return|Escape|Shift")
    ),
    Arrays.asList( // You Can't Judge a Book by the Cover
      csRegex("[A-Z].+"),
      csRegex("Ca|Wo|Do|Should|[CW]ould|Must|Did|Does|Need"),
      csRegex("n't"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // You Can't Judge a Book by the Cover
      csRegex("Ca|Wo|Do|Should|[CW]ould|Must|Did|Does|Need"),
      csRegex("n't"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList(
      token("="),
      csRegex("[A-Z].+")
    ),
    Arrays.asList(
      csRegex("Peters")
    ),
    Arrays.asList( // What Does an Effective Cover Letter Look Like?
      csRegex("Who|What|When|Where|Why|How"),
      csRegex("Is|Are|Was|Were|Do(es)?|Have|Has"),
      new PatternTokenBuilder().pos("DT").min(0).build(),
      csRegex("[A-Z].+"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // PARENTHESES: (4 hrs/wk) Manage all IT affairs / Exercise (Engage in exercises...
      tokenRegex("\\(|\\)"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // Arrows
      token("-"),
      token(">"),
      csRegex("[A-Z].+")
    ),
    Arrays.asList( // Quotes
      new PatternTokenBuilder().tokenRegex("[\"“”„]").setSkip(-1).build(),
      new PatternTokenBuilder().tokenRegex("[A-Z].+").setSkip(-1).build(),
      tokenRegex("[\"“”„]")
    )
  );

  private final Language lang;
  private final LanguageModel lm;
  private final Supplier> antiPatterns;

  public UpperCaseNgramRule(ResourceBundle messages, LanguageModel lm, Language lang, UserConfig userConfig) {
    super(messages);
    super.setCategory(Categories.CASING.getCategory(messages));
    this.lm = lm;
    this.lang = lang;
    setLocQualityIssueType(ITSIssueType.Misspelling);
    addExamplePair(Example.wrong("This Prototype was developed by Miller et al."),
                   Example.fixed("This prototype was developed by Miller et al."));
    antiPatterns = cacheAntiPatterns(lang, ANTI_PATTERNS);

    initTrie();
    initSpeller();
    if (userConfig != null && linguServices == null) {
      linguServices = userConfig.getLinguServices();
    }
  }

  private void initSpeller() {
    if (spellerRule == null) {
      synchronized (UpperCaseNgramRule.class) {
        if (spellerRule == null) {
          try {
            spellerRule = new MorfologikAmericanSpellerRule(messages, lang);
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
        }
      }
    }
  }

  private void initTrie() {
    if (exceptionTrie == null) {
      synchronized (UpperCaseNgramRule.class) {
        if (exceptionTrie == null) {
          exceptionTrie = new AhoCorasickDoubleArrayTrie<>();
          CachingWordListLoader cachingWordListLoader = new CachingWordListLoader();
          List words = new ArrayList<>();
          words.addAll(cachingWordListLoader.loadWords("en/specific_case.txt"));
          words.addAll(cachingWordListLoader.loadWords("spelling_global.txt"));
          Map map = new HashMap<>();
          for (String word : words) {
            map.put(word, word);
          }
          exceptionTrie.build(map);
        }
      }
    }
  }

  @Override
  public List getAntiPatterns() {
    return antiPatterns.get();
  }

  @Override
  public final String getId() {
    return "EN_UPPER_CASE_NGRAM";
  }

  @Override
  public String getDescription() {
    return "Checks wrong uppercase spelling of words that are not proper nouns";
  }

  @Override
  public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List matches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    boolean atSentStart = true;

    boolean isSentence = isSentence(tokens);
    if (!isSentence) {
      // might be a headline, so skip it
      return toRuleMatchArray(matches);
    }

    for (int i = 0; i < tokens.length; i++) {
      AnalyzedTokenReadings token = tokens[i];
      String tokenStr = token.getToken();
      //System.out.println(i + ": " + prevIsUpperCase(tokens, i) + " - " + tokenStr);
      if (tokenStr.length() > 0
          && !token.isImmunized()
          && Character.isUpperCase(tokenStr.charAt(0))
          && !StringTools.isAllUppercase(tokenStr)
          && !atSentStart
          && token.hasPosTagStartingWith("VB")   // start only with these to avoid false alarms. TODO: extend
          && !token.hasPosTagStartingWith("NNP")
          && token.isTagged()
          && (!prevIsUpperCase(tokens, i) || (prevIsUpperCase(tokens, i) && i == 2))  // probably a name, like "Sex Pistols", but not c
          && !nextIsUpperCase(tokens, i)
          && !prevIsOneOf(tokens, i, Arrays.asList(":", "née", "of", "\"", "'"))  // probably a title like "The history of XYZ"
          && !nextIsOneOfThenUppercase(tokens, i, Arrays.asList("of"))
          && !tokenStr.matches("I")
          && !exceptions.contains(tokenStr)
          && !trieMatches(sentence.getText(), token)
          && !maybeTitle(tokens, i)
          && !isMisspelled(StringTools.lowercaseFirstChar(tokenStr))    // e.g. "German" is correct, "german" isn't
      ) {
        if (i + 1 < tokens.length) {
          List ucList = Arrays.asList(tokens[i - 1].getToken(), tokenStr, tokens[i + 1].getToken());
          List lcList = Arrays.asList(tokens[i - 1].getToken(), StringTools.lowercaseFirstChar(tokenStr), tokens[i + 1].getToken());
          Probability ucProb = lm.getPseudoProbability(ucList);
          Probability lcProb = lm.getPseudoProbability(lcList);
          double ratio = lcProb.getProb() / ucProb.getProb();
          //System.out.println("-->" + ucProb + ", lc: " + lcProb + " ==> " + ratio);
          if (ratio > THRESHOLD) {
            String msg = "Only proper nouns start with an uppercase character (there are exceptions for headlines).";
            RuleMatch match = new RuleMatch(this, sentence, token.getStartPos(), token.getEndPos(), msg);
            match.setSuggestedReplacement(StringTools.lowercaseFirstChar(tokenStr));
            matches.add(match);
          }
        }
      }
      if (!token.isSentenceStart() && !tokenStr.isEmpty() && !token.isNonWord()) {
        atSentStart = false;
      }
    }
    return toRuleMatchArray(matches);
  }
  
  boolean isMisspelled(String word) throws IOException {
    synchronized (spellerRule) {
      return linguServices == null ? spellerRule.isMisspelled(word) : !linguServices.isCorrectSpell(word, lang);
    }
  }

  // a very rough guess whether the word at the given position might be part of a title
  boolean maybeTitle(AnalyzedTokenReadings[] tokens, int i) {
    return firstLongWordToLeftIsUppercase(tokens, i) || firstLongWordToRightIsUppercase(tokens, i);
  }

  boolean firstLongWordToLeftIsUppercase(AnalyzedTokenReadings[] tokens, int pos) {
    for (int i = pos - 1; i > 1; i--) {
      if (isShortWord(tokens[i])) {
        continue;
      }
      return StringTools.startsWithUppercase(tokens[i].getToken());
    }
    return false;
  }

  boolean firstLongWordToRightIsUppercase(AnalyzedTokenReadings[] tokens, int pos) {
    for (int i = pos + 1; i < tokens.length; i++) {
      if (isShortWord(tokens[i])) {
        continue;
      }
      return StringTools.startsWithUppercase(tokens[i].getToken());
    }
    return false;
  }

  private boolean isShortWord(AnalyzedTokenReadings token) {
    // ignore words typically spelled lowercase even in titles
    return token.getToken().trim().isEmpty() || TYPICAL_LOWERCASE.matcher(token.getToken()).matches();
  }

  private boolean trieMatches(String text, AnalyzedTokenReadings token) {
    List> hits = exceptionTrie.parseText(text);
    for (AhoCorasickDoubleArrayTrie.Hit hit : hits) {
      if (hit.begin <= token.getStartPos() && hit.end >= token.getEndPos()) {
        return true;
      }
    }
    return false;
  }

  private boolean prevIsOneOf(AnalyzedTokenReadings[] tokens, int i, List strings) {
    return i > 0 && strings.contains(tokens[i-1].getToken());
  }

  // e.g. "The history of Xyz"
  //           ^^^^^^^
  private boolean nextIsOneOfThenUppercase(AnalyzedTokenReadings[] tokens, int i, List strings) {
    return i + 2 < tokens.length && strings.contains(tokens[i+1].getToken()) && StringTools.startsWithUppercase(tokens[i+2].getToken());
  }

  private boolean prevIsUpperCase(AnalyzedTokenReadings[] tokens, int i) {
    return i > 0 && StringTools.startsWithUppercase(tokens[i-1].getToken());
  }

  private boolean nextIsUpperCase(AnalyzedTokenReadings[] tokens, int i) {
    return i + 1 < tokens.length && StringTools.startsWithUppercase(tokens[i+1].getToken());
  }

  private boolean isSentence(AnalyzedTokenReadings[] tokens) {
    boolean isSentence = false;
    for (int i = tokens.length - 1; i > 0; i--) {
      if (PUNCT_PATTERN.matcher(tokens[i].getToken()).matches()) {
        isSentence = true;
        break;
      }
      if (!tokens[i].isParagraphEnd() && !tokens[i].isNonWord()) {
        break;
      }
    }
    return isSentence;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy