org.languagetool.rules.ca.MorfologikCatalanSpellerRule Maven / Gradle / Ivy

Go to download
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2012 Marcin Miłkowski (http://www.languagetool.org)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

package org.languagetool.rules.ca;

import org.languagetool.*;
import org.languagetool.rules.SuggestedReplacement;
import org.languagetool.rules.spelling.SpellingCheckRule;
import org.languagetool.rules.spelling.morfologik.MorfologikSpellerRule;
import org.languagetool.tagging.ca.CatalanTagger;
import org.languagetool.tools.StringTools;

import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public final class MorfologikCatalanSpellerRule extends MorfologikSpellerRule {

  private final String dictFilename;
  private static final String SPELLING_FILE = "/ca/spelling.txt";
  private static final Pattern QUOTE_OR_HYPHEN = Pattern.compile("['-]");

  @Override
  public List getAdditionalSpellingFileNames() {
    return Arrays.asList("/ca/"+SpellingCheckRule.CUSTOM_SPELLING_FILE, SpellingCheckRule.GLOBAL_SPELLING_FILE,
      "/ca/multiwords.txt", "/ca/spelling-special.txt");
  }
  private static final List PARTICULA_INICIAL = Arrays.asList("no", "en", "a", "el", "els", "al", "als", "pel",
    "pels", "del", "dels", "del", "de", "per", "un", "uns", "una", "unes", "la", "les", "teu", "meu", "seu", "teus", "meus", "seus");
  private static final List PREFIX_AMB_ESPAI = Arrays.asList("pod", "ultra", "eco", "tele", "anti", "re", "des",
    "avant", "auto", "ex", "extra", "macro", "mega", "meta", "micro", "multi", "mono", "mini", "post", "retro", "semi", "super", "trans", "pro", "g", "l", "m");

  private static final List PRONOM_INICIAL = Arrays.asList("em", "et", "es", "se", "ens", "us", "vos", "li", "hi",
    "ho", "el", "la", "els", "les");
  private static final Pattern APOSTROF_INICI_VERBS = Pattern.compile("^([lnts])[90]?(h?[aeiouàéèíòóú].*)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

  private static final Pattern APOSTROF_INICI_VERBS_M = Pattern.compile("^(m)[90]?(h?[aeiouàéèíòóú].*)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
  private static final Pattern APOSTROF_INICI_NOM_SING = Pattern.compile("^([ld])[90]?(h?[aeiouàéèíòóú]...+)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
  private static final Pattern APOSTROF_INICI_NOM_PLURAL = Pattern.compile("^(d)[90]?(h?[aeiouàéèíòóú].+)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
  private static final Pattern APOSTROF_FINAL = Pattern.compile("^(...+[aei])[90]?(l|ls|m|ns|n|t)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
  private static final Pattern APOSTROF_FINAL_S = Pattern.compile("^(.+e)[90]?(s)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
  private static final Pattern GUIONET_FINAL = Pattern.compile(
      "^([\\p{L}·]+)[’']?(hi|ho|la|les|li|lo|los|me|ne|nos|se|te|vos)$",
      Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

  private static final Pattern GUIONET_FINAL_GERUNDI = Pattern.compile(
    "^([\\p{L}·]+n)(hi|ho|la|les|li|lo|los|me|ne|nos|se|te|vos)$",
    Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
  private static final List SPLIT_DIGITS_AT_END = Arrays.asList("en", "de", "del", "al", "dels", "als", "a", "i", "o", "amb");
  private static final Pattern VERB_INDSUBJ = Pattern.compile("V.[SI].*");
  private static final Pattern VERB_INDSUBJ_M = Pattern.compile("V.[SI].[123]S.*|V.[SI].[23]P.*");
  private static final Pattern NOM_SING = Pattern.compile("V.[NG].*|V.P..S..|N..[SN].*|A...[SN].|PX..S...|DD..S.");
  private static final Pattern NOM_PLURAL = Pattern.compile("V.P..P..|N..[PN].*|A...[PN].|PX..P...|DD..P.");
  private static final Pattern VERB_INFGERIMP = Pattern.compile("V.[NGM].*");
  private static final Pattern VERB_INF = Pattern.compile("V.N.*");
  private static final Pattern VERB_GER = Pattern.compile("V.G.*");

  /* lemma exceptions */
  public static final String[] LemmasToIgnore =  new String[] {"enterar", "sentar", "conseguir", "alcançar", "entimar"};
  public static final String[] LemmasToAllow =  new String[] {"enter", "sentir"};

  private static final List inalambric = Arrays.asList("inalàmbric", "inalàmbrica", "inalàmbrics", "inal" +
    "àmbriques", "inalàmbricament", "inalàmbricamente");

  private static final List PronomsDarrere = Arrays.asList("losels", "losles", "nosels", "nosles", "vosels",
    "vosens", "vosles", "lesen", "leshi", "liles", "losel", "losen", "loshi", "losho", "losla", "lsels", "lsles",
    "meles", "nosel", "nosen", "noshi", "nosho", "nosla", "nosli", "nsels", "nsles", "seles", "sevos", "teles",
    "usels", "usens", "usles", "vosel", "vosem", "vosen", "voshi", "vosho", "vosla", "vosli", "lahi", "lihi", "liho",
    "lila", "lils", "lsel", "lsen", "lshi", "lsho", "lsla", "mela", "meli", "mels", "nsel", "nsen", "nshi", "nsho",
    "nsla", "nsli", "sela", "seli", "sels", "sens", "seus", "tela", "teli", "tels", "tens", "usel", "usem", "usen",
    "ushi", "usho", "usla", "usli", "lan", "len", "les", "lhi", "lil", "lin", "los", "mel", "men", "mhi", "mho", "nhi"
    , "nos", "sel", "sem", "sen", "set", "shi", "sho", "tel", "tem", "ten", "thi", "tho", "vos", "hi", "ho", "la",
    "li", "lo", "ls", "me", "ne", "ns", "se", "te", "us");
  // "l", "m", "n", "s", "t"

  private CatalanTagger tagger;

  public MorfologikCatalanSpellerRule(ResourceBundle messages, Language language, UserConfig userConfig,
      List altLanguages) throws IOException {
    super(messages, language, userConfig, altLanguages);
    this.setIgnoreTaggedWords();
    if (language.getVariant() != null) {
      tagger = CatalanTagger.INSTANCE_VAL;
    } else {
      tagger = CatalanTagger.INSTANCE_CAT;
    }
    dictFilename = "/ca/" + language.getShortCodeWithCountryAndVariant() + "_spelling" + JLanguageTool.DICTIONARY_FILENAME_EXTENSION;
  }

  @Override
  public String getFileName() {
    return dictFilename;
  }

  @Override
  public String getSpellingFileName() {
    return SPELLING_FILE;
  }

  @Override
  public String getId() {
    return "MORFOLOGIK_RULE_CA_ES";
  }

  @Override
  // Use this rule in LO/OO extension despite being a spelling rule
  public boolean useInOffice() {
    return true;
  }

  @Override
  protected List orderSuggestions(List suggestions, String word) {
    // move some run-on-words suggestions to the top
    List newSuggestions = new ArrayList<>();
    String wordWithouDiacriticsString = StringTools.removeDiacritics(word);
    for (int i = 0; i < suggestions.size(); i++) {
      String replacement = suggestions.get(i).getReplacement();
      if (inalambric.contains(replacement.toLowerCase())) {
        newSuggestions = new ArrayList<>();
        newSuggestions.add(new SuggestedReplacement("sense fils"));
        newSuggestions.add(new SuggestedReplacement("sense fil"));
        newSuggestions.add(new SuggestedReplacement("sense cables"));
        newSuggestions.add(new SuggestedReplacement("autònom"));
        return newSuggestions;
      }
      // remove always
      if (replacement.equalsIgnoreCase("como")) {
        continue;
      }
      boolean ignoreSuggestion = false;
      String[] wordsToAnalyze = replacement.split("[ '-]");
      List newatrs = tagger.tag(Arrays.asList(wordsToAnalyze));
      for (AnalyzedTokenReadings newatr : newatrs) {
        if (newatr.hasAnyLemma(LemmasToIgnore) && !newatr.hasAnyLemma(LemmasToAllow)) {
          ignoreSuggestion = true;
          break;
        }
      }
      if (ignoreSuggestion) {
        continue;
      }
      // l'_ : remove superfluous space
      if (replacement.contains("' ")) {
        suggestions.get(i).setReplacement(replacement.replace("' ", "'"));
      }
      String[] parts = replacement.split(" ");
      if (parts.length == 2) {
        if (parts[1].toLowerCase().equals("s")) {
          continue;
        }
        // remove wrong split prefixes
        if (PREFIX_AMB_ESPAI.contains(parts[0].toLowerCase())) {
          continue;
        }
      }

      // Don't change first suggestions if they match word without diacritics
      int posNewSugg = 0;
      while (newSuggestions.size() > posNewSugg
          && StringTools.removeDiacritics(newSuggestions.get(posNewSugg).getReplacement())
              .equalsIgnoreCase(wordWithouDiacriticsString)) {
        posNewSugg++;
      }

      // move some split words to first place
      if (parts.length == 2) {
        if (parts[1].length() > 1 && PARTICULA_INICIAL.contains(parts[0].toLowerCase())) {
          String newSuggestion = parts[1];
          List atkn = tagger.tag(Arrays.asList(newSuggestion));
          boolean isBalear = atkn.get(0).hasPosTag("VMIP1S0B") && !atkn.get(0).hasPosTagStartingWith("N");
          if (!isBalear) {
            newSuggestions.add(posNewSugg, suggestions.get(i));
            continue;
          }
        }
        if (parts[1].length() > 1 && PRONOM_INICIAL.contains(parts[0].toLowerCase())) {
          String newSuggestion = parts[1];
          List atkn = tagger.tag(Arrays.asList(newSuggestion));
          if (atkn.get(0).matchesPosTagRegex(VERB_INDSUBJ)) {
            newSuggestions.add(posNewSugg, suggestions.get(i));
            continue;
          }
        }
      }

      String suggWithoutDiacritics = StringTools.removeDiacritics(replacement);
      if (StringTools.removeDiacritics(word).equalsIgnoreCase(suggWithoutDiacritics)) {
        newSuggestions.add(posNewSugg, suggestions.get(i));
        continue;
      }

      // move words with apostrophe or hyphen to second position
      String cleanSuggestion = QUOTE_OR_HYPHEN.matcher(replacement).replaceAll("");
      if (i > 1 && suggestions.size() > 2 && cleanSuggestion.equalsIgnoreCase(word)) {
        if (posNewSugg == 0) {
          posNewSugg = 1;
        }
        newSuggestions.add(posNewSugg, suggestions.get(i));
        continue;
      }
      // move "queda'n" to second position, but not anar-se'n
      if (i == 1) {
        if (!suggestions.get(0).getReplacement().contains("-") && (suggestions.get(0).getReplacement().endsWith("'n")
          || suggestions.get(0).getReplacement().endsWith("'t"))) {
          newSuggestions.add(0, suggestions.get(i));
          continue;
        }
      }
      newSuggestions.add(suggestions.get(i));
    }
    return newSuggestions;
  }

  @Override
  protected List getAdditionalTopSuggestions(List suggestions, String word)
      throws IOException {
    List suggestionsList = suggestions.stream().map(SuggestedReplacement::getReplacement)
        .collect(Collectors.toList());
    return SuggestedReplacement.convert(getAdditionalTopSuggestionsString(suggestionsList, word));
  }

  private List getAdditionalTopSuggestionsString(List suggestions, String word) throws IOException {
    /*
     * if (word.length() < 5) { return Collections.emptyList(); }
     */
    String[] parts = StringTools.splitCamelCase(word);
    if (parts.length > 1 && parts[0].length()>1) {
      boolean isNotMisspelled = true;
      for(String part: parts) {
        isNotMisspelled &= !speller1.isMisspelled(part);
      }
      if (isNotMisspelled) {
        return Collections.singletonList(String.join(" ",parts));
      }
    }
    parts = StringTools.splitDigitsAtEnd(word);
    if (parts.length > 1) {
      if (tagger.tag(Arrays.asList(parts[0])).get(0).isTagged()
      && (parts[0].length() > 2 || SPLIT_DIGITS_AT_END.contains(parts[0].toLowerCase()))) {
        return Collections.singletonList(String.join(" ",parts));
      }
    }
    String suggestion = "";
    suggestion = findSuggestion(suggestion, word, APOSTROF_INICI_VERBS, VERB_INDSUBJ, 2, "'", "");
    suggestion = findSuggestion(suggestion, word, APOSTROF_INICI_VERBS_M, VERB_INDSUBJ_M, 2, "'", "");
    suggestion = findSuggestion(suggestion, word, APOSTROF_INICI_NOM_SING, NOM_SING, 2, "'", "");
    suggestion = findSuggestion(suggestion, word, APOSTROF_INICI_NOM_PLURAL, NOM_PLURAL, 2, "'", "");
    suggestion = findSuggestion(suggestion, word, APOSTROF_FINAL, VERB_INFGERIMP, 1, "'", "");
    suggestion = findSuggestion(suggestion, word, APOSTROF_FINAL_S, VERB_INF, 1, "'", "");
    suggestion = findSuggestion(suggestion, word, GUIONET_FINAL_GERUNDI, VERB_GER, 1, "-", "t");
    suggestion = findSuggestion(suggestion, word, GUIONET_FINAL, VERB_INFGERIMP, 1, "-", "");
    suggestion = findSuggestionMultiplePronouns(suggestion, word);
    if (!suggestion.isEmpty()) {
      return Collections.singletonList(suggestion);
    }
    return Collections.emptyList();
  }

  private String findSuggestion(String suggestion, String word, Pattern wordPattern, Pattern postagPattern,
      int suggestionPosition, String separator, String addStr) throws IOException {
    if (!suggestion.isEmpty()) {
      return suggestion;
    }
    Matcher matcher = wordPattern.matcher(word);
    if (matcher.matches()) {
      String newSuggestion = matcher.group(suggestionPosition) + addStr;
      AnalyzedTokenReadings newatr = tagger.tag(Arrays.asList(newSuggestion)).get(0);
      if ((!newatr.hasPosTag("VMIP1S0B") || newSuggestion.equalsIgnoreCase("fer") || newSuggestion.equalsIgnoreCase("ajust")
          || newSuggestion.equalsIgnoreCase("gran")) && matchPostagRegexp(newatr, postagPattern)) {
        return matcher.group(1) + addStr + separator + matcher.group(2);
      }
    }
    return "";
  }

  // anarsen -> anar-se'n, danarsen -> d'anar-se'n
  private String findSuggestionMultiplePronouns(String suggestion, String word) {
    if (!suggestion.isEmpty()) {
      return suggestion;
    }
    String lcword = word.toLowerCase();
    String pronouns = endsWithPronoun(lcword);
    String verb = lcword.substring(0, word.length() - pronouns.length());
    AnalyzedTokenReadings newatr = tagger.tag(Arrays.asList(verb)).get(0);
    if (matchPostagRegexp(newatr, VERB_INFGERIMP)) {
      return verb + PronomsFeblesHelper.transformDarrere(pronouns, verb);
    }
    if (verb.length() < 5) {
      return "";
    }
    if (lcword.startsWith("d") || lcword.startsWith("l")) {
      verb = verb.substring(1, verb.length());
      newatr = tagger.tag(Arrays.asList(verb)).get(0);
      if (matchPostagRegexp(newatr, VERB_INF)) {
        return lcword.substring(0, 1) + "'" + verb + PronomsFeblesHelper.transformDarrere(pronouns, verb);
      }
    }
    return "";
  }

  private String endsWithPronoun(String s) {
    for (String pronoun : PronomsDarrere) {
      if (s.endsWith(pronoun)) {
        return pronoun;
      }
    }
    return "";
  }

  /**
   * Match POS tag with regular expression
   */
  private boolean matchPostagRegexp(AnalyzedTokenReadings aToken, Pattern pattern) {
    for (AnalyzedToken analyzedToken : aToken) {
      String posTag = analyzedToken.getPOSTag();
      if (posTag == null) {
        posTag = "UNKNOWN";
      }
      final Matcher m = pattern.matcher(posTag);
      if (m.matches()) {
        return true;
      }
    }
    return false;
  }

  // Do not tokenize new words from spelling.txt...
  // Multi-token words should be in multiwords.txt
  protected boolean tokenizeNewWords() {
    return false;
  }

}