org.languagetool.tokenizers.ca.CatalanWordTokenizer Maven / Gradle / Ivy

Go to download
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tokenizers.ca;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.languagetool.tagging.ca.CatalanTagger;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tools.StringTools;

import static org.languagetool.tools.StringTools.CHARS_NOT_FOR_SPELLING;

/**
 * Tokenizes a sentence into words. Punctuation and whitespace gets its own token.
 * Special treatment for hyphens and apostrophes in Catalan.
 *
 * @author Jaume Ortolà 
 */
public class CatalanWordTokenizer extends WordTokenizer {

  private static final String wordCharacters = "§©@€£\\$_\\p{L}\\d·\\-\u0300-\u036F\u00A8\u2070-\u209F°%‰‱&\uFFFD\u00AD\u00AC";
  private static final Pattern tokenizerPattern = Pattern.compile("[" + wordCharacters + "]+|[^" + wordCharacters + "]");
  //all possible forms of "pronoms febles" after a verb.
  private static final String PF = "(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)";
  private static final Pattern PATTERN_1 = Pattern.compile("xxCA_APOS_RECTExx", Pattern.LITERAL);
  private static final Pattern PATTERN_2 = Pattern.compile("xxCA_APOS_RODOxx", Pattern.LITERAL);
  private static final Pattern PATTERN_3 = Pattern.compile("xxCA_HYPHENxx", Pattern.LITERAL);
  private static final Pattern PATTERN_4 = Pattern.compile("xxCA_DECIMALPOINTxx", Pattern.LITERAL);
  private static final Pattern PATTERN_5 = Pattern.compile("xxCA_DECIMALCOMMAxx", Pattern.LITERAL);
  private static final Pattern PATTERN_6 = Pattern.compile("xxCA_SPACExx", Pattern.LITERAL);
  private static final Pattern PATTERN_7 = Pattern.compile("xxELA_GEMINADAxx", Pattern.LITERAL);
  private static final Pattern PATTERN_8 = Pattern.compile("xxELA_GEMINADA_UPPERCASExx", Pattern.LITERAL);
  private static final Pattern SOFT_HYPHEN = Pattern.compile("\u00AD");
  private static final Pattern CURLY_SINGLE_QUOTE = Pattern.compile("’", Pattern.LITERAL);
  private static final Pattern LL = Pattern.compile("l-l", Pattern.LITERAL);

  private static final int maxPatterns = 11;
  private final Pattern[] patterns = new Pattern[maxPatterns];

  //Patterns to avoid splitting words in certain special cases
  // allows correcting typographical errors in "ela geminada"
  private static final Pattern ELA_GEMINADA = Pattern.compile("([aeiouàéèíóòúïüAEIOUÀÈÉÍÒÓÚÏÜ])l[.\u2022\u22C5\u2219\uF0D7]l([aeiouàéèíóòúïü])",Pattern.UNICODE_CASE);
  private static final Pattern ELA_GEMINADA_UPPERCASE = Pattern.compile("([AEIOUÀÈÉÍÒÓÚÏÜ])L[.\u2022\u22C5\u2219\uF0D7]L([AEIOUÀÈÉÍÒÓÚÏÜ])",Pattern.UNICODE_CASE);
  // apostrophe 
  private static final Pattern APOSTROF_RECTE = Pattern.compile("([\\p{L}])'([\\p{L}\"‘“«])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern APOSTROF_RODO = Pattern.compile("([\\p{L}])’([\\p{L}\"‘“«])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  // apostrophe before number 1. Ex.: d'1 km, és l'1 de gener, és d'1.4 kg
  private static final Pattern APOSTROF_RECTE_1 = Pattern.compile("([dlDL])'(\\d[\\d\\s\\.,]?)",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern APOSTROF_RODO_1 = Pattern.compile("([dlDL])’(\\d[\\d\\s\\.,]?)",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  // decimal point between digits
  private static final Pattern DECIMAL_POINT= Pattern.compile("([\\d])\\.([\\d])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  // decimal comma between digits
  private static final Pattern DECIMAL_COMMA= Pattern.compile("([\\d]),([\\d])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  // space between digits
  // the first is an exception to the two next patterns
  private static final Pattern SPACE_DIGITS0= Pattern.compile("([\\d]{4}) ",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern SPACE_DIGITS= Pattern.compile("([\\d]) ([\\d][\\d][\\d])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern SPACE_DIGITS2= Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern SPACE0 = Pattern.compile("xxCA_SPACE0xx");
  // Sàsser-l'Alguer
  private static final Pattern HYPHEN_L= Pattern.compile("([\\p{L}]+)(-)([Ll]['’])([\\p{L}]+)",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

  public CatalanWordTokenizer() {

    // Apostrophe at the beginning of a word. Ex.: l'home, s'estima, n'omple, hivern, etc.
    // It creates 2 tokens: l'home
    patterns[0] = Pattern.compile("^([lnmtsd]['’])([^'’\\-]*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    // Exceptions to (Match verb+1 pronom feble)
    // It creates 1 token: qui-sap-lo
    patterns[1] = Pattern.compile("^(qui-sap-lo|qui-sap-la|qui-sap-los|qui-sap-les)|(Castella)(-)(la)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    // Match verb+3 pronoms febles (rare but possible!). Ex: Emporta-te'ls-hi.
    // It creates 4 tokens: Emporta-te'ls-hi
    patterns[2] = Pattern.compile("^([lnmtsd]['’])(.{2,})"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
    patterns[3] = Pattern.compile("^(.{2,})"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    // Match verb+2 pronoms febles. Ex: Emporta-te'ls. 
    // It creates 3 tokens: Emporta-te'ls
    patterns[4] = Pattern.compile("^([lnmtsd]['’])(.{2,})"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
    patterns[5] = Pattern.compile("^(.{2,})"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    // match verb+1 pronom feble. Ex: Emporta't, vés-hi, porta'm.
    // It creates 2 tokens: Emporta't
    // ^(.+[^cbfhjkovwyzCBFHJKOVWYZ])
    patterns[6] = Pattern.compile("^([lnmtsd]['’])(.{2,})"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
    patterns[7] = Pattern.compile("^(.+[^wo])"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    // d'emportar
    patterns[8] = Pattern.compile("^([lnmtsd]['’])(.*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    //contractions: al, als, pel, pels, del, dels, cal (!), cals (!) 
    patterns[9] = Pattern.compile("^(a|de|pe)(ls?)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

    //contraction: can
    patterns[10] = Pattern.compile("^(ca)(n)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);

  }

  /**
   * @param text Text to tokenize
   * @return List of tokens.
   *         Note: a special string xxCA_APOSxx is used to replace apostrophes,
   *         and xxCA_HYPHENxx to replace hyphens.
   */
  @Override
  public List tokenize(final String text) {
    final List l = new ArrayList<>();
    // replace hyphen, non-break hyphen -> hyphen-minus
    String auxText = text.replace('\u2010', '\u002d');
    auxText = auxText.replace('\u2011', '\u002d');
    Matcher matcher=ELA_GEMINADA.matcher(auxText);
    auxText = matcher.replaceAll("$1xxELA_GEMINADAxx$2");
    matcher=ELA_GEMINADA_UPPERCASE.matcher(auxText);
    auxText = matcher.replaceAll("$1xxELA_GEMINADA_UPPERCASExx$2");
    matcher=APOSTROF_RECTE.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_APOS_RECTExx$2");
    matcher=APOSTROF_RECTE_1.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_APOS_RECTExx$2");
    matcher=APOSTROF_RODO.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_APOS_RODOxx$2");
    matcher=APOSTROF_RODO_1.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_APOS_RODOxx$2");
    matcher=DECIMAL_POINT.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_DECIMALPOINTxx$2");
    matcher=DECIMAL_COMMA.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_DECIMALCOMMAxx$2");
    matcher=SPACE_DIGITS0.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_SPACE0xx");
    matcher=SPACE_DIGITS2.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_SPACExx$2xxCA_SPACExx$3");
    matcher=SPACE_DIGITS.matcher(auxText);
    auxText = matcher.replaceAll("$1xxCA_SPACExx$2");
    matcher=SPACE0.matcher(auxText);
    auxText = SPACE0.matcher(auxText).replaceAll(" ");

    Matcher tokenizerMatcher = tokenizerPattern.matcher(auxText);
    while (tokenizerMatcher.find()) {
      String s = tokenizerMatcher.group();
      if (l.size() > 0 && s.length() == 1 && s.codePointAt(0)>=0xFE00 && s.codePointAt(0)<=0xFE0F) {
        l.set(l.size() - 1, l.get(l.size() - 1) + s);
        continue;
      }
      s = PATTERN_1.matcher(s).replaceAll("'");
      s = PATTERN_2.matcher(s).replaceAll("’");
      s = PATTERN_3.matcher(s).replaceAll("-");
      s = PATTERN_4.matcher(s).replaceAll(".");
      s = PATTERN_5.matcher(s).replaceAll(",");
      s = PATTERN_6.matcher(s).replaceAll(" ");
      s = PATTERN_7.matcher(s).replaceAll("l.l");
      s = PATTERN_8.matcher(s).replaceAll("L.L");
      boolean matchFound = false;
      while (s.length() > 1 && s.startsWith("-")) {
        l.add("-");
        s = s.substring(1);
      }
      int hyphensAtEnd = 0;
      while (s.length() > 1 && s.endsWith("-")) {
        s = s.substring(0, s.length() - 1);
        hyphensAtEnd++;
      }
      int j = 0;
      while (j < maxPatterns && !matchFound) {
        matcher = patterns[j].matcher(s);
        matchFound = matcher.find();
        j++;
      }
      if (matchFound) {
        for (int i = 1; i <= matcher.groupCount(); i++) {
          String groupStr = matcher.group(i);
          if (groupStr!=null) {
            l.addAll(wordsToAdd(groupStr));  
          }
        }
      } else {
        l.addAll(wordsToAdd(s));
      }
      while (hyphensAtEnd > 0) {
        l.add("-");
        hyphensAtEnd--;
      }
    }
    return joinEMailsAndUrls(l);
  }

  /* Splits a word containing hyphen(-) if it doesn't exist in the dictionary. 
   * Split apostrophe in the last char */
  private List wordsToAdd(String s) {
    final List l = new ArrayList<>();
    synchronized (this) { //speller is not thread-safe
      if (!s.isEmpty()) {
        if (!s.contains("-") && !s.endsWith("'") && !s.endsWith("’")) {
          l.add(s);
        } else {
          // words containing hyphen (-) are looked up in the dictionary
          if (CatalanTagger.INSTANCE_CAT.tag(Arrays.asList(CURLY_SINGLE_QUOTE.matcher(SOFT_HYPHEN.matcher(s).replaceAll("")).replaceAll("'"))).get(0).isTagged()) {
            l.add(s);
          }
          // some camel-case words containing hyphen (is there any better fix?)
          else if (s.equalsIgnoreCase("mers-cov") || s.equalsIgnoreCase("mcgraw-hill") 
              || s.equalsIgnoreCase("sars-cov-2") || s.equalsIgnoreCase("sars-cov") 
              || s.equalsIgnoreCase("ph-metre") || s.equalsIgnoreCase("ph-metres")) {
            l.add(s);
          }
          // words with "ela geminada" with typo: col-legi (col·legi)
          else if (CatalanTagger.INSTANCE_CAT.tag(Arrays.asList(LL.matcher(SOFT_HYPHEN.matcher(s).replaceAll("")).replaceAll("l·l"))).get(0).isTagged()) {
            l.add(s);
          // apostrophe in the last char
          } else if ((s.endsWith("'") || s.endsWith("’")) && s.length() > 1) {
            l.addAll(wordsToAdd(s.substring(0, s.length() - 1)));
            l.add(s.substring(s.length() - 1));
          } else {
            Matcher matcher = HYPHEN_L.matcher(s);
            if (matcher.matches()) {
              for (int i = 1; i <= matcher.groupCount(); i++) {
                String groupStr = matcher.group(i);
                l.addAll(wordsToAdd(groupStr));
              }
            } else {
              // if not found, the word is split
              final StringTokenizer st2 = new StringTokenizer(s, "-", true);
              while (st2.hasMoreElements()) {
                l.add(st2.nextToken());
              }
            }
          }
        }
      }
      return l;
    }
  }

}