All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.tokenizers.en.EnglishWordTokenizer Maven / Gradle / Ivy

/* LanguageTool, a natural language style checker
 * Copyright (C) 2013 Marcin Milkowski (http://www.languagetool.org)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tokenizers.en;

import org.languagetool.tagging.en.EnglishTagger;
import org.languagetool.tokenizers.WordTokenizer;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.regex.Pattern.*;

/**
 * @author Marcin Milkowski
 * @since 2.5
 */
public class EnglishWordTokenizer extends WordTokenizer {

  private static final Pattern SINGLE_QUOTE = compile("'");
  private static final Pattern CURLY_QUOTE = compile("’");
  private static final Pattern APOSTYPEW = compile("\u0001\u0001APOSTYPEW\u0001\u0001");
  private static final Pattern APOSTYPOG = compile("\u0001\u0001APOSTYPOG\u0001\u0001");
  private static final Pattern SOFT_HYPHEN = compile("\u00AD");
  private static final List patternList = Arrays.asList(
      compile("^(fo['’]c['’]sle|rec['’][ds]|OK['’]d|cc['’][ds]|DJ['’][d]|[pd]m['’]d|rsvp['’]d)$", CASE_INSENSITIVE | UNICODE_CASE),
      compile(
          "^(['’]?)(are|is|were|was|do|does|did|have|has|had|wo|would|ca|could|sha|should|must|ai|ought|might|need|may|am|dare|das|dass|hai|used|use)(n['’]t)$",
          CASE_INSENSITIVE | UNICODE_CASE),
      compile("^(.+)(['’]m|['’]re|['’]ll|['’]ve|['’]d|['’]s)(['’-]?)$",
          CASE_INSENSITIVE | UNICODE_CASE),
      compile("^(['’]t)(was|were|is)$", CASE_INSENSITIVE | UNICODE_CASE));

  //the string used to tokenize characters
  private final String enTokenizingChars = super.getTokenizingCharacters() + "_"; // underscore;

//  public EnglishWordTokenizer() {
//
//  }

  /**
   * Tokenizes text. The English tokenizer differs from the standard one in two
   * respects:
   * 
    *
  1. it does not treat the hyphen as part of the word if the hyphen is at the * end of the word;
  2. *
  3. it includes n-dash as a tokenizing character, as it is used without a * whitespace in English. *
* * @param text String of words to tokenize. */ @Override public List tokenize(String text) { List l = new ArrayList<>(); String auxText = text; auxText = SINGLE_QUOTE.matcher(auxText).replaceAll("\u0001\u0001APOSTYPEW\u0001\u0001"); auxText = CURLY_QUOTE.matcher(auxText).replaceAll("\u0001\u0001APOSTYPOG\u0001\u0001"); //auxText = auxText.replaceAll("-", "\u0001\u0001HYPHEN\u0001\u0001"); String s; String groupStr; final StringTokenizer st = new StringTokenizer(auxText, enTokenizingChars, true); while (st.hasMoreElements()) { s = APOSTYPEW.matcher(st.nextToken()).replaceAll("'"); s = APOSTYPOG.matcher(s).replaceAll("’"); //.replaceAll("\u0001\u0001HYPHEN\u0001\u0001", "-"); boolean matchFound = false; Matcher matcher = null; if (s.contains("'") || s.contains("’")) { for (Pattern pattern : patternList) { matcher = pattern.matcher(s); matchFound = matcher.find(); if (matchFound) { break; } } } if (matchFound) { for (int i = 1; i <= matcher.groupCount(); i++) { groupStr = matcher.group(i); l.addAll(wordsToAdd(groupStr)); } } else { l.addAll(wordsToAdd(s)); } } return joinEMailsAndUrls(l); } /* Splits a word containing hyphen(-’') if it doesn't exist in the dictionary. */ private List wordsToAdd(String s) { final List l = new ArrayList<>(); int hyphensAtEnd = 0; synchronized (this) { // speller is not thread-safe if (!s.isEmpty()) { while (s.startsWith("-")) { l.add("-"); s = s.substring(1); } while (s.endsWith("-")) { s = s.substring(0,s.length()-1); hyphensAtEnd++; } if (!s.isEmpty()) { if (!s.contains("-") && !s.contains("'") && !s.contains("’")) { l.add(s); } else { String normalized = SOFT_HYPHEN.matcher(s).replaceAll(""); normalized = CURLY_QUOTE.matcher(normalized).replaceAll("'"); if (EnglishTagger.INSTANCE.tag(Arrays.asList(normalized)).get(0) .isTagged()) { l.add(s); } // some camel-case words containing hyphen (is there any better fix?) else if (s.equalsIgnoreCase("mers-cov") || s.equalsIgnoreCase("mcgraw-hill") || s.equalsIgnoreCase("sars-cov-2") || s.equalsIgnoreCase("sars-cov") || s.equalsIgnoreCase("ph-metre") || s.equalsIgnoreCase("ph-metres") || s.equalsIgnoreCase("anti-ivg") || s.equalsIgnoreCase("anti-uv") || s.equalsIgnoreCase("anti-vih") || s.equalsIgnoreCase("al-qaida")) { l.add(s); } else { // if not found, the word is split // final StringTokenizer st2 = new StringTokenizer(s, "-’'", true); final StringTokenizer st2 = new StringTokenizer(s, "’'", true); while (st2.hasMoreElements()) { l.add(st2.nextToken()); } } } } } while (hyphensAtEnd > 0) { l.add("-"); hyphensAtEnd--; } return l; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy