
org.languagetool.tokenizers.en.EnglishWordTokenizer Maven / Gradle / Ivy
/* LanguageTool, a natural language style checker
* Copyright (C) 2013 Marcin Milkowski (http://www.languagetool.org)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.tokenizers.en;
import org.languagetool.tagging.en.EnglishTagger;
import org.languagetool.tokenizers.WordTokenizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Pattern.*;
/**
* @author Marcin Milkowski
* @since 2.5
*/
public class EnglishWordTokenizer extends WordTokenizer {
private static final Pattern SINGLE_QUOTE = compile("'");
private static final Pattern CURLY_QUOTE = compile("’");
private static final Pattern APOSTYPEW = compile("\u0001\u0001APOSTYPEW\u0001\u0001");
private static final Pattern APOSTYPOG = compile("\u0001\u0001APOSTYPOG\u0001\u0001");
private static final Pattern SOFT_HYPHEN = compile("\u00AD");
private static final List patternList = Arrays.asList(
compile("^(fo['’]c['’]sle|rec['’][ds]|OK['’]d|cc['’][ds]|DJ['’][d]|[pd]m['’]d|rsvp['’]d)$", CASE_INSENSITIVE | UNICODE_CASE),
compile(
"^(['’]?)(are|is|were|was|do|does|did|have|has|had|wo|would|ca|could|sha|should|must|ai|ought|might|need|may|am|dare|das|dass|hai|used|use)(n['’]t)$",
CASE_INSENSITIVE | UNICODE_CASE),
compile("^(.+)(['’]m|['’]re|['’]ll|['’]ve|['’]d|['’]s)(['’-]?)$",
CASE_INSENSITIVE | UNICODE_CASE),
compile("^(['’]t)(was|were|is)$", CASE_INSENSITIVE | UNICODE_CASE));
//the string used to tokenize characters
private final String enTokenizingChars = super.getTokenizingCharacters() + "_"; // underscore;
// public EnglishWordTokenizer() {
//
// }
/**
* Tokenizes text. The English tokenizer differs from the standard one in two
* respects:
*
* - it does not treat the hyphen as part of the word if the hyphen is at the
* end of the word;
* - it includes n-dash as a tokenizing character, as it is used without a
* whitespace in English.
*
*
* @param text String of words to tokenize.
*/
@Override
public List tokenize(String text) {
List l = new ArrayList<>();
String auxText = text;
auxText = SINGLE_QUOTE.matcher(auxText).replaceAll("\u0001\u0001APOSTYPEW\u0001\u0001");
auxText = CURLY_QUOTE.matcher(auxText).replaceAll("\u0001\u0001APOSTYPOG\u0001\u0001");
//auxText = auxText.replaceAll("-", "\u0001\u0001HYPHEN\u0001\u0001");
String s;
String groupStr;
final StringTokenizer st = new StringTokenizer(auxText, enTokenizingChars, true);
while (st.hasMoreElements()) {
s = APOSTYPEW.matcher(st.nextToken()).replaceAll("'");
s = APOSTYPOG.matcher(s).replaceAll("’");
//.replaceAll("\u0001\u0001HYPHEN\u0001\u0001", "-");
boolean matchFound = false;
Matcher matcher = null;
if (s.contains("'") || s.contains("’")) {
for (Pattern pattern : patternList) {
matcher = pattern.matcher(s);
matchFound = matcher.find();
if (matchFound) {
break;
}
}
}
if (matchFound) {
for (int i = 1; i <= matcher.groupCount(); i++) {
groupStr = matcher.group(i);
l.addAll(wordsToAdd(groupStr));
}
} else {
l.addAll(wordsToAdd(s));
}
}
return joinEMailsAndUrls(l);
}
/* Splits a word containing hyphen(-’') if it doesn't exist in the dictionary. */
private List wordsToAdd(String s) {
final List l = new ArrayList<>();
int hyphensAtEnd = 0;
synchronized (this) { // speller is not thread-safe
if (!s.isEmpty()) {
while (s.startsWith("-")) {
l.add("-");
s = s.substring(1);
}
while (s.endsWith("-")) {
s = s.substring(0,s.length()-1);
hyphensAtEnd++;
}
if (!s.isEmpty()) {
if (!s.contains("-") && !s.contains("'") && !s.contains("’")) {
l.add(s);
} else {
String normalized = SOFT_HYPHEN.matcher(s).replaceAll("");
normalized = CURLY_QUOTE.matcher(normalized).replaceAll("'");
if (EnglishTagger.INSTANCE.tag(Arrays.asList(normalized)).get(0)
.isTagged()) {
l.add(s);
}
// some camel-case words containing hyphen (is there any better fix?)
else if (s.equalsIgnoreCase("mers-cov") || s.equalsIgnoreCase("mcgraw-hill")
|| s.equalsIgnoreCase("sars-cov-2") || s.equalsIgnoreCase("sars-cov") || s.equalsIgnoreCase("ph-metre")
|| s.equalsIgnoreCase("ph-metres") || s.equalsIgnoreCase("anti-ivg") || s.equalsIgnoreCase("anti-uv")
|| s.equalsIgnoreCase("anti-vih") || s.equalsIgnoreCase("al-qaida")) {
l.add(s);
} else {
// if not found, the word is split
// final StringTokenizer st2 = new StringTokenizer(s, "-’'", true);
final StringTokenizer st2 = new StringTokenizer(s, "’'", true);
while (st2.hasMoreElements()) {
l.add(st2.nextToken());
}
}
}
}
}
while (hyphensAtEnd > 0) {
l.add("-");
hyphensAtEnd--;
}
return l;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy