org.languagetool.tokenizers.uk.UkrainianWordTokenizer Maven / Gradle / Ivy
The newest version!
/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.tokenizers.uk;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;
/**
* Tokenizes a sentence into words.
* Punctuation and whitespace gets its own token.
* Specific to Ukrainian: apostrophes (0x27 and U+2019) not in the list as they are part of the word
*
* @author Andriy Rysin
*/
public class UkrainianWordTokenizer implements Tokenizer {
private static final String SPLIT_CHARS =
// "(?/|\\\\…°$€₴=№§¿¡~×]"
+ "|%(?![-\u2013][а-яіїєґ])" // allow 5%-й
+ "|(?", Pattern.CASE_INSENSITIVE);
// tokenize initials with dot before last name, e.g. "А.", "Ковальчук"
private static final Pattern INITIALS_DOT_PATTERN_SP_2 = Pattern.compile("([А-ЯІЇЄҐ])\\.([\\h\\v]{0,5}[А-ЯІЇЄҐ])\\.([\\h\\v]{0,5}[А-ЯІЇЄҐ][а-яіїєґ']+)");
private static final Pattern INITIALS_DOT_PATTERN_SP_1 = Pattern.compile("([А-ЯІЇЄҐ])\\.([\\h\\v]{0,5}[А-ЯІЇЄҐ][а-яіїєґ']+)");
// tokenize initials with dot after last name, e.g. "Ковальчук", "А."
private static final Pattern INITIALS_DOT_PATTERN_RSP_2 = Pattern.compile("([А-ЯІЇЄҐ][а-яіїєґ']+)([\\h\\v]?[А-ЯІЇЄҐ])\\.([\\h\\v]?[А-ЯІЇЄҐ])\\.");
private static final Pattern INITIALS_DOT_PATTERN_RSP_1 = Pattern.compile("([А-ЯІЇЄҐ][а-яіїєґ']+)([\\h\\v]?[А-ЯІЇЄҐ])\\.");
private static final String INITIALS_DOT_REPL_SP_2 = "$1" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER + "$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER + "$3";
private static final String INITIALS_DOT_REPL_SP_1 = "$1" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER + "$2";
private static final String INITIALS_DOT_REPL_RSP_2 = "$1" + BREAKING_PLACEHOLDER + "$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER + "$3" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER;
private static final String INITIALS_DOT_REPL_RSP_1 = "$1" + BREAKING_PLACEHOLDER + "$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER;
// abbreviation dot
private static final Pattern ABBR_DOT_VO_PATTERN1 = Pattern.compile("([вВу])\\.([\\h\\v]*о)\\.");
private static final Pattern ABBR_DOT_VO_PATTERN2 = Pattern.compile("(к)\\.([\\h\\v]*с)\\.");
private static final Pattern ABBR_DOT_VO_PATTERN3 = Pattern.compile("(ч|ст)\\.([\\h\\v]*л)\\.");
// private static final Pattern ABBR_DOT_VO_PATTERN4 = Pattern.compile("(р)\\.([\\s\u00A0\u202F]*х)\\.");
private static final Pattern ABBR_DOT_TYS_PATTERN1 = Pattern.compile("([0-9IІ][\\h\\v]+)(тис|арт)\\.");
private static final Pattern ABBR_DOT_TYS_PATTERN2 = Pattern.compile("(тис|арт)\\.([\\h\\v]+[а-яіїєґ0-9])");
private static final Pattern ABBR_DOT_ART_PATTERN = Pattern.compile("([Аа]рт|[Мм]ал|[Рр]ис|[Сс]пр)\\.([\\h]*(№[\\h]*)?[0-9])");
private static final Pattern ABBR_DOT_MAN_PATTERN = Pattern.compile("(Ман)\\.([\\h]*(Сіті|[Юю]н))");
private static final Pattern ABBR_DOT_LAT_PATTERN = Pattern.compile("([^а-яіїєґА-ЯІЇЄҐ'\u0301-]лат)\\.([\\h\\v]+[a-zA-Z])");
private static final Pattern ABBR_DOT_PROF_PATTERN = Pattern.compile("(?]*|(mailto:)?[\\p{L}\\d._-]+@[\\p{L}\\d_-]+(\\.[\\p{L}\\d_-]+)+", Pattern.CASE_INSENSITIVE);
private static final int URL_START_REPLACE_CHAR = 0xE300;
private static final Pattern LEADING_DASH_PATTERN = Pattern.compile("^([\u2014\u2013])([а-яіїєґА-ЯІЇЄҐA-Z])");
private static final Pattern LEADING_DASH_PATTERN_2 = Pattern.compile("^(-)([А-ЯІЇЄҐA-Z])");
private static final Pattern NUMBER_MISSING_SPACE = Pattern.compile("((?:[\\h\\v\uE110]|^)[а-яїієґА-ЯІЇЄҐ'-]*[а-яїієґ']?[а-яїієґ])([0-9]+(?![а-яіїєґА-ЯІЇЄҐa-zA-Z»\"“]))");
private static final Pattern WEB_ENTITIES = Pattern.compile("([а-яіїєґ])\\.(НЕТ|net|Інфо|Info|City|Life|UA|юа|лі|media|com|фм|ru|Ру)\\b", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
private static final Pattern WEB_ENTITIES2 = Pattern.compile("\\.([a-z_-]+)\\.(ua)", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
public UkrainianWordTokenizer() {
}
@Override
public List tokenize(String text) {
HashMap urls = new HashMap<>();
if( ! text.trim().isEmpty() ) {
text = adjustTextForTokenizing(text, urls);
}
List tokenList = new ArrayList<>();
List tokens = splitWithDelimiters(text, SPLIT_CHARS_REGEX);
for(String token: tokens) {
if( token.equals(BREAKING_PLACEHOLDER) )
continue;
token = token.replace(DECIMAL_COMMA_SUBST, ',');
token = token.replace(NON_BREAKING_SLASH_SUBST, '/');
token = token.replace(NON_BREAKING_COLON_SUBST, ':');
token = token.replace(NON_BREAKING_SPACE_SUBST, ' ');
token = token.replace(LEFT_BRACE_SUBST, '(');
token = token.replace(RIGHT_BRACE_SUBST, ')');
token = token.replace(LEFT_ANGLE_SUBST, '<');
token = token.replace(RIGHT_ANGLE_SUBST, '>');
token = token.replace(SLASH_SUBST, '/');
// outside of if as we also replace back sentence-ending abbreviations
token = token.replace(NON_BREAKING_DOT_SUBST, '.');
token = token.replace(SOFT_HYPHEN_WRAP_SUBST, SOFT_HYPHEN_WRAP);
token = token.replace(NON_BREAKING_PLACEHOLDER, "");
token = token.replace(NON_BREAKING_PLACEHOLDER2, "");
if( ! urls.isEmpty() ) {
for(Entry entry : urls.entrySet()) {
token = token.replace(entry.getKey(), entry.getValue());
}
}
tokenList.add( token );
}
return tokenList;
}
private String adjustTextForTokenizing(String text, HashMap urls) {
text = cleanup(text);
if( "\u2014\u2013-".indexOf(text.charAt(0)) >=0 ) {
Matcher matcher = LEADING_DASH_PATTERN.matcher(text);
if( matcher.find() ) {
text = matcher.replaceFirst("$1"+BREAKING_PLACEHOLDER+"$2");
}
else {
matcher = LEADING_DASH_PATTERN_2.matcher(text);
if( matcher.find() ) {
text = matcher.replaceFirst("$1"+BREAKING_PLACEHOLDER+"$2");
}
}
}
if( text.contains(",") ) {
text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL);
}
// check for urls
if( text.contains("http") || text.contains("www") || text.contains("@") || text.contains("ftp") ) { // https?|ftp
Matcher matcher = URL_PATTERN.matcher(text);
int urlReplaceChar = URL_START_REPLACE_CHAR;
while( matcher.find() ) {
String urlGroup = matcher.group();
String replaceChar = String.valueOf((char)urlReplaceChar);
urls.put(replaceChar, urlGroup);
text = matcher.replaceFirst(replaceChar);
urlReplaceChar++;
matcher = URL_PATTERN.matcher(text);
}
}
if( text.indexOf('\u2014') != -1 ) {
text = text.replaceAll("\u2014([\\h\\v])", BREAKING_PLACEHOLDER + "\u2014$1");
}
boolean nDashPresent = text.indexOf('\u2013') != -1;
if( text.indexOf('-') != -1 || nDashPresent ) {
text = DASH_NUMBERS_PATTERN.matcher(text).replaceAll(DASH_NUMBERS_REPL);
if( nDashPresent ) {
text = N_DASH_SPACE_PATTERN.matcher(text).replaceAll(N_DASH_SPACE_REPL);
text = N_DASH_SPACE_PATTERN2.matcher(text).replaceAll(N_DASH_SPACE_REPL);
}
}
if( text.indexOf("с/г") != -1 ) {
text = text.replaceAll("с/г", "с" +NON_BREAKING_SLASH_SUBST + "г");
}
if( text.indexOf("Л/ДНР") != -1 ) {
text = text.replaceAll("Л/ДНР", "Л" +NON_BREAKING_SLASH_SUBST + "ДНР");
}
if( text.indexOf("р.") != -1 ) {
Matcher matcher = YEAR_WITH_R.matcher(text);
if( matcher.find() ) {
text = matcher.replaceAll("$1" + BREAKING_PLACEHOLDER + "$2");
}
}
// leave only potential hashtags together
text = text.replace("#", BREAKING_PLACEHOLDER + "#");
// leave numbers with following % together
if( text.indexOf('%') >= 0 ) {
text = text.replaceAll("%([^-])", "%" + BREAKING_PLACEHOLDER + "$1");
}
text = COMPOUND_WITH_QUOTES1.matcher(text).replaceAll("$1$2\uE120$3\uE120$4\uE120");
text = COMPOUND_WITH_QUOTES2.matcher(text).replaceAll("$1\uE120$2\uE120$3\uE120$4");
if( text.indexOf('[') != -1 ) {
text = WORDS_WITH_BRACKETS_PATTERN.matcher(text).replaceAll("$1\\[\uE120$2\\]\uE120");
}
// if period is not the last character in the sentence
int dotIndex = text.indexOf('.');
String textRtrimmed = text.replaceFirst("[\\h\\v]*$", "");
boolean dotInsideSentence = dotIndex >= 0 && dotIndex < textRtrimmed.length()-1;
if( dotInsideSentence
|| (dotIndex == textRtrimmed.length()-1
&& ABBR_AT_THE_END.matcher(text).find()) ) { // ugly - special case for тис. та ініціалів
text = DOTTED_NUMBERS_PATTERN3.matcher(text).replaceAll("$1.\uE120$2.\uE120$3");
text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1.\uE120$2");
text = ABBR_DOT_NAR_PATTERN_1.matcher(text).replaceAll("$1.\uE120\uE110");
text = ABBR_DOT_NAR_PATTERN_2.matcher(text).replaceAll("$1.\uE120\uE110$2");
text = ABBR_DOT_2_SMALL_LETTERS_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2.\uE120\uE110"); //.replaceFirst("(([смкд]|мк)?м\\.[\\h\\v]*)\uE120\uE110$", "$1");
text = ABBR_DOT_VO_PATTERN1.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
text = ABBR_DOT_VO_PATTERN2.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
text = ABBR_DOT_VO_PATTERN3.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
text = ABBR_DOT_ART_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_MAN_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_TYS_PATTERN1.matcher(text).replaceAll("$1$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
text = ABBR_DOT_TYS_PATTERN2.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_LAT_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_PROF_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_GUB_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
text = ABBR_DOT_DASH_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + "$2");
text = INITIALS_DOT_PATTERN_SP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_2);
text = INITIALS_DOT_PATTERN_SP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_1);
text = INITIALS_DOT_PATTERN_RSP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_2);
text = INITIALS_DOT_PATTERN_RSP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_1);
// text = ABBR_DOT_INVALID_DOT_PATTERN.matcher(text).replaceAll(ONE_DOT_TWO_REPL);
text = ABBR_DOT_KUB_SM_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
text = ABBR_DOT_S_G_PATTERN.matcher(text).replaceAll("$1" + NON_BREAKING_DOT_SUBST + "$2" + NON_BREAKING_DOT_SUBST + BREAKING_PLACEHOLDER);
text = ABBR_DOT_CHL_KOR_PATTERN.matcher(text).replaceAll("$1.\uE120$2.\uE120\uE110");
text = ABBR_DOT_PN_ZAH_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2.\uE120\uE110");
text = ABBR_DOT_I_T_P_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
text = ABBR_DOT_I_T_CH_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
text = ABBR_DOT_T_ZV_PATTERN.matcher(text).replaceAll("$1\uE120\uE110$2\uE120\uE110");
text = ABBR_DOT_RED_AVT_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
text = ABBR_DOT_NON_ENDING_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110");
text = ABBR_DOT_NON_ENDING_PATTERN_2.matcher(text).replaceAll("$1\uE120\uE110$2");
text = INVALID_MLN_DOT_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110$2");
}
if( dotInsideSentence ) {
text = WEB_ENTITIES.matcher(text).replaceAll("$1.\uE120$2");
text = WEB_ENTITIES2.matcher(text).replaceAll(".\uE120$1.\uE120$2");
}
text = ABBR_DOT_ENDING_PATTERN.matcher(text).replaceAll("$1.\uE120\uE110");
// 2 000 000
Matcher spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text);
if( spacedDecimalMatcher.find() ) {
StringBuffer sb = new StringBuffer();
do {
String splitNumber = spacedDecimalMatcher.group(0);
String splitNumberAdjusted = splitNumber.replace(' ', NON_BREAKING_SPACE_SUBST);
splitNumberAdjusted = splitNumberAdjusted.replace('\u00A0', NON_BREAKING_SPACE_SUBST);
splitNumberAdjusted = splitNumberAdjusted.replace('\u202F', NON_BREAKING_SPACE_SUBST);
spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
} while( spacedDecimalMatcher.find() );
spacedDecimalMatcher.appendTail(sb);
text = sb.toString();
}
// 12:25
if( text.contains(":") ) {
text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL);
}
// ВКПБ(о)
if( text.contains("(") ) {
text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1" + LEFT_BRACE_SUBST + "$2" + RIGHT_BRACE_SUBST);
}
if( text.contains("<") ) {
text = XML_TAG_PATTERN.matcher(text).replaceAll(BREAKING_PLACEHOLDER + LEFT_ANGLE_SUBST + "$1" + RIGHT_ANGLE_SUBST + BREAKING_PLACEHOLDER);
text = text.replace(LEFT_ANGLE_SUBST+"/", "" + LEFT_ANGLE_SUBST + SLASH_SUBST);
text = text.replace("/" + RIGHT_ANGLE_SUBST, "" + SLASH_SUBST + RIGHT_ANGLE_SUBST);
}
if( text.contains("-") ) {
text = text.replaceAll("([а-яіїєґА-ЯІЇЄҐ])([»\"-]+-)", "$1" + BREAKING_PLACEHOLDER + "$2");
text = text.replaceAll("([»\"-]+-)([а-яіїєґА-ЯІЇЄҐ])", "$1" + BREAKING_PLACEHOLDER + "$2");
}
if( text.contains(SOFT_HYPHEN_WRAP) ) {
text = text.replaceAll("(?= 0 ) {
text = APOSTROPHE_BEGIN_PATTERN.matcher(text).replaceAll("$1'" + BREAKING_PLACEHOLDER + "$2");
text = APOSTROPHE_END_PATTER.matcher(text).replaceAll("$1" + BREAKING_PLACEHOLDER + "'$2");
}
if( text.contains("+") ) {
text = text.replaceAll("\\+(?=[а-яіїєґА-ЯІЇЄҐ0-9])", BREAKING_PLACEHOLDER + "+" + BREAKING_PLACEHOLDER);
}
// -20C
if( text.length() > 1 && (text.contains("-") || text.contains("\u2013")) ) {
text = text.replaceAll("(?<=(^|[\\h\\v]))([-\u2013])(?=[0-9])", "$2" + BREAKING_PLACEHOLDER);
}
text = NUMBER_MISSING_SPACE.matcher(text).replaceAll("$1" + BREAKING_PLACEHOLDER + "$2");
return text;
}
private static String cleanup(String text) {
text = text
.replace('\u2019', '\'')
.replace('\u02BC', '\'')
.replace('\u2018', '\'')
// .replace('`', '\'')
// .replace('´', '\'')
.replace('\u201A', ',') // SINGLE LOW-9 QUOTATION MARK sometimes used as a comma
.replace('\u2011', '-'); // we handle \u2013 in tagger so we can base our rule on it
text = WEIRD_APOSTROPH_PATTERN.matcher(text).replaceAll("$1\uE120$2\uE120$3");
return text;
}
private static List splitWithDelimiters(String str, Pattern delimPattern) {
List parts = new ArrayList();
Matcher matcher = delimPattern.matcher(str);
int lastEnd = 0;
while (matcher.find()) {
int start = matcher.start();
if (lastEnd != start) {
String nonDelim = str.substring(lastEnd, start);
parts.add(nonDelim);
}
String delim = matcher.group();
parts.add(delim);
lastEnd = matcher.end();
}
if (lastEnd != str.length()) {
String nonDelim = str.substring(lastEnd);
parts.add(nonDelim);
}
return parts;
}
}