edu.emory.mathcs.nlp.component.tokenizer.Tokenizer Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.component.tokenizer;
import edu.emory.mathcs.nlp.common.constant.CharConst;
import edu.emory.mathcs.nlp.common.constant.StringConst;
import edu.emory.mathcs.nlp.common.util.CharUtils;
import edu.emory.mathcs.nlp.common.util.IOUtils;
import edu.emory.mathcs.nlp.common.util.Joiner;
import edu.emory.mathcs.nlp.common.util.MetaUtils;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Currency;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Dictionary;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Emoticon;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Unit;
import edu.emory.mathcs.nlp.component.tokenizer.token.Token;
import edu.emory.mathcs.nlp.component.tokenizer.token.TokenIndex;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import org.magicwerk.brownies.collections.GapList;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Jinho D. Choi ({@code [email protected]})
*/
abstract public class Tokenizer
{
protected final CharSet S_SYMBOL_IN_BETWEEN = new CharOpenHashSet(new char[]{CharConst.SEMICOLON, CharConst.COMMA, CharConst.TILDA, CharConst.EQUAL, CharConst.PLUS, CharConst.AMPERSAND, CharConst.PIPE, CharConst.FW_SLASH});
protected final Pattern P_ABBREVIATION = PatternUtils.createClosedPattern("\\p{Alnum}([\\.|-]\\p{Alnum})*");
protected final Pattern P_YEAR = PatternUtils.createClosedPattern("\\d\\d['\u2019]?[sS]?");
protected final Pattern P_YEAR_YEAR = PatternUtils.createClosedPattern("(\\d{2}|\\d{4})(-)(\\d{2}|\\d{4}|\\d{2}[sS])");
protected Emoticon d_emoticon;
protected Currency d_currency;
protected Unit d_unit;
protected Set d_preserve;
public Tokenizer()
{
d_emoticon = new Emoticon();
d_currency = new Currency();
d_unit = new Unit();
d_preserve = initPreserve();
}
private Set initPreserve()
{
BufferedReader reader = IOUtils.createBufferedReader(IOUtils.getInputStreamsFromResource(Dictionary.ROOT+"preserve.txt"));
Set set = new HashSet<>();
String line;
try
{
while ((line = reader.readLine()) != null)
set.add(line.trim());
}
catch (IOException e) {e.printStackTrace();}
return set;
}
// ============================== Public ==============================
/** @return a list of tokens in the specific input stream. */
public List tokenize(InputStream in)
{
BufferedReader reader = IOUtils.createBufferedReader(in);
ArrayList tokens = new ArrayList<>();
List t;
String line;
int start = 0;
int end = 0;
boolean flag = false;
try
{
while ((line = reader.readLine()) != null)
{
if (flag)
{
// assigning the start and end offset to all the lines except first line
start = end + System.getProperty("line.separator").length();
end = start + line.length();
}
else
{
start = 0;
end = line.length();
flag = true;
}
t = tokenizeWhiteSpaces(line, start);
if (!t.isEmpty()) tokens.addAll(t);
}
reader.close();
}
catch (IOException e) {e.printStackTrace();}
tokens.trimToSize();
return tokens;
}
/** @return a list of tokens in the specific string. */
public List tokenize(String s)
{
return tokenizeWhiteSpaces(s, 0);
}
public List> segmentize(InputStream in)
{
return segmentize(tokenize(in));
}
public List> segmentize(String s)
{
return segmentize(tokenize(s));
}
/** @param flag 0: word-form, 1: simplified word-form, 2: decapitalized simplified word-form. */
public void tokenizeLine(InputStream in, PrintStream out, String delim, int flag)
{
BufferedReader reader = IOUtils.createBufferedReader(in);
String line;
try
{
while ((line = reader.readLine()) != null)
{
List tokens = tokenize(line);
if (flag > 0)
{
for (Token token : tokens)
{
String s = token.getWordForm();
switch (flag)
{
case 1: s = StringUtils.toSimplifiedForm(s, false); break;
case 2: s = StringUtils.toSimplifiedForm(s, true); break;
}
token.setWordForm(s);
}
}
line = Joiner.join(tokens, delim);
out.println(line);
}
}
catch (IOException e) {e.printStackTrace();}
}
abstract public List> segmentize(List tokens);
// ============================== Tokenize ==============================
/**
* Tokenizes white spaces.
* Called by {@link #tokenize(InputStream)} and {@link #tokenize(String)}.
*/
private List tokenizeWhiteSpaces(String s, int start)
{
List tokens = new GapList<>();
int i, len = s.length(), bIndex = start;
char[] cs = s.toCharArray();
for (i = start; i < start + len; i++)
{
if (CharUtils.isWhiteSpace(cs[i - start]))
{
if (bIndex < i) tokenizeMetaInfo(tokens, s.substring(bIndex - start, i - start), bIndex, i);
bIndex = i + 1;
}
}
if (bIndex < start + len) tokenizeMetaInfo(tokens, s.substring(bIndex - start), bIndex, len - bIndex + start);
if (!tokens.isEmpty()) finalize(tokens, s);
return tokens;
}
/**
* Tokenizes hyperlinks, emoticons.
* Called by {@link #tokenizeAux(String)}.
*/
private void tokenizeMetaInfo(List tokens, String s, int bIndex2, int i)
{
int[] ps;
TokenIndex bIndex3 = new TokenIndex(bIndex2);
if ((ps = getMetaRange(s)) != null)
{
int bIndex = ps[0], eIndex = ps[1], len = s.length();
if (0 < bIndex) tokenizeSymbols(tokens, s.substring(0, bIndex), bIndex3);
Token Token = new Token(s.substring(bIndex, eIndex), bIndex3.getVal(), bIndex3.getVal() + eIndex - bIndex);
tokens.add(Token);
bIndex3.setVal(bIndex3.getVal() + eIndex - bIndex);
if (eIndex < len) tokenizeSymbols(tokens, s.substring(eIndex), bIndex3);
}
else
tokenizeSymbols(tokens, s, bIndex3);
}
/** Called by {@link #tokenizeMetaInfo(List, String)}. */
private int[] getMetaRange(String s)
{
if (MetaUtils.startsWithNetworkProtocol(s) || d_preserve.contains(s))
return new int[]{0, s.length()};
int[] ps;
if ((ps = d_emoticon.getEmoticonRange(s)) != null)
return ps;
Matcher m = MetaUtils.HYPERLINK.matcher(s);
if (m.find())
return new int[]{m.start(), m.end()};
return null;
}
/** Called by {@link #tokenizeMetaInfo(List, String)}. */
private void tokenizeSymbols(List tokens, String s, TokenIndex bIndex2)
{
char[] cs = s.toCharArray();
int len = s.length();
int bIndex = getFirstNonSymbolIndex(cs);
if (bIndex == len)
{
addSymbols(tokens, s, bIndex2);
return;
}
int eIndex = getLastSymbolSequenceIndex(cs);
List indices = new ArrayList<>();
indices.add(new int[]{0, bIndex});
addNextSymbolSequenceIndices(indices, cs, bIndex+1, eIndex-1);
indices.add(new int[]{eIndex, len});
tokenizeSymbolsAux(tokens, s, cs, indices, bIndex2);
}
/**
* @return {@code 0} if no character in {@code cs} is symbol.
* @return {@code cs.length} if all characters in {@code cs} are symbols.
* Called by {@link #tokenizeSymbols(List, String)}.
*/
private int getFirstNonSymbolIndex(char[] cs)
{
int i, len = cs.length;
for (i=0; i=0; i--)
{
if (!isSymbol(cs[i]))
return i+1;
}
return i+1;
}
/** Called by {@link #tokenizeSymbols(List, String)}. */
private void addNextSymbolSequenceIndices(List indices, char[] cs, int bIndex, int eIndex)
{
int i, j;
for (i=bIndex; i tokens, String s, char[] cs, List indices, TokenIndex bIndex2)
{
int i, pg, ng, bIndex, eIndex, size = indices.size() - 1;
boolean pb, nb;
int[] pi, ni;
String t;
for (i=0; i 0 : pg == 1;
nb = (i+1 == size) ? ng > 0 : ng == 1;
if (pb) pi[1] = adjustFirstNonSymbolIndex(cs, bIndex, t);
if (nb) ni[0] = adjustLastSymbolSequenceIndex(cs, eIndex, t);
}
}
for (i=0; i 0)
{
beginIndex -= gap;
}
else if (CharUtils.isPreDigitSymbol(sym))
{
if (CharUtils.isDigit(curr)) beginIndex--; // -1, .1, +1
}
else if ((sym == CharConst.AT || sym == CharConst.POUND))
{
if (CharUtils.isAlphabet(curr)) beginIndex--; // @A, #A
}
else if (CharUtils.isApostrophe(sym))
{
if (P_YEAR.matcher(t).find()) beginIndex--;
}
return beginIndex;
}
/** Called by {@link #tokenizeSymbolsAux(List, String, char[], List)}. */
protected int adjustLastSymbolSequenceIndex(char[] cs, int endIndex, String t)
{
String lower = StringUtils.toLowerCase(t);
char sym = cs[endIndex];
int gap;
if ((gap = adjustLastSymbolSequenceGap(cs, endIndex, t)) > 0)
{
endIndex += gap;
}
else if (sym == CharConst.DOLLAR)
{
if (d_currency.isCurrencyDollar(lower)) endIndex++;
}
else if (sym == CharConst.PERIOD)
{
if (preservePeriod(cs, endIndex, t)) endIndex++;
}
return endIndex;
}
/** Called by {@link #adjustFirstNonSymbolIndex(char[], int, String)}. */
abstract protected int adjustFirstNonSymbolGap(char[] cs, int beginIndex, String t);
/** Called by {@link #adjustLastSymbolSequenceIndex(char[], int, String)}. */
abstract protected int adjustLastSymbolSequenceGap(char[] cs, int endIndex, String t);
// ----------------------------------- Add symbols -----------------------------------
/** Called by {@link #tokenizeSymbols(List, String)}. */
private int addSymbols(List tokens, String s, TokenIndex bIndex2)
{
if (s.length() == 1)
{
Token Token = new Token(s, bIndex2.getVal(), bIndex2.getVal() + 1);
tokens.add(Token);
bIndex2.setVal(bIndex2.getVal() + 1);
return bIndex2.getVal();
}
int i, j, flag, len = s.length(), bIndex = 0;
char[] cs = s.toCharArray();
for (i=0; ileftBound; i--)
// {
// if (!isConsecutive(cs, i, c, finalMark))
// return i+1;
// }
//
// return i+1;
// }
// ----------------------------------- Add morphmes -----------------------------------
/** Called by {@link #tokenizeSymbols(List, String)}. */
private int addMorphemes(List tokens, String s, TokenIndex bIndex2)
{
if (s.length() == 1)
{
Token Token = new Token(s, bIndex2.getVal(), bIndex2.getVal() + 1);
tokens.add(Token);
bIndex2.setVal(bIndex2.getVal() + 1);
return bIndex2.getVal();
}
char[] lcs = s.toCharArray();
String lower = CharUtils.toLowerCase(lcs) ? new String(lcs) : s;
if (!tokenize(tokens, s, lower, lcs, d_currency, bIndex2) && !tokenize(tokens, s, lower, lcs, d_unit, bIndex2) && !tokenizeDigit(tokens, s, lcs, bIndex2) && !tokenizeWordsMore(tokens, s, lower, lcs, bIndex2))
{
Token Token = new Token(s, bIndex2.getVal(), bIndex2.getVal() + s.length());
tokens.add(Token);
bIndex2.setVal(bIndex2.getVal() + s.length());
return bIndex2.getVal();
}
return bIndex2.getVal();
}
/** Called by {@link #addMorphemes(List, String)}. */
protected boolean tokenize(List tokens, String original, String lower, char[] lcs, Dictionary tokenizer, TokenIndex bIndex2)
{
String[] t = tokenizer.tokenize(original, lower, lcs);
if (t != null)
{
bIndex2.setVal(addAll(tokens, t, bIndex2.getVal()));
return true;
}
return false;
}
public int addAll(List tokens, String[] array, int bIndex2)
{
for (String item : array)
{
Token interval = new Token(item, bIndex2, bIndex2 + item.length());
tokens.add(interval);
bIndex2 = bIndex2 + item.length();
}
return bIndex2;
}
/** Called by {@link #addMorphemes(List, String)}. */
private boolean tokenizeDigit(List tokens, String original, char[] lcs, TokenIndex bIndex2)
{
int len = lcs.length;
if (len < 2) return false;
if (tokenizeDigitAux(lcs[0]) && CharUtils.containsDigitPunctuationOnly(lcs, 1, len))
{
Token Token = new Token(original.substring(0, 1), bIndex2.getVal(),
bIndex2.getVal() + 1);
tokens.add(Token);
bIndex2.setVal(bIndex2.getVal() + 1);
Token newinterval = new Token(original.substring(1), bIndex2.getVal(), bIndex2.getVal()
+ original.length() - 1);
tokens.add(newinterval);
bIndex2.setVal(bIndex2.getVal() + original.length() - 1);
return true;
}
len--;
if (tokenizeDigitAux(lcs[len]) && CharUtils.containsDigitPunctuationOnly(lcs, 0, len))
{
Token Token = new Token(original.substring(0, len), bIndex2.getVal(),
bIndex2.getVal() + len);
tokens.add(Token);
bIndex2.setVal(bIndex2.getVal() + len);
Token newinterval = new Token(original.substring(len), bIndex2.getVal(), bIndex2.getVal()
+ original.length() - len);
tokens.add(newinterval);
bIndex2.setVal(bIndex2.getVal() + original.length() - len);
return true;
}
return false;
}
/** {@link #tokenizeDigit(List, String, char[])}. */
private boolean tokenizeDigitAux(char c)
{
return c == CharConst.POUND || c == CharConst.DOLLAR || c == CharConst.PERCENT || c == CharConst.ASTERISK || c == CharConst.EQUAL;
}
/** Called by {@link #addMorphemes(List, String)}. */
abstract protected boolean tokenizeWordsMore(List tokens, String original, String lower, char[] lcs, TokenIndex bIndex2);
// ----------------------------------- Finalize -----------------------------------
/** Called by {@link #tokenize(String)}. */
private void finalize(List tokens, String input)
{
int i, j, size = tokens.size();
String token, lower;
for (i=0; i tokens, String token, String lower, int index)
{
if (lower.equals("no.") && (index+1 == tokens.size() || !CharUtils.isDigit(tokens.get(index + 1).getWordForm().charAt(0))))
{
Token currToken = tokens.get(index);
Token Token = new Token(StringUtils.trim(
currToken.getWordForm(), 1),
currToken.getStartOffset(), currToken.getEndOffset() - 1);
tokens.set(index, Token);
Token nextInterval = new Token(StringConst.PERIOD,
currToken.getEndOffset() - 1, currToken.getEndOffset());
tokens.add(index + 1, nextInterval);
return 1;
}
return 0;
}
/** Called by {@link #finalize()}. */
private int mergeBrackets(List tokens, String token, int index, String input)
{
if ((token.length() == 1 || StringUtils.containsDigitOnly(token)) && 0 <= index-1 && index+1 < tokens.size())
{
Token prevToken = tokens.get(index - 1);
Token nextToken = tokens.get(index + 1);
if (CharUtils.isLeftBracket(prevToken.getWordForm().charAt(0)) && CharUtils.isRightgBracket(nextToken.getWordForm().charAt(0)))
{
Token currToken = tokens.get(index);
Token Token = new Token(prevToken.getWordForm()+currToken.getWordForm()+nextToken.getWordForm(), prevToken.getStartOffset(), nextToken.getEndOffset());
tokens.set(index - 1, Token);
tokens.remove(index);
tokens.remove(index);
return -1;
}
}
return 0;
}
private int tokenizeYears(List tokens, String token, int index)
{
Matcher m = P_YEAR_YEAR.matcher(token);
return m.find() ? addTokens(m, tokens, index, 2, 3, 4) : 0;
}
protected int addTokens(Matcher m, List tokens, int index, int... ids)
{
Token prev, curr;
curr = tokens.get(index);
curr.setWordForm(m.group(ids[0]));
curr.resetEndOffset();
for (int i=1; i tokens, String token, String lower, int index);
/** Called by {@link #finalize()}. */
private void tokenizeLastPeriod(List tokens)
{
int lastIndex = tokens.size() - 1;
Token lastInterval = tokens.get(lastIndex);
String lastToken = lastInterval.getWordForm();
char[] ca = lastToken.toCharArray();
int leng = lastToken.length();
if (1 < leng && ca[leng - 1] == CharConst.PERIOD && !CharUtils.isFinalMark(ca[leng - 2]))
{
Token Token = new Token(StringUtils.trim(lastToken, 1), lastInterval.getStartOffset(), lastInterval.getEndOffset() - 1);
tokens.set(lastIndex, Token);
Token nextInterval = new Token(StringConst.PERIOD, lastInterval.getEndOffset() - 1, lastInterval.getEndOffset());
tokens.add(lastIndex + 1, nextInterval);
}
}
// ----------------------------------- Preserve -----------------------------------
/** Called by {@link #addNextSymbolSequenceIndices(List, char[], int, int)}. */
abstract protected boolean preserveSymbolInBetween(char[] cs, int index);
/** Called by {@link #addMorphemes(List, String)}. */
private boolean preserveSymbolInDigits(char[] cs, int index)
{
char c = cs[index];
if (CharUtils.isHyphen(c))
return (0 <= index-1 && index+1 < cs.length) && CharUtils.isAlnum(cs[index-1]) && CharUtils.isDigit(cs[index+1]);
else if (c == CharConst.FW_SLASH)
return (0 <= index-1 && index+1 < cs.length) && CharUtils.isDigit(cs[index-1]) && CharUtils.isDigit(cs[index+1]);
else if (cs[index] == CharConst.COMMA)
return (0 <= index-1 && index+3 < cs.length) && (index+4 == cs.length || !CharUtils.isDigit(cs[index+4])) && CharUtils.isDigit(cs[index-1]) && CharUtils.isDigit(cs[index+1]) && CharUtils.isDigit(cs[index+2]) && CharUtils.isDigit(cs[index+3]);
return false;
}
/** Called by {@link #addMorphemes(List, String)}. */
private boolean preserveSymbolInAlphabets(char[] cs, int index)
{
char c = cs[index];
if (c == CharConst.AMPERSAND)
return (0 <= index-1 && index+1 < cs.length) && CharUtils.isAlphabet(cs[index-1]) && CharUtils.isAlphabet(cs[index+1]);
return false;
}
/** Called by {@link #adjustLastSymbolSequenceGap(char[], int, String)}. */
protected boolean preservePeriod(char[] cs, int endIndex, String t)
{
if (endIndex+1 < cs.length)
{
char c = cs[endIndex+1];
if (CharUtils.isSeparatorMark(c))
return true;
if (CharUtils.isFinalMark(c) || CharUtils.isQuotationMark(c))
return false;
}
if (P_ABBREVIATION.matcher(t).find())
return true;
int len = t.length();
return (2 <= len && len <= 5) && CharUtils.containsOnlyConsonants(t);
}
// ----------------------------------- Boolean -----------------------------------
/** Called by {@link #getFirstNonSymbolIndex(char[])} and {@link #getLastSymbolSequenceIndex(char[])}. */
private boolean isSymbol(char c)
{
return CharUtils.isPunctuation(c) ||
CharUtils.isGeneralPunctuation(c) ||
CharUtils.isCurrency(c) ||
CharUtils.isArrow(c);
}
/** Called by {@link #addNextSymbolSequenceIndices(List, char[], int, int)}. */
private boolean isEllipsis(char[] cs, int index)
{
if (CharUtils.isFinalMark(cs[index]) && index+1 < cs.length)
{
char c = cs[index+1];
return CharUtils.isFinalMark(c) || CharUtils.isSeparatorMark(c) || CharUtils.isQuotationMark(c);
}
return false;
}
/** Called by {@link #addNextSymbolSequenceIndices(List, char[], int, int)}. */
private boolean isSymbolInBetween(char c)
{
return CharUtils.isBracket(c) || CharUtils.isArrow(c) || CharUtils.isDoubleQuotationMark(c) || CharUtils.isHyphen(c) || S_SYMBOL_IN_BETWEEN.contains(c);
}
/** Called by {@link #getSpanIndex(char[], int, int, boolean)}. */
private boolean isConsecutive(char[] cs, int index, char c, boolean finalMark)
{
return finalMark ? CharUtils.isFinalMark(cs[index]) : c == cs[index];
}
/** Called by {@link #addSymbols(List, String)}. */
private int getSymbolFlag(char c)
{
if (CharUtils.isFinalMark(c))
return 1;
else if (CharUtils.isBracket(c) || CharUtils.isSeparatorMark(c) || CharUtils.isQuotationMark(c) || c == CharConst.PRIME)
return 2;
else
return 0;
}
protected boolean isFinalMarksOnly(String s)
{
for (char c : s.toCharArray())
{
if (!CharUtils.isFinalMark(c))
return false;
}
return true;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy