edu.emory.mathcs.nlp.component.tokenizer.EnglishTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp4j-api Show documentation
The newest version!
/**
 * Copyright 2014, Emory University
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.emory.mathcs.nlp.component.tokenizer;

import edu.emory.mathcs.nlp.common.constant.CharConst;
import edu.emory.mathcs.nlp.common.util.Language;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Abbreviation;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.Compound;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.EnglishApostrophe;
import edu.emory.mathcs.nlp.component.tokenizer.dictionary.EnglishHyphen;
import edu.emory.mathcs.nlp.component.tokenizer.token.Token;
import edu.emory.mathcs.nlp.component.tokenizer.token.TokenIndex;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @since 3.0.0
 * @author Jinho D. Choi ({@code [email protected]})
 */
public class EnglishTokenizer extends Tokenizer
{
	private final String[] L_BRACKETS = {"\"","(","{","["};
	private final String[] R_BRACKETS = {"\"",")","}","]"};
	private final Pattern  P_MID_SYM  = PatternUtils.createClosedPattern("(\\p{Alpha}{2,}+)([\\.\\!\\?]+)(\\p{Alpha}{2,}+)");	
	private EnglishApostrophe d_apostrophe;
	private Abbreviation      d_abbreviation;
	private Compound          d_compound;
	private EnglishHyphen     d_hyphen;
	
	public EnglishTokenizer()
	{
		d_apostrophe   = new EnglishApostrophe();
		d_abbreviation = new Abbreviation();
		d_compound     = new Compound(Language.ENGLISH);
		d_hyphen       = new EnglishHyphen();
	}
	
//	============================== Abstract ==============================
	
	@Override
	protected int adjustFirstNonSymbolGap(char[] cs, int beginIndex, String t)
	{
		return 0;
	}
	
	@Override
	protected int adjustLastSymbolSequenceGap(char[] cs, int endIndex, String t)
	{
		char sym = cs[endIndex];
		
		if (sym == CharConst.PERIOD)
		{
			if (d_abbreviation.isAbbreviationEndingWithPeriod(StringUtils.toLowerCase(t)))
				return 1;
		}
		
		return 0;
	}

	@Override
	protected boolean preserveSymbolInBetween(char[] cs, int index)
	{
		return d_hyphen.preserveHyphen(cs, index);
	}
	
	@Override
	protected boolean tokenizeWordsMore(List tokens, String original, String lower, char[] lcs, TokenIndex bIndex2)
	{
		return tokenize(tokens, original, lower, lcs, d_apostrophe, bIndex2) || tokenize(tokens, original, lower, lcs, d_compound, bIndex2); 
	}
	
	@Override
	protected int tokenizeMiddleSymbol(List tokens, String token, String lower, int index)
	{
		Matcher m = P_MID_SYM.matcher(token);
		return m.find() ? addTokens(m, tokens, index, 2, 3, 4) : 0;
	}
	
//	============================== Segmentize ==============================
	
	@Override
	public List> segmentize(List tokens)
	{
		List> sentences = new ArrayList<>();
		int[] brackets = new int[R_BRACKETS.length];
		int bIndex, i, size = tokens.size();
		boolean isTerminal = false;
		String token;
		
		for (i=0, bIndex=0; i tokens, int beginIndex, int endIndex)
	{
		Token[] array = new Token[endIndex - beginIndex];
		
		for (int i=beginIndex,j=0; i 0 && str.equals(R_BRACKETS[i]))
				return true;
		}
		
		return false;
	}
}