io.bdrc.lucene.sa.SkrtWordTokenizer Maven / Gradle / Ivy
Show all versions of lucene-sa Show documentation
/*******************************************************************************
* Copyright (c) 2017 Buddhist Digital Resource Center (BDRC)
*
* If this file is a derivation of another work the license header will appear
* below; otherwise, this work is licensed under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with the
* License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package io.bdrc.lucene.sa;
import java.io.DataInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.bdrc.lucene.sa.CmdParser.DiffStruct;
import io.bdrc.lucene.sa.PartOfSpeechAttribute.PartOfSpeech;
import io.bdrc.lucene.stemmer.Row;
import io.bdrc.lucene.stemmer.Trie;
/**
* A maximal-matching word tokenizer for Sanskrit that uses a {@link Trie}.
*
*
* The expected input is an SLP string.
* {@link SkrtSyllableTokenizer#isSLP(int)} is used to filter out nonSLP characters.
*
*
* The necessary information for unsandhying finals and initials is taken from
* {@code resources/sanskrit-stemming-data/output/total_output.txt} (a submodule).
*
*
* Due to its design, this tokenizer doesn't deal with contextual ambiguities.
* For example, "nagaraM" could either be a word of its own or "na" + "garaM",
* but is always parsed as a single word in the default behavior.
*
* In order to get the correct segmentation, we provide a mechanism to include
* custom entries in the Trie that will contain multi-token lemmas. The provided information
* will then be used by this tokenizer to correctly tokenize the problematic passages.
*
* See here
* for more information.
*
* Derived from Lucene 6.4.1 CharTokenizer, but differs by using a RollingCharBuffer
* to still find tokens that are on the IO_BUFFER_SIZE (4096 chars)
*
* @author Élie Roux
* @author Drupchen
*
*/
public final class SkrtWordTokenizer extends Tokenizer {
private boolean debug = false;
String compiledTrieName = "skrt-compiled-trie.dump";
private static Trie defaultTrie;
private Trie scanner;
static final Logger logger = LoggerFactory.getLogger(SkrtWordTokenizer.class);
/* attributes allowing to modify the values of the generated terms */
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
private final PositionIncrementAttribute incrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Default constructor: uses the default compiled Trie loaded at class level
*/
public SkrtWordTokenizer() {
this.scanner = getTrie();
ioBuffer = new RollingCharBuffer();
ioBuffer.reset(input);
}
public SkrtWordTokenizer(boolean debug) {
this();
this.debug = debug;
}
/**
* Builds a Trie from a file containing raw Trie data. (might take a long time, depending on the size of the Trie to build)
*
Does so in memory without storing to an external file.
*
Is best used with small Tries, such as for testing purposes.
*
* sanskrit-stemming-data
* should be used to parse custom data in the accepted format.
*
* @param filename the file containing the entries of the Trie
* @throws FileNotFoundException the file containing the Trie can't be found
* @throws IOException the file containing the Trie can't be read
*/
public SkrtWordTokenizer(String filename) throws FileNotFoundException, IOException {
this.scanner = BuildCompiledTrie.buildTrie(filename);
ioBuffer = new RollingCharBuffer();
ioBuffer.reset(input);
}
public SkrtWordTokenizer(boolean debug, String filename) throws FileNotFoundException, IOException {
this(filename);
this.debug = debug;
}
/**
* Opens an already compiled Trie that was saved to disk.
*
* sanskrit-stemming-data
* should be used to parse custom data in the accepted format.
*
* The compiled Trie should then be built and saved to disk
* with {@link BuildCompiledTrie#main(String[])}
*
* @param trieStream an InputStream (FileInputStream, for ex.) containing the compiled Trie
* @throws FileNotFoundException the file containing the Trie can't be found
* @throws IOException the file containing the Trie can't be read
*/
public SkrtWordTokenizer(InputStream trieStream) throws FileNotFoundException, IOException {
getTrie(trieStream);
ioBuffer = new RollingCharBuffer();
ioBuffer.reset(input);
}
public SkrtWordTokenizer(boolean debug, InputStream trieStream) throws FileNotFoundException, IOException {
this(trieStream);
this.debug = debug;
}
/**
* Uses the given Trie
* @param trie a Trie built using {@link BuildCompiledTrie}
*/
public SkrtWordTokenizer(Trie trie) {
this.scanner = trie;
ioBuffer = new RollingCharBuffer();
ioBuffer.reset(input);
}
public SkrtWordTokenizer(boolean debug, Trie trie) {
this(trie);
this.debug = debug;
}
private Trie getTrie() {
if (defaultTrie != null)
return defaultTrie;
Trie trie = null;
InputStream stream = null;
stream = CommonHelpers.getResourceOrFile(compiledTrieName);
if (stream == null) {
final String msg = "The default compiled Trie is not found. Either rebuild the Jar or run BuildCompiledTrie.main()"
+ "\n\tAborting...";
logger.error(msg);
return null;
} else {
trie = getTrie(stream);
}
defaultTrie = trie;
return defaultTrie;
}
private Trie getTrie(InputStream stream) {
Trie trie = null;
long start = System.currentTimeMillis();
try {
trie = new Trie(new DataInputStream(stream));
} catch (IOException e) {
logger.error("error in inputstream conversion for Trie", e);
return null;
}
long end = System.currentTimeMillis();
String msg = "Trie loaded in: " + (end - start) / 1000 + "s.";
logger.info(msg);
System.out.println(msg);
return trie;
}
/* current token related */
private int tokenStart;
private StringBuilder tokenBuffer = new StringBuilder();
private Row rootRow, currentRow;
private int cmdIndex, foundMatchCmdIndex;
private boolean foundMatch;
private boolean afterNonwordMatch;
/* nonMaxMatch related */
private boolean foundNonMaxMatch, wentToMaxDownTheTrie;
private StringBuilder nonMaxBuffer = new StringBuilder();
private int nonMaxTokenStart, nonMaxBufferIndex, nonMaxFoundMatchCmdIndex, nonMaxNonWordLength;
/* tokens related */
private LinkedHashMap potentialTokens = new LinkedHashMap();
// contains : {startingIndex, endingIndex, tokenLength, (isItAMatchInTheTrie ? 1 : 0),
// (isItAMatchInTheTrie ? theIndexOfTheCmd : -1)}
/* nonWords related */
private int nonWordStart;
private StringBuilder nonWordBuffer = new StringBuilder();
/* totalTokens related */
private LinkedList totalTokens = new LinkedList();
private boolean hasTokenToEmit;
/* initials related */
private LinkedHashMap initials = null; // it is HashSet to filter duplicate initials
private Iterator> initialsIterator = null;
private StringCharacterIterator initialCharsIterator = null;
private static int sandhiIndex = -1;
private int initialsOrigBufferIndex = -1, initialsOrigTokenStart = -1;
private StringBuilder initialsOrigBuffer = new StringBuilder();
private HashSet storedInitials = null;
private static boolean mergesInitials = false;
private int finalsIndex = -1;
private int firstInitialIndex;
private boolean applyOtherInitial;
/* ioBuffer related (contains the input string) */
private RollingCharBuffer ioBuffer;
private int bufferIndex = 0, finalOffset = 0;
private int charCount;
int MAX_WORD_LEN = 255;
/* previous state related*/
private int storedNoMatchState, noMatchTokenStart, noMatchBufferIndex, noMatchFoundMatchCmdIndex;
private StringBuilder noMatchBuffer = new StringBuilder();
private Integer idempotentIdx = -1;
private boolean previousIsSpace;
/**
* Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this to,
* e.g., lowercase tokens.
* @param c current character
* @return normalized c
*/
protected int normalize(int c) {
return c;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
/* B.3. ADDING REMAINING EXTRA TOKENS */
if (hasTokenToEmit == true) {
addExtraToken();
if (hasTokenToEmit == true) {
return true;
} else {
totalTokens.clear(); // and resume looping over ioBuffer
}
}
if (bufferIndex - 4 >= 0) {
if (sandhiIndex != -1 && sandhiIndex < bufferIndex) {
ioBuffer.freeBefore(sandhiIndex);
sandhiIndex = -1;
} else if (idempotentIdx != -1 && idempotentIdx < bufferIndex) {
ioBuffer.freeBefore(idempotentIdx - 4);
} else {
ioBuffer.freeBefore(bufferIndex - 4);
}
}
tokenStart = -1;
rootRow = scanner.getRow(scanner.getRoot());
currentRow = null;
cmdIndex = -1;
foundMatchCmdIndex = -1;
foundMatch = false;
afterNonwordMatch = false;
previousIsSpace = false;
nonWordBuffer.setLength(0);
nonWordStart = -1;
nonMaxBuffer.setLength(0);
nonMaxTokenStart = -1;
nonMaxBufferIndex = -1;
nonMaxFoundMatchCmdIndex = -1;
nonMaxNonWordLength = -1;
foundNonMaxMatch = false;
wentToMaxDownTheTrie = false;
firstInitialIndex = -1;
applyOtherInitial = false;
noMatchBuffer.setLength(0);
noMatchTokenStart = -1;
noMatchBufferIndex = -1;
storedNoMatchState = -1;
noMatchFoundMatchCmdIndex = -1;
initialsOrigBuffer.setLength(0);
initialsOrigTokenStart = -1;
initialsOrigBufferIndex = -1;
charCount = -1;
// furthest char visited when iterating over the initials.
// ensures we do MaxMatch in case the match of the last initial is shorter
int longestIdx = -1;
tokenBuffer.setLength(0);
boolean potentialTokensContainMatches = false;
@SuppressWarnings("unused") // these two variables are not used.
boolean match = false;
@SuppressWarnings("unused") // they only provide humans an easy way to understand what is happening
boolean continuing = false;
@SuppressWarnings("unused")
char currentChar;
if (debug) System.out.println("----------------------");
/* A. FINDING TOKENS */
while (true) {
/* A.1. FILLING c WITH CHARS FROM ioBuffer OR FROM UNSANDHIED INITIALS
*
* In case there are initials to consume:
* - store the current state
* - replace c with first initial
* - resume looping over ioBuffer
* - when a token or a nonWord ends AND there are more initials:
* - restore state
* - do as before
*/
int c = ioBuffer.get(bufferIndex); // take next char in ioBuffer
currentChar = (char) c;
charCount = Character.charCount(c);
bufferIndex += charCount; // increment bufferIndex for next value of c
ifIsNeededInitializeStartingIndexOfNonword();
if (initialsOrigBufferIndex == -1)
storeCurrentState();
if (debug) System.out.print((char) c);
/* when ioBuffer is empty (end of input, ...) */
if (c == -1) {
if (initials == null || initials.isEmpty()) {
bufferIndex -= charCount;
cutOffTokenFromNonWordBuffer();
if ((tokenBuffer.length() == 0 && nonWordBuffer.length() == 0) || isLoneInitial()) {
finalOffset = correctOffset(bufferIndex);
initials = null; // discard all initial-related content
initialsIterator = null;
initialCharsIterator = null;
return false;
}
break;
} else {
bufferIndex -= 1;
}
}
/* Deals with spaces and soft hyphens at word boundaries */
if (isValidCharWithinSandhi(c)) {
if (currentCharIsSpaceWithinSandhi(c)) {
nonWordStart = -1;
if (sandhiIndex != -1) sandhiIndex += charCount;
previousIsSpace = true;
continue; // if there is a space in the sandhied substring, moves beyond the space
} else if (tokenBuffer.length() != 0 || nonWordBuffer.length() != 0) {
if (foundMatch || foundNonMaxMatch) {
if (!foundMatch && foundNonMaxMatch) {
restoreNonMaxMatchState();
}
cutOffTokenFromNonWordBuffer();
if (nonWordBuffer.length() >= 1 && storedInitials != null && !storedInitials.contains(nonWordBuffer.toString())) {
addNonwordToPotentialTokensIfThereIsOne();
}
if (isLoneInitial()) {
tokenBuffer.setLength(0);
}
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
}
if (initialsNotEmpty()) {
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
restoreInitialsOrigState();
reinitializeState();
resetNonWordBuffer(0);
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
continue;
} else if (tokenBuffer.length() == 1 && storedInitials != null && storedInitials.contains(tokenBuffer.toString())) {
tokenBuffer.setLength(0);
} else if (thereIsNoTokenAndNoNonword()) {
foundNonMaxMatch = false;
continue;
} else {
break;
}
}
}
if (thereAreInitialsToConsume()) {
if (initialIsNotFollowedBySandhied(c)) {
ifNoInitialsCleanupPotentialTokensAndNonwords();
if (foundMatch || foundNonMaxMatch) {
if (!foundMatch && foundNonMaxMatch) {
restoreNonMaxMatchState();
}
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
}
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
restoreInitialsOrigState();
reinitializeState();
resetNonWordBuffer(0);
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
if (isValidCharWithinSandhi(ioBuffer.get(bufferIndex))) {
bufferIndex = longestIdx;
}
continue;
} else if (startConsumingInitials()) {
/* we enter here on finalOffset == first initials. (when all initials are consumed, initials == []) */
if (sandhiIndex != -1) {
c = ioBuffer.get(sandhiIndex);
bufferIndex = sandhiIndex + charCount;
sandhiIndex = -1;
if (debug) System.out.print("=>" + (char) c);
}
storeCurrentState();
firstInitialIndex = bufferIndex;
final boolean isIdemSandhi = initializeInitialCharsIteratorIfNeeded();
if (isIdemSandhi) {
bufferIndex = idempotentIdx;
if (initials == null || initials.isEmpty()) idempotentIdx = -1;
if (previousIsSpace) {
ioBuffer.get(bufferIndex); // update ioBuffer
bufferIndex += charCount;
}
}
c = applyInitialChar();
if (debug) System.out.print("=>" + (char) c);
} else if (stillConsumingInitials() || applyOtherInitial) {
/* we enter here if all initial chars are not yet consumed */
final boolean isIdemSandhi = initializeInitialCharsIteratorIfNeeded();
if (isIdemSandhi) {
// only adjust forward; only adjust when the sandhi merges and
// we need to stay on the same character to apply the sandhis
if (bufferIndex < idempotentIdx) {
bufferIndex = idempotentIdx;
}
if (initials == null || initials.isEmpty()) idempotentIdx = -1;
}
c = applyInitialChar();
if (nonWordBuffer.length() > 0) decrement(nonWordBuffer);
if (debug) System.out.print("=>" + (char) c);
applyOtherInitial = false;
}
}
tokenBuffer.append((char) normalize(c));
nonWordBuffer.append((char) c); // later remove chars belonging to a token
if (debug) System.out.println("");
/* A.2. PROCESSING c */
/* A.2.1) if it's a token char */
if (isSLPTokenChar(c)) {
if (isSLPModifier(c)) {
decrement(tokenBuffer);
decrement(nonWordBuffer);
continue;
}
/* Go one step down the Trie */
if (isStartOfTokenOrIsNonwordChar()) {
/* we enter on two occasions: at the actual start of a token and at each new non-word character. */
tokenStart = bufferIndex - charCount; // update for potential word starting here
match = tryToFindMatchIn(rootRow, c); // if foundMatch == true, there is a match
continuing = tryToContinueDownTheTrie(rootRow, c); // if currentRow != null, can continue
incrementTokenIndices();
ifIsNeededInitializeStartingIndexOfNonword();
} else {
/* we enter here on all other occasions: we don't know if word chars will be a match or not */
match = tryToFindMatchIn(currentRow, c);
continuing = tryToContinueDownTheTrie(currentRow, c);
if (reachedNonwordCharacter()) {
if (!foundNonMaxMatch && storedNoMatchState == 1) {
restoreNoMatchState();
storedNoMatchState = 0;
continue;
} else if (longestIdx != -1) {
if (allCharsFromCurrentInitialAreConsumed() || (initialsIterator == null || initialCharsIterator == null)) {
if (allInitialsAreConsumed()) {
cutOffTokenFromNonWordBuffer();
if (!foundAToken() && !foundNonMaxMatch) {
tokenBuffer.setLength(0);
}
if (longestIdx != -1) {
resetNonWordBuffer(0);
}
bufferIndex = longestIdx;
longestIdx = -1;
if (potentialTokens.isEmpty()
&& tokenBuffer.length() == 0
&& nonWordBuffer.length() == 0) {
continue;
} else {
if (foundNonMaxMatch) {
restoreNonMaxMatchState();
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
}
break;
}
} else {
if (foundNonMaxMatch) {
restoreNonMaxMatchState();
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
}
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
foundNonMaxMatch = false;
resetNonWordBuffer(0);
applyOtherInitial = true;
longestIdx = -1;
continue;
}
}
} else if (!foundNonMaxMatch) {
match = tryToFindMatchIn(rootRow, c);
continuing = tryToContinueDownTheTrie(rootRow, c);
tokenBuffer.setLength(0);
tokenStart = bufferIndex - 1;
if (foundMatch) {
afterNonwordMatch = true;
}
}
if (currentRow == null) {
wentToMaxDownTheTrie = true;
}
storedNoMatchState = -1;
if (tokenBuffer.length() == 0) {
tokenBuffer.append((char) c);
} else {
tokenBuffer.setLength(0); // because no word ever started in the first place
}
}
}
/* Decide what to do with the SLP chars currently processed */
if (wentBeyondLongestMatch()) {
if (foundNonMaxMatch) {
restoreNonMaxMatchState();
}
if (storedInitials != null && storedInitials.contains(nonWordBuffer.toString())) {
foundNonMaxMatch = false;
foundMatch = false;
foundMatchCmdIndex = -1;
tokenBuffer.setLength(0);
tokenStart = -1;
wentToMaxDownTheTrie = false;
resetNonWordBuffer(0);
nonWordStart = -1;
continue;
}
ifNoInitialsCleanupPotentialTokensAndNonwords();
if (thereIsNoTokenAndNoNonword()) {
foundNonMaxMatch = false;
continue; // resume looping over ioBuffer
/* the first initial led to a dead end */
} else if (initials == null && initialsIterator != null && initialsIterator.hasNext()) {
foundNonMaxMatch = false;
resetNonWordBuffer(0);
continue; // resume looping over ioBuffer
} else if (isLoneInitial()) {
foundNonMaxMatch = false;
foundMatch = false;
foundMatchCmdIndex = -1;
continue;
} else if (thereAreRemainingInitialsToTest()) {
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
restoreInitialsOrigState();
reinitializeState();
resetNonWordBuffer(0);
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
continue;
} else {
cutOffTokenFromNonWordBuffer();
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
addNonwordToPotentialTokensIfThereIsOne();
break;
}
} else if (thereAreRemainingInitialsToTest()) {
restoreInitialsOrigState();
reinitializeState();
resetNonWordBuffer(0);
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
continue;
} else if (reachedNonwordCharacter()) {
tokenBuffer.setLength(0); // because no word ever started in the first place
tokenStart += charCount;
} else if (foundAToken()) {
if (!afterNonwordMatch || foundMatch) {
cutOffTokenFromNonWordBuffer();
}
if (isLoneInitial()) {
tokenBuffer.setLength(0);
}
if (allCharsFromCurrentInitialAreConsumed()) {
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token
if (allInitialsAreConsumed()) {
ifNoInitialsCleanupPotentialTokensAndNonwords(); // same as above
storedInitials = null;
sandhiIndex = -1;
if (thereIsNoTokenAndNoNonword()) {
continue; // resume looping over ioBuffer
} else {
break; // and resume looping over ioBuffer
}
}
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
if (wentToMaxDownTheTrie && initialsNotEmpty()) {
foundNonMaxMatch = false;
resetNonWordBuffer(0);
}
wentToMaxDownTheTrie = false;
foundNonMaxMatch = false;
applyOtherInitial = true;
} else {
ifNoInitialsCleanupPotentialTokensAndNonwords();
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token
storedInitials = null;
sandhiIndex = -1;
break;
}
} else { // we are within a potential token
if (reachedEndOfInputString()) {
if (allCharsFromCurrentInitialAreConsumed()) {
addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token
if (allInitialsAreConsumed()) {
ifNoInitialsCleanupPotentialTokensAndNonwords();
storedInitials = null;
sandhiIndex = -1;
break;
}
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
if (wentToMaxDownTheTrie && initialsNotEmpty()) {
foundNonMaxMatch = false;
resetNonWordBuffer(0);
}
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
} else {
if (foundNonMaxMatch) {
restoreNonMaxMatchState();
cutOffTokenFromNonWordBuffer();
ifNoInitialsCleanupPotentialTokensAndNonwords();
break;
} else {
ifNoInitialsCleanupPotentialTokensAndNonwords();
if (!foundMatch && !foundNonMaxMatch) {
tokenBuffer.setLength(0);
}
break;
}
}
}
}
/* tokenBuffer corner case: buffer overflow! */
if (tokenBuffer.length() >= MAX_WORD_LEN) { // make sure to check for >= surrogate pair could break == test
break;
}
/* A.2.2) if it is not a token char */
} else if (foundNonMaxMatch) {
restoreNonMaxMatchState();
if (matchIsLoneInitial()) {
tokenBuffer.setLength(0);
foundNonMaxMatch = false;
foundMatchCmdIndex = -1;
storedNoMatchState = -1;
decrement(nonWordBuffer);
nonWordStart = -1;
if (allCharsFromCurrentInitialAreConsumed()) {
addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token
if (allInitialsAreConsumed()) {
ifNoInitialsCleanupPotentialTokensAndNonwords();
storedInitials = null;
sandhiIndex = -1;
if (thereIsNoTokenAndNoNonword() || longestIdx == -1) {
continue; // resume looping over ioBuffer
} else {
if (longestIdx != -1) {
resetNonWordBuffer(0);
}
break; // and resume looping over ioBuffer
}
} else {
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
resetNonWordBuffer(0);
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
continue;
}
} else {
continue;
}
}
ifNoInitialsCleanupPotentialTokensAndNonwords();
if (nonWordBuffer.toString().equals(tokenBuffer.toString()) && nonWordStart != tokenStart) {
nonWordStart = tokenStart;
}
if (allCharsFromCurrentInitialAreConsumed()) {
cutOffTokenFromNonWordBuffer();
addNonwordToPotentialTokensIfThereIsOne();
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
if (allInitialsAreConsumed()) {
ifNoInitialsCleanupPotentialTokensAndNonwords();
storedInitials = null;
sandhiIndex = -1;
if (thereIsNoTokenAndNoNonword()) {
continue; // resume looping over ioBuffer
} else {
break; // and resume looping over ioBuffer
}
} else {
if (potentialTokensContainMatches && initials != null) {
sandhiIndex = bufferIndex;
}
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
resetNonWordBuffer(0);
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
continue;
}
} else {
break;
}
} else if (isNonSLPprecededBySLP() && !isValidCharWithinSandhi(c)) { // we have a nonword token
decrement(tokenBuffer);
decrement(nonWordBuffer);
if (allCharsFromCurrentInitialAreConsumed()) {
if (nonwordIsLoneInitial()) {
tokenBuffer.setLength(0);
resetNonWordBuffer(0);
foundMatchCmdIndex = -1;
continue;
}
addNonwordToPotentialTokensIfThereIsOne();
if (allInitialsAreConsumed()) {
ifNoInitialsCleanupPotentialTokensAndNonwords();
storedInitials = null;
sandhiIndex = -1;
if (foundNonMaxMatch) {
restoreNonMaxMatchState();
if (isLoneInitial()) {
tokenBuffer.setLength(0);
}
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
} else {
tokenBuffer.setLength(0);
}
if (thereIsNoTokenAndNoNonword()) {
foundNonMaxMatch = false;
continue;
} else {
break;
}
}
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
resetNonWordBuffer(0);
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
} else {
ifNoInitialsCleanupPotentialTokensAndNonwords();
tokenBuffer.setLength(0); // there was no match in the first place (we are after "if (foundNonMaxMatch)")
if (isSLPModifier(c)) {
continue; // move on and do as if the modifier didn't exist
} else {
break;
}
}
} else if (isNonSLPprecededByNotEmptyNonWord()) {
decrement(tokenBuffer);
decrement(nonWordBuffer);
if (allCharsFromCurrentInitialAreConsumed() && c != ' ') {
potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne();
decrement(nonWordBuffer);
if (allInitialsAreConsumed()) {
ifNoInitialsCleanupPotentialTokensAndNonwords();
storedInitials = null;
sandhiIndex = -1;
if (thereIsNoTokenAndNoNonword()) {
continue; // resume looping over ioBuffer
} else {
break; // and resume looping over ioBuffer
}
}
if (longestIdx < bufferIndex)
longestIdx = bufferIndex;
resetNonWordBuffer(0);
resetInitialCharsIterator();
restoreInitialsOrigState();
reinitializeState();
wentToMaxDownTheTrie = false;
applyOtherInitial = true;
} else {
ifNoInitialsCleanupPotentialTokensAndNonwords();
tokenBuffer.setLength(0);
if (potentialTokens.isEmpty() && tokenBuffer.length() == 0 && nonWordBuffer.length() == 0) {
reinitializeState();
continue;
} else {
break;
}
}
} else {
decrement(tokenBuffer);
decrement(nonWordBuffer);
nonWordStart = -1;
}
}
/* B. HANDING THEM TO LUCENE */
initials = null; // all initials are consumed. reinitialize for next call of reconstructLemmas()
initialsIterator = null;
initialCharsIterator = null;
if (bufferIndex < longestIdx)
bufferIndex = longestIdx;
longestIdx = -1;
/* B.1. FILLING totalTokens */
if (unsandhyingInitialsYieldedPotentialTokens()) {
if (potentialTokensContainMatches) {
if (nonWordPrecedes()) {
ifThereIsNonwordAddItToTotalTokens();
}
unsandhiFinalsAndAddLemmatizedMatchesToTotalTokens();
} else {
ifThereIsNonwordAddItToTotalTokens();
}
potentialTokens.clear(); // all potential tokens have been consumed, empty the variable
ifSandhiMergesStayOnSameCurrentChar(); // so we can unsandhi the initial and find the start of next word
finalsIndex = bufferIndex; // save index of finals for currentCharIsSpaceWithinSandhi()
} else { // general case: no potential tokens
boolean aNonwordWasAdded = false;
if ((nonWordBuffer.length() > 0 && tokenBuffer.length() <= 0) ||
(nonWordBuffer.length() > 0 && tokenBuffer.length() > 0 && nonWordStart < tokenStart)) {
aNonwordWasAdded = ifThereIsNonwordAddItToTotalTokens();
}
boolean lemmasWereAdded = ifUnsandhyingFinalsYieldsLemmasAddThemToTotalTokens();
if (lemmasWereAdded) {
ifSandhiMergesStayOnSameCurrentChar(); // so we can unsandhi the initial and find the start of next word
finalsIndex = bufferIndex; // save index of finals for currentCharIsSpaceWithinSandhi()
} else if (aNonwordWasAdded) { // if a non-word was added, there was a match but no sandhi
}
}
/* B.2. EXITING incrementToken() WITH THE TOKEN (OR THE FIRST ONE FROM totalTokens) */
ifThereAreInitialsFillIterator();
ifEndOfInputReachedEmptyInitials();
if (thereAreTokensToReturn()) {
hasTokenToEmit = true;
/* B.2.a. ADJUSTING THE LIST OF PreTokens */
/* deal with preverbs and other custom defined entries */
processMultiTokenLemmas();
removeOverlappingNonwords();
/* B.2.b. RETURNING A TOKEN */
final PreToken firstToken = totalTokens.removeFirst();
final Integer[] metaData = firstToken.getMetadata();
fillTermAttributeWith(firstToken.getString(), metaData);
changeTypeOfToken(metaData[3]);
changePartOfSpeech(metaData[4]);
incrAtt.setPositionIncrement(1);
return true; // we exit incrementToken()
} else { // there is no non-word nor extra lemma to add. there was no sandhi for this token
assert(tokenStart != -1);
finalizeSettingTermAttribute();
return true; // we exit incrementToken()
}
}
private void removeOverlappingNonwords() {
TreeSet toDelete = new TreeSet(Collections.reverseOrder());
for (int i=0; i < totalTokens.size(); i++) {
final Integer[] aMetadata = totalTokens.get(i).getMetadata();
final int aStart = aMetadata[0];
final int aEnd = aMetadata[1];
final int aPos = aMetadata[4];
if (aPos == -1) {
for (int j=0; j < totalTokens.size(); j++) {
if (i != j) {
final Integer[] bMetadata = totalTokens.get(j).getMetadata();
final int bStart = bMetadata[0];
final int bEnd = bMetadata[1];
final int bPos = bMetadata[4];
if (aStart >= bStart && aEnd <= bEnd) {
if (bPos != -1) {
toDelete.add(i);
} else {
if (!toDelete.contains(i) && !toDelete.contains(j)) {
toDelete.add(i);
}
}
}
}
}
}
}
for (int idx: toDelete) {
totalTokens.remove(idx);
}
}
/**
* Splits tokens that have a multi-token lemma.
*/
private void processMultiTokenLemmas() {
for (int i=0; i < totalTokens.size(); i++) {
PreToken token = totalTokens.get(i);
if (token.getString().contains("⟾")) {
String[] rawTokens = token.getString().split("⟾");
LinkedList newTokens = new LinkedList();
for (String rawToken: rawTokens) {
Integer[] metaData = token.getMetadata().clone();
final int base = metaData[0];
final int underscore = rawToken.indexOf('_');
final int arrow = rawToken.indexOf('>');
final String string = rawToken.substring(0, underscore - 1);
final int pos = Integer.valueOf(rawToken.substring(underscore - 1, underscore ));
final int startIdx = Integer.valueOf(rawToken.substring(underscore + 1, arrow )) - 1;
final int endIdx = Integer.valueOf(rawToken.substring(arrow + 1, rawToken.length()));
final int tokenLen = string.length();
// replace with new start and end
metaData[0] = base + startIdx;
metaData[1] = base + endIdx;
metaData[2] = tokenLen;
metaData[4] = pos;
newTokens.add(new PreToken(string, metaData));
}
// replace the original token with the new ones
totalTokens.remove(i);
int j = 0;
for (PreToken newToken: newTokens) {
if (!isDupe(newToken))
totalTokens.add(i + j, newToken);
j ++;
}
}
}
}
TreeSet reconstructLemmas(String cmd, String inflected) throws NumberFormatException, IOException {
return reconstructLemmas(cmd, inflected, -1);
}
public static class LemmaInfo implements Comparable {
String lemma;
Integer pos;
public LemmaInfo(String lemma, int pos) {
this.lemma = lemma;
this.pos = pos;
}
@Override
public int compareTo(LemmaInfo arg0) {
int posDiff = pos.compareTo(arg0.pos);
if (posDiff != 0)
return posDiff;
return lemma.compareTo(arg0.lemma);
}
}
/**
* Reconstructs all the possible sandhied strings for the first word using CmdParser.parse(),
* iterates through them, checking if the sandhied string is found in the sandhiable range,
* only reconstructs the lemmas if there is a match.
*
* Each time an idempotent sandhi is indicated by its group in the cmd, all the possibilities
* are generated and a DiffStruct is created and stored in diffLists[1]
*
* @param cmd of the current word
* @param inflected the inflected word to be lemmatized
* @param tokenEndIdx
*
* @return: the list of all the possible lemmas given the current context
*/
TreeSet reconstructLemmas(String cmd, String inflected, int tokenEndIdx) throws NumberFormatException, IOException {
TreeSet totalLemmas = new TreeSet(); // uses a Set to avoid duplicates
CmdParser parser = new CmdParser();
if (tokenEndIdx == -1) tokenEndIdx = bufferIndex;
// (a hack needed because we don't want to generate the idempotent sandhis of ALL sandhis in the cmd.
// we only need those from the sandhis that were applied)
// a list of two elements,
// the first contains all the contexts from the sandhis of the cmd
// the second will contain all the contexts from the idempotent sandhis from the sandhis that fit in the current context.
List>> diffLists = Arrays.asList(parser.parse(inflected, cmd), new TreeMap>());
for (TreeMap> diffList: diffLists) {
for (Entry> current: diffList.entrySet()) {
String sandhied = current.getKey();
TreeSet diffs = current.getValue();
boolean foundAsandhi = false;
for (DiffStruct diff: diffs) {
if (diff.sandhiType == 0 && diff.toAdd.isEmpty() && diff.nbToDelete == 0 && diff.initial.isEmpty()) {
final String lemma = inflected.substring(0, inflected.length()-diff.nbToDelete)+diff.toAdd+"_"+diff.pos;
totalLemmas.add(lemma);
continue; // there is no sandhi nor, so we skip this diff
}
if (containsSandhiedCombination(ioBuffer, tokenEndIdx - 1, sandhied, diff.sandhiType)) {
foundAsandhi = true;
if (!diff.initial.isEmpty() || diff.idempotentGroup == -2) {
if (initials == null) {
initials = new LinkedHashMap();
storedInitials = new HashSet();
}
if (diff.idempotentGroup == -2) {
initials.put(diff.initial, 1);
idempotentIdx = bufferIndex + 1;
} else {
initials.put(diff.initial, -1);
}
storedInitials.add(diff.initial);
}
if (diff.idempotentGroup != -2) {
final String lemma = inflected.substring(0, inflected.length()-diff.nbToDelete)+diff.toAdd+"_"+diff.pos;
totalLemmas.add(lemma);
}
if (diff.idempotentGroup > 0) { // filters groups -1 and 0 (no sandhi)
TreeMap> idemDiffList = diffLists.get(1);
final HashMap idemSandhis = parser.getIdemSandhied(inflected, diff.idempotentGroup);
for (Entry idem: idemSandhis.entrySet()) {
final String initial = idem.getKey().substring(idem.getKey().length()-1);
TreeSet structs = new TreeSet();
structs.add(new DiffStruct(0, null, initial, 10, diff.pos, -2));
idemDiffList.put(idem.getKey(), structs);
}
diffLists.set(1, idemDiffList);
}
}
}
if (foundAsandhi) break;
}
}
return totalLemmas;
}
/**
* Tells whether sandhied could be found between the two words.
* Does it by generating all the legal combinations, filtering spaces and checking for equality.
*
* See SandhiedCombinationTests for how these figures were obtained
*
* @param ioBuffer: is given as parameter for the tests
* @return: true if sandhied is one of the combinations; false otherwise
*/
static boolean containsSandhiedCombination(RollingCharBuffer ioBuffer, int bufferIndex, String sandhied, int sandhiType) throws IOException {
switch(sandhiType) {
case 0:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // no sandhi, but lemmatization required
case 1:
if (isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0)) { // vowel sandhi
if (sandhied.length() == 1) {
mergesInitials = true;
}
return true;
} else {
return false;
}
case 2:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // consonant sandhi 1
case 3:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -1); // consonant sandhi 1 vowels
case 4:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // consonant sandhi 2
case 5:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -1); // visarga sandhi
case 6:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -1); // visarga sandhi 2
case 7:
// (consonant clusters are always reduced to the first consonant)
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // absolute finals sandhi
case 8:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // "cC"-words sandhi
case 9:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -4); // special sandhi: "punar"
case 10:
return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // idempotent sandhi
default:
return false;
}
}
static boolean isSandhiedCombination(RollingCharBuffer ioBuffer, int bufferIndex, String sandhied, int start) throws IOException {
int j = 0;
int nbIgnoredSpaces = 0;
while (j < sandhied.length()) {
final int res = ioBuffer.get(bufferIndex+start+j+nbIgnoredSpaces);
if (isValidCharWithinSandhi(res)) { //
nbIgnoredSpaces++;
continue;
}
if (res == -1)
return false;
if (res != sandhied.codePointAt(j))
return false;
j++;
}
return true;
}
private void decrement(StringBuilder buffer) {
buffer.setLength(buffer.length() - charCount);
}
private void ifNoInitialsCleanupPotentialTokensAndNonwords() {
if (storedInitials != null) {
/* cleanup potentialTokens */
for (String key: storedInitials) {
if (potentialTokens.containsKey(key)) {
potentialTokens.remove(key);
}
if (tokenBuffer.toString().equals(key)) {
tokenBuffer.setLength(0);
}
}
/* cleanup nonwords */
final String nonword = nonWordBuffer.toString();
if (storedInitials.contains(nonword)) {
resetNonWordBuffer(0);
}
}
}
private void ifEndOfInputReachedEmptyInitials() throws IOException {
if (ioBuffer.get(bufferIndex) == -1) {
initials = null;
initialsIterator = null;
}
}
private void finalizeSettingTermAttribute() {
finalOffset = correctOffset(tokenStart + tokenBuffer.length());
offsetAtt.setOffset(correctOffset(tokenStart), finalOffset);
termAtt.setEmpty().append(tokenBuffer.toString());
}
private void changeTypeOfToken(int t) {
if (t == 0) {
typeAtt.setType("non-word");
} else if (t == 1) {
typeAtt.setType("word");
} else if (t == 2) {
typeAtt.setType("lemma");
}
}
private void changePartOfSpeech(int t) {
if (t == 0) {
posAtt.setPartOfSpeech(PartOfSpeech.Indeclinable);
} else if (t == 1) {
posAtt.setPartOfSpeech(PartOfSpeech.Noun);
} else if (t == 2) {
posAtt.setPartOfSpeech(PartOfSpeech.Pronoun);
} else if (t == 3) {
posAtt.setPartOfSpeech(PartOfSpeech.Verb);
} else if (t == 4) {
posAtt.setPartOfSpeech(PartOfSpeech.Preposition);
} else {
posAtt.setPartOfSpeech(PartOfSpeech.Unknown);
}
}
private void fillTermAttributeWith(String string, Integer[] metaData) {
termAtt.setEmpty().append(string); // add the token string
termAtt.setLength(metaData[2]); // declare its size
finalOffset = correctOffset(metaData[1]); // get final offset
offsetAtt.setOffset(correctOffset(metaData[0]), finalOffset); // set its offsets (initial & final)
}
private void ifThereAreInitialsFillIterator() {
if (initials != null && !initials.isEmpty()) {
initialsIterator = initials.entrySet().iterator(); // one sandhi can yield many unsandhied initials
}
}
private void ifSandhiMergesStayOnSameCurrentChar() throws IOException {
if (charCount != -1 && mergesInitials) {
if (ioBuffer.get(bufferIndex) != -1) { // if end of input is not reached
bufferIndex -= charCount;
if (sandhiIndex > -1 && bufferIndex +charCount == sandhiIndex) sandhiIndex -= charCount;
}
mergesInitials = false; // reinitialize variable
}
}
private boolean ifUnsandhyingFinalsYieldsLemmasAddThemToTotalTokens() throws NumberFormatException, IOException {
String cmd = scanner.getCommandVal(foundMatchCmdIndex);
if (cmd != null) {
String token = tokenBuffer.toString();
if (!token.isEmpty()) {
if (debug) System.out.println("form found: " + token + "\n");
final Set lemmas = reconstructLemmas(cmd, token);
if (lemmas.size() != 0) {
for (String l: lemmas) {
final int underscore = l.lastIndexOf('_');
final String lemma = l.substring(0, underscore);
final int pos = Integer.valueOf(l.substring(underscore + 1));
final PreToken newToken = new PreToken(lemma,
new Integer[] {tokenStart, tokenStart + tokenBuffer.length(), lemma.length(), 2, pos});
if (!isDupe(newToken))
totalTokens.add(newToken);
// use same start-end indices since all are from the same inflected form)
}
return true;
}
}
}
return false;
}
private boolean ifThereIsNonwordAddItToTotalTokens() {
boolean containsNonWord = false;
final String nonWord = nonWordBuffer.toString();
if (nonWord.length() > 0) {
final PreToken newToken = new PreToken(nonWord,
new Integer[] {nonWordStart, nonWordStart + nonWordBuffer.length(), nonWord.length(), 0, -1});
if (!isDupe(newToken))
totalTokens.add(newToken);
// ignore all potential tokens. add the non-word with sandhied initials
containsNonWord = true;
}
for (Entry potential: potentialTokens.entrySet()) {
if (potential.getValue()[3] == 0) {
final PreToken newToken = new PreToken(potential.getKey(), potential.getValue());
if (!isDupe(newToken)) {
totalTokens.add(newToken);
containsNonWord = true;
}
}
}
return containsNonWord;
}
private void unsandhiFinalsAndAddLemmatizedMatchesToTotalTokens() throws NumberFormatException, IOException {
for (Entry entry: potentialTokens.entrySet()) {
final String key = entry.getKey();
final Integer[] value = entry.getValue();
if (debug) System.out.println("form found: " + key);
if (value[3] == 1) {
String cmd = scanner.getCommandVal(value[4]);
final Set lemmas = reconstructLemmas(cmd, key, value[1]);
if (lemmas.size() != 0) {
for (String l: lemmas) { // multiple lemmas are possible: finals remain unanalyzed
final int underscore = l.lastIndexOf('_');
final String lemma = l.substring(0, underscore);
final int pos = Integer.valueOf(l.substring(underscore + 1));
final PreToken newToken = new PreToken(lemma,
new Integer[] {value[0], value[1], lemma.length(), 2, pos});
if (!isDupe(newToken))
totalTokens.add(newToken);
// use same indices for all (all are from the same inflected form)
}
} else { // there is no applicable sandhi. the form is returned as-is.
final int pos = Integer.valueOf(cmd.substring(cmd.lastIndexOf('#')+1));
final PreToken newToken = new PreToken(key, new Integer[] {value[0], value[1], value[2], 1, pos});
if (!isDupe(newToken))
totalTokens.add(newToken);
mergesInitials = false;
}
} else {
if (debug) System.out.println("can't be lemmatized\n");
}
}
}
private boolean isDupe(PreToken newToken) {
// determine if newToken already exists.
// this looks for equality in both metadata and token string
boolean isDupe = false;
int idx = 0;
while (!isDupe && idx < totalTokens.size()) {
if (newToken.compareTo(totalTokens.get(idx)) == 0) {
isDupe = true;
}
idx ++;
}
return isDupe;
}
private void cutOffTokenFromNonWordBuffer() {
int newSize = nonWordBuffer.length() - tokenBuffer.length();
newSize = newSize < 0 ? 0: newSize; // ensure the new size is never negative
nonWordBuffer.setLength(newSize);
// end of non-word can be: a matching word starts (potentialEnd == true) OR a nonSLP char follows a nonWord.
}
private void ifIsNeededInitializeStartingIndexOfNonword() {
if (nonWordStart == -1) { // the starting index of a non-word token does not increment
nonWordStart = bufferIndex;
if (!previousIsSpace) {
nonWordStart -= charCount;
previousIsSpace = false;
}
}
}
private void incrementTokenIndices() {
if (tokenStart == -1) {
tokenStart = bufferIndex - charCount;
}
}
private boolean tryToContinueDownTheTrie(Row row, int c) {
int ref = row.getRef((char) c);
currentRow = (ref >= 0) ? scanner.getRow(ref) : null;
return currentRow != null;
}
private boolean tryToFindMatchIn(Row row, int c) {
cmdIndex = row.getCmd((char) c);
foundMatch = (cmdIndex >= 0);
if (foundMatch) {
foundMatchCmdIndex = cmdIndex;
foundNonMaxMatch = storeNonMaxMatchState();
if (storedNoMatchState == -1) {
storeNoMatchState();
storedNoMatchState = 1;
}
return true;
}
return false;
}
private boolean storeNoMatchState() {
noMatchBufferIndex = bufferIndex;
noMatchTokenStart = (tokenStart == -1) ? 0: tokenStart;
noMatchBuffer.setLength(0);
noMatchBuffer.append(tokenBuffer);
noMatchFoundMatchCmdIndex = foundMatchCmdIndex;
return true;
}
private void restoreNoMatchState() {
bufferIndex = noMatchBufferIndex;
tokenStart = noMatchTokenStart;
currentRow = rootRow;
foundMatchCmdIndex = noMatchFoundMatchCmdIndex;
}
private boolean storeNonMaxMatchState() {
nonMaxBufferIndex = bufferIndex;
nonMaxTokenStart = (tokenStart == -1) ? 0: tokenStart;
nonMaxBuffer.setLength(0);
nonMaxBuffer.append(tokenBuffer);
nonMaxFoundMatchCmdIndex = foundMatchCmdIndex;
nonMaxNonWordLength = nonWordBuffer.length();
return true;
}
private void restoreNonMaxMatchState() {
bufferIndex = nonMaxBufferIndex;
tokenStart = nonMaxTokenStart;
currentRow = rootRow;
tokenBuffer.setLength(0);
tokenBuffer.append(nonMaxBuffer);
foundMatchCmdIndex = nonMaxFoundMatchCmdIndex;
nonWordBuffer.setLength(nonMaxNonWordLength);
}
private void storeCurrentState() {
initialsOrigBufferIndex = bufferIndex - 1;
initialsOrigTokenStart = (tokenStart == -1) ? 0: tokenStart;
initialsOrigBuffer.setLength(0);
initialsOrigBuffer.append(tokenBuffer);
}
/* returns to the beginning of the token in ioBuffer */
private void restoreInitialsOrigState() {
bufferIndex = initialsOrigBufferIndex;
tokenStart = initialsOrigTokenStart;
currentRow = rootRow;
tokenBuffer.setLength(0);
tokenBuffer.append(initialsOrigBuffer);
}
private void reinitializeState() {
currentRow = rootRow; // TEST
cmdIndex = -1;
foundMatchCmdIndex = -1;
foundMatch = false;
afterNonwordMatch = false;
nonMaxBuffer.setLength(0);
nonMaxTokenStart = -1;
nonMaxBufferIndex = -1;
nonMaxFoundMatchCmdIndex = -1;
nonMaxNonWordLength = -1;
foundNonMaxMatch = false;
firstInitialIndex = -1;
noMatchBuffer.setLength(0);
noMatchTokenStart = -1;
noMatchBufferIndex = -1;
storedNoMatchState = -1;
noMatchFoundMatchCmdIndex = -1;
initialsOrigBuffer.setLength(0);
initialsOrigTokenStart = -1;
initialsOrigBufferIndex = -1;
}
private void resetNonWordBuffer(int i) {
if (nonWordBuffer.length() - i > 0) {
nonWordBuffer.setLength(i);
} else {
nonWordBuffer.setLength(0);
}
}
private void addNonwordToPotentialTokensIfThereIsOne() {
if (nonWordBuffer.length() != 0 && nonWordStart < tokenStart) {
potentialTokens.put(nonWordBuffer.toString(),
new Integer[] {nonWordStart, nonWordStart + nonWordBuffer.length(), nonWordBuffer.length(), 0, -1});
}
}
private boolean addFoundTokenToPotentialTokensIfThereIsOne() {
if (tokenBuffer.length() > 0) { // avoid empty tokens
final String potentialToken = tokenBuffer.toString();
potentialTokens.put(potentialToken,
new Integer[] {tokenStart, tokenStart + tokenBuffer.length(), potentialToken.length(), 1, foundMatchCmdIndex});
return true;
}
return false;
}
private boolean initializeInitialCharsIteratorIfNeeded() {
boolean isIdem = false;
if (initialCharsIterator == null) {
Entry entry = initialsIterator.next();
if (entry.getValue() == 1) isIdem = true;
initialCharsIterator = new StringCharacterIterator(entry.getKey());
// initialize the iterator with the first initials
initialsIterator.remove(); // remove the initials just fed to the initialsCharsIterator
} else if (initialsIterator.hasNext()) {
/* either first time or initialCharsIterator has been reset AND there are more initials to process */
Entry entry = initialsIterator.next();
if (entry.getValue() == 1) isIdem = true;
initialCharsIterator.setText(entry.getKey());
// fill with new initials. happens if we reach the end of a token (either a Trie match or a non-word)
initialsIterator.remove(); // remove the initials just fed to the initialsCharsIterator
}
return isIdem;
}
private int applyInitialChar() throws IOException {
int initial = initialCharsIterator.current();
if (initial == CharacterIterator.DONE) {
initial = ioBuffer.get(bufferIndex);
}
if (initialCharsIterator.getIndex() == initialCharsIterator.getEndIndex()) {
initialCharsIterator.setIndex(0);
} else {
initialCharsIterator.setIndex(initialCharsIterator.getIndex()+1); // increment iterator index
}
return initial;
// charCount is not updated with new value of c since we only process SLP, so there are never surrogate pairs
}
private void addExtraToken() {
if (totalTokens.peekFirst() != null) {
final PreToken nextToken = totalTokens.removeFirst();
final Integer[] metaData = nextToken.getMetadata();
termAtt.setEmpty().append(nextToken.getString());
changePartOfSpeech(metaData[4]);
changeTypeOfToken(metaData[3]);
termAtt.setLength(metaData[2]);
finalOffset = correctOffset(metaData[1]);
offsetAtt.setOffset(correctOffset(metaData[0]), finalOffset);
incrAtt.setPositionIncrement(0);
} else {
hasTokenToEmit = false;
}
}
final private boolean isLoneInitial() {
boolean isInitial = false;
if (storedInitials != null) {
String tokenStr = tokenBuffer.toString();
for (String initial: storedInitials) {
if (tokenStr.equals(initial) && nonWordBuffer.length() == 0) {
isInitial = true;
}
}
}
return isInitial;
}
final private boolean nonWordPrecedes() {
int nonWordStartIdx = -1;
int wordStartIdx = -1;
for (Integer[] value: potentialTokens.values()) {
if (value[3] == 0) {
nonWordStartIdx = value[0];
} else if (value[3] == 1) {
wordStartIdx = value[0];
}
}
return nonWordStartIdx != -1 && wordStartIdx > nonWordStartIdx;
}
final private boolean thereAreRemainingInitialsToTest() {
/* To remember: returns false if (foundMatch == true), even if there are remaining initials */
return wentToMaxDownTheTrie && foundMatch == false && initialsNotEmpty();
}
final private boolean initialsNotEmpty() {
return initials != null && storedInitials != null && !initials.isEmpty() && initials.size() <= storedInitials.size() - 1;
}
final private boolean nonwordIsLoneInitial() {
return storedInitials != null && storedInitials.contains(nonWordBuffer.toString());
}
final private boolean matchIsLoneInitial() {
return tokenBuffer.length() == 1 && storedInitials != null && storedInitials.contains(tokenBuffer.toString());
}
final private boolean isSLPTokenChar(int c) {
return SkrtSyllableTokenizer.charType.get(c) != null;
// SLP modifiers are excluded because they are not considered to be part of a word/token.
// TODO: If a modifier occurs between two sandhied words, second word won't be considered sandhied
}
final private boolean currentCharIsSpaceWithinSandhi(int c) {
return finalsIndex + 1 == bufferIndex && isValidCharWithinSandhi(c);
}
final private static boolean isValidCharWithinSandhi(int c) {
return c == ' ' || c == '-';
}
final private boolean isSLPModifier(int c) {
return SkrtSyllableTokenizer.charType.get(c) != null && SkrtSyllableTokenizer.charType.get(c) == SkrtSyllableTokenizer.MODIFIER;
}
final private boolean thereIsNoTokenAndNoNonword() {
return tokenBuffer.length() == 0 && nonWordBuffer.length() == 0;
}
final private boolean wentBeyondLongestMatch() {
return foundNonMaxMatch && wentToMaxDownTheTrie && foundMatch == false;
}
final private boolean thereAreTokensToReturn() {
return !totalTokens.isEmpty();
}
final private boolean reachedNonwordCharacter() { // we can't continue down the Trie, yet we don't have any match
return currentRow == null && foundMatch == false;
}
final private boolean unsandhyingInitialsYieldedPotentialTokens() {
return !potentialTokens.isEmpty();
}
final private boolean isNonSLPprecededByNotEmptyNonWord() {
return currentRow == null && nonWordBuffer.length() - charCount > 0;
}
final private boolean isNonSLPprecededBySLP() {
return tokenBuffer.length() > 1;
}
final private boolean reachedEndOfInputString() throws IOException {
return ioBuffer.get(bufferIndex) == -1;
}
final private boolean allCharsFromCurrentInitialAreConsumed() {
return initials != null && initialCharsIterator.current() == CharacterIterator.DONE;
}
final private boolean isStartOfTokenOrIsNonwordChar() {
return tokenBuffer.length() == 1;
}
final private boolean startConsumingInitials() {
return initialCharsIterator == null;
}
final private boolean stillConsumingInitials() {
return initialCharsIterator.getIndex() < initialCharsIterator.getEndIndex();
}
final private boolean initialIsNotFollowedBySandhied(int c) {
return isValidCharWithinSandhi(c) && (firstInitialIndex == -1 || bufferIndex == firstInitialIndex + 1);
}
final private boolean allInitialsAreConsumed() {
return initialsIterator != null && !initialsIterator.hasNext();
}
final private void resetInitialCharsIterator() {
if (initialCharsIterator != null) initialCharsIterator.setIndex(0);
}
final private boolean thereAreInitialsToConsume() throws IOException {
return initials != null && !initials.isEmpty();
}
final private boolean foundAToken() throws IOException {
return currentRow == null && foundMatch == true || (foundMatch == true && reachedEndOfInputString());
}
@Override
public final void end() throws IOException {
super.end();
offsetAtt.setOffset(finalOffset, finalOffset); // set final offset
}
@Override
public void reset() throws IOException {
super.reset();
bufferIndex = 0;
finalOffset = 0;
ioBuffer.reset(input); // make sure to reset the IO buffer!!
totalTokens = new LinkedList();
finalsIndex = -1;
hasTokenToEmit = false; // for emitting multiple tokens
idempotentIdx = -1;
}
public static class PreToken implements Comparable{
String tokenString;
Integer[] tokenMetaData;
public PreToken(String string, Integer[] metaData) {
this.tokenString = string;
this.tokenMetaData = metaData;
}
public String getString() {
return tokenString;
}
public Integer[] getMetadata() {
return tokenMetaData;
}
@Override
public int compareTo(PreToken o) {
boolean meta = Arrays.equals(tokenMetaData, o.tokenMetaData);
boolean str = tokenString.equals(o.tokenString);
if (meta && str) {
return 0;
}
return 1;
}
}
}