All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.bdrc.lucene.sa.SkrtWordTokenizer Maven / Gradle / Ivy

There is a newer version: 1.1.0
Show newest version
/*******************************************************************************
 * Copyright (c) 2017 Buddhist Digital Resource Center (BDRC)
 *
 * If this file is a derivation of another work the license header will appear
 * below; otherwise, this work is licensed under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with the
 * License.
 *
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package io.bdrc.lucene.sa;

import java.io.DataInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.bdrc.lucene.sa.CmdParser.DiffStruct;
import io.bdrc.lucene.sa.PartOfSpeechAttribute.PartOfSpeech;
import io.bdrc.lucene.stemmer.Row;
import io.bdrc.lucene.stemmer.Trie;

/**
 * A maximal-matching word tokenizer for Sanskrit that uses a {@link Trie}.
 *
 * 

* The expected input is an SLP string.
* {@link SkrtSyllableTokenizer#isSLP(int)} is used to filter out nonSLP characters. * *

* The necessary information for unsandhying finals and initials is taken from * {@code resources/sanskrit-stemming-data/output/total_output.txt} (a submodule). * *

* Due to its design, this tokenizer doesn't deal with contextual ambiguities.
* For example, "nagaraM" could either be a word of its own or "na" + "garaM", * but is always parsed as a single word in the default behavior. *
* In order to get the correct segmentation, we provide a mechanism to include * custom entries in the Trie that will contain multi-token lemmas. The provided information * will then be used by this tokenizer to correctly tokenize the problematic passages. *

* See here * for more information. *

* Derived from Lucene 6.4.1 CharTokenizer, but differs by using a RollingCharBuffer * to still find tokens that are on the IO_BUFFER_SIZE (4096 chars) * * @author Élie Roux * @author Drupchen * */ public final class SkrtWordTokenizer extends Tokenizer { private boolean debug = false; String compiledTrieName = "skrt-compiled-trie.dump"; private static Trie defaultTrie; private Trie scanner; static final Logger logger = LoggerFactory.getLogger(SkrtWordTokenizer.class); /* attributes allowing to modify the values of the generated terms */ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); private final PositionIncrementAttribute incrAtt = addAttribute(PositionIncrementAttribute.class); /** * Default constructor: uses the default compiled Trie loaded at class level */ public SkrtWordTokenizer() { this.scanner = getTrie(); ioBuffer = new RollingCharBuffer(); ioBuffer.reset(input); } public SkrtWordTokenizer(boolean debug) { this(); this.debug = debug; } /** * Builds a Trie from a file containing raw Trie data. (might take a long time, depending on the size of the Trie to build) *
Does so in memory without storing to an external file. *
Is best used with small Tries, such as for testing purposes. *

* sanskrit-stemming-data * should be used to parse custom data in the accepted format. * * @param filename the file containing the entries of the Trie * @throws FileNotFoundException the file containing the Trie can't be found * @throws IOException the file containing the Trie can't be read */ public SkrtWordTokenizer(String filename) throws FileNotFoundException, IOException { this.scanner = BuildCompiledTrie.buildTrie(filename); ioBuffer = new RollingCharBuffer(); ioBuffer.reset(input); } public SkrtWordTokenizer(boolean debug, String filename) throws FileNotFoundException, IOException { this(filename); this.debug = debug; } /** * Opens an already compiled Trie that was saved to disk. *

* sanskrit-stemming-data * should be used to parse custom data in the accepted format. *

* The compiled Trie should then be built and saved to disk * with {@link BuildCompiledTrie#main(String[])} * * @param trieStream an InputStream (FileInputStream, for ex.) containing the compiled Trie * @throws FileNotFoundException the file containing the Trie can't be found * @throws IOException the file containing the Trie can't be read */ public SkrtWordTokenizer(InputStream trieStream) throws FileNotFoundException, IOException { getTrie(trieStream); ioBuffer = new RollingCharBuffer(); ioBuffer.reset(input); } public SkrtWordTokenizer(boolean debug, InputStream trieStream) throws FileNotFoundException, IOException { this(trieStream); this.debug = debug; } /** * Uses the given Trie * @param trie a Trie built using {@link BuildCompiledTrie} */ public SkrtWordTokenizer(Trie trie) { this.scanner = trie; ioBuffer = new RollingCharBuffer(); ioBuffer.reset(input); } public SkrtWordTokenizer(boolean debug, Trie trie) { this(trie); this.debug = debug; } private Trie getTrie() { if (defaultTrie != null) return defaultTrie; Trie trie = null; InputStream stream = null; stream = CommonHelpers.getResourceOrFile(compiledTrieName); if (stream == null) { final String msg = "The default compiled Trie is not found. Either rebuild the Jar or run BuildCompiledTrie.main()" + "\n\tAborting..."; logger.error(msg); return null; } else { trie = getTrie(stream); } defaultTrie = trie; return defaultTrie; } private Trie getTrie(InputStream stream) { Trie trie = null; long start = System.currentTimeMillis(); try { trie = new Trie(new DataInputStream(stream)); } catch (IOException e) { logger.error("error in inputstream conversion for Trie", e); return null; } long end = System.currentTimeMillis(); String msg = "Trie loaded in: " + (end - start) / 1000 + "s."; logger.info(msg); System.out.println(msg); return trie; } /* current token related */ private int tokenStart; private StringBuilder tokenBuffer = new StringBuilder(); private Row rootRow, currentRow; private int cmdIndex, foundMatchCmdIndex; private boolean foundMatch; private boolean afterNonwordMatch; /* nonMaxMatch related */ private boolean foundNonMaxMatch, wentToMaxDownTheTrie; private StringBuilder nonMaxBuffer = new StringBuilder(); private int nonMaxTokenStart, nonMaxBufferIndex, nonMaxFoundMatchCmdIndex, nonMaxNonWordLength; /* tokens related */ private LinkedHashMap potentialTokens = new LinkedHashMap(); // contains : {startingIndex, endingIndex, tokenLength, (isItAMatchInTheTrie ? 1 : 0), // (isItAMatchInTheTrie ? theIndexOfTheCmd : -1)} /* nonWords related */ private int nonWordStart; private StringBuilder nonWordBuffer = new StringBuilder(); /* totalTokens related */ private LinkedList totalTokens = new LinkedList(); private boolean hasTokenToEmit; /* initials related */ private LinkedHashMap initials = null; // it is HashSet to filter duplicate initials private Iterator> initialsIterator = null; private StringCharacterIterator initialCharsIterator = null; private static int sandhiIndex = -1; private int initialsOrigBufferIndex = -1, initialsOrigTokenStart = -1; private StringBuilder initialsOrigBuffer = new StringBuilder(); private HashSet storedInitials = null; private static boolean mergesInitials = false; private int finalsIndex = -1; private int firstInitialIndex; private boolean applyOtherInitial; /* ioBuffer related (contains the input string) */ private RollingCharBuffer ioBuffer; private int bufferIndex = 0, finalOffset = 0; private int charCount; int MAX_WORD_LEN = 255; /* previous state related*/ private int storedNoMatchState, noMatchTokenStart, noMatchBufferIndex, noMatchFoundMatchCmdIndex; private StringBuilder noMatchBuffer = new StringBuilder(); private Integer idempotentIdx = -1; private boolean previousIsSpace; /** * Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this to, * e.g., lowercase tokens. * @param c current character * @return normalized c */ protected int normalize(int c) { return c; } @Override public final boolean incrementToken() throws IOException { clearAttributes(); /* B.3. ADDING REMAINING EXTRA TOKENS */ if (hasTokenToEmit == true) { addExtraToken(); if (hasTokenToEmit == true) { return true; } else { totalTokens.clear(); // and resume looping over ioBuffer } } if (bufferIndex - 4 >= 0) { if (sandhiIndex != -1 && sandhiIndex < bufferIndex) { ioBuffer.freeBefore(sandhiIndex); sandhiIndex = -1; } else if (idempotentIdx != -1 && idempotentIdx < bufferIndex) { ioBuffer.freeBefore(idempotentIdx - 4); } else { ioBuffer.freeBefore(bufferIndex - 4); } } tokenStart = -1; rootRow = scanner.getRow(scanner.getRoot()); currentRow = null; cmdIndex = -1; foundMatchCmdIndex = -1; foundMatch = false; afterNonwordMatch = false; previousIsSpace = false; nonWordBuffer.setLength(0); nonWordStart = -1; nonMaxBuffer.setLength(0); nonMaxTokenStart = -1; nonMaxBufferIndex = -1; nonMaxFoundMatchCmdIndex = -1; nonMaxNonWordLength = -1; foundNonMaxMatch = false; wentToMaxDownTheTrie = false; firstInitialIndex = -1; applyOtherInitial = false; noMatchBuffer.setLength(0); noMatchTokenStart = -1; noMatchBufferIndex = -1; storedNoMatchState = -1; noMatchFoundMatchCmdIndex = -1; initialsOrigBuffer.setLength(0); initialsOrigTokenStart = -1; initialsOrigBufferIndex = -1; charCount = -1; // furthest char visited when iterating over the initials. // ensures we do MaxMatch in case the match of the last initial is shorter int longestIdx = -1; tokenBuffer.setLength(0); boolean potentialTokensContainMatches = false; @SuppressWarnings("unused") // these two variables are not used. boolean match = false; @SuppressWarnings("unused") // they only provide humans an easy way to understand what is happening boolean continuing = false; @SuppressWarnings("unused") char currentChar; if (debug) System.out.println("----------------------"); /* A. FINDING TOKENS */ while (true) { /* A.1. FILLING c WITH CHARS FROM ioBuffer OR FROM UNSANDHIED INITIALS * * In case there are initials to consume: * - store the current state * - replace c with first initial * - resume looping over ioBuffer * - when a token or a nonWord ends AND there are more initials: * - restore state * - do as before */ int c = ioBuffer.get(bufferIndex); // take next char in ioBuffer currentChar = (char) c; charCount = Character.charCount(c); bufferIndex += charCount; // increment bufferIndex for next value of c ifIsNeededInitializeStartingIndexOfNonword(); if (initialsOrigBufferIndex == -1) storeCurrentState(); if (debug) System.out.print((char) c); /* when ioBuffer is empty (end of input, ...) */ if (c == -1) { if (initials == null || initials.isEmpty()) { bufferIndex -= charCount; cutOffTokenFromNonWordBuffer(); if ((tokenBuffer.length() == 0 && nonWordBuffer.length() == 0) || isLoneInitial()) { finalOffset = correctOffset(bufferIndex); initials = null; // discard all initial-related content initialsIterator = null; initialCharsIterator = null; return false; } break; } else { bufferIndex -= 1; } } /* Deals with spaces and soft hyphens at word boundaries */ if (isValidCharWithinSandhi(c)) { if (currentCharIsSpaceWithinSandhi(c)) { nonWordStart = -1; if (sandhiIndex != -1) sandhiIndex += charCount; previousIsSpace = true; continue; // if there is a space in the sandhied substring, moves beyond the space } else if (tokenBuffer.length() != 0 || nonWordBuffer.length() != 0) { if (foundMatch || foundNonMaxMatch) { if (!foundMatch && foundNonMaxMatch) { restoreNonMaxMatchState(); } cutOffTokenFromNonWordBuffer(); if (nonWordBuffer.length() >= 1 && storedInitials != null && !storedInitials.contains(nonWordBuffer.toString())) { addNonwordToPotentialTokensIfThereIsOne(); } if (isLoneInitial()) { tokenBuffer.setLength(0); } potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); } if (initialsNotEmpty()) { if (longestIdx < bufferIndex) longestIdx = bufferIndex; restoreInitialsOrigState(); reinitializeState(); resetNonWordBuffer(0); wentToMaxDownTheTrie = false; applyOtherInitial = true; continue; } else if (tokenBuffer.length() == 1 && storedInitials != null && storedInitials.contains(tokenBuffer.toString())) { tokenBuffer.setLength(0); } else if (thereIsNoTokenAndNoNonword()) { foundNonMaxMatch = false; continue; } else { break; } } } if (thereAreInitialsToConsume()) { if (initialIsNotFollowedBySandhied(c)) { ifNoInitialsCleanupPotentialTokensAndNonwords(); if (foundMatch || foundNonMaxMatch) { if (!foundMatch && foundNonMaxMatch) { restoreNonMaxMatchState(); } potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); } if (longestIdx < bufferIndex) longestIdx = bufferIndex; restoreInitialsOrigState(); reinitializeState(); resetNonWordBuffer(0); wentToMaxDownTheTrie = false; applyOtherInitial = true; if (isValidCharWithinSandhi(ioBuffer.get(bufferIndex))) { bufferIndex = longestIdx; } continue; } else if (startConsumingInitials()) { /* we enter here on finalOffset == first initials. (when all initials are consumed, initials == []) */ if (sandhiIndex != -1) { c = ioBuffer.get(sandhiIndex); bufferIndex = sandhiIndex + charCount; sandhiIndex = -1; if (debug) System.out.print("=>" + (char) c); } storeCurrentState(); firstInitialIndex = bufferIndex; final boolean isIdemSandhi = initializeInitialCharsIteratorIfNeeded(); if (isIdemSandhi) { bufferIndex = idempotentIdx; if (initials == null || initials.isEmpty()) idempotentIdx = -1; if (previousIsSpace) { ioBuffer.get(bufferIndex); // update ioBuffer bufferIndex += charCount; } } c = applyInitialChar(); if (debug) System.out.print("=>" + (char) c); } else if (stillConsumingInitials() || applyOtherInitial) { /* we enter here if all initial chars are not yet consumed */ final boolean isIdemSandhi = initializeInitialCharsIteratorIfNeeded(); if (isIdemSandhi) { // only adjust forward; only adjust when the sandhi merges and // we need to stay on the same character to apply the sandhis if (bufferIndex < idempotentIdx) { bufferIndex = idempotentIdx; } if (initials == null || initials.isEmpty()) idempotentIdx = -1; } c = applyInitialChar(); if (nonWordBuffer.length() > 0) decrement(nonWordBuffer); if (debug) System.out.print("=>" + (char) c); applyOtherInitial = false; } } tokenBuffer.append((char) normalize(c)); nonWordBuffer.append((char) c); // later remove chars belonging to a token if (debug) System.out.println(""); /* A.2. PROCESSING c */ /* A.2.1) if it's a token char */ if (isSLPTokenChar(c)) { if (isSLPModifier(c)) { decrement(tokenBuffer); decrement(nonWordBuffer); continue; } /* Go one step down the Trie */ if (isStartOfTokenOrIsNonwordChar()) { /* we enter on two occasions: at the actual start of a token and at each new non-word character. */ tokenStart = bufferIndex - charCount; // update for potential word starting here match = tryToFindMatchIn(rootRow, c); // if foundMatch == true, there is a match continuing = tryToContinueDownTheTrie(rootRow, c); // if currentRow != null, can continue incrementTokenIndices(); ifIsNeededInitializeStartingIndexOfNonword(); } else { /* we enter here on all other occasions: we don't know if word chars will be a match or not */ match = tryToFindMatchIn(currentRow, c); continuing = tryToContinueDownTheTrie(currentRow, c); if (reachedNonwordCharacter()) { if (!foundNonMaxMatch && storedNoMatchState == 1) { restoreNoMatchState(); storedNoMatchState = 0; continue; } else if (longestIdx != -1) { if (allCharsFromCurrentInitialAreConsumed() || (initialsIterator == null || initialCharsIterator == null)) { if (allInitialsAreConsumed()) { cutOffTokenFromNonWordBuffer(); if (!foundAToken() && !foundNonMaxMatch) { tokenBuffer.setLength(0); } if (longestIdx != -1) { resetNonWordBuffer(0); } bufferIndex = longestIdx; longestIdx = -1; if (potentialTokens.isEmpty() && tokenBuffer.length() == 0 && nonWordBuffer.length() == 0) { continue; } else { if (foundNonMaxMatch) { restoreNonMaxMatchState(); potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); } break; } } else { if (foundNonMaxMatch) { restoreNonMaxMatchState(); potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); } resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); foundNonMaxMatch = false; resetNonWordBuffer(0); applyOtherInitial = true; longestIdx = -1; continue; } } } else if (!foundNonMaxMatch) { match = tryToFindMatchIn(rootRow, c); continuing = tryToContinueDownTheTrie(rootRow, c); tokenBuffer.setLength(0); tokenStart = bufferIndex - 1; if (foundMatch) { afterNonwordMatch = true; } } if (currentRow == null) { wentToMaxDownTheTrie = true; } storedNoMatchState = -1; if (tokenBuffer.length() == 0) { tokenBuffer.append((char) c); } else { tokenBuffer.setLength(0); // because no word ever started in the first place } } } /* Decide what to do with the SLP chars currently processed */ if (wentBeyondLongestMatch()) { if (foundNonMaxMatch) { restoreNonMaxMatchState(); } if (storedInitials != null && storedInitials.contains(nonWordBuffer.toString())) { foundNonMaxMatch = false; foundMatch = false; foundMatchCmdIndex = -1; tokenBuffer.setLength(0); tokenStart = -1; wentToMaxDownTheTrie = false; resetNonWordBuffer(0); nonWordStart = -1; continue; } ifNoInitialsCleanupPotentialTokensAndNonwords(); if (thereIsNoTokenAndNoNonword()) { foundNonMaxMatch = false; continue; // resume looping over ioBuffer /* the first initial led to a dead end */ } else if (initials == null && initialsIterator != null && initialsIterator.hasNext()) { foundNonMaxMatch = false; resetNonWordBuffer(0); continue; // resume looping over ioBuffer } else if (isLoneInitial()) { foundNonMaxMatch = false; foundMatch = false; foundMatchCmdIndex = -1; continue; } else if (thereAreRemainingInitialsToTest()) { potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); if (longestIdx < bufferIndex) longestIdx = bufferIndex; restoreInitialsOrigState(); reinitializeState(); resetNonWordBuffer(0); wentToMaxDownTheTrie = false; applyOtherInitial = true; continue; } else { cutOffTokenFromNonWordBuffer(); potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); addNonwordToPotentialTokensIfThereIsOne(); break; } } else if (thereAreRemainingInitialsToTest()) { restoreInitialsOrigState(); reinitializeState(); resetNonWordBuffer(0); wentToMaxDownTheTrie = false; applyOtherInitial = true; continue; } else if (reachedNonwordCharacter()) { tokenBuffer.setLength(0); // because no word ever started in the first place tokenStart += charCount; } else if (foundAToken()) { if (!afterNonwordMatch || foundMatch) { cutOffTokenFromNonWordBuffer(); } if (isLoneInitial()) { tokenBuffer.setLength(0); } if (allCharsFromCurrentInitialAreConsumed()) { potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token if (allInitialsAreConsumed()) { ifNoInitialsCleanupPotentialTokensAndNonwords(); // same as above storedInitials = null; sandhiIndex = -1; if (thereIsNoTokenAndNoNonword()) { continue; // resume looping over ioBuffer } else { break; // and resume looping over ioBuffer } } if (longestIdx < bufferIndex) longestIdx = bufferIndex; resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); if (wentToMaxDownTheTrie && initialsNotEmpty()) { foundNonMaxMatch = false; resetNonWordBuffer(0); } wentToMaxDownTheTrie = false; foundNonMaxMatch = false; applyOtherInitial = true; } else { ifNoInitialsCleanupPotentialTokensAndNonwords(); potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token storedInitials = null; sandhiIndex = -1; break; } } else { // we are within a potential token if (reachedEndOfInputString()) { if (allCharsFromCurrentInitialAreConsumed()) { addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token if (allInitialsAreConsumed()) { ifNoInitialsCleanupPotentialTokensAndNonwords(); storedInitials = null; sandhiIndex = -1; break; } if (longestIdx < bufferIndex) longestIdx = bufferIndex; resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); if (wentToMaxDownTheTrie && initialsNotEmpty()) { foundNonMaxMatch = false; resetNonWordBuffer(0); } wentToMaxDownTheTrie = false; applyOtherInitial = true; } else { if (foundNonMaxMatch) { restoreNonMaxMatchState(); cutOffTokenFromNonWordBuffer(); ifNoInitialsCleanupPotentialTokensAndNonwords(); break; } else { ifNoInitialsCleanupPotentialTokensAndNonwords(); if (!foundMatch && !foundNonMaxMatch) { tokenBuffer.setLength(0); } break; } } } } /* tokenBuffer corner case: buffer overflow! */ if (tokenBuffer.length() >= MAX_WORD_LEN) { // make sure to check for >= surrogate pair could break == test break; } /* A.2.2) if it is not a token char */ } else if (foundNonMaxMatch) { restoreNonMaxMatchState(); if (matchIsLoneInitial()) { tokenBuffer.setLength(0); foundNonMaxMatch = false; foundMatchCmdIndex = -1; storedNoMatchState = -1; decrement(nonWordBuffer); nonWordStart = -1; if (allCharsFromCurrentInitialAreConsumed()) { addNonwordToPotentialTokensIfThereIsOne(); // we do have a non-word token if (allInitialsAreConsumed()) { ifNoInitialsCleanupPotentialTokensAndNonwords(); storedInitials = null; sandhiIndex = -1; if (thereIsNoTokenAndNoNonword() || longestIdx == -1) { continue; // resume looping over ioBuffer } else { if (longestIdx != -1) { resetNonWordBuffer(0); } break; // and resume looping over ioBuffer } } else { if (longestIdx < bufferIndex) longestIdx = bufferIndex; resetNonWordBuffer(0); resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); wentToMaxDownTheTrie = false; applyOtherInitial = true; continue; } } else { continue; } } ifNoInitialsCleanupPotentialTokensAndNonwords(); if (nonWordBuffer.toString().equals(tokenBuffer.toString()) && nonWordStart != tokenStart) { nonWordStart = tokenStart; } if (allCharsFromCurrentInitialAreConsumed()) { cutOffTokenFromNonWordBuffer(); addNonwordToPotentialTokensIfThereIsOne(); potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); if (allInitialsAreConsumed()) { ifNoInitialsCleanupPotentialTokensAndNonwords(); storedInitials = null; sandhiIndex = -1; if (thereIsNoTokenAndNoNonword()) { continue; // resume looping over ioBuffer } else { break; // and resume looping over ioBuffer } } else { if (potentialTokensContainMatches && initials != null) { sandhiIndex = bufferIndex; } if (longestIdx < bufferIndex) longestIdx = bufferIndex; resetNonWordBuffer(0); resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); wentToMaxDownTheTrie = false; applyOtherInitial = true; continue; } } else { break; } } else if (isNonSLPprecededBySLP() && !isValidCharWithinSandhi(c)) { // we have a nonword token decrement(tokenBuffer); decrement(nonWordBuffer); if (allCharsFromCurrentInitialAreConsumed()) { if (nonwordIsLoneInitial()) { tokenBuffer.setLength(0); resetNonWordBuffer(0); foundMatchCmdIndex = -1; continue; } addNonwordToPotentialTokensIfThereIsOne(); if (allInitialsAreConsumed()) { ifNoInitialsCleanupPotentialTokensAndNonwords(); storedInitials = null; sandhiIndex = -1; if (foundNonMaxMatch) { restoreNonMaxMatchState(); if (isLoneInitial()) { tokenBuffer.setLength(0); } potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); } else { tokenBuffer.setLength(0); } if (thereIsNoTokenAndNoNonword()) { foundNonMaxMatch = false; continue; } else { break; } } if (longestIdx < bufferIndex) longestIdx = bufferIndex; resetNonWordBuffer(0); resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); wentToMaxDownTheTrie = false; applyOtherInitial = true; } else { ifNoInitialsCleanupPotentialTokensAndNonwords(); tokenBuffer.setLength(0); // there was no match in the first place (we are after "if (foundNonMaxMatch)") if (isSLPModifier(c)) { continue; // move on and do as if the modifier didn't exist } else { break; } } } else if (isNonSLPprecededByNotEmptyNonWord()) { decrement(tokenBuffer); decrement(nonWordBuffer); if (allCharsFromCurrentInitialAreConsumed() && c != ' ') { potentialTokensContainMatches = addFoundTokenToPotentialTokensIfThereIsOne(); decrement(nonWordBuffer); if (allInitialsAreConsumed()) { ifNoInitialsCleanupPotentialTokensAndNonwords(); storedInitials = null; sandhiIndex = -1; if (thereIsNoTokenAndNoNonword()) { continue; // resume looping over ioBuffer } else { break; // and resume looping over ioBuffer } } if (longestIdx < bufferIndex) longestIdx = bufferIndex; resetNonWordBuffer(0); resetInitialCharsIterator(); restoreInitialsOrigState(); reinitializeState(); wentToMaxDownTheTrie = false; applyOtherInitial = true; } else { ifNoInitialsCleanupPotentialTokensAndNonwords(); tokenBuffer.setLength(0); if (potentialTokens.isEmpty() && tokenBuffer.length() == 0 && nonWordBuffer.length() == 0) { reinitializeState(); continue; } else { break; } } } else { decrement(tokenBuffer); decrement(nonWordBuffer); nonWordStart = -1; } } /* B. HANDING THEM TO LUCENE */ initials = null; // all initials are consumed. reinitialize for next call of reconstructLemmas() initialsIterator = null; initialCharsIterator = null; if (bufferIndex < longestIdx) bufferIndex = longestIdx; longestIdx = -1; /* B.1. FILLING totalTokens */ if (unsandhyingInitialsYieldedPotentialTokens()) { if (potentialTokensContainMatches) { if (nonWordPrecedes()) { ifThereIsNonwordAddItToTotalTokens(); } unsandhiFinalsAndAddLemmatizedMatchesToTotalTokens(); } else { ifThereIsNonwordAddItToTotalTokens(); } potentialTokens.clear(); // all potential tokens have been consumed, empty the variable ifSandhiMergesStayOnSameCurrentChar(); // so we can unsandhi the initial and find the start of next word finalsIndex = bufferIndex; // save index of finals for currentCharIsSpaceWithinSandhi() } else { // general case: no potential tokens boolean aNonwordWasAdded = false; if ((nonWordBuffer.length() > 0 && tokenBuffer.length() <= 0) || (nonWordBuffer.length() > 0 && tokenBuffer.length() > 0 && nonWordStart < tokenStart)) { aNonwordWasAdded = ifThereIsNonwordAddItToTotalTokens(); } boolean lemmasWereAdded = ifUnsandhyingFinalsYieldsLemmasAddThemToTotalTokens(); if (lemmasWereAdded) { ifSandhiMergesStayOnSameCurrentChar(); // so we can unsandhi the initial and find the start of next word finalsIndex = bufferIndex; // save index of finals for currentCharIsSpaceWithinSandhi() } else if (aNonwordWasAdded) { // if a non-word was added, there was a match but no sandhi } } /* B.2. EXITING incrementToken() WITH THE TOKEN (OR THE FIRST ONE FROM totalTokens) */ ifThereAreInitialsFillIterator(); ifEndOfInputReachedEmptyInitials(); if (thereAreTokensToReturn()) { hasTokenToEmit = true; /* B.2.a. ADJUSTING THE LIST OF PreTokens */ /* deal with preverbs and other custom defined entries */ processMultiTokenLemmas(); removeOverlappingNonwords(); /* B.2.b. RETURNING A TOKEN */ final PreToken firstToken = totalTokens.removeFirst(); final Integer[] metaData = firstToken.getMetadata(); fillTermAttributeWith(firstToken.getString(), metaData); changeTypeOfToken(metaData[3]); changePartOfSpeech(metaData[4]); incrAtt.setPositionIncrement(1); return true; // we exit incrementToken() } else { // there is no non-word nor extra lemma to add. there was no sandhi for this token assert(tokenStart != -1); finalizeSettingTermAttribute(); return true; // we exit incrementToken() } } private void removeOverlappingNonwords() { TreeSet toDelete = new TreeSet(Collections.reverseOrder()); for (int i=0; i < totalTokens.size(); i++) { final Integer[] aMetadata = totalTokens.get(i).getMetadata(); final int aStart = aMetadata[0]; final int aEnd = aMetadata[1]; final int aPos = aMetadata[4]; if (aPos == -1) { for (int j=0; j < totalTokens.size(); j++) { if (i != j) { final Integer[] bMetadata = totalTokens.get(j).getMetadata(); final int bStart = bMetadata[0]; final int bEnd = bMetadata[1]; final int bPos = bMetadata[4]; if (aStart >= bStart && aEnd <= bEnd) { if (bPos != -1) { toDelete.add(i); } else { if (!toDelete.contains(i) && !toDelete.contains(j)) { toDelete.add(i); } } } } } } } for (int idx: toDelete) { totalTokens.remove(idx); } } /** * Splits tokens that have a multi-token lemma. */ private void processMultiTokenLemmas() { for (int i=0; i < totalTokens.size(); i++) { PreToken token = totalTokens.get(i); if (token.getString().contains("⟾")) { String[] rawTokens = token.getString().split("⟾"); LinkedList newTokens = new LinkedList(); for (String rawToken: rawTokens) { Integer[] metaData = token.getMetadata().clone(); final int base = metaData[0]; final int underscore = rawToken.indexOf('_'); final int arrow = rawToken.indexOf('>'); final String string = rawToken.substring(0, underscore - 1); final int pos = Integer.valueOf(rawToken.substring(underscore - 1, underscore )); final int startIdx = Integer.valueOf(rawToken.substring(underscore + 1, arrow )) - 1; final int endIdx = Integer.valueOf(rawToken.substring(arrow + 1, rawToken.length())); final int tokenLen = string.length(); // replace with new start and end metaData[0] = base + startIdx; metaData[1] = base + endIdx; metaData[2] = tokenLen; metaData[4] = pos; newTokens.add(new PreToken(string, metaData)); } // replace the original token with the new ones totalTokens.remove(i); int j = 0; for (PreToken newToken: newTokens) { if (!isDupe(newToken)) totalTokens.add(i + j, newToken); j ++; } } } } TreeSet reconstructLemmas(String cmd, String inflected) throws NumberFormatException, IOException { return reconstructLemmas(cmd, inflected, -1); } public static class LemmaInfo implements Comparable { String lemma; Integer pos; public LemmaInfo(String lemma, int pos) { this.lemma = lemma; this.pos = pos; } @Override public int compareTo(LemmaInfo arg0) { int posDiff = pos.compareTo(arg0.pos); if (posDiff != 0) return posDiff; return lemma.compareTo(arg0.lemma); } } /** * Reconstructs all the possible sandhied strings for the first word using CmdParser.parse(), * iterates through them, checking if the sandhied string is found in the sandhiable range, * only reconstructs the lemmas if there is a match. *
* Each time an idempotent sandhi is indicated by its group in the cmd, all the possibilities * are generated and a DiffStruct is created and stored in diffLists[1] * * @param cmd of the current word * @param inflected the inflected word to be lemmatized * @param tokenEndIdx * * @return: the list of all the possible lemmas given the current context */ TreeSet reconstructLemmas(String cmd, String inflected, int tokenEndIdx) throws NumberFormatException, IOException { TreeSet totalLemmas = new TreeSet(); // uses a Set to avoid duplicates CmdParser parser = new CmdParser(); if (tokenEndIdx == -1) tokenEndIdx = bufferIndex; // (a hack needed because we don't want to generate the idempotent sandhis of ALL sandhis in the cmd. // we only need those from the sandhis that were applied) // a list of two elements, // the first contains all the contexts from the sandhis of the cmd // the second will contain all the contexts from the idempotent sandhis from the sandhis that fit in the current context. List>> diffLists = Arrays.asList(parser.parse(inflected, cmd), new TreeMap>()); for (TreeMap> diffList: diffLists) { for (Entry> current: diffList.entrySet()) { String sandhied = current.getKey(); TreeSet diffs = current.getValue(); boolean foundAsandhi = false; for (DiffStruct diff: diffs) { if (diff.sandhiType == 0 && diff.toAdd.isEmpty() && diff.nbToDelete == 0 && diff.initial.isEmpty()) { final String lemma = inflected.substring(0, inflected.length()-diff.nbToDelete)+diff.toAdd+"_"+diff.pos; totalLemmas.add(lemma); continue; // there is no sandhi nor, so we skip this diff } if (containsSandhiedCombination(ioBuffer, tokenEndIdx - 1, sandhied, diff.sandhiType)) { foundAsandhi = true; if (!diff.initial.isEmpty() || diff.idempotentGroup == -2) { if (initials == null) { initials = new LinkedHashMap(); storedInitials = new HashSet(); } if (diff.idempotentGroup == -2) { initials.put(diff.initial, 1); idempotentIdx = bufferIndex + 1; } else { initials.put(diff.initial, -1); } storedInitials.add(diff.initial); } if (diff.idempotentGroup != -2) { final String lemma = inflected.substring(0, inflected.length()-diff.nbToDelete)+diff.toAdd+"_"+diff.pos; totalLemmas.add(lemma); } if (diff.idempotentGroup > 0) { // filters groups -1 and 0 (no sandhi) TreeMap> idemDiffList = diffLists.get(1); final HashMap idemSandhis = parser.getIdemSandhied(inflected, diff.idempotentGroup); for (Entry idem: idemSandhis.entrySet()) { final String initial = idem.getKey().substring(idem.getKey().length()-1); TreeSet structs = new TreeSet(); structs.add(new DiffStruct(0, null, initial, 10, diff.pos, -2)); idemDiffList.put(idem.getKey(), structs); } diffLists.set(1, idemDiffList); } } } if (foundAsandhi) break; } } return totalLemmas; } /** * Tells whether sandhied could be found between the two words. * Does it by generating all the legal combinations, filtering spaces and checking for equality. *

* See SandhiedCombinationTests for how these figures were obtained * * @param ioBuffer: is given as parameter for the tests * @return: true if sandhied is one of the combinations; false otherwise */ static boolean containsSandhiedCombination(RollingCharBuffer ioBuffer, int bufferIndex, String sandhied, int sandhiType) throws IOException { switch(sandhiType) { case 0: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // no sandhi, but lemmatization required case 1: if (isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0)) { // vowel sandhi if (sandhied.length() == 1) { mergesInitials = true; } return true; } else { return false; } case 2: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // consonant sandhi 1 case 3: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -1); // consonant sandhi 1 vowels case 4: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // consonant sandhi 2 case 5: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -1); // visarga sandhi case 6: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -1); // visarga sandhi 2 case 7: // (consonant clusters are always reduced to the first consonant) return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // absolute finals sandhi case 8: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // "cC"-words sandhi case 9: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, -4); // special sandhi: "punar" case 10: return isSandhiedCombination(ioBuffer, bufferIndex, sandhied, 0); // idempotent sandhi default: return false; } } static boolean isSandhiedCombination(RollingCharBuffer ioBuffer, int bufferIndex, String sandhied, int start) throws IOException { int j = 0; int nbIgnoredSpaces = 0; while (j < sandhied.length()) { final int res = ioBuffer.get(bufferIndex+start+j+nbIgnoredSpaces); if (isValidCharWithinSandhi(res)) { // nbIgnoredSpaces++; continue; } if (res == -1) return false; if (res != sandhied.codePointAt(j)) return false; j++; } return true; } private void decrement(StringBuilder buffer) { buffer.setLength(buffer.length() - charCount); } private void ifNoInitialsCleanupPotentialTokensAndNonwords() { if (storedInitials != null) { /* cleanup potentialTokens */ for (String key: storedInitials) { if (potentialTokens.containsKey(key)) { potentialTokens.remove(key); } if (tokenBuffer.toString().equals(key)) { tokenBuffer.setLength(0); } } /* cleanup nonwords */ final String nonword = nonWordBuffer.toString(); if (storedInitials.contains(nonword)) { resetNonWordBuffer(0); } } } private void ifEndOfInputReachedEmptyInitials() throws IOException { if (ioBuffer.get(bufferIndex) == -1) { initials = null; initialsIterator = null; } } private void finalizeSettingTermAttribute() { finalOffset = correctOffset(tokenStart + tokenBuffer.length()); offsetAtt.setOffset(correctOffset(tokenStart), finalOffset); termAtt.setEmpty().append(tokenBuffer.toString()); } private void changeTypeOfToken(int t) { if (t == 0) { typeAtt.setType("non-word"); } else if (t == 1) { typeAtt.setType("word"); } else if (t == 2) { typeAtt.setType("lemma"); } } private void changePartOfSpeech(int t) { if (t == 0) { posAtt.setPartOfSpeech(PartOfSpeech.Indeclinable); } else if (t == 1) { posAtt.setPartOfSpeech(PartOfSpeech.Noun); } else if (t == 2) { posAtt.setPartOfSpeech(PartOfSpeech.Pronoun); } else if (t == 3) { posAtt.setPartOfSpeech(PartOfSpeech.Verb); } else if (t == 4) { posAtt.setPartOfSpeech(PartOfSpeech.Preposition); } else { posAtt.setPartOfSpeech(PartOfSpeech.Unknown); } } private void fillTermAttributeWith(String string, Integer[] metaData) { termAtt.setEmpty().append(string); // add the token string termAtt.setLength(metaData[2]); // declare its size finalOffset = correctOffset(metaData[1]); // get final offset offsetAtt.setOffset(correctOffset(metaData[0]), finalOffset); // set its offsets (initial & final) } private void ifThereAreInitialsFillIterator() { if (initials != null && !initials.isEmpty()) { initialsIterator = initials.entrySet().iterator(); // one sandhi can yield many unsandhied initials } } private void ifSandhiMergesStayOnSameCurrentChar() throws IOException { if (charCount != -1 && mergesInitials) { if (ioBuffer.get(bufferIndex) != -1) { // if end of input is not reached bufferIndex -= charCount; if (sandhiIndex > -1 && bufferIndex +charCount == sandhiIndex) sandhiIndex -= charCount; } mergesInitials = false; // reinitialize variable } } private boolean ifUnsandhyingFinalsYieldsLemmasAddThemToTotalTokens() throws NumberFormatException, IOException { String cmd = scanner.getCommandVal(foundMatchCmdIndex); if (cmd != null) { String token = tokenBuffer.toString(); if (!token.isEmpty()) { if (debug) System.out.println("form found: " + token + "\n"); final Set lemmas = reconstructLemmas(cmd, token); if (lemmas.size() != 0) { for (String l: lemmas) { final int underscore = l.lastIndexOf('_'); final String lemma = l.substring(0, underscore); final int pos = Integer.valueOf(l.substring(underscore + 1)); final PreToken newToken = new PreToken(lemma, new Integer[] {tokenStart, tokenStart + tokenBuffer.length(), lemma.length(), 2, pos}); if (!isDupe(newToken)) totalTokens.add(newToken); // use same start-end indices since all are from the same inflected form) } return true; } } } return false; } private boolean ifThereIsNonwordAddItToTotalTokens() { boolean containsNonWord = false; final String nonWord = nonWordBuffer.toString(); if (nonWord.length() > 0) { final PreToken newToken = new PreToken(nonWord, new Integer[] {nonWordStart, nonWordStart + nonWordBuffer.length(), nonWord.length(), 0, -1}); if (!isDupe(newToken)) totalTokens.add(newToken); // ignore all potential tokens. add the non-word with sandhied initials containsNonWord = true; } for (Entry potential: potentialTokens.entrySet()) { if (potential.getValue()[3] == 0) { final PreToken newToken = new PreToken(potential.getKey(), potential.getValue()); if (!isDupe(newToken)) { totalTokens.add(newToken); containsNonWord = true; } } } return containsNonWord; } private void unsandhiFinalsAndAddLemmatizedMatchesToTotalTokens() throws NumberFormatException, IOException { for (Entry entry: potentialTokens.entrySet()) { final String key = entry.getKey(); final Integer[] value = entry.getValue(); if (debug) System.out.println("form found: " + key); if (value[3] == 1) { String cmd = scanner.getCommandVal(value[4]); final Set lemmas = reconstructLemmas(cmd, key, value[1]); if (lemmas.size() != 0) { for (String l: lemmas) { // multiple lemmas are possible: finals remain unanalyzed final int underscore = l.lastIndexOf('_'); final String lemma = l.substring(0, underscore); final int pos = Integer.valueOf(l.substring(underscore + 1)); final PreToken newToken = new PreToken(lemma, new Integer[] {value[0], value[1], lemma.length(), 2, pos}); if (!isDupe(newToken)) totalTokens.add(newToken); // use same indices for all (all are from the same inflected form) } } else { // there is no applicable sandhi. the form is returned as-is. final int pos = Integer.valueOf(cmd.substring(cmd.lastIndexOf('#')+1)); final PreToken newToken = new PreToken(key, new Integer[] {value[0], value[1], value[2], 1, pos}); if (!isDupe(newToken)) totalTokens.add(newToken); mergesInitials = false; } } else { if (debug) System.out.println("can't be lemmatized\n"); } } } private boolean isDupe(PreToken newToken) { // determine if newToken already exists. // this looks for equality in both metadata and token string boolean isDupe = false; int idx = 0; while (!isDupe && idx < totalTokens.size()) { if (newToken.compareTo(totalTokens.get(idx)) == 0) { isDupe = true; } idx ++; } return isDupe; } private void cutOffTokenFromNonWordBuffer() { int newSize = nonWordBuffer.length() - tokenBuffer.length(); newSize = newSize < 0 ? 0: newSize; // ensure the new size is never negative nonWordBuffer.setLength(newSize); // end of non-word can be: a matching word starts (potentialEnd == true) OR a nonSLP char follows a nonWord. } private void ifIsNeededInitializeStartingIndexOfNonword() { if (nonWordStart == -1) { // the starting index of a non-word token does not increment nonWordStart = bufferIndex; if (!previousIsSpace) { nonWordStart -= charCount; previousIsSpace = false; } } } private void incrementTokenIndices() { if (tokenStart == -1) { tokenStart = bufferIndex - charCount; } } private boolean tryToContinueDownTheTrie(Row row, int c) { int ref = row.getRef((char) c); currentRow = (ref >= 0) ? scanner.getRow(ref) : null; return currentRow != null; } private boolean tryToFindMatchIn(Row row, int c) { cmdIndex = row.getCmd((char) c); foundMatch = (cmdIndex >= 0); if (foundMatch) { foundMatchCmdIndex = cmdIndex; foundNonMaxMatch = storeNonMaxMatchState(); if (storedNoMatchState == -1) { storeNoMatchState(); storedNoMatchState = 1; } return true; } return false; } private boolean storeNoMatchState() { noMatchBufferIndex = bufferIndex; noMatchTokenStart = (tokenStart == -1) ? 0: tokenStart; noMatchBuffer.setLength(0); noMatchBuffer.append(tokenBuffer); noMatchFoundMatchCmdIndex = foundMatchCmdIndex; return true; } private void restoreNoMatchState() { bufferIndex = noMatchBufferIndex; tokenStart = noMatchTokenStart; currentRow = rootRow; foundMatchCmdIndex = noMatchFoundMatchCmdIndex; } private boolean storeNonMaxMatchState() { nonMaxBufferIndex = bufferIndex; nonMaxTokenStart = (tokenStart == -1) ? 0: tokenStart; nonMaxBuffer.setLength(0); nonMaxBuffer.append(tokenBuffer); nonMaxFoundMatchCmdIndex = foundMatchCmdIndex; nonMaxNonWordLength = nonWordBuffer.length(); return true; } private void restoreNonMaxMatchState() { bufferIndex = nonMaxBufferIndex; tokenStart = nonMaxTokenStart; currentRow = rootRow; tokenBuffer.setLength(0); tokenBuffer.append(nonMaxBuffer); foundMatchCmdIndex = nonMaxFoundMatchCmdIndex; nonWordBuffer.setLength(nonMaxNonWordLength); } private void storeCurrentState() { initialsOrigBufferIndex = bufferIndex - 1; initialsOrigTokenStart = (tokenStart == -1) ? 0: tokenStart; initialsOrigBuffer.setLength(0); initialsOrigBuffer.append(tokenBuffer); } /* returns to the beginning of the token in ioBuffer */ private void restoreInitialsOrigState() { bufferIndex = initialsOrigBufferIndex; tokenStart = initialsOrigTokenStart; currentRow = rootRow; tokenBuffer.setLength(0); tokenBuffer.append(initialsOrigBuffer); } private void reinitializeState() { currentRow = rootRow; // TEST cmdIndex = -1; foundMatchCmdIndex = -1; foundMatch = false; afterNonwordMatch = false; nonMaxBuffer.setLength(0); nonMaxTokenStart = -1; nonMaxBufferIndex = -1; nonMaxFoundMatchCmdIndex = -1; nonMaxNonWordLength = -1; foundNonMaxMatch = false; firstInitialIndex = -1; noMatchBuffer.setLength(0); noMatchTokenStart = -1; noMatchBufferIndex = -1; storedNoMatchState = -1; noMatchFoundMatchCmdIndex = -1; initialsOrigBuffer.setLength(0); initialsOrigTokenStart = -1; initialsOrigBufferIndex = -1; } private void resetNonWordBuffer(int i) { if (nonWordBuffer.length() - i > 0) { nonWordBuffer.setLength(i); } else { nonWordBuffer.setLength(0); } } private void addNonwordToPotentialTokensIfThereIsOne() { if (nonWordBuffer.length() != 0 && nonWordStart < tokenStart) { potentialTokens.put(nonWordBuffer.toString(), new Integer[] {nonWordStart, nonWordStart + nonWordBuffer.length(), nonWordBuffer.length(), 0, -1}); } } private boolean addFoundTokenToPotentialTokensIfThereIsOne() { if (tokenBuffer.length() > 0) { // avoid empty tokens final String potentialToken = tokenBuffer.toString(); potentialTokens.put(potentialToken, new Integer[] {tokenStart, tokenStart + tokenBuffer.length(), potentialToken.length(), 1, foundMatchCmdIndex}); return true; } return false; } private boolean initializeInitialCharsIteratorIfNeeded() { boolean isIdem = false; if (initialCharsIterator == null) { Entry entry = initialsIterator.next(); if (entry.getValue() == 1) isIdem = true; initialCharsIterator = new StringCharacterIterator(entry.getKey()); // initialize the iterator with the first initials initialsIterator.remove(); // remove the initials just fed to the initialsCharsIterator } else if (initialsIterator.hasNext()) { /* either first time or initialCharsIterator has been reset AND there are more initials to process */ Entry entry = initialsIterator.next(); if (entry.getValue() == 1) isIdem = true; initialCharsIterator.setText(entry.getKey()); // fill with new initials. happens if we reach the end of a token (either a Trie match or a non-word) initialsIterator.remove(); // remove the initials just fed to the initialsCharsIterator } return isIdem; } private int applyInitialChar() throws IOException { int initial = initialCharsIterator.current(); if (initial == CharacterIterator.DONE) { initial = ioBuffer.get(bufferIndex); } if (initialCharsIterator.getIndex() == initialCharsIterator.getEndIndex()) { initialCharsIterator.setIndex(0); } else { initialCharsIterator.setIndex(initialCharsIterator.getIndex()+1); // increment iterator index } return initial; // charCount is not updated with new value of c since we only process SLP, so there are never surrogate pairs } private void addExtraToken() { if (totalTokens.peekFirst() != null) { final PreToken nextToken = totalTokens.removeFirst(); final Integer[] metaData = nextToken.getMetadata(); termAtt.setEmpty().append(nextToken.getString()); changePartOfSpeech(metaData[4]); changeTypeOfToken(metaData[3]); termAtt.setLength(metaData[2]); finalOffset = correctOffset(metaData[1]); offsetAtt.setOffset(correctOffset(metaData[0]), finalOffset); incrAtt.setPositionIncrement(0); } else { hasTokenToEmit = false; } } final private boolean isLoneInitial() { boolean isInitial = false; if (storedInitials != null) { String tokenStr = tokenBuffer.toString(); for (String initial: storedInitials) { if (tokenStr.equals(initial) && nonWordBuffer.length() == 0) { isInitial = true; } } } return isInitial; } final private boolean nonWordPrecedes() { int nonWordStartIdx = -1; int wordStartIdx = -1; for (Integer[] value: potentialTokens.values()) { if (value[3] == 0) { nonWordStartIdx = value[0]; } else if (value[3] == 1) { wordStartIdx = value[0]; } } return nonWordStartIdx != -1 && wordStartIdx > nonWordStartIdx; } final private boolean thereAreRemainingInitialsToTest() { /* To remember: returns false if (foundMatch == true), even if there are remaining initials */ return wentToMaxDownTheTrie && foundMatch == false && initialsNotEmpty(); } final private boolean initialsNotEmpty() { return initials != null && storedInitials != null && !initials.isEmpty() && initials.size() <= storedInitials.size() - 1; } final private boolean nonwordIsLoneInitial() { return storedInitials != null && storedInitials.contains(nonWordBuffer.toString()); } final private boolean matchIsLoneInitial() { return tokenBuffer.length() == 1 && storedInitials != null && storedInitials.contains(tokenBuffer.toString()); } final private boolean isSLPTokenChar(int c) { return SkrtSyllableTokenizer.charType.get(c) != null; // SLP modifiers are excluded because they are not considered to be part of a word/token. // TODO: If a modifier occurs between two sandhied words, second word won't be considered sandhied } final private boolean currentCharIsSpaceWithinSandhi(int c) { return finalsIndex + 1 == bufferIndex && isValidCharWithinSandhi(c); } final private static boolean isValidCharWithinSandhi(int c) { return c == ' ' || c == '-'; } final private boolean isSLPModifier(int c) { return SkrtSyllableTokenizer.charType.get(c) != null && SkrtSyllableTokenizer.charType.get(c) == SkrtSyllableTokenizer.MODIFIER; } final private boolean thereIsNoTokenAndNoNonword() { return tokenBuffer.length() == 0 && nonWordBuffer.length() == 0; } final private boolean wentBeyondLongestMatch() { return foundNonMaxMatch && wentToMaxDownTheTrie && foundMatch == false; } final private boolean thereAreTokensToReturn() { return !totalTokens.isEmpty(); } final private boolean reachedNonwordCharacter() { // we can't continue down the Trie, yet we don't have any match return currentRow == null && foundMatch == false; } final private boolean unsandhyingInitialsYieldedPotentialTokens() { return !potentialTokens.isEmpty(); } final private boolean isNonSLPprecededByNotEmptyNonWord() { return currentRow == null && nonWordBuffer.length() - charCount > 0; } final private boolean isNonSLPprecededBySLP() { return tokenBuffer.length() > 1; } final private boolean reachedEndOfInputString() throws IOException { return ioBuffer.get(bufferIndex) == -1; } final private boolean allCharsFromCurrentInitialAreConsumed() { return initials != null && initialCharsIterator.current() == CharacterIterator.DONE; } final private boolean isStartOfTokenOrIsNonwordChar() { return tokenBuffer.length() == 1; } final private boolean startConsumingInitials() { return initialCharsIterator == null; } final private boolean stillConsumingInitials() { return initialCharsIterator.getIndex() < initialCharsIterator.getEndIndex(); } final private boolean initialIsNotFollowedBySandhied(int c) { return isValidCharWithinSandhi(c) && (firstInitialIndex == -1 || bufferIndex == firstInitialIndex + 1); } final private boolean allInitialsAreConsumed() { return initialsIterator != null && !initialsIterator.hasNext(); } final private void resetInitialCharsIterator() { if (initialCharsIterator != null) initialCharsIterator.setIndex(0); } final private boolean thereAreInitialsToConsume() throws IOException { return initials != null && !initials.isEmpty(); } final private boolean foundAToken() throws IOException { return currentRow == null && foundMatch == true || (foundMatch == true && reachedEndOfInputString()); } @Override public final void end() throws IOException { super.end(); offsetAtt.setOffset(finalOffset, finalOffset); // set final offset } @Override public void reset() throws IOException { super.reset(); bufferIndex = 0; finalOffset = 0; ioBuffer.reset(input); // make sure to reset the IO buffer!! totalTokens = new LinkedList(); finalsIndex = -1; hasTokenToEmit = false; // for emitting multiple tokens idempotentIdx = -1; } public static class PreToken implements Comparable{ String tokenString; Integer[] tokenMetaData; public PreToken(String string, Integer[] metaData) { this.tokenString = string; this.tokenMetaData = metaData; } public String getString() { return tokenString; } public Integer[] getMetadata() { return tokenMetaData; } @Override public int compareTo(PreToken o) { boolean meta = Arrays.equals(tokenMetaData, o.tokenMetaData); boolean str = tokenString.equals(o.tokenString); if (meta && str) { return 0; } return 1; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy