io.bdrc.lucene.sa.SkrtSyllableTokenizer Maven / Gradle / Ivy
Show all versions of lucene-sa Show documentation
/*******************************************************************************
* Copyright (c) 2017 Buddhist Digital Resource Center (BDRC)
*
* If this file is a derivation of another work the license header will appear
* below; otherwise, this work is licensed under the Apache License, Version 2.0
* (the 'License"); you may not use this file except in compliance with the
* License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package io.bdrc.lucene.sa;
import java.io.IOException;
import java.util.HashMap;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
*
* A Syllable Tokenizer for Sanskrit encoded in SLP1.
*
* Does not implement complex syllabation rules, does the same syllabation
* as @see Sanskrit Library
*
* Derived from Lucene 6.4.1 analysis.util.CharTokenizer
*
* @author Hélios Hildt
* @author Élie Roux
*
*/
public final class SkrtSyllableTokenizer extends Tokenizer {
/**
* Construct a new SkrtSyllableTokenizer.
*/
public SkrtSyllableTokenizer() {
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private int previousChar = -1;
public static final int DEFAULT_MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final int maxTokenLen = 10;
// valid SLP characters' types
public final static int VOWEL = 0;
public final static int SPECIALPHONEME = 1;
public final static int CONSONANT = 2;
public final static int MODIFIER = 3;
// SLP punctuation
public final static int PUNCT = 4;
// states returned by isTrailingCluster()
public final static int CLUSTER_N_VOWEL = 20;
public final static int CLUSTER_N_PUNCT = 21;
public final static int CLUSTER_N_END = 22;
public final static int NOT_A_CLUSTER = 23;
// states returned by isSylEnd()
public final static int SLP_N_NONSLP = 10;
public final static int MODIFIER_N_CONSONANT = 11;
public final static int SPECIALPHONEME_N_CONSONANT = 12;
public final static int VOWEL_N_CONSONANT = 13;
public final static int NOT_SYLL_END = 14;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
private static final HashMap skrtPunct = punctMap();
private static final HashMap punctMap()
{
HashMap skrtPunct = new HashMap<>();
skrtPunct.put((int)'.', PUNCT);
skrtPunct.put((int)' ', PUNCT);
skrtPunct.put((int)',', PUNCT);
return skrtPunct;
}
static final HashMap charType = createMap();
private static final HashMap createMap()
{
HashMap charType = new HashMap<>();
// vowels
charType.put((int)'a', VOWEL);
charType.put((int)'A', VOWEL);
charType.put((int)'i', VOWEL);
charType.put((int)'I', VOWEL);
charType.put((int)'u', VOWEL);
charType.put((int)'U', VOWEL);
charType.put((int)'f', VOWEL);
charType.put((int)'F', VOWEL);
charType.put((int)'x', VOWEL);
charType.put((int)'X', VOWEL);
charType.put((int)'e', VOWEL);
charType.put((int)'E', VOWEL);
charType.put((int)'o', VOWEL);
charType.put((int)'O', VOWEL);
// special class for anusvara & visarga, jihvamuliya, upadhmaniya
charType.put((int)'M', SPECIALPHONEME);
charType.put((int)'H', SPECIALPHONEME);
charType.put((int)'V', SPECIALPHONEME);
charType.put((int)'Z', SPECIALPHONEME);
charType.put((int)'~', SPECIALPHONEME);
// consonants
charType.put((int)'k', CONSONANT);
charType.put((int)'K', CONSONANT);
charType.put((int)'g', CONSONANT);
charType.put((int)'G', CONSONANT);
charType.put((int)'N', CONSONANT);
charType.put((int)'c', CONSONANT);
charType.put((int)'C', CONSONANT);
charType.put((int)'j', CONSONANT);
charType.put((int)'J', CONSONANT);
charType.put((int)'Y', CONSONANT);
charType.put((int)'w', CONSONANT);
charType.put((int)'W', CONSONANT);
charType.put((int)'q', CONSONANT);
charType.put((int)'Q', CONSONANT);
charType.put((int)'R', CONSONANT);
charType.put((int)'t', CONSONANT);
charType.put((int)'T', CONSONANT);
charType.put((int)'d', CONSONANT);
charType.put((int)'D', CONSONANT);
charType.put((int)'n', CONSONANT);
charType.put((int)'p', CONSONANT);
charType.put((int)'P', CONSONANT);
charType.put((int)'b', CONSONANT);
charType.put((int)'B', CONSONANT);
charType.put((int)'m', CONSONANT);
charType.put((int)'y', CONSONANT);
charType.put((int)'r', CONSONANT);
charType.put((int)'l', CONSONANT);
charType.put((int)'v', CONSONANT);
charType.put((int)'L', CONSONANT);
charType.put((int)'|', CONSONANT);
charType.put((int)'S', CONSONANT);
charType.put((int)'z', CONSONANT);
charType.put((int)'s', CONSONANT);
charType.put((int)'h', CONSONANT);
// Modifiers
charType.put((int)'_', MODIFIER);
charType.put((int)'=', MODIFIER);
charType.put((int)'!', MODIFIER);
charType.put((int)'#', MODIFIER);
charType.put((int)'1', MODIFIER);
charType.put((int)'2', MODIFIER);
charType.put((int)'3', MODIFIER);
charType.put((int)'4', MODIFIER);
charType.put((int)'/', MODIFIER);
charType.put((int)'\\', MODIFIER);
charType.put((int)'^', MODIFIER);
charType.put((int)'6', MODIFIER);
charType.put((int)'7', MODIFIER);
charType.put((int)'8', MODIFIER);
charType.put((int)'9', MODIFIER);
charType.put((int)'+', MODIFIER);
return charType;
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
if (ioBuffer.getLength() == 0) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
break;
} else {
finalOffset = correctOffset(offset);
return false;
}
}
dataLen = ioBuffer.getLength();
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int charCount = Character.charCount(c);
bufferIndex += charCount;
if (isSLP(c)) { // if it's a token char
if (length == 0) { // start of token
assert start == -1;
start = offset + bufferIndex - charCount;
end = start;
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars(c, buffer, length); // buffer it
// Here is where the syllabation logic really happens
int maybeTrailingConsonants = afterConsonantCluster(ioBuffer, bufferIndex-1);
int maybeSylEnd = syllEndingCombinations(previousChar, c);
boolean endOfSyllable;
if (maybeTrailingConsonants == CLUSTER_N_VOWEL || maybeTrailingConsonants == NOT_A_CLUSTER) {
if (maybeSylEnd == VOWEL_N_CONSONANT || maybeSylEnd == SPECIALPHONEME_N_CONSONANT ||
maybeSylEnd == MODIFIER_N_CONSONANT || maybeSylEnd == SLP_N_NONSLP ||
maybeSylEnd == MODIFIER_N_CONSONANT) {
endOfSyllable = true;
} else if (maybeSylEnd == NOT_SYLL_END) {
endOfSyllable = false;
} else {
endOfSyllable = false;
}
} else if (maybeTrailingConsonants == CLUSTER_N_PUNCT || maybeTrailingConsonants == CLUSTER_N_END) {
endOfSyllable = false;
} else {
endOfSyllable = false;
}
if (endOfSyllable) {
// previousChar is the end of the current syllable
// setting the cursor one step back and ending this token/syllable
bufferIndex = bufferIndex - charCount;
length = length - charCount;
end = end - charCount;
previousChar = c;
break;
} // end of syllabation logic
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
previousChar = c;
break;
}
} else if (length > 0) { // at non-Letter w/ chars
previousChar = c;
break; // return 'em
}
previousChar = c;
}
termAtt.setLength(length);
assert start != -1;
finalOffset = correctOffset(end);
offsetAtt.setOffset(correctOffset(start), finalOffset);
return true;
}
public static boolean isSLP(int c) {
/**
* filters only legal SLP1 characters
* @return true if c is a SLP character, else false
*/
Integer res = charType.get(c);
return (res != null);
}
public int syllEndingCombinations(int char1, int char2) {
/**
* Finds all combinations that correspond to a syllable ending
*
* | char1\char2 | nonSLP | MODIFIER | CONSONANT | SPECIALPHONEME | VOWEL |
* |----------------|--------|----------|-----------|----------------|-------|
* | nonSLP | | | | | |
* | MODIFIER | X | | X | | |
* | CONSONANT | X | | | | |
* | SPECIALPHONEME | X | | X | | |
* | VOWEL | X | | X | | |
* |--------------------------------------------------------------------------
*
* @param corresponds to previousChar
* @param corresponds to c
* @return true if a syllable ends between char1 and char2, else false
*/
if (charType.containsKey(char1) && !charType.containsKey(char2)) {
return SLP_N_NONSLP;
} else if (charType.containsKey(char2) && charType.get(char2) == CONSONANT) {
if (charType.containsKey(char1) && charType.get(char1) == MODIFIER) {
return MODIFIER_N_CONSONANT;
} else if (charType.containsKey(char1) && charType.get(char1) == SPECIALPHONEME) {
return SPECIALPHONEME_N_CONSONANT;
} else if (charType.containsKey(char1) && charType.get(char1) == VOWEL) {
return VOWEL_N_CONSONANT;
} else {
return NOT_SYLL_END;
}
} else {
return NOT_SYLL_END;
}
}
private int afterConsonantCluster(CharacterBuffer inputBuffer, int currentIdx ) {
/**
* checks whether the next consonants constitute a trailing cluster of consonants or not.
* @return the combination
*/
// see who comes first, a vowel, a legal punctuation or the end of the buffer
int nextSylEndIdx = currentIdx;
char[] buffer = inputBuffer.getBuffer();
while (nextSylEndIdx < inputBuffer.getLength()) {
if (charType.containsKey((int)buffer[nextSylEndIdx]) && charType.get((int)buffer[nextSylEndIdx]) == CONSONANT) {
if (nextSylEndIdx+1 == inputBuffer.getLength()) {
return CLUSTER_N_END;
}// if char at nextSylIdx
else if (charType.containsKey((int)buffer[nextSylEndIdx+1]) && charType.get((int)buffer[nextSylEndIdx+1]) == VOWEL) {
return CLUSTER_N_VOWEL;
} else if (skrtPunct.containsKey((int)buffer[nextSylEndIdx+1])) {
//System.out.print(Arrays.asList(buffer).subList(0, nextSylEndIdx).toString());
return CLUSTER_N_PUNCT;
}
}
nextSylEndIdx++;
}
return NOT_A_CLUSTER;
}
}