org.apache.lucene.analysis.th.ThaiTokenizer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.th;
import java.text.BreakIterator;
import java.util.Locale;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArrayIterator;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;
/**
* Tokenizer that use {@link BreakIterator} to tokenize Thai text.
* WARNING: this tokenizer may not be supported by all JREs.
* It is known to work with Sun/Oracle and Harmony JREs.
* If your application needs to be fully portable, consider using ICUTokenizer instead,
* which uses an ICU Thai BreakIterator that will always be available.
*/
public class ThaiTokenizer extends SegmentingTokenizerBase {
/**
* True if the JRE supports a working dictionary-based breakiterator for Thai.
* If this is false, this tokenizer will not work at all!
*/
public static final boolean DBBI_AVAILABLE;
private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
static {
// check that we have a working dictionary-based break iterator for thai
proto.setText("ภาษาไทย");
DBBI_AVAILABLE = proto.isBoundary(4);
}
/** used for breaking the text into sentences */
private static final BreakIterator sentenceProto = BreakIterator.getSentenceInstance(Locale.ROOT);
private final BreakIterator wordBreaker;
private final CharArrayIterator wrapper = CharArrayIterator.newWordInstance();
int sentenceStart;
int sentenceEnd;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/** Creates a new ThaiTokenizer */
public ThaiTokenizer() {
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
}
/** Creates a new ThaiTokenizer, supplying the AttributeFactory */
public ThaiTokenizer(AttributeFactory factory) {
super(factory, (BreakIterator)sentenceProto.clone());
if (!DBBI_AVAILABLE) {
throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
}
wordBreaker = (BreakIterator)proto.clone();
}
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
wrapper.setText(buffer, sentenceStart, sentenceEnd - sentenceStart);
wordBreaker.setText(wrapper);
}
@Override
protected boolean incrementWord() {
int start = wordBreaker.current();
if (start == BreakIterator.DONE) {
return false; // BreakIterator exhausted
}
// find the next set of boundaries, skipping over non-tokens
int end = wordBreaker.next();
while (end != BreakIterator.DONE &&
!Character.isLetterOrDigit(Character.codePointAt(buffer, sentenceStart + start, sentenceEnd))) {
start = end;
end = wordBreaker.next();
}
if (end == BreakIterator.DONE) {
return false; // BreakIterator exhausted
}
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart + start, end - start);
offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end));
return true;
}
}