org.apache.lucene.analysis.cn.ChineseTokenizer Maven / Gradle / Ivy
package org.apache.lucene.analysis.cn;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Tokenize Chinese text as individual chinese characters.
*
*
* The difference between ChineseTokenizer and
* CJKTokenizer is that they have different
* token parsing logic.
*
*
* For example, if the Chinese text
* "C1C2C3C4" is to be indexed:
*
* - The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
*
- The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
*
*
*
* Therefore the index created by CJKTokenizer is much larger.
*
*
* The problem is that when searching for C1, C1C2, C1C3,
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the
* CJKTokenizer will not work.
*
* @version 1.0
*
*/
public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
super(in);
init();
}
public ChineseTokenizer(AttributeSource source, Reader in) {
super(source, in);
init();
}
public ChineseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
init();
}
private void init() {
termAtt = addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private final void push(char c) {
if (length == 0) start = offset-1; // start of token
buffer[length++] = Character.toLowerCase(c); // buffer it
}
private final boolean flush() {
if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
termAtt.setTermBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else
return false;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
length = 0;
start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) return flush();
else
c = ioBuffer[bufferIndex++];
switch(Character.getType(c)) {
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
if (length == MAX_WORD_LEN) return flush();
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
}
push(c);
return flush();
default:
if (length>0) return flush();
break;
}
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = offset;
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
reset();
}
}