All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.ngram.NGramTokenizer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ngram;


import java.io.IOException;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;

/**
 * Tokenizes the input into n-grams of the given size(s).
 * 

On the contrary to {@link NGramTokenFilter}, this class sets offsets so * that characters between startOffset and endOffset in the original stream are * the same as the term chars. *

For example, "abcde" would be tokenized as (minGram=2, maxGram=3): *

* * * * *
Termababcbcbcdcdcdede
Position increment1111111
Position length1111111
Offsets[0,2[[0,3[[1,3[[1,4[[2,4[[2,5[[3,5[
* *

This tokenizer changed a lot in Lucene 4.4 in order to:

    *
  • tokenize in a streaming fashion to support streams which are larger * than 1024 chars (limit of the previous version), *
  • count grams based on unicode code points instead of java chars (and * never split in the middle of surrogate pairs), *
  • give the ability to {@link #isTokenChar(int) pre-tokenize} the stream * before computing n-grams.
*

Additionally, this class doesn't trim trailing whitespaces and emits * tokens in a different order, tokens are now emitted by increasing start * offsets while they used to be emitted by increasing lengths (which prevented * from supporting large input streams). */ // non-final to allow for overriding isTokenChar, but all other methods should be final public class NGramTokenizer extends Tokenizer { public static final int DEFAULT_MIN_NGRAM_SIZE = 1; public static final int DEFAULT_MAX_NGRAM_SIZE = 2; private CharacterUtils.CharacterBuffer charBuffer; private int[] buffer; // like charBuffer, but converted to code points private int bufferStart, bufferEnd; // remaining slice in buffer private int offset; private int gramSize; private int minGram, maxGram; private boolean exhausted; private int lastCheckedChar; // last offset in the buffer that we checked private int lastNonTokenChar; // last offset that we found to not be a token char private boolean edgesOnly; // leading edges n-grams only private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) { init(minGram, maxGram, edgesOnly); } /** * Creates NGramTokenizer with given min and max n-grams. * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(int minGram, int maxGram) { this(minGram, maxGram, false); } NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) { super(factory); init(minGram, maxGram, edgesOnly); } /** * Creates NGramTokenizer with given min and max n-grams. * @param factory {@link org.apache.lucene.util.AttributeFactory} to use * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(AttributeFactory factory, int minGram, int maxGram) { this(factory, minGram, maxGram, false); } /** * Creates NGramTokenizer with default min and max n-grams. */ public NGramTokenizer() { this(DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); } private void init(int minGram, int maxGram, boolean edgesOnly) { if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.getBuffer().length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); } @Override public final boolean incrementToken() throws IOException { clearAttributes(); // termination of this loop is guaranteed by the fact that every iteration // either advances the buffer (calls consumes()) or increases gramSize while (true) { // compact if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) { System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); bufferEnd -= bufferStart; lastCheckedChar -= bufferStart; lastNonTokenChar -= bufferStart; bufferStart = 0; // fill in remaining space exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd); // convert to code points bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd); } // should we go to the next offset? if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) { if (bufferStart + 1 + minGram > bufferEnd) { assert exhausted; return false; } consume(); gramSize = minGram; } updateLastNonTokenChar(); // retry if the token to be emitted was going to not only contain token chars final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; } final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0); termAtt.setLength(length); posIncAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1); offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length)); ++gramSize; return true; } } private void updateLastNonTokenChar() { final int termEnd = bufferStart + gramSize - 1; if (termEnd > lastCheckedChar) { for (int i = termEnd; i > lastCheckedChar; --i) { if (!isTokenChar(buffer[i])) { lastNonTokenChar = i; break; } } lastCheckedChar = termEnd; } } /** Consume one code point. */ private void consume() { offset += Character.charCount(buffer[bufferStart++]); } /** Only collect characters which satisfy this condition. */ protected boolean isTokenChar(int chr) { return true; } @Override public final void end() throws IOException { super.end(); assert bufferStart <= bufferEnd; int endOffset = offset; for (int i = bufferStart; i < bufferEnd; ++i) { endOffset += Character.charCount(buffer[i]); } endOffset = correctOffset(endOffset); // set final offset offsetAtt.setOffset(endOffset, endOffset); } @Override public final void reset() throws IOException { super.reset(); bufferStart = bufferEnd = buffer.length; lastNonTokenChar = lastCheckedChar = bufferStart - 1; offset = 0; gramSize = minGram; exhausted = false; charBuffer.reset(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy