All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.highlight.TokenStreamFromTermVector Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.highlight;

import java.io.IOException;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;

/**
 * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
 * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
 * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
 * for them and if not then won't get them.  This TokenStream supports an efficient {@link #reset()}, so there's
 * no need to wrap with a caching impl.
 * 

* The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps * in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds * them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides * the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy * the same final position; otherwise tokens become adjacent. * * @lucene.internal */ public final class TokenStreamFromTermVector extends TokenStream { private final Terms vector; private final CharTermAttribute termAttribute; private final PositionIncrementAttribute positionIncrementAttribute; private final int maxStartOffset; private OffsetAttribute offsetAttribute;//maybe null private PayloadAttribute payloadAttribute;//maybe null private CharsRefBuilder termCharsBuilder;//term data here private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null private TokenLL firstToken = null; // the head of a linked-list private TokenLL incrementToken = null; private boolean initialized = false;//lazy /** * Constructor. The uninversion doesn't happen here; it's delayed till the first call to * {@link #incrementToken}. * * @param vector Terms that contains the data for * creating the TokenStream. Must have positions and/or offsets. * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit. */ public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException { this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset; assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*"; if (!vector.hasPositions() && !vector.hasOffsets()) { throw new IllegalArgumentException("The term vector needs positions and/or offsets."); } assert vector.hasFreqs(); this.vector = vector; termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); } public Terms getTermVectorTerms() { return vector; } @Override public void reset() throws IOException { incrementToken = null; super.reset(); } //We delay initialization because we can see which attributes the consumer wants, particularly payloads private void init() throws IOException { assert !initialized; short dpEnumFlags = PostingsEnum.POSITIONS; if (vector.hasOffsets()) { dpEnumFlags |= PostingsEnum.OFFSETS; offsetAttribute = addAttribute(OffsetAttribute.class); } if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) { dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too payloadAttribute = getAttribute(PayloadAttribute.class); payloadsBytesRefArray = new BytesRefArray(Counter.newCounter()); spareBytesRefBuilder = new BytesRefBuilder(); } // We put term data here termCharsBuilder = new CharsRefBuilder(); termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position TokenLL[] positionedTokens = initTokensArray(); int lastPosition = -1; final TermsEnum termsEnum = vector.iterator(); BytesRef termBytesRef; PostingsEnum dpEnum = null; CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call //int sumFreq = 0; while ((termBytesRef = termsEnum.next()) != null) { //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj) // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand tempCharsRefBuilder.grow(termBytesRef.length); final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars()); final int termCharsOff = termCharsBuilder.length(); termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen); dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier dpEnum.nextDoc(); final int freq = dpEnum.freq(); //sumFreq += freq; for (int j = 0; j < freq; j++) { int pos = dpEnum.nextPosition(); TokenLL token = new TokenLL(); token.termCharsOff = termCharsOff; token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE); if (offsetAttribute != null) { token.startOffset = dpEnum.startOffset(); if (token.startOffset > maxStartOffset) { continue;//filter this token out; exceeds threshold } token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE); if (pos == -1) { pos = token.startOffset >> 3;//divide by 8 } } if (payloadAttribute != null) { final BytesRef payload = dpEnum.getPayload(); token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload); } //Add token to an array indexed by position if (positionedTokens.length <= pos) { //grow, but not 2x since we think our original length estimate is close TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)]; System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1); positionedTokens = newPositionedTokens; } positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]); lastPosition = Math.max(lastPosition, pos); } } // System.out.println(String.format( // "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f", // sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq, // (originalPositionEstimate/(lastPosition + 1.0f)))); // Step 2: Link all Tokens into a linked-list and set position increments as we go int prevTokenPos = -1; TokenLL prevToken = null; for (int pos = 0; pos <= lastPosition; pos++) { TokenLL token = positionedTokens[pos]; if (token == null) { continue; } //link if (prevToken != null) { assert prevToken.next == null; prevToken.next = token; //concatenate linked-list } else { assert firstToken == null; firstToken = token; } //set increments if (vector.hasPositions()) { token.positionIncrement = pos - prevTokenPos; while (token.next != null) { token = token.next; token.positionIncrement = 0; } } else { token.positionIncrement = 1; while (token.next != null) { prevToken = token; token = token.next; if (prevToken.startOffset == token.startOffset) { token.positionIncrement = 0; } else { token.positionIncrement = 1; } } } prevTokenPos = pos; prevToken = token; } initialized = true; } private TokenLL[] initTokensArray() throws IOException { // Estimate the number of position slots we need from term stats. We use some estimation factors taken from // Wikipedia that reduce the likelihood of needing to expand the array. int sumTotalTermFreq = (int) vector.getSumTotalTermFreq(); assert sumTotalTermFreq != -1; final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this // This estimate is based on maxStartOffset. Err on the side of this being larger than needed. final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0); // Take the smaller of the two estimates, but no smaller than 64 return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))]; } @Override public boolean incrementToken() throws IOException { if (incrementToken == null) { if (!initialized) { init(); assert initialized; } incrementToken = firstToken; if (incrementToken == null) { return false; } } else if (incrementToken.next != null) { incrementToken = incrementToken.next; } else { return false; } clearAttributes(); termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen); positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement); if (offsetAttribute != null) { offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc); } if (payloadAttribute != null) { if (incrementToken.payloadIndex == -1) { payloadAttribute.setPayload(null); } else { payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex)); } } return true; } private static class TokenLL { // This class should weigh 32 bytes, including object header int termCharsOff; // see termCharsBuilder short termCharsLen; int positionIncrement; int startOffset; short endOffsetInc; // add to startOffset to get endOffset int payloadIndex; TokenLL next; /** Given the head of a linked-list (possibly null) this inserts the token at the correct * spot to maintain the desired order, and returns the head (which could be this token if it's the smallest). * O(N^2) complexity but N should be a handful at most. */ TokenLL insertIntoSortedLinkedList(final TokenLL head) { assert next == null; if (head == null) { return this; } else if (this.compareOffsets(head) <= 0) { this.next = head; return this; } TokenLL prev = head; while (prev.next != null && this.compareOffsets(prev.next) > 0) { prev = prev.next; } this.next = prev.next; prev.next = this; return head; } /** by startOffset then endOffset */ int compareOffsets(TokenLL tokenB) { int cmp = Integer.compare(this.startOffset, tokenB.startOffset); if (cmp == 0) { cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc); } return cmp; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy