org.apache.lucene.search.highlight.TokenStreamFromTermVector Maven / Gradle / Ivy
Show all versions of lucene-highlighter Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
/**
* TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
* want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
* because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
* for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
* no need to wrap with a caching impl.
*
* The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps
* in positions, this is fine. And it assumes there aren't large numbers of tokens at the same position, since it adds
* them to a linked-list per position in O(N^2) complexity. When there aren't positions in the term vector, it divides
* the startOffset by 8 to use as a temporary substitute. In that case, tokens with the same startOffset will occupy
* the same final position; otherwise tokens become adjacent.
*
* @lucene.internal
*/
public final class TokenStreamFromTermVector extends TokenStream {
private final Terms vector;
private final CharTermAttribute termAttribute;
private final PositionIncrementAttribute positionIncrementAttribute;
private final int maxStartOffset;
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;//maybe null
private CharsRefBuilder termCharsBuilder;//term data here
private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
private TokenLL firstToken = null; // the head of a linked-list
private TokenLL incrementToken = null;
private boolean initialized = false;//lazy
/**
* Constructor. The uninversion doesn't happen here; it's delayed till the first call to
* {@link #incrementToken}.
*
* @param vector Terms that contains the data for
* creating the TokenStream. Must have positions and/or offsets.
* @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
*/
public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
}
assert vector.hasFreqs();
this.vector = vector;
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
}
public Terms getTermVectorTerms() { return vector; }
@Override
public void reset() throws IOException {
incrementToken = null;
super.reset();
}
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
// We put term data here
termCharsBuilder = new CharsRefBuilder();
termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
int lastPosition = -1;
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
tempCharsRefBuilder.grow(termBytesRef.length);
final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
final int termCharsOff = termCharsBuilder.length();
termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
dpEnum.nextDoc();
final int freq = dpEnum.freq();
//sumFreq += freq;
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
token.termCharsOff = termCharsOff;
token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
if (token.startOffset > maxStartOffset) {
continue;//filter this token out; exceeds threshold
}
token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
pos = token.startOffset >> 3;//divide by 8
}
}
if (payloadAttribute != null) {
final BytesRef payload = dpEnum.getPayload();
token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
}
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
//grow, but not 2x since we think our original length estimate is close
TokenLL[] newPositionedTokens = new TokenLL[(int)((pos + 1) * 1.5f)];
System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
positionedTokens = newPositionedTokens;
}
positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
lastPosition = Math.max(lastPosition, pos);
}
}
// System.out.println(String.format(
// "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
// sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
// (originalPositionEstimate/(lastPosition + 1.0f))));
// Step 2: Link all Tokens into a linked-list and set position increments as we go
int prevTokenPos = -1;
TokenLL prevToken = null;
for (int pos = 0; pos <= lastPosition; pos++) {
TokenLL token = positionedTokens[pos];
if (token == null) {
continue;
}
//link
if (prevToken != null) {
assert prevToken.next == null;
prevToken.next = token; //concatenate linked-list
} else {
assert firstToken == null;
firstToken = token;
}
//set increments
if (vector.hasPositions()) {
token.positionIncrement = pos - prevTokenPos;
while (token.next != null) {
token = token.next;
token.positionIncrement = 0;
}
} else {
token.positionIncrement = 1;
while (token.next != null) {
prevToken = token;
token = token.next;
if (prevToken.startOffset == token.startOffset) {
token.positionIncrement = 0;
} else {
token.positionIncrement = 1;
}
}
}
prevTokenPos = pos;
prevToken = token;
}
initialized = true;
}
private TokenLL[] initTokensArray() throws IOException {
// Estimate the number of position slots we need from term stats. We use some estimation factors taken from
// Wikipedia that reduce the likelihood of needing to expand the array.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
assert sumTotalTermFreq != -1;
final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
// This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
// Take the smaller of the two estimates, but no smaller than 64
return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
}
@Override
public boolean incrementToken() throws IOException {
if (incrementToken == null) {
if (!initialized) {
init();
assert initialized;
}
incrementToken = firstToken;
if (incrementToken == null) {
return false;
}
} else if (incrementToken.next != null) {
incrementToken = incrementToken.next;
} else {
return false;
}
clearAttributes();
termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
if (offsetAttribute != null) {
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
}
if (payloadAttribute != null) {
if (incrementToken.payloadIndex == -1) {
payloadAttribute.setPayload(null);
} else {
payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
}
}
return true;
}
private static class TokenLL {
// This class should weigh 32 bytes, including object header
int termCharsOff; // see termCharsBuilder
short termCharsLen;
int positionIncrement;
int startOffset;
short endOffsetInc; // add to startOffset to get endOffset
int payloadIndex;
TokenLL next;
/** Given the head of a linked-list (possibly null) this inserts the token at the correct
* spot to maintain the desired order, and returns the head (which could be this token if it's the smallest).
* O(N^2) complexity but N should be a handful at most.
*/
TokenLL insertIntoSortedLinkedList(final TokenLL head) {
assert next == null;
if (head == null) {
return this;
} else if (this.compareOffsets(head) <= 0) {
this.next = head;
return this;
}
TokenLL prev = head;
while (prev.next != null && this.compareOffsets(prev.next) > 0) {
prev = prev.next;
}
this.next = prev.next;
prev.next = this;
return head;
}
/** by startOffset then endOffset */
int compareOffsets(TokenLL tokenB) {
int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
if (cmp == 0) {
cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
}
return cmp;
}
}
}