org.apache.lucene.search.uhighlight.AnalysisOffsetStrategy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Show all versions of lucene-highlighter Show documentation
This is the highlighter for apache lucene java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Provides a base class for analysis based offset strategies to extend from.
* Requires an Analyzer and provides an override-able method for altering how
* the TokenStream is created.
*
* @lucene.internal
*/
public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
protected final Analyzer analyzer;
public AnalysisOffsetStrategy(UHComponents components, Analyzer analyzer) {
super(components);
this.analyzer = analyzer;
if (analyzer.getOffsetGap(getField()) != 1) { // note: 1 is the default. It is RARELY changed.
throw new IllegalArgumentException(
"offset gap of the provided analyzer should be 1 (field " + getField() + ")");
}
}
@Override
public final UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.ANALYSIS;
}
protected TokenStream tokenStream(String content) throws IOException {
// If there is no splitChar in content then we needn't wrap:
int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
if (splitCharIdx == -1) {
return analyzer.tokenStream(getField(), content);
}
TokenStream subTokenStream = analyzer.tokenStream(getField(), content.substring(0, splitCharIdx));
return new MultiValueTokenStream(subTokenStream, getField(), analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx);
}
/**
* Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
* exposes a TokenStream that matches what would get indexed considering the
* {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
* 1; an exception will be thrown if it isn't.
*
* It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
* more work. The underlying components see a Reader not a String -- and the String is easy to
* split up without redundant buffering.
*
* @lucene.internal
*/
// TODO we could make this go away. MemoryIndexOffsetStrategy could simply split and analyze each value into the
// MemoryIndex. TokenStreamOffsetStrategy's hack TokenStreamPostingsEnum could incorporate this logic,
// albeit with less code, less hack.
private static final class MultiValueTokenStream extends TokenFilter {
private final String fieldName;
private final Analyzer indexAnalyzer;
private final String content;
private final char splitChar;
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private int startValIdx = 0;
private int endValIdx;
private int remainingPosInc = 0;
private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
String content, char splitChar, int splitCharIdx) {
super(subTokenStream); // subTokenStream is already initialized to operate on the first value
this.fieldName = fieldName;
this.indexAnalyzer = indexAnalyzer;
this.content = content;
this.splitChar = splitChar;
this.endValIdx = splitCharIdx;
}
@Override
public void reset() throws IOException {
if (startValIdx != 0) {
throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
// ... although we could if a need for it arises.
}
super.reset();
}
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (input.incrementToken()) {
// Position tracking:
if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
remainingPosInc = 0;//reset
}
// Offset tracking:
offsetAtt.setOffset(
startValIdx + offsetAtt.startOffset(),
startValIdx + offsetAtt.endOffset()
);
return true;
}
if (endValIdx == content.length()) {//no more
return false;
}
input.end(); // might adjust position increment
remainingPosInc += posIncAtt.getPositionIncrement();
input.close();
remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
// Get new tokenStream based on next segment divided by the splitChar
startValIdx = endValIdx + 1;
endValIdx = content.indexOf(splitChar, startValIdx);
if (endValIdx == -1) {//EOF
endValIdx = content.length();
}
TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
// This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
// very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
// since we used it as our input in the constructor.
// Were this not the case, we'd have to copy every attribute of interest since we can't alter the
// AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
// If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
// us to easily set the char[] reference without literally copying char by char.
throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
indexAnalyzer.getReuseStrategy());
}
tokenStream.reset();
} // while loop to increment token of this new value
}
@Override
public void end() throws IOException {
super.end();
// Offset tracking:
offsetAtt.setOffset(
startValIdx + offsetAtt.startOffset(),
startValIdx + offsetAtt.endOffset());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy