org.apache.lucene.search.uhighlight.LengthGoalBreakIterator Maven / Gradle / Ivy
Show all versions of lucene-highlighter Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* Wraps another {@link BreakIterator} to skip past breaks that would result in passages that are
* too short. It's still possible to get a short passage but only at the very end of the input text.
*
* Important: This is not a general purpose {@link BreakIterator}; it's only designed to work in
* a way compatible with the {@link UnifiedHighlighter}. Some assumptions are checked with Java
* assertions.
*
* @lucene.experimental
*/
public class LengthGoalBreakIterator extends BreakIterator {
private final BreakIterator baseIter;
private final int lengthGoal;
// how much text to align before match-fragment, valid in range [0, 1]
private final float fragmentAlignment;
// if false then is "closest to" length
private final boolean isMinimumLength;
private int currentCache;
/**
* Breaks will be at least {@code minLength} apart (to the extent possible), while trying to
* position the match inside the fragment according to {@code fragmentAlignment}.
*/
public static LengthGoalBreakIterator createMinLength(
BreakIterator baseIter, int minLength, float fragmentAlignment) {
return new LengthGoalBreakIterator(
baseIter, minLength, fragmentAlignment, true, baseIter.current());
}
/**
* Breaks will be on average {@code targetLength} apart; the closest break to this target (before
* or after) is chosen. The match will be positioned according to {@code fragmentAlignment} as
* much as possible.
*/
public static LengthGoalBreakIterator createClosestToLength(
BreakIterator baseIter, int targetLength, float fragmentAlignment) {
return new LengthGoalBreakIterator(
baseIter, targetLength, fragmentAlignment, false, baseIter.current());
}
private LengthGoalBreakIterator(
BreakIterator baseIter,
int lengthGoal,
float fragmentAlignment,
boolean isMinimumLength,
int currentCache) {
this.baseIter = baseIter;
this.currentCache = currentCache;
this.lengthGoal = lengthGoal;
if (fragmentAlignment < 0.f || fragmentAlignment > 1.f || !Float.isFinite(fragmentAlignment)) {
throw new IllegalArgumentException("fragmentAlignment must be >= zero and <= one");
}
this.fragmentAlignment = fragmentAlignment;
this.isMinimumLength = isMinimumLength;
}
// note: the only methods that will get called are setText(txt), getText(),
// getSummaryPassagesNoHighlight: current(), first(), next()
// highlightOffsetsEnums: preceding(int), and following(int)
// Nonetheless we make some attempt to implement the rest; mostly delegating.
@Override
public String toString() {
String goalDesc = isMinimumLength ? "minLen" : "targetLen";
return getClass().getSimpleName()
+ "{"
+ goalDesc
+ "="
+ lengthGoal
+ ", fragAlign="
+ fragmentAlignment
+ ", baseIter="
+ baseIter
+ "}";
}
@Override
public Object clone() {
return new LengthGoalBreakIterator(
(BreakIterator) baseIter.clone(),
lengthGoal,
fragmentAlignment,
isMinimumLength,
currentCache);
}
@Override
public CharacterIterator getText() {
return baseIter.getText();
}
@Override
public void setText(String newText) {
baseIter.setText(newText);
currentCache = baseIter.current();
}
@Override
public void setText(CharacterIterator newText) {
baseIter.setText(newText);
currentCache = baseIter.current();
}
@Override
public int current() {
return currentCache;
}
@Override
public int first() {
return currentCache = baseIter.first();
}
@Override
public int last() {
return currentCache = baseIter.last();
}
@Override
public int next(int n) {
assert false : "Not supported";
return baseIter.next(n); // probably wrong
}
// Called by getSummaryPassagesNoHighlight to generate default summary.
@Override
public int next() {
return following(currentCache, currentCache + lengthGoal);
}
@Override
public int previous() {
assert false : "Not supported";
return baseIter.previous();
}
@Override
public int following(int matchEndIndex) {
return following(
matchEndIndex, (matchEndIndex + 1) + (int) (lengthGoal * (1.f - fragmentAlignment)));
}
private int following(int matchEndIndex, int targetIdx) {
if (targetIdx >= getText().getEndIndex()) {
if (currentCache == baseIter.last()) {
return DONE;
}
return currentCache = baseIter.last();
}
final int afterIdx = baseIter.following(targetIdx - 1);
if (afterIdx == DONE) {
currentCache = baseIter.last();
return DONE;
}
if (afterIdx == targetIdx) { // right on the money
return currentCache = afterIdx;
}
if (isMinimumLength) { // thus never undershoot
return currentCache = afterIdx;
}
// note: it is a shame that we invoke preceding() *one more time*; BI's are sometimes expensive.
// Find closest break to target
final int beforeIdx = baseIter.preceding(targetIdx);
if (targetIdx - beforeIdx < afterIdx - targetIdx && beforeIdx > matchEndIndex) {
return currentCache = beforeIdx;
}
return currentCache = afterIdx;
}
// called at start of new Passage given first word start offset
@Override
public int preceding(int matchStartIndex) {
final int targetIdx = (matchStartIndex - 1) - (int) (lengthGoal * fragmentAlignment);
if (targetIdx <= 0) {
if (currentCache == baseIter.first()) {
return DONE;
}
return currentCache = baseIter.first();
}
final int beforeIdx = baseIter.preceding(targetIdx + 1);
if (beforeIdx == DONE) {
currentCache = baseIter.first();
return DONE;
}
if (beforeIdx == targetIdx) { // right on the money
return currentCache = beforeIdx;
}
if (isMinimumLength) { // thus never undershoot
return currentCache = beforeIdx;
}
// note: it is a shame that we invoke following() *one more time*; BI's are sometimes expensive.
// Find closest break to target
final int afterIdx = baseIter.following(targetIdx - 1);
if (afterIdx - targetIdx < targetIdx - beforeIdx && afterIdx < matchStartIndex) {
return currentCache = afterIdx;
}
return currentCache = beforeIdx;
}
@Override
public boolean isBoundary(int offset) {
assert false : "Not supported";
return baseIter.isBoundary(offset);
}
}