All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner Maven / Gradle / Ivy

There is a newer version: 2.18.0
Show newest version
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.apache.lucene.search.uhighlight;

import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.util.Locale;

/**
 * A custom break iterator that is used to find break-delimited passages bounded by
 * a provided maximum length in the {@link UnifiedHighlighter} context.
 * This class uses a {@link BreakIterator} to find the last break after the provided offset
 * that would create a passage smaller than maxLen.
 * If the {@link BreakIterator} cannot find a passage smaller than the maximum length,
 * a secondary break iterator is used to re-split the passage at the first boundary after
 * maximum length.
 * 

* This is useful to split passages created by {@link BreakIterator}s like `sentence` that * can create big outliers on semi-structured text. *

* * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}. *

* TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not * depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses * only the first matching offset to derive the start and end of each passage. **/ public class BoundedBreakIteratorScanner extends BreakIterator { private final BreakIterator mainBreak; private final BreakIterator innerBreak; private final int maxLen; private int lastPrecedingOffset = -1; private int windowStart = -1; private int windowEnd = -1; private int innerStart = -1; private int innerEnd = 0; private BoundedBreakIteratorScanner(BreakIterator mainBreak, BreakIterator innerBreak, int maxLen) { this.mainBreak = mainBreak; this.innerBreak = innerBreak; this.maxLen = maxLen; } @Override public CharacterIterator getText() { return mainBreak.getText(); } @Override public void setText(CharacterIterator newText) { reset(); mainBreak.setText(newText); innerBreak.setText(newText); } @Override public void setText(String newText) { reset(); mainBreak.setText(newText); innerBreak.setText(newText); } private void reset() { lastPrecedingOffset = -1; windowStart = -1; windowEnd = -1; innerStart = -1; innerEnd = 0; } /** * Must be called with increasing offset. See {@link FieldHighlighter} for usage. */ @Override public int preceding(int offset) { if (offset < lastPrecedingOffset) { throw new IllegalArgumentException("offset < lastPrecedingOffset: " + "usage doesn't look like UnifiedHighlighter"); } if (offset > windowStart && offset < windowEnd) { innerStart = innerEnd; innerEnd = windowEnd; } else { windowStart = innerStart = mainBreak.preceding(offset); windowEnd = innerEnd = mainBreak.following(offset - 1); // expand to next break until we reach maxLen while (innerEnd - innerStart < maxLen) { int newEnd = mainBreak.following(innerEnd); if (newEnd == DONE || (newEnd - innerStart) > maxLen) { break; } windowEnd = innerEnd = newEnd; } } if (innerEnd - innerStart > maxLen) { // the current split is too big, // so starting from the current term we try to find boundaries on the left first if (offset - maxLen > innerStart) { innerStart = Math.max(innerStart, innerBreak.preceding(offset - maxLen)); } // and then we try to expand the passage to the right with the remaining size int remaining = Math.max(0, maxLen - (offset - innerStart)); if (offset + remaining < windowEnd) { innerEnd = Math.min(windowEnd, innerBreak.following(offset + remaining)); } } lastPrecedingOffset = offset - 1; return innerStart; } /** * Can be invoked only after a call to preceding(offset+1). * See {@link FieldHighlighter} for usage. */ @Override public int following(int offset) { if (offset != lastPrecedingOffset || innerEnd == -1) { throw new IllegalArgumentException("offset != lastPrecedingOffset: " + "usage doesn't look like UnifiedHighlighter"); } return innerEnd; } /** * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen. * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}. */ public static BreakIterator getSentence(Locale locale, int maxLen) { final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale); final BreakIterator wBreak = BreakIterator.getWordInstance(locale); return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen); } @Override public int current() { // Returns the last offset of the current split return this.innerEnd; } @Override public int first() { throw new IllegalStateException("first() should not be called in this context"); } @Override public int next() { throw new IllegalStateException("next() should not be called in this context"); } @Override public int last() { throw new IllegalStateException("last() should not be called in this context"); } @Override public int next(int n) { throw new IllegalStateException("next(n) should not be called in this context"); } @Override public int previous() { throw new IllegalStateException("previous() should not be called in this context"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy