org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder Maven / Gradle / Ivy
Show all versions of lucene-highlighter Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.vectorhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
/**
* Base FragmentsBuilder implementation that supports colored pre/post tags and multivalued fields.
*
* Uses {@link BoundaryScanner} to determine fragments.
*/
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
protected String[] preTags, postTags;
public static final String[] COLORED_PRE_TAGS = {
"", "",
"",
"", "",
"",
"", "",
"",
"", "",
"",
"", "",
"",
"", "",
"",
"", ""
};
public static final String[] COLORED_POST_TAGS = {""};
private char multiValuedSeparator = ' ';
private final BoundaryScanner boundaryScanner;
private boolean discreteMultiValueHighlighting = false;
protected BaseFragmentsBuilder() {
this(new String[] {""}, new String[] {""});
}
protected BaseFragmentsBuilder(String[] preTags, String[] postTags) {
this(preTags, postTags, new SimpleBoundaryScanner());
}
protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner) {
this(new String[] {""}, new String[] {""}, boundaryScanner);
}
protected BaseFragmentsBuilder(
String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
this.preTags = preTags;
this.postTags = postTags;
this.boundaryScanner = boundaryScanner;
}
static Object checkTagsArgument(Object tags) {
if (tags instanceof String) {
return tags;
} else if (tags instanceof String[]) {
return tags;
}
throw new IllegalArgumentException("type of preTags/postTags must be a String or String[]");
}
public abstract List getWeightedFragInfoList(List src);
private static final Encoder NULL_ENCODER = new DefaultEncoder();
@Override
public String createFragment(
IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList)
throws IOException {
return createFragment(reader, docId, fieldName, fieldFragList, preTags, postTags, NULL_ENCODER);
}
@Override
public String[] createFragments(
IndexReader reader,
int docId,
String fieldName,
FieldFragList fieldFragList,
int maxNumFragments)
throws IOException {
return createFragments(
reader, docId, fieldName, fieldFragList, maxNumFragments, preTags, postTags, NULL_ENCODER);
}
@Override
public String createFragment(
IndexReader reader,
int docId,
String fieldName,
FieldFragList fieldFragList,
String[] preTags,
String[] postTags,
Encoder encoder)
throws IOException {
String[] fragments =
createFragments(reader, docId, fieldName, fieldFragList, 1, preTags, postTags, encoder);
if (fragments == null || fragments.length == 0) return null;
return fragments[0];
}
@Override
public String[] createFragments(
IndexReader reader,
int docId,
String fieldName,
FieldFragList fieldFragList,
int maxNumFragments,
String[] preTags,
String[] postTags,
Encoder encoder)
throws IOException {
if (maxNumFragments < 0) {
throw new IllegalArgumentException(
"maxNumFragments(" + maxNumFragments + ") must be positive number.");
}
List fragInfos = fieldFragList.getFragInfos();
Field[] values = getFields(reader, docId, fieldName);
if (values.length == 0) {
return null;
}
if (discreteMultiValueHighlighting && values.length > 1) {
fragInfos = discreteMultiValueHighlighting(fragInfos, values);
}
fragInfos = getWeightedFragInfoList(fragInfos);
int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size();
List fragments = new ArrayList<>(limitFragments);
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = {0};
for (int n = 0; n < limitFragments; n++) {
WeightedFragInfo fragInfo = fragInfos.get(n);
fragments.add(
makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder));
}
return fragments.toArray(new String[fragments.size()]);
}
protected Field[] getFields(IndexReader reader, int docId, final String fieldName)
throws IOException {
// according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
final List fields = new ArrayList<>();
reader
.storedFields()
.document(
docId,
new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, String value) {
Objects.requireNonNull(value, "String value should not be null");
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasTermVectors());
fields.add(new Field(fieldInfo.name, value, ft));
}
@Override
public Status needsField(FieldInfo fieldInfo) {
return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO;
}
});
return fields.toArray(new Field[fields.size()]);
}
protected String makeFragment(
StringBuilder buffer,
int[] index,
Field[] values,
WeightedFragInfo fragInfo,
String[] preTags,
String[] postTags,
Encoder encoder) {
StringBuilder fragment = new StringBuilder();
final int s = fragInfo.getStartOffset();
int[] modifiedStartOffset = {s};
String src =
getFragmentSourceMSO(
buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset);
int srcIndex = 0;
for (SubInfo subInfo : fragInfo.getSubInfos()) {
for (Toffs to : subInfo.termsOffsets()) {
fragment
.append(
encoder.encodeText(
src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0])))
.append(getPreTag(preTags, subInfo.seqnum()))
.append(
encoder.encodeText(
src.substring(
to.getStartOffset() - modifiedStartOffset[0],
to.getEndOffset() - modifiedStartOffset[0])))
.append(getPostTag(postTags, subInfo.seqnum()));
srcIndex = to.getEndOffset() - modifiedStartOffset[0];
}
}
fragment.append(encoder.encodeText(src.substring(srcIndex)));
return fragment.toString();
}
protected String getFragmentSourceMSO(
StringBuilder buffer,
int[] index,
Field[] values,
int startOffset,
int endOffset,
int[] modifiedStartOffset) {
while (buffer.length() < endOffset && index[0] < values.length) {
buffer.append(values[index[0]++].stringValue());
buffer.append(getMultiValuedSeparator());
}
int bufferLength = buffer.length();
// we added the multi value char to the last buffer, ignore it
if (values[index[0] - 1].fieldType().tokenized()) {
bufferLength--;
}
int eo =
bufferLength < endOffset ? bufferLength : boundaryScanner.findEndOffset(buffer, endOffset);
modifiedStartOffset[0] = boundaryScanner.findStartOffset(buffer, startOffset);
return buffer.substring(modifiedStartOffset[0], eo);
}
protected String getFragmentSource(
StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset) {
while (buffer.length() < endOffset && index[0] < values.length) {
buffer.append(values[index[0]].stringValue());
buffer.append(multiValuedSeparator);
index[0]++;
}
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
return buffer.substring(startOffset, eo);
}
protected List discreteMultiValueHighlighting(
List fragInfos, Field[] fields) {
Map> fieldNameToFragInfos = new HashMap<>();
for (Field field : fields) {
fieldNameToFragInfos.put(field.name(), new ArrayList());
}
fragInfos:
for (WeightedFragInfo fragInfo : fragInfos) {
int fieldStart;
int fieldEnd = 0;
for (Field field : fields) {
if (field.stringValue().isEmpty()) {
fieldEnd++;
continue;
}
fieldStart = fieldEnd;
fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
if (fragInfo.getStartOffset() >= fieldStart
&& fragInfo.getEndOffset() >= fieldStart
&& fragInfo.getStartOffset() <= fieldEnd
&& fragInfo.getEndOffset() <= fieldEnd) {
fieldNameToFragInfos.get(field.name()).add(fragInfo);
continue fragInfos;
}
if (fragInfo.getSubInfos().isEmpty()) {
continue fragInfos;
}
Toffs firstToffs = fragInfo.getSubInfos().get(0).termsOffsets().get(0);
if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
continue;
}
int fragStart = fieldStart;
if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
fragStart = fragInfo.getStartOffset();
}
int fragEnd = fieldEnd;
if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
fragEnd = fragInfo.getEndOffset();
}
List subInfos = new ArrayList<>();
Iterator subInfoIterator = fragInfo.getSubInfos().iterator();
// The boost of the new info will be the sum of the boosts of its SubInfos
float boost = 0.0f;
while (subInfoIterator.hasNext()) {
SubInfo subInfo = subInfoIterator.next();
List toffsList = new ArrayList<>();
Iterator toffsIterator = subInfo.termsOffsets().iterator();
while (toffsIterator.hasNext()) {
Toffs toffs = toffsIterator.next();
if (toffs.getStartOffset() >= fieldEnd) {
// We've gone past this value so its not worth iterating any more.
break;
}
boolean startsAfterField = toffs.getStartOffset() >= fieldStart;
boolean endsBeforeField = toffs.getEndOffset() < fieldEnd;
if (startsAfterField && endsBeforeField) {
// The Toff is entirely within this value.
toffsList.add(toffs);
toffsIterator.remove();
} else if (startsAfterField) {
/*
* The Toffs starts within this value but ends after this value
* so we clamp the returned Toffs to this value and leave the
* Toffs in the iterator for the next value of this field.
*/
toffsList.add(new Toffs(toffs.getStartOffset(), fieldEnd - 1));
} else if (endsBeforeField) {
/*
* The Toffs starts before this value but ends in this value
* which means we're really continuing from where we left off
* above. Since we use the remainder of the offset we can remove
* it from the iterator.
*/
toffsList.add(new Toffs(fieldStart, toffs.getEndOffset()));
toffsIterator.remove();
} else {
/*
* The Toffs spans the whole value so we clamp on both sides.
* This is basically a combination of both arms of the loop
* above.
*/
toffsList.add(new Toffs(fieldStart, fieldEnd - 1));
}
}
if (!toffsList.isEmpty()) {
subInfos.add(new SubInfo(subInfo.text(), toffsList, subInfo.seqnum(), subInfo.boost()));
boost += subInfo.boost();
}
if (subInfo.termsOffsets().isEmpty()) {
subInfoIterator.remove();
}
}
WeightedFragInfo weightedFragInfo =
new WeightedFragInfo(fragStart, fragEnd, subInfos, boost);
fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
}
}
List result = new ArrayList<>();
for (List weightedFragInfos : fieldNameToFragInfos.values()) {
result.addAll(weightedFragInfos);
}
result.sort((info1, info2) -> info1.getStartOffset() - info2.getStartOffset());
return result;
}
public void setMultiValuedSeparator(char separator) {
multiValuedSeparator = separator;
}
public char getMultiValuedSeparator() {
return multiValuedSeparator;
}
public boolean isDiscreteMultiValueHighlighting() {
return discreteMultiValueHighlighting;
}
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
}
protected String getPreTag(int num) {
return getPreTag(preTags, num);
}
protected String getPostTag(int num) {
return getPostTag(postTags, num);
}
protected String getPreTag(String[] preTags, int num) {
int n = num % preTags.length;
return preTags[n];
}
protected String getPostTag(String[] postTags, int num) {
int n = num % postTags.length;
return postTags[n];
}
}