org.apache.lucene.search.vectorhighlight.AbstractFragmentsBuilder Maven / Gradle / Ivy
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.vectorhighlight;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.highlight.Encoder;
import java.io.IOException;
import java.util.*;
/**
* Abstract {@link FragmentsBuilder} implementation that detects whether highlight hits occurred on a field that is
* multivalued (Basically fields that have the same name) and splits the highlight snippets according to a single field
* boundary. This avoids that a highlight hit is shown as one hit whilst it is actually a hit on multiple fields.
*/
public abstract class AbstractFragmentsBuilder extends BaseFragmentsBuilder {
private boolean discreteMultiValueHighlighting = true;
protected AbstractFragmentsBuilder(){
super();
}
protected AbstractFragmentsBuilder(BoundaryScanner boundaryScanner){
super(boundaryScanner);
}
protected AbstractFragmentsBuilder( String[] preTags, String[] postTags ){
super(preTags, postTags);
}
public AbstractFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner bs) {
super( preTags, postTags, bs );
}
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
}
public String[] createFragments(IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
String[] preTags, String[] postTags, Encoder encoder) throws IOException {
if (maxNumFragments < 0) {
throw new IllegalArgumentException("maxNumFragments(" + maxNumFragments + ") must be positive number.");
}
List fragments = new ArrayList(maxNumFragments);
List fragInfos = fieldFragList.getFragInfos();
Field[] values = getFields(reader, docId, fieldName);
if (values.length == 0) {
return null;
}
if (discreteMultiValueHighlighting && values.length > 1) {
fragInfos = discreteMultiValueHighlighting(fragInfos, values);
}
fragInfos = getWeightedFragInfoList(fragInfos);
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = {0};
for (int n = 0; n < maxNumFragments && n < fragInfos.size(); n++) {
FieldFragList.WeightedFragInfo fragInfo = fragInfos.get(n);
fragments.add(makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder));
}
return fragments.toArray(new String[fragments.size()]);
}
protected List discreteMultiValueHighlighting(List fragInfos, Field[] fields) {
Map> fieldNameToFragInfos = new HashMap>();
for (Field field : fields) {
fieldNameToFragInfos.put(field.name(), new ArrayList());
}
fragInfos:
for (FieldFragList.WeightedFragInfo fragInfo : fragInfos) {
int fieldStart;
int fieldEnd = 0;
for (Field field : fields) {
if (field.stringValue().isEmpty()) {
fieldEnd++;
continue;
}
fieldStart = fieldEnd;
fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart &&
fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
fieldNameToFragInfos.get(field.name()).add(fragInfo);
continue fragInfos;
}
if (fragInfo.getSubInfos().isEmpty()) {
continue fragInfos;
}
FieldPhraseList.WeightedPhraseInfo.Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
continue;
}
int fragStart = fieldStart;
if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
fragStart = fragInfo.getStartOffset();
}
int fragEnd = fieldEnd;
if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
fragEnd = fragInfo.getEndOffset();
}
List subInfos = new ArrayList();
WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, fragInfo.getTotalBoost(), subInfos);
Iterator subInfoIterator = fragInfo.getSubInfos().iterator();
while (subInfoIterator.hasNext()) {
FieldFragList.WeightedFragInfo.SubInfo subInfo = subInfoIterator.next();
List toffsList = new ArrayList();
Iterator toffsIterator = subInfo.getTermsOffsets().iterator();
while (toffsIterator.hasNext()) {
FieldPhraseList.WeightedPhraseInfo.Toffs toffs = toffsIterator.next();
if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) {
toffsList.add(toffs);
toffsIterator.remove();
}
}
if (!toffsList.isEmpty()) {
subInfos.add(new FieldFragList.WeightedFragInfo.SubInfo(subInfo.text, toffsList, subInfo.getSeqnum()));
}
if (subInfo.getTermsOffsets().isEmpty()) {
subInfoIterator.remove();
}
}
fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
}
}
List result = new ArrayList();
for (List weightedFragInfos : fieldNameToFragInfos.values()) {
result.addAll(weightedFragInfos);
}
Collections.sort(result, new Comparator() {
public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
return info1.getStartOffset() - info2.getStartOffset();
}
});
return result;
}
private static class WeightedFragInfo extends FieldFragList.WeightedFragInfo {
private final static List EMPTY = Collections.emptyList();
private WeightedFragInfo(int startOffset, int endOffset, float totalBoost, List subInfos) {
super(startOffset, endOffset, EMPTY);
this.subInfos = subInfos;
this.totalBoost = totalBoost;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy