org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder Maven / Gradle / Ivy
Show all versions of lucene-highlighter Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.vectorhighlight;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
/**
* Base FragmentsBuilder implementation that supports colored pre/post
* tags and multivalued fields.
*
* Uses {@link BoundaryScanner} to determine fragments.
*/
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
protected String[] preTags, postTags;
public static final String[] COLORED_PRE_TAGS = {
"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", "", "",
"", ""
};
public static final String[] COLORED_POST_TAGS = { "" };
private char multiValuedSeparator = ' ';
private final BoundaryScanner boundaryScanner;
private boolean discreteMultiValueHighlighting = false;
protected BaseFragmentsBuilder(){
this( new String[]{ "" }, new String[]{ "" } );
}
protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
this(preTags, postTags, new SimpleBoundaryScanner());
}
protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){
this( new String[]{ "" }, new String[]{ "" }, boundaryScanner );
}
protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){
this.preTags = preTags;
this.postTags = postTags;
this.boundaryScanner = boundaryScanner;
}
static Object checkTagsArgument( Object tags ){
if( tags instanceof String ) return tags;
else if( tags instanceof String[] ) return tags;
throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
}
public abstract List getWeightedFragInfoList( List src );
private static final Encoder NULL_ENCODER = new DefaultEncoder();
@Override
public String createFragment( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList ) throws IOException {
return createFragment( reader, docId, fieldName, fieldFragList,
preTags, postTags, NULL_ENCODER );
}
@Override
public String[] createFragments( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, int maxNumFragments )
throws IOException {
return createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments,
preTags, postTags, NULL_ENCODER );
}
@Override
public String createFragment( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, String[] preTags, String[] postTags,
Encoder encoder ) throws IOException {
String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1,
preTags, postTags, encoder );
if( fragments == null || fragments.length == 0 ) return null;
return fragments[0];
}
@Override
public String[] createFragments( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
if( maxNumFragments < 0 ) {
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
}
List fragInfos = fieldFragList.getFragInfos();
Field[] values = getFields( reader, docId, fieldName );
if( values.length == 0 ) {
return null;
}
if (discreteMultiValueHighlighting && values.length > 1) {
fragInfos = discreteMultiValueHighlighting(fragInfos, values);
}
fragInfos = getWeightedFragInfoList(fragInfos);
int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size();
List fragments = new ArrayList<>( limitFragments );
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = { 0 };
for( int n = 0; n < limitFragments; n++ ){
WeightedFragInfo fragInfo = fragInfos.get( n );
fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) );
}
return fragments.toArray( new String[fragments.size()] );
}
protected Field[] getFields( IndexReader reader, int docId, final String fieldName) throws IOException {
// according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
final List fields = new ArrayList<>();
reader.document(docId, new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, byte[] bytes) {
String value = new String(bytes, StandardCharsets.UTF_8);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(fieldInfo.hasVectors());
fields.add(new Field(fieldInfo.name, value, ft));
}
@Override
public Status needsField(FieldInfo fieldInfo) {
return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO;
}
});
return fields.toArray(new Field[fields.size()]);
}
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
String[] preTags, String[] postTags, Encoder encoder ){
StringBuilder fragment = new StringBuilder();
final int s = fragInfo.getStartOffset();
int[] modifiedStartOffset = { s };
String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset );
int srcIndex = 0;
for( SubInfo subInfo : fragInfo.getSubInfos() ){
for( Toffs to : subInfo.getTermsOffsets() ){
fragment
.append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) )
.append( getPreTag( preTags, subInfo.getSeqnum() ) )
.append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) )
.append( getPostTag( postTags, subInfo.getSeqnum() ) );
srcIndex = to.getEndOffset() - modifiedStartOffset[0];
}
}
fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
return fragment.toString();
}
protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset, int[] modifiedStartOffset ){
while( buffer.length() < endOffset && index[0] < values.length ){
buffer.append( values[index[0]++].stringValue() );
buffer.append( getMultiValuedSeparator() );
}
int bufferLength = buffer.length();
// we added the multi value char to the last buffer, ignore it
if (values[index[0] - 1].fieldType().tokenized()) {
bufferLength--;
}
int eo = bufferLength < endOffset ? bufferLength : boundaryScanner.findEndOffset( buffer, endOffset );
modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset );
return buffer.substring( modifiedStartOffset[0], eo );
}
protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset ){
while( buffer.length() < endOffset && index[0] < values.length ){
buffer.append( values[index[0]].stringValue() );
buffer.append( multiValuedSeparator );
index[0]++;
}
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
return buffer.substring( startOffset, eo );
}
protected List discreteMultiValueHighlighting(List fragInfos, Field[] fields) {
Map> fieldNameToFragInfos = new HashMap<>();
for (Field field : fields) {
fieldNameToFragInfos.put(field.name(), new ArrayList());
}
fragInfos: for (WeightedFragInfo fragInfo : fragInfos) {
int fieldStart;
int fieldEnd = 0;
for (Field field : fields) {
if (field.stringValue().isEmpty()) {
fieldEnd++;
continue;
}
fieldStart = fieldEnd;
fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart &&
fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
fieldNameToFragInfos.get(field.name()).add(fragInfo);
continue fragInfos;
}
if (fragInfo.getSubInfos().isEmpty()) {
continue fragInfos;
}
Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
continue;
}
int fragStart = fieldStart;
if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
fragStart = fragInfo.getStartOffset();
}
int fragEnd = fieldEnd;
if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
fragEnd = fragInfo.getEndOffset();
}
List subInfos = new ArrayList<>();
Iterator subInfoIterator = fragInfo.getSubInfos().iterator();
float boost = 0.0f; // The boost of the new info will be the sum of the boosts of its SubInfos
while (subInfoIterator.hasNext()) {
SubInfo subInfo = subInfoIterator.next();
List toffsList = new ArrayList<>();
Iterator toffsIterator = subInfo.getTermsOffsets().iterator();
while (toffsIterator.hasNext()) {
Toffs toffs = toffsIterator.next();
if (toffs.getStartOffset() >= fieldEnd) {
// We've gone past this value so its not worth iterating any more.
break;
}
boolean startsAfterField = toffs.getStartOffset() >= fieldStart;
boolean endsBeforeField = toffs.getEndOffset() < fieldEnd;
if (startsAfterField && endsBeforeField) {
// The Toff is entirely within this value.
toffsList.add(toffs);
toffsIterator.remove();
} else if (startsAfterField) {
/*
* The Toffs starts within this value but ends after this value
* so we clamp the returned Toffs to this value and leave the
* Toffs in the iterator for the next value of this field.
*/
toffsList.add(new Toffs(toffs.getStartOffset(), fieldEnd - 1));
} else if (endsBeforeField) {
/*
* The Toffs starts before this value but ends in this value
* which means we're really continuing from where we left off
* above. Since we use the remainder of the offset we can remove
* it from the iterator.
*/
toffsList.add(new Toffs(fieldStart, toffs.getEndOffset()));
toffsIterator.remove();
} else {
/*
* The Toffs spans the whole value so we clamp on both sides.
* This is basically a combination of both arms of the loop
* above.
*/
toffsList.add(new Toffs(fieldStart, fieldEnd - 1));
}
}
if (!toffsList.isEmpty()) {
subInfos.add(new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum(), subInfo.getBoost()));
boost += subInfo.getBoost();
}
if (subInfo.getTermsOffsets().isEmpty()) {
subInfoIterator.remove();
}
}
WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, boost);
fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
}
}
List result = new ArrayList<>();
for (List weightedFragInfos : fieldNameToFragInfos.values()) {
result.addAll(weightedFragInfos);
}
Collections.sort(result, new Comparator() {
@Override
public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
return info1.getStartOffset() - info2.getStartOffset();
}
});
return result;
}
public void setMultiValuedSeparator( char separator ){
multiValuedSeparator = separator;
}
public char getMultiValuedSeparator(){
return multiValuedSeparator;
}
public boolean isDiscreteMultiValueHighlighting() {
return discreteMultiValueHighlighting;
}
public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
}
protected String getPreTag( int num ){
return getPreTag( preTags, num );
}
protected String getPostTag( int num ){
return getPostTag( postTags, num );
}
protected String getPreTag( String[] preTags, int num ){
int n = num % preTags.length;
return preTags[n];
}
protected String getPostTag( String[] postTags, int num ){
int n = num % postTags.length;
return postTags[n];
}
}