All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.vectorhighlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;

/**
 * Base FragmentsBuilder implementation that supports colored pre/post tags and multivalued fields.
 *
 * 

Uses {@link BoundaryScanner} to determine fragments. */ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { protected String[] preTags, postTags; public static final String[] COLORED_PRE_TAGS = { "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "" }; public static final String[] COLORED_POST_TAGS = {""}; private char multiValuedSeparator = ' '; private final BoundaryScanner boundaryScanner; private boolean discreteMultiValueHighlighting = false; protected BaseFragmentsBuilder() { this(new String[] {""}, new String[] {""}); } protected BaseFragmentsBuilder(String[] preTags, String[] postTags) { this(preTags, postTags, new SimpleBoundaryScanner()); } protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner) { this(new String[] {""}, new String[] {""}, boundaryScanner); } protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) { this.preTags = preTags; this.postTags = postTags; this.boundaryScanner = boundaryScanner; } static Object checkTagsArgument(Object tags) { if (tags instanceof String) { return tags; } else if (tags instanceof String[]) { return tags; } throw new IllegalArgumentException("type of preTags/postTags must be a String or String[]"); } public abstract List getWeightedFragInfoList(List src); private static final Encoder NULL_ENCODER = new DefaultEncoder(); @Override public String createFragment( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList) throws IOException { return createFragment(reader, docId, fieldName, fieldFragList, preTags, postTags, NULL_ENCODER); } @Override public String[] createFragments( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, int maxNumFragments) throws IOException { return createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments, preTags, postTags, NULL_ENCODER); } @Override public String createFragment( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, String[] preTags, String[] postTags, Encoder encoder) throws IOException { String[] fragments = createFragments(reader, docId, fieldName, fieldFragList, 1, preTags, postTags, encoder); if (fragments == null || fragments.length == 0) return null; return fragments[0]; } @Override public String[] createFragments( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, int maxNumFragments, String[] preTags, String[] postTags, Encoder encoder) throws IOException { if (maxNumFragments < 0) { throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number."); } List fragInfos = fieldFragList.getFragInfos(); Field[] values = getFields(reader, docId, fieldName); if (values.length == 0) { return null; } if (discreteMultiValueHighlighting && values.length > 1) { fragInfos = discreteMultiValueHighlighting(fragInfos, values); } fragInfos = getWeightedFragInfoList(fragInfos); int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size(); List fragments = new ArrayList<>(limitFragments); StringBuilder buffer = new StringBuilder(); int[] nextValueIndex = {0}; for (int n = 0; n < limitFragments; n++) { WeightedFragInfo fragInfo = fragInfos.get(n); fragments.add( makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder)); } return fragments.toArray(new String[fragments.size()]); } protected Field[] getFields(IndexReader reader, int docId, final String fieldName) throws IOException { // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field??? final List fields = new ArrayList<>(); reader .storedFields() .document( docId, new StoredFieldVisitor() { @Override public void stringField(FieldInfo fieldInfo, String value) { Objects.requireNonNull(value, "String value should not be null"); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(fieldInfo.hasVectors()); fields.add(new Field(fieldInfo.name, value, ft)); } @Override public Status needsField(FieldInfo fieldInfo) { return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO; } }); return fields.toArray(new Field[fields.size()]); } protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder) { StringBuilder fragment = new StringBuilder(); final int s = fragInfo.getStartOffset(); int[] modifiedStartOffset = {s}; String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset); int srcIndex = 0; for (SubInfo subInfo : fragInfo.getSubInfos()) { for (Toffs to : subInfo.getTermsOffsets()) { fragment .append( encoder.encodeText( src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0]))) .append(getPreTag(preTags, subInfo.getSeqnum())) .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0]))) .append(getPostTag(postTags, subInfo.getSeqnum())); srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } fragment.append(encoder.encodeText(src.substring(srcIndex))); return fragment.toString(); } protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset, int[] modifiedStartOffset) { while (buffer.length() < endOffset && index[0] < values.length) { buffer.append(values[index[0]++].stringValue()); buffer.append(getMultiValuedSeparator()); } int bufferLength = buffer.length(); // we added the multi value char to the last buffer, ignore it if (values[index[0] - 1].fieldType().tokenized()) { bufferLength--; } int eo = bufferLength < endOffset ? bufferLength : boundaryScanner.findEndOffset(buffer, endOffset); modifiedStartOffset[0] = boundaryScanner.findStartOffset(buffer, startOffset); return buffer.substring(modifiedStartOffset[0], eo); } protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset) { while (buffer.length() < endOffset && index[0] < values.length) { buffer.append(values[index[0]].stringValue()); buffer.append(multiValuedSeparator); index[0]++; } int eo = buffer.length() < endOffset ? buffer.length() : endOffset; return buffer.substring(startOffset, eo); } protected List discreteMultiValueHighlighting( List fragInfos, Field[] fields) { Map> fieldNameToFragInfos = new HashMap<>(); for (Field field : fields) { fieldNameToFragInfos.put(field.name(), new ArrayList()); } fragInfos: for (WeightedFragInfo fragInfo : fragInfos) { int fieldStart; int fieldEnd = 0; for (Field field : fields) { if (field.stringValue().isEmpty()) { fieldEnd++; continue; } fieldStart = fieldEnd; fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name. if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart && fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) { fieldNameToFragInfos.get(field.name()).add(fragInfo); continue fragInfos; } if (fragInfo.getSubInfos().isEmpty()) { continue fragInfos; } Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0); if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) { continue; } int fragStart = fieldStart; if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) { fragStart = fragInfo.getStartOffset(); } int fragEnd = fieldEnd; if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) { fragEnd = fragInfo.getEndOffset(); } List subInfos = new ArrayList<>(); Iterator subInfoIterator = fragInfo.getSubInfos().iterator(); // The boost of the new info will be the sum of the boosts of its SubInfos float boost = 0.0f; while (subInfoIterator.hasNext()) { SubInfo subInfo = subInfoIterator.next(); List toffsList = new ArrayList<>(); Iterator toffsIterator = subInfo.getTermsOffsets().iterator(); while (toffsIterator.hasNext()) { Toffs toffs = toffsIterator.next(); if (toffs.getStartOffset() >= fieldEnd) { // We've gone past this value so its not worth iterating any more. break; } boolean startsAfterField = toffs.getStartOffset() >= fieldStart; boolean endsBeforeField = toffs.getEndOffset() < fieldEnd; if (startsAfterField && endsBeforeField) { // The Toff is entirely within this value. toffsList.add(toffs); toffsIterator.remove(); } else if (startsAfterField) { /* * The Toffs starts within this value but ends after this value * so we clamp the returned Toffs to this value and leave the * Toffs in the iterator for the next value of this field. */ toffsList.add(new Toffs(toffs.getStartOffset(), fieldEnd - 1)); } else if (endsBeforeField) { /* * The Toffs starts before this value but ends in this value * which means we're really continuing from where we left off * above. Since we use the remainder of the offset we can remove * it from the iterator. */ toffsList.add(new Toffs(fieldStart, toffs.getEndOffset())); toffsIterator.remove(); } else { /* * The Toffs spans the whole value so we clamp on both sides. * This is basically a combination of both arms of the loop * above. */ toffsList.add(new Toffs(fieldStart, fieldEnd - 1)); } } if (!toffsList.isEmpty()) { subInfos.add( new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum(), subInfo.getBoost())); boost += subInfo.getBoost(); } if (subInfo.getTermsOffsets().isEmpty()) { subInfoIterator.remove(); } } WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, boost); fieldNameToFragInfos.get(field.name()).add(weightedFragInfo); } } List result = new ArrayList<>(); for (List weightedFragInfos : fieldNameToFragInfos.values()) { result.addAll(weightedFragInfos); } Collections.sort( result, new Comparator() { @Override public int compare( FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) { return info1.getStartOffset() - info2.getStartOffset(); } }); return result; } public void setMultiValuedSeparator(char separator) { multiValuedSeparator = separator; } public char getMultiValuedSeparator() { return multiValuedSeparator; } public boolean isDiscreteMultiValueHighlighting() { return discreteMultiValueHighlighting; } public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) { this.discreteMultiValueHighlighting = discreteMultiValueHighlighting; } protected String getPreTag(int num) { return getPreTag(preTags, num); } protected String getPostTag(int num) { return getPostTag(postTags, num); } protected String getPreTag(String[] preTags, int num) { int n = num % preTags.length; return preTags[n]; } protected String getPostTag(String[] postTags, int num) { int n = num % postTags.length; return postTags[n]; } }