![JAR search and dependency download from the Maven repository](/logo.png)
org.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch - Open Source, Distributed, RESTful Search Engine
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.completion2x;
import com.carrotsearch.hppc.ObjectLongHashMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x;
import org.elasticsearch.search.suggest.completion.CompletionStats;
import org.elasticsearch.search.suggest.completion.CompletionSuggestionContext;
import org.elasticsearch.search.suggest.completion.FuzzyOptions;
import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.CompletionLookupProvider;
import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.LookupFactory;
import org.elasticsearch.search.suggest.completion2x.context.ContextMapping.ContextQuery;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider {
// for serialization
public static final int SERIALIZE_PRESERVE_SEPARATORS = 1;
public static final int SERIALIZE_HAS_PAYLOADS = 2;
public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4;
private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256;
private static final int MAX_GRAPH_EXPANSIONS = -1;
public static final String CODEC_NAME = "analyzing";
public static final int CODEC_VERSION_START = 1;
public static final int CODEC_VERSION_SERIALIZED_LABELS = 2;
public static final int CODEC_VERSION_CHECKSUMS = 3;
public static final int CODEC_VERSION_LATEST = CODEC_VERSION_CHECKSUMS;
private final boolean preserveSep;
private final boolean preservePositionIncrements;
private final int maxSurfaceFormsPerAnalyzedForm;
private final int maxGraphExpansions;
private final boolean hasPayloads;
private final XAnalyzingSuggester prototype;
public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean preservePositionIncrements, boolean hasPayloads) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.hasPayloads = hasPayloads;
this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM;
this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS;
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
// needs to fixed in the suggester first before it can be supported
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
prototype = new XAnalyzingSuggester(null, null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP,
XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
}
@Override
public String getName() {
return "analyzing";
}
public boolean getPreserveSep() {
return preserveSep;
}
public boolean getPreservePositionsIncrements() {
return preservePositionIncrements;
}
public boolean hasPayloads() {
return hasPayloads;
}
@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
return new FieldsConsumer() {
private Map fieldOffsets = new HashMap<>();
@Override
public void close() throws IOException {
try {
/*
* write the offsets per field such that we know where
* we need to load the FSTs from
*/
long pointer = output.getFilePointer();
output.writeVInt(fieldOffsets.size());
for (Map.Entry entry : fieldOffsets.entrySet()) {
output.writeString(entry.getKey());
output.writeVLong(entry.getValue());
}
output.writeLong(pointer);
CodecUtil.writeFooter(output);
} finally {
IOUtils.close(output);
}
}
@Override
public void write(Fields fields) throws IOException {
for(String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docsEnum = null;
final SuggestPayload spare = new SuggestPayload();
int maxAnalyzedPathsForOneInput = 0;
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(
maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
int docCount = 0;
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS);
builder.startTerm(term);
int docFreq = 0;
while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
for (int i = 0; i < docsEnum.freq(); i++) {
final int position = docsEnum.nextPosition();
AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
// multi fields have the same surface form so we sum up here
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
}
docFreq++;
docCount = Math.max(docCount, docsEnum.docID()+1);
}
builder.finishTerm(docFreq);
}
/*
* Here we are done processing the field and we can
* buid the FST and write it to disk.
*/
FST> build = builder.build();
assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
/*
* it's possible that the FST is null if we have 2 segments that get merged
* and all docs that have a value in this field are deleted. This will cause
* a consumer to be created but it doesn't consume any values causing the FSTBuilder
* to return null.
*/
if (build != null) {
fieldOffsets.put(field, output.getFilePointer());
build.save(output);
/* write some more meta-info */
output.writeVInt(maxAnalyzedPathsForOneInput);
output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
output.writeInt(maxGraphExpansions); // can be negative
int options = 0;
options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
output.writeVInt(options);
output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
output.writeVInt(XAnalyzingSuggester.END_BYTE);
output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
}
}
}
};
}
@Override
public LookupFactory load(IndexInput input) throws IOException {
long sizeInBytes = 0;
int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST);
if (version >= CODEC_VERSION_CHECKSUMS) {
CodecUtil.checksumEntireFile(input);
}
final long metaPointerPosition = input.length() - (version >= CODEC_VERSION_CHECKSUMS? 8 + CodecUtil.footerLength() : 8);
final Map lookupMap = new HashMap<>();
input.seek(metaPointerPosition);
long metaPointer = input.readLong();
input.seek(metaPointer);
int numFields = input.readVInt();
Map meta = new TreeMap<>();
for (int i = 0; i < numFields; i++) {
String name = input.readString();
long offset = input.readVLong();
meta.put(offset, name);
}
for (Map.Entry entry : meta.entrySet()) {
input.seek(entry.getKey());
FST> fst = new FST<>(input, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
int maxAnalyzedPathsForOneInput = input.readVInt();
int maxSurfaceFormsPerAnalyzedForm = input.readVInt();
int maxGraphExpansions = input.readInt();
int options = input.readVInt();
boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPARATORS) != 0;
boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
// first version did not include these three fields, so fall back to old default (before the analyzingsuggester
// was updated in Lucene, so we cannot use the suggester defaults)
int sepLabel, payloadSep, endByte, holeCharacter;
switch (version) {
case CODEC_VERSION_START:
sepLabel = 0xFF;
payloadSep = '\u001f';
endByte = 0x0;
holeCharacter = '\u001E';
break;
default:
sepLabel = input.readVInt();
endByte = input.readVInt();
payloadSep = input.readVInt();
holeCharacter = input.readVInt();
}
AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements,
maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput,
fst, sepLabel, payloadSep, endByte, holeCharacter);
sizeInBytes += fst.ramBytesUsed();
lookupMap.put(entry.getValue(), holder);
}
final long ramBytesUsed = sizeInBytes;
return new LookupFactory() {
@Override
public Lookup getLookup(CompletionFieldMapper2x.CompletionFieldType fieldType, CompletionSuggestionContext suggestionContext) {
AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(fieldType.name());
if (analyzingSuggestHolder == null) {
return null;
}
int flags = analyzingSuggestHolder.getPreserveSeparator() ? XAnalyzingSuggester.PRESERVE_SEP : 0;
final XAnalyzingSuggester suggester;
final Automaton queryPrefix = fieldType.requiresContext() ?
ContextQuery.toAutomaton(analyzingSuggestHolder.getPreserveSeparator(), suggestionContext.getContextQueries()) : null;
final FuzzyOptions fuzzyOptions = suggestionContext.getFuzzyOptions();
if (fuzzyOptions != null) {
suggester = new XFuzzySuggester(fieldType.indexAnalyzer(), queryPrefix, fieldType.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
fuzzyOptions.getEditDistance(), fuzzyOptions.isTranspositions(),
fuzzyOptions.getFuzzyPrefixLength(), fuzzyOptions.getFuzzyMinLength(), fuzzyOptions.isUnicodeAware(),
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel,
analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte,
analyzingSuggestHolder.holeCharacter);
} else {
suggester = new XAnalyzingSuggester(fieldType.indexAnalyzer(), queryPrefix, fieldType.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel,
analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte, analyzingSuggestHolder.holeCharacter);
}
return suggester;
}
@Override
public CompletionStats stats(String... fields) {
long sizeInBytes = 0;
ObjectLongHashMap completionFields = null;
if (fields != null && fields.length > 0) {
completionFields = new ObjectLongHashMap<>(fields.length);
}
for (Map.Entry entry : lookupMap.entrySet()) {
sizeInBytes += entry.getValue().fst.ramBytesUsed();
if (fields == null || fields.length == 0) {
continue;
}
if (Regex.simpleMatch(fields, entry.getKey())) {
long fstSize = entry.getValue().fst.ramBytesUsed();
completionFields.addTo(entry.getKey(), fstSize);
}
}
return new CompletionStats(sizeInBytes, completionFields);
}
@Override
AnalyzingSuggestHolder getAnalyzingSuggestHolder(MappedFieldType fieldType) {
return lookupMap.get(fieldType.name());
}
@Override
public long ramBytesUsed() {
return ramBytesUsed;
}
@Override
public Collection getChildResources() {
return Accountables.namedAccountables("field", lookupMap);
}
};
}
static class AnalyzingSuggestHolder implements Accountable {
final boolean preserveSep;
final boolean preservePositionIncrements;
final int maxSurfaceFormsPerAnalyzedForm;
final int maxGraphExpansions;
final boolean hasPayloads;
final int maxAnalyzedPathsForOneInput;
final FST> fst;
final int sepLabel;
final int payloadSep;
final int endByte;
final int holeCharacter;
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements,
int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean hasPayloads, int maxAnalyzedPathsForOneInput,
FST> fst, int sepLabel, int payloadSep,
int endByte, int holeCharacter) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
this.maxGraphExpansions = maxGraphExpansions;
this.hasPayloads = hasPayloads;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.fst = fst;
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
this.holeCharacter = holeCharacter;
}
public boolean getPreserveSeparator() {
return preserveSep;
}
public boolean getPreservePositionIncrements() {
return preservePositionIncrements;
}
public boolean hasPayloads() {
return hasPayloads;
}
@Override
public long ramBytesUsed() {
if (fst != null) {
return fst.ramBytesUsed();
} else {
return 0;
}
}
@Override
public Collection getChildResources() {
if (fst != null) {
return Collections.singleton(Accountables.namedAccountable("fst", fst));
} else {
return Collections.emptyList();
}
}
}
@Override
public Set toFiniteStrings(TokenStream stream) throws IOException {
return prototype.toFiniteStrings(stream);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy