org.apache.lucene.search.suggest.document.CompletionFieldsConsumer Maven / Gradle / Ivy
Show all versions of lucene-suggest Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.document;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
/**
* Weighted FSTs for any indexed {@link SuggestField} is built on {@link
* #write(Fields,NormsProducer)}. A weighted FST maps the analyzed forms of a field to its surface
* form and document id. FSTs are stored in the CompletionDictionary (.lkp).
*
* The file offsets of a field's FST are stored in the CompletionIndex (.cmp) along with the
* field's internal number {@link FieldInfo#number} on {@link #close()}.
*/
final class CompletionFieldsConsumer extends FieldsConsumer {
private final String delegatePostingsFormatName;
private final Map seenFields = new HashMap<>();
private final SegmentWriteState state;
private IndexOutput dictOut;
private FieldsConsumer delegateFieldsConsumer;
private final String codecName;
CompletionFieldsConsumer(
String codecName, PostingsFormat delegatePostingsFormat, SegmentWriteState state)
throws IOException {
this.codecName = codecName;
this.delegatePostingsFormatName = delegatePostingsFormat.getName();
this.state = state;
String dictFile =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
boolean success = false;
try {
this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
dictOut = state.directory.createOutput(dictFile, state.context);
CodecUtil.writeIndexHeader(
dictOut,
codecName,
COMPLETION_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
delegateFieldsConsumer.write(fields, norms);
for (String field : fields) {
CompletionTermWriter termWriter = new CompletionTermWriter();
Terms terms = fields.terms(field);
if (terms == null) {
// this can happen from ghost fields, where the incoming Fields iterator claims a field
// exists but it does not
continue;
}
TermsEnum termsEnum = terms.iterator();
// write terms
BytesRef term;
while ((term = termsEnum.next()) != null) {
termWriter.write(term, termsEnum);
}
// store lookup, if needed
long filePointer = dictOut.getFilePointer();
if (termWriter.finish(dictOut)) {
seenFields.put(
field,
new CompletionMetaData(
filePointer, termWriter.minWeight, termWriter.maxWeight, termWriter.type));
}
}
}
private boolean closed = false;
@Override
public void close() throws IOException {
if (closed) {
return;
}
closed = true;
String indexFile =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
boolean success = false;
try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
delegateFieldsConsumer.close();
CodecUtil.writeIndexHeader(
indexOut,
codecName,
COMPLETION_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
/*
* we write the delegate postings format name so we can load it
* without getting an instance in the ctor
*/
indexOut.writeString(delegatePostingsFormatName);
// write # of seen fields
indexOut.writeVInt(seenFields.size());
// write field numbers and dictOut offsets
for (Map.Entry seenField : seenFields.entrySet()) {
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
indexOut.writeVInt(fieldInfo.number);
CompletionMetaData metaData = seenField.getValue();
indexOut.writeVLong(metaData.filePointer);
indexOut.writeVLong(metaData.minWeight);
indexOut.writeVLong(metaData.maxWeight);
indexOut.writeByte(metaData.type);
}
CodecUtil.writeFooter(indexOut);
CodecUtil.writeFooter(dictOut);
IOUtils.close(dictOut);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
private static class CompletionMetaData {
private final long filePointer;
private final long minWeight;
private final long maxWeight;
private final byte type;
private CompletionMetaData(long filePointer, long minWeight, long maxWeight, byte type) {
this.filePointer = filePointer;
this.minWeight = minWeight;
this.maxWeight = maxWeight;
this.type = type;
}
}
// builds an FST based on the terms written
private static class CompletionTermWriter {
private PostingsEnum postingsEnum = null;
private int docCount = 0;
private long maxWeight = 0;
private long minWeight = Long.MAX_VALUE;
private byte type;
private boolean first;
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final NRTSuggesterBuilder builder;
public CompletionTermWriter() throws IOException {
builder = new NRTSuggesterBuilder();
first = true;
}
/**
* Stores the built FST in output
Returns true if there was anything stored, false
* otherwise
*/
public boolean finish(IndexOutput output) throws IOException {
boolean stored = builder.store(output);
assert stored || docCount == 0
: "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
if (docCount == 0) {
minWeight = 0;
}
return stored;
}
/** Writes all postings (surface form, weight, document id) for term
*/
public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.PAYLOADS);
builder.startTerm(term);
int docFreq = 0;
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int docID = postingsEnum.docID();
for (int i = 0; i < postingsEnum.freq(); i++) {
postingsEnum.nextPosition();
assert postingsEnum.getPayload() != null;
BytesRef payload = postingsEnum.getPayload();
ByteArrayDataInput input =
new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
int len = input.readVInt();
scratch.growNoCopy(len);
scratch.setLength(len);
input.readBytes(scratch.bytes(), 0, scratch.length());
long weight = input.readVInt() - 1;
maxWeight = Math.max(maxWeight, weight);
minWeight = Math.min(minWeight, weight);
byte type = input.readByte();
if (first) {
this.type = type;
first = false;
} else if (this.type != type) {
throw new IllegalArgumentException("single field name has mixed types");
}
builder.addEntry(docID, scratch.get(), weight);
}
docFreq++;
docCount = Math.max(docCount, docFreq + 1);
}
builder.finishTerm();
}
}
}