org.apache.lucene.codecs.memory.FSTTermsReader Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.memory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
/**
* FST-based terms dictionary reader.
*
* The FST directly maps each term and its metadata, it is memory resident.
*
* @lucene.experimental
*/
public class FSTTermsReader extends FieldsProducer {
private final TreeMap fields = new TreeMap<>();
private final PostingsReaderBase postingsReader;
private final IndexInput fstTermsInput;
public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)
throws IOException {
final String termsFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);
this.postingsReader = postingsReader;
this.fstTermsInput =
state.directory.openInput(
termsFileName, state.context.withReadAdvice(ReadAdvice.RANDOM_PRELOAD));
IndexInput in = this.fstTermsInput;
boolean success = false;
try {
CodecUtil.checkIndexHeader(
in,
FSTTermsWriter.TERMS_CODEC_NAME,
FSTTermsWriter.TERMS_VERSION_START,
FSTTermsWriter.TERMS_VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
this.postingsReader.init(in, state);
seekDir(in);
final FieldInfos fieldInfos = state.fieldInfos;
final int numFields = in.readVInt();
for (int i = 0; i < numFields; i++) {
int fieldNumber = in.readVInt();
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
long numTerms = in.readVLong();
long sumTotalTermFreq = in.readVLong();
// if frequencies are omitted, sumTotalTermFreq=sumDocFreq and we only write one value
long sumDocFreq =
fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong();
int docCount = in.readVInt();
TermsReader current =
new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount);
TermsReader previous = fields.put(fieldInfo.name, current);
checkFieldSummary(state.segmentInfo, in, current, previous);
}
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(in);
}
}
}
private void seekDir(IndexInput in) throws IOException {
in.seek(in.length() - CodecUtil.footerLength() - 8);
in.seek(in.readLong());
}
private void checkFieldSummary(
SegmentInfo info, IndexInput in, TermsReader field, TermsReader previous) throws IOException {
// #docs with field must be <= #docs
if (field.docCount < 0 || field.docCount > info.maxDoc()) {
throw new CorruptIndexException(
"invalid docCount: " + field.docCount + " maxDoc: " + info.maxDoc(), in);
}
// #postings must be >= #docs with field
if (field.sumDocFreq < field.docCount) {
throw new CorruptIndexException(
"invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount, in);
}
// #positions must be >= #postings
if (field.sumTotalTermFreq < field.sumDocFreq) {
throw new CorruptIndexException(
"invalid sumTotalTermFreq: "
+ field.sumTotalTermFreq
+ " sumDocFreq: "
+ field.sumDocFreq,
in);
}
if (previous != null) {
throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name, in);
}
}
@Override
public Iterator iterator() {
return Collections.unmodifiableSet(fields.keySet()).iterator();
}
@Override
public Terms terms(String field) throws IOException {
assert field != null;
return fields.get(field);
}
@Override
public int size() {
return fields.size();
}
@Override
public void close() throws IOException {
try {
IOUtils.close(postingsReader, fstTermsInput);
} finally {
fields.clear();
}
}
final class TermsReader extends Terms {
final FieldInfo fieldInfo;
final long numTerms;
final long sumTotalTermFreq;
final long sumDocFreq;
final int docCount;
final FST dict;
TermsReader(
FieldInfo fieldInfo,
IndexInput in,
long numTerms,
long sumTotalTermFreq,
long sumDocFreq,
int docCount)
throws IOException {
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
final var fstMetadata = FST.readMetadata(in, outputs);
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size());
}
@Override
public String toString() {
return "FSTTerms(terms="
+ numTerms
+ ",postings="
+ sumDocFreq
+ ",positions="
+ sumTotalTermFreq
+ ",docs="
+ docCount
+ ")";
}
@Override
public boolean hasFreqs() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
public boolean hasOffsets() {
return fieldInfo
.getIndexOptions()
.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
>= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return fieldInfo.hasPayloads();
}
@Override
public long size() {
return numTerms;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override
public TermsEnum iterator() throws IOException {
return new SegmentTermsEnum();
}
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(compiled, startTerm);
}
// Only wraps common operations for PBF interact
abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum {
/* Current term stats + decoded metadata (customized by PBF) */
final BlockTermState state;
/* Current term stats + undecoded metadata (long[] & byte[]) */
FSTTermOutputs.TermData meta;
ByteArrayDataInput bytesReader;
/** Decodes metadata into customized term state */
abstract void decodeMetaData() throws IOException;
BaseTermsEnum() throws IOException {
this.state = postingsReader.newTermState();
this.bytesReader = new ByteArrayDataInput();
// NOTE: metadata will only be initialized in child class
}
@Override
public TermState termState() throws IOException {
decodeMetaData();
return state.clone();
}
@Override
public int docFreq() throws IOException {
return state.docFreq;
}
@Override
public long totalTermFreq() throws IOException {
return state.totalTermFreq == -1 ? state.docFreq : state.totalTermFreq;
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
decodeMetaData();
return postingsReader.postings(fieldInfo, state, reuse, flags);
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
decodeMetaData();
return postingsReader.impacts(fieldInfo, state, flags);
}
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
}
// Iterates through all terms in this field
private final class SegmentTermsEnum extends BaseTermsEnum {
/* Current term, null when enum ends or unpositioned */
BytesRef term;
final BytesRefFSTEnum fstEnum;
/* True when current term's metadata is decoded */
boolean decoded;
/* True when current enum is 'positioned' by seekExact(TermState) */
boolean seekPending;
SegmentTermsEnum() throws IOException {
super();
this.fstEnum = new BytesRefFSTEnum<>(dict);
this.decoded = false;
this.seekPending = false;
this.meta = null;
}
@Override
public BytesRef term() throws IOException {
return term;
}
// Let PBF decode metadata from long[] and byte[]
@Override
void decodeMetaData() throws IOException {
if (!decoded && !seekPending) {
if (meta.bytes != null) {
bytesReader.reset(meta.bytes, 0, meta.bytes.length);
}
postingsReader.decodeTerm(bytesReader, fieldInfo, state, true);
decoded = true;
}
}
// Update current enum according to FSTEnum
void updateEnum(final InputOutput pair) {
if (pair == null) {
term = null;
} else {
term = pair.input;
meta = pair.output;
state.docFreq = meta.docFreq;
state.totalTermFreq = meta.totalTermFreq;
}
decoded = false;
seekPending = false;
}
@Override
public BytesRef next() throws IOException {
if (seekPending) { // previously positioned, but termOutputs not fetched
seekPending = false;
SeekStatus status = seekCeil(term);
assert status == SeekStatus.FOUND; // must positioned on valid term
}
updateEnum(fstEnum.next());
return term;
}
@Override
public boolean seekExact(BytesRef target) throws IOException {
updateEnum(fstEnum.seekExact(target));
return term != null;
}
@Override
public SeekStatus seekCeil(BytesRef target) throws IOException {
updateEnum(fstEnum.seekCeil(target));
if (term == null) {
return SeekStatus.END;
} else {
return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
}
}
@Override
public void seekExact(BytesRef target, TermState otherState) {
if (!target.equals(term)) {
state.copyFrom(otherState);
term = BytesRef.deepCopyOf(target);
seekPending = true;
}
}
}
// Iterates intersect result with automaton (cannot seek!)
private final class IntersectTermsEnum extends BaseTermsEnum {
/* Current term, null when enum ends or unpositioned */
BytesRefBuilder term;
/* True when current term's metadata is decoded */
boolean decoded;
/* True when there is pending term when calling next() */
boolean pending;
/* stack to record how current term is constructed,
* used to accumulate metadata or rewind term:
* level == term.length + 1,
* == 0 when term is null */
Frame[] stack;
int level;
/* to which level the metadata is accumulated
* so that we can accumulate metadata lazily */
int metaUpto;
/* term dict fst */
final FST fst;
final FST.BytesReader fstReader;
final Outputs fstOutputs;
/* query automaton to intersect with */
final ByteRunnable fsa;
private final class Frame {
/* fst stats */
FST.Arc fstArc;
FSTTermOutputs.TermData output;
/* automaton stats */
int fsaState;
Frame() {
this.fstArc = new FST.Arc<>();
this.fsaState = -1;
}
@Override
public String toString() {
return "arc=" + fstArc + " state=" + fsaState;
}
}
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
super();
// if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
this.fst = dict;
this.fstReader = fst.getBytesReader();
this.fstOutputs = dict.outputs;
this.fsa = compiled.getByteRunnable();
this.level = -1;
this.stack = new Frame[16];
for (int i = 0; i < stack.length; i++) {
this.stack[i] = new Frame();
}
loadVirtualFrame(newFrame());
this.level++;
pushFrame(loadFirstFrame(newFrame()));
this.meta = null;
this.metaUpto = 1;
this.decoded = false;
this.pending = false;
if (startTerm == null) {
pending = isAccept(topFrame());
} else {
doSeekCeil(startTerm);
pending =
(term == null || !startTerm.equals(term.get()))
&& isValid(topFrame())
&& isAccept(topFrame());
}
}
@Override
public BytesRef term() throws IOException {
return term == null ? null : term.get();
}
@Override
void decodeMetaData() throws IOException {
assert term != null;
if (!decoded) {
if (meta.bytes != null) {
bytesReader.reset(meta.bytes, 0, meta.bytes.length);
}
postingsReader.decodeTerm(bytesReader, fieldInfo, state, true);
decoded = true;
}
}
/** Lazily accumulate meta data, when we got a accepted term */
void loadMetaData() {
Frame last, next;
last = stack[metaUpto];
while (metaUpto != level) {
metaUpto++;
next = stack[metaUpto];
next.output = fstOutputs.add(next.output, last.output);
last = next;
}
if (last.fstArc.isFinal()) {
meta = fstOutputs.add(last.output, last.fstArc.nextFinalOutput());
} else {
meta = last.output;
}
state.docFreq = meta.docFreq;
state.totalTermFreq = meta.totalTermFreq;
}
@Override
public SeekStatus seekCeil(BytesRef target) throws IOException {
decoded = false;
doSeekCeil(target);
loadMetaData();
if (term == null) {
return SeekStatus.END;
} else {
return term.get().equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
}
}
@Override
public BytesRef next() throws IOException {
// if (TEST) System.out.println("Enum next()");
if (pending) {
pending = false;
loadMetaData();
return term();
}
decoded = false;
DFS:
while (level > 0) {
Frame frame = newFrame();
if (loadExpandFrame(topFrame(), frame) != null) { // has valid target
pushFrame(frame);
if (isAccept(frame)) { // gotcha
break;
}
continue; // check next target
}
frame = popFrame();
while (level > 0) {
if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling
pushFrame(frame);
if (isAccept(frame)) { // gotcha
break DFS;
}
continue DFS; // check next target
}
frame = popFrame();
}
return null;
}
loadMetaData();
return term();
}
private BytesRef doSeekCeil(BytesRef target) throws IOException {
// if (TEST) System.out.println("Enum doSeekCeil()");
Frame frame = null;
int label, upto = 0, limit = target.length;
while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
frame = newFrame();
label = target.bytes[target.offset + upto] & 0xff;
frame = loadCeilFrame(label, topFrame(), frame);
if (frame == null || frame.fstArc.label() != label) {
break;
}
assert isValid(frame); // target must be fetched from automaton
pushFrame(frame);
upto++;
}
if (upto == limit) { // got target
return term();
}
if (frame != null) { // got larger term('s prefix)
pushFrame(frame);
return isAccept(frame) ? term() : next();
}
while (level > 0) { // got target's prefix, advance to larger term
frame = popFrame();
while (level > 0 && !canRewind(frame)) {
frame = popFrame();
}
if (loadNextFrame(topFrame(), frame) != null) {
pushFrame(frame);
return isAccept(frame) ? term() : next();
}
}
return null;
}
/** Virtual frame, never pop */
Frame loadVirtualFrame(Frame frame) {
frame.output = fstOutputs.getNoOutput();
frame.fsaState = -1;
return frame;
}
/** Load frame for start arc(node) on fst */
Frame loadFirstFrame(Frame frame) throws IOException {
frame.fstArc = fst.getFirstArc(frame.fstArc);
frame.output = frame.fstArc.output();
frame.fsaState = 0;
return frame;
}
/** Load frame for target arc(node) on fst */
Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
if (!canGrow(top)) {
return null;
}
frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader);
frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label());
// if (TEST) System.out.println(" loadExpand frame="+frame);
if (frame.fsaState == -1) {
return loadNextFrame(top, frame);
}
frame.output = frame.fstArc.output();
return frame;
}
/** Load frame for sibling arc(node) on fst */
Frame loadNextFrame(Frame top, Frame frame) throws IOException {
if (!canRewind(frame)) {
return null;
}
while (!frame.fstArc.isLast()) {
frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label());
if (frame.fsaState != -1) {
break;
}
}
// if (TEST) System.out.println(" loadNext frame="+frame);
if (frame.fsaState == -1) {
return null;
}
frame.output = frame.fstArc.output();
return frame;
}
/**
* Load frame for target arc(node) on fst, so that arc.label >= label and
* !fsa.reject(arc.label)
*/
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
FST.Arc arc = frame.fstArc;
arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
if (arc == null) {
return null;
}
frame.fsaState = fsa.step(top.fsaState, arc.label());
// if (TEST) System.out.println(" loadCeil frame="+frame);
if (frame.fsaState == -1) {
return loadNextFrame(top, frame);
}
frame.output = frame.fstArc.output();
return frame;
}
boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts
return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal();
}
boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject
return /*frame != null &&*/ frame.fsaState != -1;
}
boolean canGrow(Frame frame) { // can walk forward on both fst&fsa
return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc);
}
boolean canRewind(Frame frame) { // can jump to sibling
return !frame.fstArc.isLast();
}
void pushFrame(Frame frame) {
term = grow(frame.fstArc.label());
level++;
// if (TEST) System.out.println(" term=" + term + " level=" + level);
}
Frame popFrame() {
term = shrink();
level--;
metaUpto = metaUpto > level ? level : metaUpto;
// if (TEST) System.out.println(" term=" + term + " level=" + level);
return stack[level + 1];
}
Frame newFrame() {
if (level + 1 == stack.length) {
final Frame[] temp =
new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(stack, 0, temp, 0, stack.length);
for (int i = stack.length; i < temp.length; i++) {
temp[i] = new Frame();
}
stack = temp;
}
return stack[level + 1];
}
Frame topFrame() {
return stack[level];
}
BytesRefBuilder grow(int label) {
if (term == null) {
term = new BytesRefBuilder();
} else {
term.append((byte) label);
}
return term;
}
BytesRefBuilder shrink() {
if (term.length() == 0) {
term = null;
} else {
term.setLength(term.length() - 1);
}
return term;
}
}
}
static void walk(FST fst) throws IOException {
final ArrayList> queue = new ArrayList<>();
final BitSet seen = new BitSet();
final FST.BytesReader reader = fst.getBytesReader();
final FST.Arc startArc = fst.getFirstArc(new FST.Arc());
queue.add(startArc);
while (!queue.isEmpty()) {
final FST.Arc arc = queue.remove(0);
final long node = arc.target();
// System.out.println(arc);
if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
seen.set((int) node);
fst.readFirstRealTargetArc(node, arc, reader);
while (true) {
queue.add(new FST.Arc().copyFrom(arc));
if (arc.isLast()) {
break;
} else {
fst.readNextRealArc(arc, reader);
}
}
}
}
}
@Override
public String toString() {
return getClass().getSimpleName()
+ "(fields="
+ fields.size()
+ ",delegate="
+ postingsReader
+ ")";
}
@Override
public void checkIntegrity() throws IOException {
postingsReader.checkIntegrity();
}
}