All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.memory.FSTTermsReader Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.memory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;

/**
 * FST-based terms dictionary reader.
 *
 * 

The FST directly maps each term and its metadata, it is memory resident. * * @lucene.experimental */ public class FSTTermsReader extends FieldsProducer { private final TreeMap fields = new TreeMap<>(); private final PostingsReaderBase postingsReader; private final IndexInput fstTermsInput; public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { final String termsFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION); this.postingsReader = postingsReader; this.fstTermsInput = state.directory.openInput( termsFileName, state.context.withReadAdvice(ReadAdvice.RANDOM_PRELOAD)); IndexInput in = this.fstTermsInput; boolean success = false; try { CodecUtil.checkIndexHeader( in, FSTTermsWriter.TERMS_CODEC_NAME, FSTTermsWriter.TERMS_VERSION_START, FSTTermsWriter.TERMS_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.checksumEntireFile(in); this.postingsReader.init(in, state); seekDir(in); final FieldInfos fieldInfos = state.fieldInfos; final int numFields = in.readVInt(); for (int i = 0; i < numFields; i++) { int fieldNumber = in.readVInt(); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); long numTerms = in.readVLong(); long sumTotalTermFreq = in.readVLong(); // if frequencies are omitted, sumTotalTermFreq=sumDocFreq and we only write one value long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong(); int docCount = in.readVInt(); TermsReader current = new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount); TermsReader previous = fields.put(fieldInfo.name, current); checkFieldSummary(state.segmentInfo, in, current, previous); } success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(in); } } } private void seekDir(IndexInput in) throws IOException { in.seek(in.length() - CodecUtil.footerLength() - 8); in.seek(in.readLong()); } private void checkFieldSummary( SegmentInfo info, IndexInput in, TermsReader field, TermsReader previous) throws IOException { // #docs with field must be <= #docs if (field.docCount < 0 || field.docCount > info.maxDoc()) { throw new CorruptIndexException( "invalid docCount: " + field.docCount + " maxDoc: " + info.maxDoc(), in); } // #postings must be >= #docs with field if (field.sumDocFreq < field.docCount) { throw new CorruptIndexException( "invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount, in); } // #positions must be >= #postings if (field.sumTotalTermFreq < field.sumDocFreq) { throw new CorruptIndexException( "invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq, in); } if (previous != null) { throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name, in); } } @Override public Iterator iterator() { return Collections.unmodifiableSet(fields.keySet()).iterator(); } @Override public Terms terms(String field) throws IOException { assert field != null; return fields.get(field); } @Override public int size() { return fields.size(); } @Override public void close() throws IOException { try { IOUtils.close(postingsReader, fstTermsInput); } finally { fields.clear(); } } final class TermsReader extends Terms { final FieldInfo fieldInfo; final long numTerms; final long sumTotalTermFreq; final long sumDocFreq; final int docCount; final FST dict; TermsReader( FieldInfo fieldInfo, IndexInput in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo); final var fstMetadata = FST.readMetadata(in, outputs); OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata); this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore); in.skipBytes(offHeapFSTStore.size()); } @Override public String toString() { return "FSTTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")"; } @Override public boolean hasFreqs() { return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; } @Override public boolean hasOffsets() { return fieldInfo .getIndexOptions() .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; } @Override public boolean hasPositions() { return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; } @Override public boolean hasPayloads() { return fieldInfo.hasPayloads(); } @Override public long size() { return numTerms; } @Override public long getSumTotalTermFreq() { return sumTotalTermFreq; } @Override public long getSumDocFreq() throws IOException { return sumDocFreq; } @Override public int getDocCount() throws IOException { return docCount; } @Override public TermsEnum iterator() throws IOException { return new SegmentTermsEnum(); } @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new IntersectTermsEnum(compiled, startTerm); } // Only wraps common operations for PBF interact abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { /* Current term stats + decoded metadata (customized by PBF) */ final BlockTermState state; /* Current term stats + undecoded metadata (long[] & byte[]) */ FSTTermOutputs.TermData meta; ByteArrayDataInput bytesReader; /** Decodes metadata into customized term state */ abstract void decodeMetaData() throws IOException; BaseTermsEnum() throws IOException { this.state = postingsReader.newTermState(); this.bytesReader = new ByteArrayDataInput(); // NOTE: metadata will only be initialized in child class } @Override public TermState termState() throws IOException { decodeMetaData(); return state.clone(); } @Override public int docFreq() throws IOException { return state.docFreq; } @Override public long totalTermFreq() throws IOException { return state.totalTermFreq == -1 ? state.docFreq : state.totalTermFreq; } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { decodeMetaData(); return postingsReader.postings(fieldInfo, state, reuse, flags); } @Override public ImpactsEnum impacts(int flags) throws IOException { decodeMetaData(); return postingsReader.impacts(fieldInfo, state, flags); } @Override public void seekExact(long ord) throws IOException { throw new UnsupportedOperationException(); } @Override public long ord() { throw new UnsupportedOperationException(); } } // Iterates through all terms in this field private final class SegmentTermsEnum extends BaseTermsEnum { /* Current term, null when enum ends or unpositioned */ BytesRef term; final BytesRefFSTEnum fstEnum; /* True when current term's metadata is decoded */ boolean decoded; /* True when current enum is 'positioned' by seekExact(TermState) */ boolean seekPending; SegmentTermsEnum() throws IOException { super(); this.fstEnum = new BytesRefFSTEnum<>(dict); this.decoded = false; this.seekPending = false; this.meta = null; } @Override public BytesRef term() throws IOException { return term; } // Let PBF decode metadata from long[] and byte[] @Override void decodeMetaData() throws IOException { if (!decoded && !seekPending) { if (meta.bytes != null) { bytesReader.reset(meta.bytes, 0, meta.bytes.length); } postingsReader.decodeTerm(bytesReader, fieldInfo, state, true); decoded = true; } } // Update current enum according to FSTEnum void updateEnum(final InputOutput pair) { if (pair == null) { term = null; } else { term = pair.input; meta = pair.output; state.docFreq = meta.docFreq; state.totalTermFreq = meta.totalTermFreq; } decoded = false; seekPending = false; } @Override public BytesRef next() throws IOException { if (seekPending) { // previously positioned, but termOutputs not fetched seekPending = false; SeekStatus status = seekCeil(term); assert status == SeekStatus.FOUND; // must positioned on valid term } updateEnum(fstEnum.next()); return term; } @Override public boolean seekExact(BytesRef target) throws IOException { updateEnum(fstEnum.seekExact(target)); return term != null; } @Override public SeekStatus seekCeil(BytesRef target) throws IOException { updateEnum(fstEnum.seekCeil(target)); if (term == null) { return SeekStatus.END; } else { return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; } } @Override public void seekExact(BytesRef target, TermState otherState) { if (!target.equals(term)) { state.copyFrom(otherState); term = BytesRef.deepCopyOf(target); seekPending = true; } } } // Iterates intersect result with automaton (cannot seek!) private final class IntersectTermsEnum extends BaseTermsEnum { /* Current term, null when enum ends or unpositioned */ BytesRefBuilder term; /* True when current term's metadata is decoded */ boolean decoded; /* True when there is pending term when calling next() */ boolean pending; /* stack to record how current term is constructed, * used to accumulate metadata or rewind term: * level == term.length + 1, * == 0 when term is null */ Frame[] stack; int level; /* to which level the metadata is accumulated * so that we can accumulate metadata lazily */ int metaUpto; /* term dict fst */ final FST fst; final FST.BytesReader fstReader; final Outputs fstOutputs; /* query automaton to intersect with */ final ByteRunnable fsa; private final class Frame { /* fst stats */ FST.Arc fstArc; FSTTermOutputs.TermData output; /* automaton stats */ int fsaState; Frame() { this.fstArc = new FST.Arc<>(); this.fsaState = -1; } @Override public String toString() { return "arc=" + fstArc + " state=" + fsaState; } } IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { super(); // if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = dict; this.fstReader = fst.getBytesReader(); this.fstOutputs = dict.outputs; this.fsa = compiled.getByteRunnable(); this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.length; i++) { this.stack[i] = new Frame(); } loadVirtualFrame(newFrame()); this.level++; pushFrame(loadFirstFrame(newFrame())); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); } } @Override public BytesRef term() throws IOException { return term == null ? null : term.get(); } @Override void decodeMetaData() throws IOException { assert term != null; if (!decoded) { if (meta.bytes != null) { bytesReader.reset(meta.bytes, 0, meta.bytes.length); } postingsReader.decodeTerm(bytesReader, fieldInfo, state, true); decoded = true; } } /** Lazily accumulate meta data, when we got a accepted term */ void loadMetaData() { Frame last, next; last = stack[metaUpto]; while (metaUpto != level) { metaUpto++; next = stack[metaUpto]; next.output = fstOutputs.add(next.output, last.output); last = next; } if (last.fstArc.isFinal()) { meta = fstOutputs.add(last.output, last.fstArc.nextFinalOutput()); } else { meta = last.output; } state.docFreq = meta.docFreq; state.totalTermFreq = meta.totalTermFreq; } @Override public SeekStatus seekCeil(BytesRef target) throws IOException { decoded = false; doSeekCeil(target); loadMetaData(); if (term == null) { return SeekStatus.END; } else { return term.get().equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; } } @Override public BytesRef next() throws IOException { // if (TEST) System.out.println("Enum next()"); if (pending) { pending = false; loadMetaData(); return term(); } decoded = false; DFS: while (level > 0) { Frame frame = newFrame(); if (loadExpandFrame(topFrame(), frame) != null) { // has valid target pushFrame(frame); if (isAccept(frame)) { // gotcha break; } continue; // check next target } frame = popFrame(); while (level > 0) { if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling pushFrame(frame); if (isAccept(frame)) { // gotcha break DFS; } continue DFS; // check next target } frame = popFrame(); } return null; } loadMetaData(); return term(); } private BytesRef doSeekCeil(BytesRef target) throws IOException { // if (TEST) System.out.println("Enum doSeekCeil()"); Frame frame = null; int label, upto = 0, limit = target.length; while (upto < limit) { // to target prefix, or ceil label (rewind prefix) frame = newFrame(); label = target.bytes[target.offset + upto] & 0xff; frame = loadCeilFrame(label, topFrame(), frame); if (frame == null || frame.fstArc.label() != label) { break; } assert isValid(frame); // target must be fetched from automaton pushFrame(frame); upto++; } if (upto == limit) { // got target return term(); } if (frame != null) { // got larger term('s prefix) pushFrame(frame); return isAccept(frame) ? term() : next(); } while (level > 0) { // got target's prefix, advance to larger term frame = popFrame(); while (level > 0 && !canRewind(frame)) { frame = popFrame(); } if (loadNextFrame(topFrame(), frame) != null) { pushFrame(frame); return isAccept(frame) ? term() : next(); } } return null; } /** Virtual frame, never pop */ Frame loadVirtualFrame(Frame frame) { frame.output = fstOutputs.getNoOutput(); frame.fsaState = -1; return frame; } /** Load frame for start arc(node) on fst */ Frame loadFirstFrame(Frame frame) throws IOException { frame.fstArc = fst.getFirstArc(frame.fstArc); frame.output = frame.fstArc.output(); frame.fsaState = 0; return frame; } /** Load frame for target arc(node) on fst */ Frame loadExpandFrame(Frame top, Frame frame) throws IOException { if (!canGrow(top)) { return null; } frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader); frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); // if (TEST) System.out.println(" loadExpand frame="+frame); if (frame.fsaState == -1) { return loadNextFrame(top, frame); } frame.output = frame.fstArc.output(); return frame; } /** Load frame for sibling arc(node) on fst */ Frame loadNextFrame(Frame top, Frame frame) throws IOException { if (!canRewind(frame)) { return null; } while (!frame.fstArc.isLast()) { frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader); frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); if (frame.fsaState != -1) { break; } } // if (TEST) System.out.println(" loadNext frame="+frame); if (frame.fsaState == -1) { return null; } frame.output = frame.fstArc.output(); return frame; } /** * Load frame for target arc(node) on fst, so that arc.label >= label and * !fsa.reject(arc.label) */ Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { FST.Arc arc = frame.fstArc; arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); if (arc == null) { return null; } frame.fsaState = fsa.step(top.fsaState, arc.label()); // if (TEST) System.out.println(" loadCeil frame="+frame); if (frame.fsaState == -1) { return loadNextFrame(top, frame); } frame.output = frame.fstArc.output(); return frame; } boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal(); } boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject return /*frame != null &&*/ frame.fsaState != -1; } boolean canGrow(Frame frame) { // can walk forward on both fst&fsa return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc); } boolean canRewind(Frame frame) { // can jump to sibling return !frame.fstArc.isLast(); } void pushFrame(Frame frame) { term = grow(frame.fstArc.label()); level++; // if (TEST) System.out.println(" term=" + term + " level=" + level); } Frame popFrame() { term = shrink(); level--; metaUpto = metaUpto > level ? level : metaUpto; // if (TEST) System.out.println(" term=" + term + " level=" + level); return stack[level + 1]; } Frame newFrame() { if (level + 1 == stack.length) { final Frame[] temp = new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(stack, 0, temp, 0, stack.length); for (int i = stack.length; i < temp.length; i++) { temp[i] = new Frame(); } stack = temp; } return stack[level + 1]; } Frame topFrame() { return stack[level]; } BytesRefBuilder grow(int label) { if (term == null) { term = new BytesRefBuilder(); } else { term.append((byte) label); } return term; } BytesRefBuilder shrink() { if (term.length() == 0) { term = null; } else { term.setLength(term.length() - 1); } return term; } } } static void walk(FST fst) throws IOException { final ArrayList> queue = new ArrayList<>(); final BitSet seen = new BitSet(); final FST.BytesReader reader = fst.getBytesReader(); final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); queue.add(startArc); while (!queue.isEmpty()) { final FST.Arc arc = queue.remove(0); final long node = arc.target(); // System.out.println(arc); if (FST.targetHasArcs(arc) && !seen.get((int) node)) { seen.set((int) node); fst.readFirstRealTargetArc(node, arc, reader); while (true) { queue.add(new FST.Arc().copyFrom(arc)); if (arc.isLast()) { break; } else { fst.readNextRealArc(arc, reader); } } } } } @Override public String toString() { return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")"; } @Override public void checkIntegrity() throws IOException { postingsReader.checkIntegrity(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy