All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.pulsing.PulsingPostingsWriter Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
package org.apache.lucene.codecs.pulsing;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;

import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

// TODO: we now inline based on total TF of the term,
// but it might be better to inline by "net bytes used"
// so that a term that has only 1 posting but a huge
// payload would not be inlined.  Though this is
// presumably rare in practice...

/** 
 * Writer for the pulsing format. 
 * 

* Wraps another postings implementation and decides * (based on total number of occurrences), whether a terms * postings should be inlined into the term dictionary, * or passed through to the wrapped writer. * * @lucene.experimental */ public final class PulsingPostingsWriter extends PostingsWriterBase { final static String CODEC = "PulsedPostingsWriter"; // recording field summary final static String SUMMARY_EXTENSION = "smy"; // To add a new version, increment from the last one, and // change VERSION_CURRENT to point to your new version: final static int VERSION_START = 0; final static int VERSION_META_ARRAY = 1; final static int VERSION_CURRENT = VERSION_META_ARRAY; private SegmentWriteState segmentState; private IndexOutput termsOut; private List fields; private IndexOptions indexOptions; private boolean storePayloads; // information for wrapped PF, in current field private int longsSize; private long[] longs; boolean absolute; private static class PulsingTermState extends BlockTermState { private byte[] bytes; private BlockTermState wrappedState; @Override public String toString() { if (bytes != null) { return "inlined"; } else { return "not inlined wrapped=" + wrappedState; } } } // one entry per position private final Position[] pending; private int pendingCount = 0; // -1 once we've hit too many positions private Position currentDoc; // first Position entry of current doc private static final class Position { BytesRef payload; int termFreq; // only incremented on first position for a given doc int pos; int docID; int startOffset; int endOffset; } private static final class FieldMetaData { int fieldNumber; int longsSize; FieldMetaData(int number, int size) { fieldNumber = number; longsSize = size; } } // TODO: -- lazy init this? ie, if every single term // was inlined (eg for a "primary key" field) then we // never need to use this fallback? Fallback writer for // non-inlined terms: final PostingsWriterBase wrappedPostingsWriter; /** If the total number of positions (summed across all docs * for this term) is <= maxPositions, then the postings are * inlined into terms dict */ public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) { pending = new Position[maxPositions]; for(int i=0;i(); // We simply wrap another postings writer, but only call // on it when tot positions is >= the cutoff: this.wrappedPostingsWriter = wrappedPostingsWriter; this.segmentState = state; } @Override public void init(IndexOutput termsOut) throws IOException { this.termsOut = termsOut; CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); termsOut.writeVInt(pending.length); // encode maxPositions in header wrappedPostingsWriter.init(termsOut); } @Override public BlockTermState newTermState() throws IOException { PulsingTermState state = new PulsingTermState(); state.wrappedState = wrappedPostingsWriter.newTermState(); return state; } @Override public void startTerm() { //if (DEBUG) System.out.println("PW startTerm"); assert pendingCount == 0; } // TODO: -- should we NOT reuse across fields? would // be cleaner // Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes @Override public int setField(FieldInfo fieldInfo) { this.indexOptions = fieldInfo.getIndexOptions(); //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions); storePayloads = fieldInfo.hasPayloads(); absolute = false; longsSize = wrappedPostingsWriter.setField(fieldInfo); longs = new long[longsSize]; fields.add(new FieldMetaData(fieldInfo.number, longsSize)); return 0; //DEBUG = BlockTreeTermsWriter.DEBUG; } private boolean DEBUG; @Override public void startDoc(int docID, int termDocFreq) throws IOException { assert docID >= 0: "got docID=" + docID; /* if (termID != -1) { if (docID == 0) { baseDocID = termID; } else if (baseDocID + docID != termID) { throw new RuntimeException("WRITE: baseDocID=" + baseDocID + " docID=" + docID + " termID=" + termID); } } */ //if (DEBUG) System.out.println("PW doc=" + docID); if (pendingCount == pending.length) { push(); //if (DEBUG) System.out.println("PW: wrapped.finishDoc"); wrappedPostingsWriter.finishDoc(); } if (pendingCount != -1) { assert pendingCount < pending.length; currentDoc = pending[pendingCount]; currentDoc.docID = docID; if (indexOptions == IndexOptions.DOCS_ONLY) { pendingCount++; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { pendingCount++; currentDoc.termFreq = termDocFreq; } else { currentDoc.termFreq = termDocFreq; } } else { // We've already seen too many docs for this term -- // just forward to our fallback writer wrappedPostingsWriter.startDoc(docID, termDocFreq); } } @Override public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { //if (DEBUG) System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes")); if (pendingCount == pending.length) { push(); } if (pendingCount == -1) { // We've already seen too many docs for this term -- // just forward to our fallback writer wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset); } else { // buffer up final Position pos = pending[pendingCount++]; pos.pos = position; pos.startOffset = startOffset; pos.endOffset = endOffset; pos.docID = currentDoc.docID; if (payload != null && payload.length > 0) { if (pos.payload == null) { pos.payload = BytesRef.deepCopyOf(payload); } else { pos.payload.copyBytes(payload); } } else if (pos.payload != null) { pos.payload.length = 0; } } } @Override public void finishDoc() throws IOException { // if (DEBUG) System.out.println("PW finishDoc"); if (pendingCount == -1) { wrappedPostingsWriter.finishDoc(); } } private final RAMOutputStream buffer = new RAMOutputStream(); // private int baseDocID; /** Called when we are done adding docs to this term */ @Override public void finishTerm(BlockTermState _state) throws IOException { PulsingTermState state = (PulsingTermState) _state; // if (DEBUG) System.out.println("PW finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size()); assert pendingCount > 0 || pendingCount == -1; if (pendingCount == -1) { state.wrappedState.docFreq = state.docFreq; state.wrappedState.totalTermFreq = state.totalTermFreq; state.bytes = null; wrappedPostingsWriter.finishTerm(state.wrappedState); } else { // There were few enough total occurrences for this // term, so we fully inline our postings data into // terms dict, now: // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { int lastDocID = 0; int pendingIDX = 0; int lastPayloadLength = -1; int lastOffsetLength = -1; while(pendingIDX < pendingCount) { final Position doc = pending[pendingIDX]; final int delta = doc.docID - lastDocID; lastDocID = doc.docID; // if (DEBUG) System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq); if (doc.termFreq == 1) { buffer.writeVInt((delta<<1)|1); } else { buffer.writeVInt(delta<<1); buffer.writeVInt(doc.termFreq); } int lastPos = 0; int lastOffset = 0; for(int posIDX=0;posIDX 0) { assert storePayloads; buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length); } } } } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { int lastDocID = 0; for(int posIDX=0;posIDX-->



© 2015 - 2024 Weber Informatics LLC | Privacy Policy