org.apache.lucene.codecs.pulsing.PulsingPostingsWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
There is a newer version: 9.11.1
package org.apache.lucene.codecs.pulsing;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;

// TODO: we now inline based on total TF of the term,
// but it might be better to inline by "net bytes used"
// so that a term that has only 1 posting but a huge
// payload would not be inlined.  Though this is
// presumably rare in practice...

/** 
 * Writer for the pulsing format. 
 * 
 * Wraps another postings implementation and decides 
 * (based on total number of occurrences), whether a terms 
 * postings should be inlined into the term dictionary,
 * or passed through to the wrapped writer.
 *
 * @lucene.experimental */
public final class PulsingPostingsWriter extends PostingsWriterBase {

  final static String CODEC = "PulsedPostingsWriter";

  // To add a new version, increment from the last one, and
  // change VERSION_CURRENT to point to your new version:
  final static int VERSION_START = 0;

  final static int VERSION_CURRENT = VERSION_START;

  private IndexOutput termsOut;

  private IndexOptions indexOptions;
  private boolean storePayloads;

  private static class PendingTerm {
    private final byte[] bytes;
    public PendingTerm(byte[] bytes) {
      this.bytes = bytes;
    }
  }

  private final List pendingTerms = new ArrayList();

  // one entry per position
  private final Position[] pending;
  private int pendingCount = 0;                           // -1 once we've hit too many positions
  private Position currentDoc;                    // first Position entry of current doc

  private static final class Position {
    BytesRef payload;
    int termFreq;                                 // only incremented on first position for a given doc
    int pos;
    int docID;
    int startOffset;
    int endOffset;
  }

  // TODO: -- lazy init this?  ie, if every single term
  // was inlined (eg for a "primary key" field) then we
  // never need to use this fallback?  Fallback writer for
  // non-inlined terms:
  final PostingsWriterBase wrappedPostingsWriter;

  /** If the total number of positions (summed across all docs
   *  for this term) is <= maxPositions, then the postings are
   *  inlined into terms dict */
  public PulsingPostingsWriter(int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
    pending = new Position[maxPositions];
    for(int i=0;i= the cutoff:
    this.wrappedPostingsWriter = wrappedPostingsWriter;
  }

  @Override
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
    termsOut.writeVInt(pending.length); // encode maxPositions in header
    wrappedPostingsWriter.start(termsOut);
  }

  @Override
  public void startTerm() {
    //if (DEBUG) System.out.println("PW   startTerm");
    assert pendingCount == 0;
  }

  // TODO: -- should we NOT reuse across fields?  would
  // be cleaner

  // Currently, this instance is re-used across fields, so
  // our parent calls setField whenever the field changes
  @Override
  public void setField(FieldInfo fieldInfo) {
    this.indexOptions = fieldInfo.getIndexOptions();
    //if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
    storePayloads = fieldInfo.hasPayloads();
    wrappedPostingsWriter.setField(fieldInfo);
    //DEBUG = BlockTreeTermsWriter.DEBUG;
  }

  private boolean DEBUG;

  @Override
  public void startDoc(int docID, int termDocFreq) throws IOException {
    assert docID >= 0: "got docID=" + docID;

    /*
    if (termID != -1) {
      if (docID == 0) {
        baseDocID = termID;
      } else if (baseDocID + docID != termID) {
        throw new RuntimeException("WRITE: baseDocID=" + baseDocID + " docID=" + docID + " termID=" + termID);
      }
    }
    */

    //if (DEBUG) System.out.println("PW     doc=" + docID);

    if (pendingCount == pending.length) {
      push();
      //if (DEBUG) System.out.println("PW: wrapped.finishDoc");
      wrappedPostingsWriter.finishDoc();
    }

    if (pendingCount != -1) {
      assert pendingCount < pending.length;
      currentDoc = pending[pendingCount];
      currentDoc.docID = docID;
      if (indexOptions == IndexOptions.DOCS_ONLY) {
        pendingCount++;
      } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { 
        pendingCount++;
        currentDoc.termFreq = termDocFreq;
      } else {
        currentDoc.termFreq = termDocFreq;
      }
    } else {
      // We've already seen too many docs for this term --
      // just forward to our fallback writer
      wrappedPostingsWriter.startDoc(docID, termDocFreq);
    }
  }

  @Override
  public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {

    //if (DEBUG) System.out.println("PW       pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
    if (pendingCount == pending.length) {
      push();
    }

    if (pendingCount == -1) {
      // We've already seen too many docs for this term --
      // just forward to our fallback writer
      wrappedPostingsWriter.addPosition(position, payload, startOffset, endOffset);
    } else {
      // buffer up
      final Position pos = pending[pendingCount++];
      pos.pos = position;
      pos.startOffset = startOffset;
      pos.endOffset = endOffset;
      pos.docID = currentDoc.docID;
      if (payload != null && payload.length > 0) {
        if (pos.payload == null) {
          pos.payload = BytesRef.deepCopyOf(payload);
        } else {
          pos.payload.copyBytes(payload);
        }
      } else if (pos.payload != null) {
        pos.payload.length = 0;
      }
    }
  }

  @Override
  public void finishDoc() throws IOException {
    // if (DEBUG) System.out.println("PW     finishDoc");
    if (pendingCount == -1) {
      wrappedPostingsWriter.finishDoc();
    }
  }

  private final RAMOutputStream buffer = new RAMOutputStream();

  // private int baseDocID;

  /** Called when we are done adding docs to this term */
  @Override
  public void finishTerm(TermStats stats) throws IOException {
    // if (DEBUG) System.out.println("PW   finishTerm docCount=" + stats.docFreq + " pendingCount=" + pendingCount + " pendingTerms.size()=" + pendingTerms.size());

    assert pendingCount > 0 || pendingCount == -1;

    if (pendingCount == -1) {
      wrappedPostingsWriter.finishTerm(stats);
      // Must add null entry to record terms that our
      // wrapped postings impl added
      pendingTerms.add(null);
    } else {

      // There were few enough total occurrences for this
      // term, so we fully inline our postings data into
      // terms dict, now:

      // TODO: it'd be better to share this encoding logic
      // in some inner codec that knows how to write a
      // single doc / single position, etc.  This way if a
      // given codec wants to store other interesting
      // stuff, it could use this pulsing codec to do so

      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
        int lastDocID = 0;
        int pendingIDX = 0;
        int lastPayloadLength = -1;
        int lastOffsetLength = -1;
        while(pendingIDX < pendingCount) {
          final Position doc = pending[pendingIDX];

          final int delta = doc.docID - lastDocID;
          lastDocID = doc.docID;

          // if (DEBUG) System.out.println("  write doc=" + doc.docID + " freq=" + doc.termFreq);

          if (doc.termFreq == 1) {
            buffer.writeVInt((delta<<1)|1);
          } else {
            buffer.writeVInt(delta<<1);
            buffer.writeVInt(doc.termFreq);
          }

          int lastPos = 0;
          int lastOffset = 0;
          for(int posIDX=0;posIDX 0) {
              assert storePayloads;
              buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
            }
          }
        }
      } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
        int lastDocID = 0;
        for(int posIDX=0;posIDX-->