All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.tagger.TermPrefixCursor Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.io.IOException;
import java.util.Map;

import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;

/**
 * Cursor into the terms that advances by prefix.
 */
class TermPrefixCursor {

  //Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
  // Maybe that could be added to Lucene.

  // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?

  private static final byte SEPARATOR_CHAR = ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable?
  private static final IntsRef EMPTY_INTSREF = new IntsRef();

  private final TermsEnum termsEnum;
  private final Bits liveDocs;
  private final Map docIdsCache;

  private BytesRef prefixBuf;//we append to this
  private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
  private boolean prefixBufOnLoan;//if true, PB is loaned; needs to be copied
  private PostingsEnum postingsEnum;
  private IntsRef docIds;

  TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map docIdsCache) {
    this.termsEnum = termsEnum;
    this.liveDocs = liveDocs;
    this.docIdsCache = docIdsCache;
  }

  /** Appends the separator char (if not the first) plus the given word to the prefix buffer,
   * then seeks to it. If the seek fails, false is returned and this cursor
   * can be re-used as if in a new state.  The {@code word} BytesRef is considered temporary,
   * and is not saved within this class. */
  boolean advance(BytesRef word) throws IOException {
    if (prefixBuf == null) { // first advance
      //set prefixBuf to word temporary. When advance() completes, we either null out or copy.
      prefixBuf = word;
      prefixBufOnLoan = true;
      if (seekPrefix()) {//... and we have to
        ensureBufIsACopy();
        return true;
      } else {
        prefixBuf = null;//just to be darned sure 'word' isn't referenced here
        return false;
      }

    } else { // subsequent advance
      //append to existing
      assert !prefixBufOnLoan;

      prefixBufBuilder.append(SEPARATOR_CHAR);
      prefixBufBuilder.append(word);
      prefixBuf = prefixBufBuilder.get();
      if (seekPrefix()) {
        return true;
      } else {
        prefixBuf = null;
        return false;
      }
    }
  }

  private void ensureBufIsACopy() {
    if (!prefixBufOnLoan)
      return;

    prefixBufBuilder.clear();
    prefixBufBuilder.copyBytes(prefixBuf);
    prefixBuf = prefixBufBuilder.get();
    prefixBufOnLoan = false;
  }

  /** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
   * Sets docIds. **/
  private boolean seekPrefix() throws IOException {
    TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);

    docIds = null;//invalidate
    switch (seekStatus) {
      case END:
        return false;

      case FOUND:
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
        if (docIds.length > 0) {
          return true;
        }

        //Pretend we didn't find it; go to next term
        docIds = null;
        if (termsEnum.next() == null) { // case END
          return false;
        }
        //fall through to NOT_FOUND

      case NOT_FOUND:
        //termsEnum must start with prefixBuf to continue
        BytesRef teTerm = termsEnum.term();

        if (teTerm.length > prefixBuf.length) {
          for (int i = 0; i < prefixBuf.length; i++) {
            if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
              return false;
          }
          if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
            return false;
          return true;
        }
        return false;
    }
    throw new IllegalStateException(seekStatus.toString());
  }

  /** Returns an IntsRef either cached or reading postingsEnum. Not null. */
  private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
    // (The cache can have empty IntsRefs)

    //lookup prefixBuf in a cache
    if (docIdsCache != null) {
      docIds = docIdsCache.get(prefixBuf);
      if (docIds != null) {
        return docIds;
      }
    }

    //read postingsEnum
    docIds = new IntsRef(termsEnum.docFreq());
    int docId;
    while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
        continue;
      }
      docIds.ints[docIds.length++] = docId;
    }
    if (docIds.length == 0)
      docIds = EMPTY_INTSREF;

    //cache
    if (docIdsCache != null) {
      ensureBufIsACopy();
      //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
      docIdsCache.put(prefixBuf.clone(), docIds);
    }
    return docIds;
  }

  /** The docIds of the last call to advance, if it returned true. It might be null, but
   * its length won't be 0. Treat as immutable. */
  IntsRef getDocIds() {
    assert docIds == null || docIds.length != 0;
    return docIds;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy