org.apache.solr.handler.tagger.TermPrefixCursor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;

/** Cursor into the terms that advances by prefix. */
class TermPrefixCursor {

  // Note: this could be a lot more efficient if MemoryPostingsFormat supported ordinal lookup.
  // Maybe that could be added to Lucene.

  // TODO add bloom filter of hashcode of first ~ 6 bytes to avoid lookup into terms dict?

  private static final byte SEPARATOR_CHAR =
      ConcatenateGraphFilter.SEP_LABEL; // used to be ' '; TODO configurable?
  private static final IntsRef EMPTY_INTSREF = new IntsRef();

  private final TermsEnum termsEnum;
  private final Bits liveDocs;
  private final Map docIdsCache;

  private BytesRef prefixBuf; // we append to this
  private BytesRefBuilder prefixBufBuilder = new BytesRefBuilder();
  private boolean prefixBufOnLoan; // if true, PB is loaned; needs to be copied
  private PostingsEnum postingsEnum;
  private IntsRef docIds;

  TermPrefixCursor(TermsEnum termsEnum, Bits liveDocs, Map docIdsCache) {
    this.termsEnum = termsEnum;
    this.liveDocs = liveDocs;
    this.docIdsCache = docIdsCache;
  }

  /**
   * Appends the separator char (if not the first) plus the given word to the prefix buffer, then
   * seeks to it. If the seek fails, false is returned and this cursor can be re-used as if in a new
   * state. The {@code word} BytesRef is considered temporary, and is not saved within this class.
   */
  boolean advance(BytesRef word) throws IOException {
    if (prefixBuf == null) { // first advance
      // set prefixBuf to word temporary. When advance() completes, we either null out or copy.
      prefixBuf = word;
      prefixBufOnLoan = true;
      if (seekPrefix()) { // ... and we have to
        ensureBufIsACopy();
        return true;
      } else {
        prefixBuf = null; // just to be darned sure 'word' isn't referenced here
        return false;
      }

    } else { // subsequent advance
      // append to existing
      assert !prefixBufOnLoan;

      prefixBufBuilder.append(SEPARATOR_CHAR);
      prefixBufBuilder.append(word);
      prefixBuf = prefixBufBuilder.get();
      if (seekPrefix()) {
        return true;
      } else {
        prefixBuf = null;
        return false;
      }
    }
  }

  private void ensureBufIsACopy() {
    if (!prefixBufOnLoan) return;

    prefixBufBuilder.clear();
    prefixBufBuilder.copyBytes(prefixBuf);
    prefixBuf = prefixBufBuilder.get();
    prefixBufOnLoan = false;
  }

  /**
   * Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char. Sets
   * docIds. *
   */
  @SuppressWarnings({"fallthrough"})
  private boolean seekPrefix() throws IOException {
    TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);

    docIds = null; // invalidate
    switch (seekStatus) {
      case END:
        return false;

      case FOUND:
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
        if (docIds.length > 0) {
          return true;
        }

        // Pretend we didn't find it; go to next term
        docIds = null;
        if (termsEnum.next() == null) { // case END
          return false;
        }
        // fall through to NOT_FOUND

      case NOT_FOUND:
        // termsEnum must start with prefixBuf to continue
        BytesRef teTerm = termsEnum.term();

        if (teTerm.length > prefixBuf.length) {
          for (int i = 0; i < prefixBuf.length; i++) {
            if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
              return false;
          }
          if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR) return false;
          return true;
        }
        return false;
    }
    throw new IllegalStateException(seekStatus.toString());
  }

  /** Returns an IntsRef either cached or reading postingsEnum. Not null. */
  private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs)
      throws IOException {
    // (The cache can have empty IntsRefs)

    // lookup prefixBuf in a cache
    if (docIdsCache != null) {
      docIds = docIdsCache.get(prefixBuf);
      if (docIds != null) {
        return docIds;
      }
    }

    // read postingsEnum
    docIds = new IntsRef(termsEnum.docFreq());
    int docId;
    while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
        continue;
      }
      docIds.ints[docIds.length++] = docId;
    }
    if (docIds.length == 0) docIds = EMPTY_INTSREF;

    // cache
    if (docIdsCache != null) {
      ensureBufIsACopy();
      // clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
      docIdsCache.put(prefixBuf.clone(), docIds);
    }
    return docIds;
  }

  /**
   * The docIds of the last call to advance, if it returned true. It might be null, but its length
   * won't be 0. Treat as immutable.
   */
  IntsRef getDocIds() {
    assert docIds == null || docIds.length != 0;
    return docIds;
  }
}