org.apache.lucene.codecs.blocktree.AutoPrefixTermsWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hivemall-all
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blocktree;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;

// TODO: instead of inlining auto-prefix terms with normal terms,
// we could write them into their own virtual/private field.  This
// would make search time a bit more complex, since we'd need to
// merge sort between two TermEnums, but it would also make stats
// API (used by CheckIndex -verbose) easier to implement since we could
// just walk this virtual field and gather its stats)

/** Used in the first pass when writing a segment to locate
 *  "appropriate" auto-prefix terms to pre-compile into the index.
 *  This visits every term in the index to find prefixes that
 *  match {@code >= min} and {@code <= max} number of terms. */

class AutoPrefixTermsWriter {

  //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
  //static boolean DEBUG = false;
  //static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2;
  //static boolean DEBUG2 = true;

  /** Describes a range of term-space to match, either a simple prefix
   *  (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*,
   *  foo[n-z]*) when there are too many terms starting with foo*. */
  public static final class PrefixTerm implements Comparable {
    /** Common prefix */
    public final byte[] prefix;

    /** If this is -2, this is a normal prefix (foo *), else it's the minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */
    public final int floorLeadStart;

    /** The lead byte (inclusive) of the suffix for the term range we match (e.g. 'm' in foo[d-m*]); this is ignored when
     *  floorLeadStart is -2. */
    public final int floorLeadEnd;

    public final BytesRef term;

    /** Sole constructor. */
    public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) {
      this.prefix = prefix;
      this.floorLeadStart = floorLeadStart;
      this.floorLeadEnd = floorLeadEnd;
      this.term = toBytesRef(prefix, floorLeadStart);

      assert floorLeadEnd >= floorLeadStart;
      assert floorLeadEnd >= 0;
      assert floorLeadStart == -2 || floorLeadStart >= 0;

      // We should never create empty-string prefix term:
      assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd != 0xff;
    }

    @Override
    public String toString() {
      String s = brToString(new BytesRef(prefix));
      if (floorLeadStart == -2) {
        s += "[-" + Integer.toHexString(floorLeadEnd) + "]";
      } else {
        s += "[" + Integer.toHexString(floorLeadStart) + "-" + Integer.toHexString(floorLeadEnd) + "]";
      }
      return s;
    }

    @Override
    public int compareTo(PrefixTerm other) {
      int cmp = term.compareTo(other.term);
      if (cmp == 0) {
        if (prefix.length != other.prefix.length) {
          return prefix.length - other.prefix.length;
        }

        // On tie, sort the bigger floorLeadEnd, earlier, since it
        // spans more terms, so during intersect, we want to encounter this one
        // first so we can use it if the automaton accepts the larger range:
        cmp = other.floorLeadEnd - floorLeadEnd;
      }

      return cmp;
    }

    /** Returns the leading term for this prefix term, e.g. "foo" (for
     *  the foo* prefix) or "foom" (for the foo[m-z]* case). */
    private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) {
      BytesRef br;
      if (floorLeadStart != -2) {
        assert floorLeadStart >= 0;
        br = new BytesRef(prefix.length+1);
      } else {
        br = new BytesRef(prefix.length);
      }
      System.arraycopy(prefix, 0, br.bytes, 0, prefix.length);
      br.length = prefix.length;
      if (floorLeadStart != -2) {
        assert floorLeadStart >= 0;
        br.bytes[br.length++] = (byte) floorLeadStart;
      }

      return br;
    }

    public int compareTo(BytesRef term) {
      return this.term.compareTo(term);
    }

    public TermsEnum getTermsEnum(TermsEnum in) {

      final BytesRef prefixRef = new BytesRef(prefix);

      return new FilteredTermsEnum(in) {
          {
            setInitialSeekTerm(term);
          }

          @Override
          protected AcceptStatus accept(BytesRef term) {
            if (StringHelper.startsWith(term, prefixRef) &&
                (floorLeadEnd == -1 || term.length == prefixRef.length || (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) {
              return AcceptStatus.YES;
            } else {
              return AcceptStatus.END;
            }
          }
        };
    }
  }

  // for debugging
  static String brToString(BytesRef b) {
    try {
      return b.utf8ToString() + " " + b;
    } catch (Throwable t) {
      // If BytesRef isn't actually UTF8, or it's eg a
      // prefix of UTF8 that ends mid-unicode-char, we
      // fallback to hex:
      return b.toString();
    }
  }

  final List prefixes = new ArrayList<>();
  private final int minItemsInPrefix;
  private final int maxItemsInPrefix;

  // Records index into pending where the current prefix at that
  // length "started"; for example, if current term starts with 't',
  // startsByPrefix[0] is the index into pending for the first
  // term/sub-block starting with 't'.  We use this to figure out when
  // to write a new block:
  private final BytesRefBuilder lastTerm = new BytesRefBuilder();
  private int[] prefixStarts = new int[8];
  private List