org.apache.lucene.codecs.uniformsplit.TermBytes Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
There is a newer version: 9.11.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;

/**
 * Term of a block line.
 * 
 * Contains the term bytes and the minimal distinguishing prefix (MDP) length
 * of this term.
 * 

 * The MDP is the minimal prefix that distinguishes a term from its immediate
 * previous term (terms are alphabetically sorted).
 * 

 * The incremental encoding suffix is the suffix starting at the last byte of
 * the MDP (inclusive).
 * 

 * Example:
 * For the block
 * 
 * client
 * color
 * company
 * companies
 * 
 * "color" - MDP is "co" - incremental encoding suffix is "olor".
 * 

 * "company" - MDP is "com" - incremental encoding suffix is "mpany".
 * 

 * "companies" - MDP is "compani" - incremental encoding suffix is "ies".
 *
 * @lucene.experimental
 */
public class TermBytes implements Accountable {

  private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(TermBytes.class);

  protected int mdpLength;
  protected BytesRef term;

  public TermBytes(int mdpLength, BytesRef term) {
    reset(mdpLength, term);
  }

  public TermBytes reset(int mdpLength, BytesRef term) {
    assert term.length > 0 && mdpLength > 0 || term.length == 0 && mdpLength == 0 : "Inconsistent mdpLength=" + mdpLength + ", term.length=" + term.length;
    assert term.length == 0 || mdpLength <= term.length : "Too large mdpLength=" + mdpLength + ", term.length=" + term.length;
    assert term.offset == 0;
    this.mdpLength = mdpLength;
    this.term = term;
    return this;
  }

  /**
   * @return This term MDP length.
   * @see TermBytes
   */
  public int getMdpLength() {
    return mdpLength;
  }

  /**
   * @return This term bytes.
   */
  public BytesRef getTerm() {
    return term;
  }

  /**
   * @return The offset of this term incremental encoding suffix.
   * @see TermBytes
   */
  public int getSuffixOffset() {
    return Math.max(mdpLength - 1, 0);
  }

  /**
   * @return The length of this term incremental encoding suffix.
   * @see TermBytes
   */
  public int getSuffixLength() {
    return term.length - getSuffixOffset();
  }

  /**
   * Computes the length of the minimal distinguishing prefix (MDP) between
   * a current term and its previous term (terms are alphabetically sorted).
   * 
   * Example: If previous="car" and current="cartridge", then MDP length is
   * 4. It is the length of the minimal prefix distinguishing "cartridge" from
   * "car", that is, the length of "cart".
   *
   * @see TermBytes
   */
  public static int computeMdpLength(BytesRef previousTerm, BytesRef currentTerm) {
    int mdpLength = previousTerm == null ? 1 : StringHelper.sortKeyLength(previousTerm, currentTerm);
    return Math.min(mdpLength, currentTerm.length);
  }

  @Override
  public long ramBytesUsed() {
    return BASE_RAM_USAGE + RamUsageUtil.ramBytesUsed(term);
  }
}