org.apache.hadoop.hbase.codec.prefixtree.decode.PrefixTreeArraySearcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of alihbase-prefix-tree Show documentation
Prefix Tree Data Block Encoder
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hbase.codec.prefixtree.decode;

import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
import org.apache.hadoop.hbase.codec.prefixtree.scanner.CellScannerPosition;
import org.apache.hadoop.hbase.codec.prefixtree.scanner.CellSearcher;

import com.google.common.primitives.UnsignedBytes;

/**
 * Searcher extends the capabilities of the Scanner + ReversibleScanner to add the ability to
 * position itself on a requested Cell without scanning through cells before it. The PrefixTree is
 * set up to be a Trie of rows, so finding a particular row is extremely cheap.
 * 
 * Once it finds the row, it does a binary search through the cells inside the row, which is not as
 * fast as the trie search, but faster than iterating through every cell like existing block
 * formats
 * do. For this reason, this implementation is targeted towards schemas where rows are narrow
 * enough
 * to have several or many per block, and where you are generally looking for the entire row or
 * the
 * first cell. It will still be fast for wide rows or point queries, but could be improved upon.
 */
@InterfaceAudience.Private
public class PrefixTreeArraySearcher extends PrefixTreeArrayReversibleScanner implements
    CellSearcher {

  /*************** construct ******************************/

  public PrefixTreeArraySearcher(PrefixTreeBlockMeta blockMeta, int rowTreeDepth,
      int rowBufferLength, int qualifierBufferLength, int tagsBufferLength) {
    super(blockMeta, rowTreeDepth, rowBufferLength, qualifierBufferLength, tagsBufferLength);
  }


  /********************* CellSearcher methods *******************/

  @Override
  public boolean positionAt(Cell key) {
    return CellScannerPosition.AT == positionAtOrAfter(key);
  }

  @Override
  public CellScannerPosition positionAtOrBefore(Cell key) {
    reInitFirstNode();
    int fanIndex = -1;

    while(true){
      //detect row mismatch.  break loop if mismatch
      int currentNodeDepth = rowLength;
      int rowTokenComparison = compareToCurrentToken(key);
      if(rowTokenComparison != 0){
        return fixRowTokenMissReverse(rowTokenComparison);
      }

      //exact row found, move on to qualifier & ts
      if(rowMatchesAfterCurrentPosition(key)){
        return positionAtQualifierTimestamp(key, true);
      }

      //detect dead end (no fan to descend into)
      if(!currentRowNode.hasFan()){
        if(hasOccurrences()){//must be leaf or nub
          populateLastNonRowFields();
          return CellScannerPosition.BEFORE;
        }else{
          //TODO i don't think this case is exercised by any tests
          return fixRowFanMissReverse(0);
        }
      }

      //keep hunting for the rest of the row
      byte searchForByte = CellUtil.getRowByte(key, currentNodeDepth);
      fanIndex = currentRowNode.whichFanNode(searchForByte);
      if(fanIndex < 0){//no matching row.  return early
        int insertionPoint = -fanIndex - 1;
        return fixRowFanMissReverse(insertionPoint);
      }
      //found a match, so dig deeper into the tree
      followFan(fanIndex);
    }
  }

  /**
   * Identical workflow as positionAtOrBefore, but split them to avoid having ~10 extra
   * if-statements. Priority on readability and debugability.
   */
  @Override
  public CellScannerPosition positionAtOrAfter(Cell key) {
    reInitFirstNode();
    int fanIndex = -1;

    while(true){
      //detect row mismatch.  break loop if mismatch
      int currentNodeDepth = rowLength;
      int rowTokenComparison = compareToCurrentToken(key);
      if(rowTokenComparison != 0){
        return fixRowTokenMissForward(rowTokenComparison);
      }

      //exact row found, move on to qualifier & ts
      if(rowMatchesAfterCurrentPosition(key)){
        return positionAtQualifierTimestamp(key, false);
      }

      //detect dead end (no fan to descend into)
      if(!currentRowNode.hasFan()){
        if(hasOccurrences()){
          if (rowLength < key.getRowLength()) {
            nextRow();
          } else {
            populateFirstNonRowFields();
          }
          return CellScannerPosition.AFTER;
        }else{
          //TODO i don't think this case is exercised by any tests
          return fixRowFanMissForward(0);
        }
      }

      //keep hunting for the rest of the row
      byte searchForByte = CellUtil.getRowByte(key, currentNodeDepth);
      fanIndex = currentRowNode.whichFanNode(searchForByte);
      if(fanIndex < 0){//no matching row.  return early
        int insertionPoint = -fanIndex - 1;
        return fixRowFanMissForward(insertionPoint);
      }
      //found a match, so dig deeper into the tree
      followFan(fanIndex);
    }
  }

  @Override
  public boolean seekForwardTo(Cell key) {
    if(currentPositionIsAfter(key)){
      //our position is after the requested key, so can't do anything
      return false;
    }
    return positionAt(key);
  }

  @Override
  public CellScannerPosition seekForwardToOrBefore(Cell key) {
    //Do we even need this check or should upper layers avoid this situation.  It's relatively
    //expensive compared to the rest of the seek operation.
    if(currentPositionIsAfter(key)){
      //our position is after the requested key, so can't do anything
      return CellScannerPosition.AFTER;
    }

    return positionAtOrBefore(key);
  }

  @Override
  public CellScannerPosition seekForwardToOrAfter(Cell key) {
    //Do we even need this check or should upper layers avoid this situation.  It's relatively
    //expensive compared to the rest of the seek operation.
    if(currentPositionIsAfter(key)){
      //our position is after the requested key, so can't do anything
      return CellScannerPosition.AFTER;
    }

    return positionAtOrAfter(key);
  }

  /**
   * The content of the buffers doesn't matter here, only that afterLast=true and beforeFirst=false
   */
  @Override
  public void positionAfterLastCell() {
    resetToBeforeFirstEntry();
    beforeFirst = false;
    afterLast = true;
  }


  /***************** Object methods ***************************/

  @Override
  public boolean equals(Object obj) {
    //trivial override to confirm intent (findbugs)
    return super.equals(obj);
  }


  /****************** internal methods ************************/

  protected boolean currentPositionIsAfter(Cell cell){
    return compareTo(cell) > 0;
  }

  protected CellScannerPosition positionAtQualifierTimestamp(Cell key, boolean beforeOnMiss) {
    int minIndex = 0;
    int maxIndex = currentRowNode.getLastCellIndex();
    int diff;
    while (true) {
      int midIndex = (maxIndex + minIndex) / 2;//don't worry about overflow
      diff = populateNonRowFieldsAndCompareTo(midIndex, key);

      if (diff == 0) {// found exact match
        return CellScannerPosition.AT;
      } else if (minIndex == maxIndex) {// even termination case
        break;
      } else if ((minIndex + 1) == maxIndex) {// odd termination case
        diff = populateNonRowFieldsAndCompareTo(maxIndex, key);
        if(diff > 0){
          diff = populateNonRowFieldsAndCompareTo(minIndex, key);
        }
        break;
      } else if (diff < 0) {// keep going forward
        minIndex = currentCellIndex;
      } else {// went past it, back up
        maxIndex = currentCellIndex;
      }
    }

    if (diff == 0) {
      return CellScannerPosition.AT;

    } else if (diff < 0) {// we are before key
      if (beforeOnMiss) {
        return CellScannerPosition.BEFORE;
      }
      if (advance()) {
        return CellScannerPosition.AFTER;
      }
      return CellScannerPosition.AFTER_LAST;

    } else {// we are after key
      if (!beforeOnMiss) {
        return CellScannerPosition.AFTER;
      }
      if (previous()) {
        return CellScannerPosition.BEFORE;
      }
      return CellScannerPosition.BEFORE_FIRST;
    }
  }

  /**
   * compare this.row to key.row but starting at the current rowLength
   * @param key Cell being searched for
   * @return true if row buffer contents match key.row
   */
  protected boolean rowMatchesAfterCurrentPosition(Cell key) {
    if (!currentRowNode.hasOccurrences()) {
      return false;
    }
    int thatRowLength = key.getRowLength();
    if (rowLength != thatRowLength) {
      return false;
    }
    return true;
  }

  // TODO move part of this to Cell comparator?
  /**
   * Compare only the bytes within the window of the current token
   * @param key
   * @return return -1 if key is lessThan (before) this, 0 if equal, and 1 if key is after
   */
  protected int compareToCurrentToken(Cell key) {
    int startIndex = rowLength - currentRowNode.getTokenLength();
    int endIndexExclusive = startIndex + currentRowNode.getTokenLength();
    for (int i = startIndex; i < endIndexExclusive; ++i) {
      if (i >= key.getRowLength()) {// key was shorter, so it's first
        return -1;
      }
      byte keyByte = CellUtil.getRowByte(key, i);
      byte thisByte = rowBuffer[i];
      if (keyByte == thisByte) {
        continue;
      }
      return UnsignedBytes.compare(keyByte, thisByte);
    }
    if (!currentRowNode.hasOccurrences() && rowLength >= key.getRowLength()) { // key was shorter
        return -1;
    }
    return 0;
  }

  protected void followLastFansUntilExhausted(){
    while(currentRowNode.hasFan()){
      followLastFan();
    }
  }


  /****************** complete seek when token mismatch ******************/

  /**
   * @param searcherIsAfterInputKey <0: input key is before the searcher's position

   *          >0: input key is after the searcher's position
   */
  protected CellScannerPosition fixRowTokenMissReverse(int searcherIsAfterInputKey) {
    if (searcherIsAfterInputKey < 0) {//searcher position is after the input key, so back up
      boolean foundPreviousRow = previousRow(true);
      if(foundPreviousRow){
        populateLastNonRowFields();
        return CellScannerPosition.BEFORE;
      }else{
        return CellScannerPosition.BEFORE_FIRST;
      }

    }else{//searcher position is before the input key
      if(currentRowNode.hasOccurrences()){
        populateFirstNonRowFields();
        return CellScannerPosition.BEFORE;
      }
      boolean foundNextRow = nextRow();
      if(foundNextRow){
        return CellScannerPosition.AFTER;
      }else{
        return CellScannerPosition.AFTER_LAST;
      }
    }
  }

  /**
   * @param searcherIsAfterInputKey <0: input key is before the searcher's position

   *                   >0: input key is after the searcher's position
   */
  protected CellScannerPosition fixRowTokenMissForward(int searcherIsAfterInputKey) {
    if (searcherIsAfterInputKey < 0) {//searcher position is after the input key
      if(currentRowNode.hasOccurrences()){
        populateFirstNonRowFields();
        return CellScannerPosition.AFTER;
      }
      boolean foundNextRow = nextRow();
      if(foundNextRow){
        return CellScannerPosition.AFTER;
      }else{
        return CellScannerPosition.AFTER_LAST;
      }

    }else{//searcher position is before the input key, so go forward
      discardCurrentRowNode(true);
      boolean foundNextRow = nextRow();
      if(foundNextRow){
        return CellScannerPosition.AFTER;
      }else{
        return CellScannerPosition.AFTER_LAST;
      }
    }
  }


  /****************** complete seek when fan mismatch ******************/

  protected CellScannerPosition fixRowFanMissReverse(int fanInsertionPoint){
    if(fanInsertionPoint == 0){//we need to back up a row
      if (currentRowNode.hasOccurrences()) {
        populateLastNonRowFields();
        return CellScannerPosition.BEFORE;
      }
      boolean foundPreviousRow = previousRow(true);//true -> position on last cell in row
      if(foundPreviousRow){
        populateLastNonRowFields();
        return CellScannerPosition.BEFORE;
      }
      return CellScannerPosition.BEFORE_FIRST;
    }

    //follow the previous fan, but then descend recursively forward
    followFan(fanInsertionPoint - 1);
    followLastFansUntilExhausted();
    populateLastNonRowFields();
    return CellScannerPosition.BEFORE;
  }

  protected CellScannerPosition fixRowFanMissForward(int fanInsertionPoint){
    if(fanInsertionPoint >= currentRowNode.getFanOut()){
      discardCurrentRowNode(true);
      if (!nextRow()) {
        return CellScannerPosition.AFTER_LAST;
      } else {
        return CellScannerPosition.AFTER;
      }
    }

    followFan(fanInsertionPoint);
    if(hasOccurrences()){
      populateFirstNonRowFields();
      return CellScannerPosition.AFTER;
    }

    if(nextRowInternal()){
      populateFirstNonRowFields();
      return CellScannerPosition.AFTER;

    }else{
      return CellScannerPosition.AFTER_LAST;
    }
  }

}