com.sindicetech.siren.analysis.attributes.NodeNumericTermAttribute Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of siren-core Show documentation
SIREn core module
The newest version!
/**
 * Copyright (c) 2014, Sindice Limited. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package com.sindicetech.siren.analysis.attributes;

import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.FieldType.NumericType;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;

import com.sindicetech.siren.analysis.NumericTokenizer;
import com.sindicetech.siren.search.node.NodeNumericRangeQuery;

/**
 * Expert: This class provides an {@link Attribute} for the
 * {@link NumericTokenizer} for indexing numeric values that can be used by {@link
 * NodeNumericRangeQuery}.
 * 
 * This attribute provides a stream of tokens which iterates over
 * the different precisions of a given numeric value.
 * 

 * The string representation of each precision is prefixed by:
 * 

 *  the numeric type of the value;
 * 
 the precision step;
 * 
 * This prefix is in fact encoding the numeric type and precision step inside
 * the dictionary. This prefix is necessary for two reasons:
 * 
 *  it avoids overlapping value of different numeric type, and therefore
 * avoid getting false-positive;
 * 
 enables better clustering of the values of a particular numeric type
 * in the dictionary.
 * 
 */
public interface NodeNumericTermAttribute extends Attribute {

  /**
   * Return the numeric type of the value
   */
  NumericType getNumericType();

  /**
   * Returns the current shift value
   * 
   * Undefined before first call to
   * {@link #incrementShift(CharTermAttribute, NumericType)}
   */
  int getShift();

  /**
   * Returns the value size in bits (32 for {@code float}, {@code int}; 64 for
   * {@code double}, {@code long})
   */
  int getValueSize();

  /**
   * Set the precision step
   */
  void setPrecisionStep(int precisionStep);

  /**
   * Returns the precision step
   */
  int getPrecisionStep();

  /**
   * Initialise this attribute
   */
  void init(NumericType numericType, long value, int valSize);

  /**
   * Reset the current shift value to 0
   */
  void resetShift();

  /**
   * Increment the shift and generate the next token.
   * 

   * The original Lucene's {@link NumericTermAttribute} implements
   * {@link TermToBytesRefAttribute}. There is a conflict problem with the
   * {@link CharTermAttribute} used in higher-level SIREn's analyzers, which also
   * implements {@link TermToBytesRefAttribute}.
   * The problem is that the {@link AttributeSource} is not able to choose
   * between the two when requested an attribute implementing
   * {@link TermToBytesRefAttribute}, e.g., in TermsHashPerField.
   * 
   * The current solution is to fill the {@link BytesRef} attribute of the
   * {@link CharTermAttribute} with the encoded numeric value.
   *
   * @return True if there are still tokens, false if we reach the end of the
   * stream.
   */
  boolean incrementShift(CharTermAttribute termAtt);

}