All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.stratio.cassandra.lucene.mapping.ClusteringMapper.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2014 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.cassandra.lucene.mapping

import java.nio.ByteBuffer

import com.google.common.base.MoreObjects
import com.google.common.primitives.Longs
import com.stratio.cassandra.lucene.mapping.ClusteringMapper._
import com.stratio.cassandra.lucene.util.ByteBufferUtils
import com.stratio.cassandra.lucene.util.ByteBufferUtils._
import org.apache.cassandra.schema.TableMetadata
import org.apache.cassandra.schema.ColumnMetadata
import org.apache.cassandra.db._
import org.apache.cassandra.db.filter.{ClusteringIndexNamesFilter, ClusteringIndexSliceFilter}
import org.apache.cassandra.db.marshal.{ByteBufferAccessor, CompositeType}
import org.apache.cassandra.dht.Token
import org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER
import org.apache.cassandra.utils.FastByteOperations._
import org.apache.lucene.document.{Document, Field, FieldType, StoredField}
import org.apache.lucene.index.FilteredTermsEnum.AcceptStatus
import org.apache.lucene.index._
import org.apache.lucene.search.BooleanClause.Occur.SHOULD
import org.apache.lucene.search.FieldComparator.TermValComparator
import org.apache.lucene.search._
import org.apache.lucene.util.{AttributeSource, BytesRef}

import scala.collection.JavaConverters._

/** Class for several clustering key mappings between Cassandra and Lucene.
  *
  * @param metadata the indexed table metadata
  * @author Andres de la Pena `[email protected]`
  */
class ClusteringMapper(metadata: TableMetadata) {

  /** The clustering key comparator */
  val comparator: ClusteringComparator = metadata.comparator

  /** A composite type composed by the types of the clustering key */
  val clusteringType: CompositeType = CompositeType.getInstance(comparator.subtypes)

  val clusteringColumns: List[ColumnMetadata] = metadata.clusteringColumns.asScala.toList

  /** Returns a list of Lucene [[IndexableField]]s representing the specified primary key.
    *
    * @param key        the partition key
    * @param clustering the clustering key
    * @return a indexable field
    */
  def indexableFields(key: DecoratedKey, clustering: Clustering[_]): List[IndexableField] = {

    // Build stored field for clustering key retrieval
    val plainClustering = bytesRef(byteBuffer(clustering))
    val storedField = new StoredField(FIELD_NAME, plainClustering)

    // Build indexed field prefixed by token value collation
    val bb = ByteBuffer.allocate(PREFIX_SIZE + plainClustering.length)
    bb.put(prefix(key.getToken)).put(plainClustering.bytes).flip
    val indexedField = new Field(FIELD_NAME, bytesRef(bb), FIELD_TYPE)

    List(indexedField, storedField)
  }

  /** Returns the [[ByteBuffer]] representation of the specified [[Clustering]].
    *
    * @param clustering a clustering key
    * @return a byte buffer representing `clustering`
    */
  def byteBuffer(clustering: Clustering[_]): ByteBuffer = {
    CompositeType.build(ByteBufferAccessor.instance, clustering.getBufferArray : _*)
  }

  /** Returns the [[String]] human-readable representation of the specified [[ClusteringPrefix]].
    *
    * @param prefix the clustering prefix
    * @return a [[String]] representing the prefix
    */
  def toString(prefix: Option[ClusteringPrefix[_]]): String = {
    prefix.map(_.toString(metadata)).orNull
  }

  /** Returns the clustering key represented by the specified [[ByteBuffer]].
    *
    * @param clustering a byte buffer containing a [[Clustering]]
    * @return a Lucene field binary value
    */
  def clustering(clustering: ByteBuffer): Clustering[_] = {
    Clustering.make(clusteringType.split(clustering): _*)
  }

  /** Returns the clustering key contained in the specified [[Document]].
    *
    * @param document a document containing the clustering key to be get
    * @return the clustering key contained in the document
    */
  def clustering(document: Document): Clustering[_] = {
    val bytesRef = document.getBinaryValue(FIELD_NAME)
    clustering(ByteBufferUtils.byteBuffer(bytesRef))
  }

  /** Returns a Lucene [[SortField]] to sort documents by primary key.
    *
    * @return the sort field
    */
  def sortField: SortField = {
    new ClusteringSort(this)
  }

  /** Returns a Lucene [[Query]] to retrieve all the rows in the specified partition slice.
    *
    * @param position the partition position
    * @param start    the start clustering prefix
    * @param stop     the stop clustering prefix
    * @return the Lucene query
    */
  def query(
      position: PartitionPosition,
      start: Option[ClusteringPrefix[_]],
      stop: Option[ClusteringPrefix[_]]): Query = {
    new ClusteringQuery(this, position, start, stop)
  }

  /** Returns a Lucene [[Query]] to retrieve all the rows in the specified clustering slice.
    *
    * @param key   the partition key
    * @param slice the slice
    * @return the Lucene query
    */
  def query(key: DecoratedKey, slice: Slice): Query = {
    query(key, Option(slice.start), Option(slice.end))
  }

  /** Returns a Lucene [[Query]] to retrieve all the rows in the specified clustering slice filter.
    *
    * @param key    the partition key
    * @param filter the slice filter
    * @return the Lucene query
    */
  def query(key: DecoratedKey, filter: ClusteringIndexSliceFilter): Query = {
    (new BooleanQuery.Builder /: filter.requestedSlices.asScala) (
      (builder, slice) => builder.add(query(key, slice), SHOULD)).build()
  }

}

/** Companion object for [[ClusteringMapper]]. */
object ClusteringMapper {

  /** The Lucene field name. */
  val FIELD_NAME = "_clustering"

  /** The Lucene field type. */
  val FIELD_TYPE = new FieldType
  FIELD_TYPE.setOmitNorms(true)
  FIELD_TYPE.setIndexOptions(IndexOptions.DOCS)
  FIELD_TYPE.setTokenized(false)
  FIELD_TYPE.setStored(false)
  FIELD_TYPE.setDocValuesType(DocValuesType.SORTED)
  FIELD_TYPE.freeze()

  /** The number of bytes produced by token collation. */
  val PREFIX_SIZE = 8

  /** Returns a lexicographically sortable representation of the specified token.
    *
    * @param token a token
    * @return a lexicographically sortable 8 bytes array
    */
  @SuppressWarnings(Array("NumericOverflow"))
  def prefix(token: Token): Array[Byte] = {
    val value = TokenMapper.longValue(token)
    val collated = Long.MinValue * -1 + value
    Longs.toByteArray(collated)
  }

  /** Returns the start [[ClusteringPrefix]] of the first partition of the specified [[DataRange]].
    *
    * @param range the data range
    * @return the optional start clustering prefix of the data range
    */
  def startClusteringPrefix(range: DataRange): Option[ClusteringPrefix[_]] = {
    val filter = range.startKey match {
      case key: DecoratedKey => range.clusteringIndexFilter(key)
      case position =>
        range.clusteringIndexFilter(new BufferDecoratedKey(position.getToken, EMPTY_BYTE_BUFFER))
    }
    filter match {
      case slices: ClusteringIndexSliceFilter => Some(slices.requestedSlices.get(0).start)
      case names: ClusteringIndexNamesFilter => Some(names.requestedRows.first)
      case _ => None
    }
  }

  /** Returns the stop [[ClusteringPrefix]] of the last partition of the specified [[DataRange]].
    *
    * @param range the data range
    * @return the optional stop clustering prefix of the data range
    */
  def stopClusteringPrefix(range: DataRange): Option[ClusteringPrefix[_]] = {
    val filter = range.stopKey match {
      case key: DecoratedKey => range.clusteringIndexFilter(key)
      case position =>
        range.clusteringIndexFilter(new BufferDecoratedKey(position.getToken, EMPTY_BYTE_BUFFER))
    }
    filter match {
      case slices: ClusteringIndexSliceFilter =>
        Some(slices.requestedSlices.get(slices.requestedSlices.size - 1).end)
      case names: ClusteringIndexNamesFilter =>
        Some(names.requestedRows.last)
      case _ => None
    }
  }
}

/** [[SortField]] to sort by token and clustering key.
  *
  * @param mapper the primary key mapper to be used
  */
class ClusteringSort(mapper: ClusteringMapper) extends SortField(
  FIELD_NAME, new FieldComparatorSource {
    override def newComparator(fieldname: String, numHits: Int, sortPos: Int, reversed: Boolean): FieldComparator[_] = {
      new TermValComparator(numHits, fieldname, false) {
        override def compareValues(t1: BytesRef, t2: BytesRef): Int = {
          val comp = compareUnsigned(t1.bytes, 0, PREFIX_SIZE, t2.bytes, 0, PREFIX_SIZE)
          if (comp != 0) return comp
          val bb1 = ByteBuffer.wrap(t1.bytes, PREFIX_SIZE, t1.length - PREFIX_SIZE)
          val bb2 = ByteBuffer.wrap(t2.bytes, PREFIX_SIZE, t2.length - PREFIX_SIZE)
          val clustering1 = mapper.clustering(bb1)
          val clustering2 = mapper.clustering(bb2)
          mapper.comparator.compare(clustering1, clustering2)
        }
      }
    }
  }) {

  /** @inheritdoc */
  override def toString: String = ""

  /** @inheritdoc */
  override def equals(o: Any): Boolean = o match {
    case _: ClusteringSort => true
    case _ => false
  }
}

/** [[MultiTermQuery]] to get a range of clustering keys.
  *
  * @param mapper   the clustering key mapper to be used
  * @param position the partition position
  * @param start    the start clustering
  * @param stop     the stop clustering
  */
class ClusteringQuery(
    val mapper: ClusteringMapper,
    val position: PartitionPosition,
    val start: Option[ClusteringPrefix[_]],
    val stop: Option[ClusteringPrefix[_]]) extends MultiTermQuery(FIELD_NAME) {

  val token = position.getToken
  val seek = ClusteringMapper.prefix(token)
  val comparator = mapper.comparator

  /** @inheritdoc */
  override def getTermsEnum(terms: Terms, attributes: AttributeSource): TermsEnum = {
    new FullKeyDataRangeFilteredTermsEnum(terms.iterator)
  }

  /** Important to avoid collisions in Lucene's query cache. */
  override def equals(o: Any): Boolean = o match {
    case q: ClusteringQuery => token == q.token && start == q.start && stop == q.stop
    case _ => false
  }

  /** Important to avoid collisions in Lucene's query cache. */
  override def hashCode: Int = {
    var result = super.hashCode
    result = 31 * result + token.hashCode
    result = 31 * result + start.map(_.hashCode).getOrElse(0)
    result = 31 * result + stop.map(_.hashCode).getOrElse(0)
    result
  }

  /** @inheritdoc */
  override def toString(field: String): String = {
    MoreObjects.toStringHelper(this)
      .add("field", field)
      .add("token", token)
      .add("start", mapper.toString(start))
      .add("stop", mapper.toString(stop))
      .toString
  }

  class FullKeyDataRangeFilteredTermsEnum(tenum: TermsEnum) extends FilteredTermsEnum(tenum) {

    // Jump to the start of the partition
    setInitialSeekTerm(new BytesRef(seek))

    /** @inheritdoc */
    override def accept(term: BytesRef): AcceptStatus = {

      // Check token range
      val comp = compareUnsigned(term.bytes, 0, PREFIX_SIZE, seek, 0, PREFIX_SIZE)
      if (comp < 0) return AcceptStatus.NO
      if (comp > 0) return AcceptStatus.END

      // Check clustering range
      val bb = ByteBuffer.wrap(term.bytes, PREFIX_SIZE, term.length - PREFIX_SIZE)
      val clustering = mapper.clustering(bb)
      if (start.exists(comparator.compare(_, clustering) > 0)) return AcceptStatus.NO
      if (stop.exists(comparator.compare(_, clustering) < 0)) return AcceptStatus.NO

      AcceptStatus.YES
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy