All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.stratio.cassandra.lucene.IndexService.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2014 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.cassandra.lucene

import java.lang.management.ManagementFactory

import javax.management.{JMException, ObjectName}
import com.stratio.cassandra.lucene.index.{DocumentIterator, PartitionedIndex}
import com.stratio.cassandra.lucene.mapping._
import com.stratio.cassandra.lucene.search.Search
import com.stratio.cassandra.lucene.util._
import org.apache.cassandra.config.DatabaseDescriptor
import org.apache.cassandra.db._
import org.apache.cassandra.db.filter._
import org.apache.cassandra.db.partitions._
import org.apache.cassandra.db.rows._
import org.apache.cassandra.index.transactions.IndexTransaction
import org.apache.cassandra.schema.{ColumnMetadata, IndexMetadata}
import org.apache.cassandra.utils.FBUtilities
import org.apache.cassandra.utils.concurrent.OpOrder
import org.apache.lucene.document.Document
import org.apache.lucene.index.{IndexableField, Term}
import org.apache.lucene.search.{Query, Sort, SortField}

import scala.collection.JavaConverters._
import scala.collection.mutable

/** Lucene index service provider.
  *
  * @param table         the indexed table
  * @param indexMetadata the index metadata
  * @author Andres de la Pena `[email protected]`
  */
abstract class IndexService(
    val table: ColumnFamilyStore,
    val indexMetadata: IndexMetadata) extends IndexServiceMBean with Logging with Tracing {

  val metadata = table.metadata.get()
  val ksName = metadata.keyspace
  val cfName = metadata.name
  val idxName = indexMetadata.name
  val qualifiedName = s"$ksName.$cfName.$idxName"

  // Parse options
  val options = new IndexOptions(metadata, indexMetadata)

  // Setup schema
  val schema = options.schema
  val regulars = metadata.regularColumns().asScala.toSet
  val mappedRegulars = regulars.map(_.name.toString).filter(schema.mappedCells.contains)
  val mapsMultiCell = regulars.exists(x => x.`type`.isMultiCell && schema.mapsCell(x.name.toString))
  val mapsPrimaryKey = metadata.primaryKeyColumns().asScala.exists(x => schema.mapsCell(x.name.toString))

  val excludedDataCenter= options.excludedDataCenters.contains(DatabaseDescriptor.getLocalDataCenter)

  // Setup mapping
  val tokenMapper = new TokenMapper
  val partitionMapper = new PartitionMapper(metadata)
  val columnsMapper = new ColumnsMapper(schema, metadata)
  val expressionMapper = ExpressionMapper(metadata, indexMetadata)

  // Setup FS index and write queue
  val queue = TaskQueue.build(options.indexingThreads, options.indexingQueuesSize)
  val partitioner = options.partitioner
  val lucene = new PartitionedIndex(partitioner.numPartitions,
    idxName,
    options.path,
    options.schema.analyzer,
    options.refreshSeconds,
    options.ramBufferMB,
    options.maxMergeMB,
    options.maxCachedMB)

  // Delay JMX MBean creation
  var mBean: ObjectName = _

  // Setup indexing read-before-write lock provider
  val readBeforeWriteLocker = new Locker(DatabaseDescriptor.getConcurrentWriters * 128)

  def init() {

    // Initialize index
    try {
      val sort = new Sort(keySortFields.toArray: _*)
      if (!excludedDataCenter)
        lucene.init(sort, fieldsToLoad)
    } catch {
      case e: Exception =>
        logger.error(s"Initialization of Lucene FS directory for index '$idxName' has failed", e)
    }

    // Register JMX MBean
    try {
      val mBeanName = "com.stratio.cassandra.lucene:type=Lucene," +
        s"keyspace=$ksName,table=$cfName,index=$idxName"
      mBean = new ObjectName(mBeanName)
      ManagementFactory.getPlatformMBeanServer.registerMBean(this, this.mBean)
    } catch {
      case e: JMException => logger.error("Error while registering Lucene index JMX MBean", e)
    }
  }

  /** Returns the Lucene [[SortField]]s required to retrieve documents sorted by primary key.
    *
    * @return the sort fields
    */
  def keySortFields: List[SortField]

  /** Returns the names of the Lucene fields to be loaded from index during searches.
    *
    * @return the names of the fields to be loaded
    */
  def fieldsToLoad: java.util.Set[String]

  def keyIndexableFields(key: DecoratedKey, clustering: Clustering[_]): List[IndexableField]

  /** Returns if the specified column definition is mapped by this index.
    *
    * @param columnDef a column definition
    * @return `true` if the column is mapped, `false` otherwise
    */
  def dependsOn(columnDef: ColumnMetadata): Boolean = {
    schema.mapsCell(columnDef.name.toString)
  }

  /** Returns the validated search contained in the specified expression.
    *
    * @param expression a custom CQL expression
    * @return the validated expression
    */
  def validate(expression: RowFilter.Expression): Search = {
    expressionMapper.search(expression).validate(schema)
  }

  /** Returns a Lucene term uniquely identifying the specified row.
    *
    * @param key        the partition key
    * @param clustering the clustering key
    * @return a Lucene identifying term
    */
  def term(key: DecoratedKey, clustering: Clustering[_]): Term

  /** Returns a Lucene term identifying documents representing all the row's which are in the
    * partition the specified [[DecoratedKey]].
    *
    * @param key the partition key
    * @return a Lucene term representing `key`
    */
  def term(key: DecoratedKey): Term = {
    partitionMapper.term(key)
  }

  /** Returns if SSTables can contain additional columns of the specified row so read-before-write
    * is required prior to indexing.
    *
    * @param key the partition key
    * @param row the row
    * @return `true` if read-before-write is required, `false` otherwise
    */
  def needsReadBeforeWrite(key: DecoratedKey, row: Row): Boolean = {
    mapsMultiCell || !mappedRegulars.subsetOf(row.columns.asScala.map(_.name.toString).toSet)
  }

  /**
    * Returns true if the supplied Row contains at least one column that is present
    * in the index schema.
    *
    * @param row the row
    * @return `true` if the index must be updated, `false` otherwise
    */
  def doesAffectIndex(row: Row): Boolean = {
    !options.sparse || mapsPrimaryKey || row.columns().asScala.exists(c => mappedRegulars.contains(c.name.toString))
  }

  /** Returns the [[DecoratedKey]] contained in the specified Lucene document.
    *
    * @param document the document containing the partition key to be get
    * @return the partition key contained in the specified Lucene document
    */
  def decoratedKey(document: Document): DecoratedKey = {
    partitionMapper.decoratedKey(document)
  }

  /** Creates an new [[IndexWriter]] object for updates to a given partition.
    *
    * @param key             key of the partition being modified
    * @param nowInSec        current time of the update operation
    * @param orderGroup      operation group spanning the update operation
    * @param transactionType what kind of update is being performed on the base data
    * @return the newly created index writer
    */
  def writer(
      key: DecoratedKey,
      nowInSec: Int,
      orderGroup: OpOrder.Group,
      transactionType: IndexTransaction.Type): IndexWriter

  /** Deletes all the index contents. */
  def truncate() {
    if (!excludedDataCenter)
      queue.submitSynchronous(lucene.truncate)
  }

  /** Closes and removes all the index files. */
  def delete() {
    try {
      if (!excludedDataCenter)
        queue.close()
      ManagementFactory.getPlatformMBeanServer.unregisterMBean(mBean)
    } catch {
      case e: JMException => logger.error("Error while unregistering Lucene index MBean", e)
    } finally {
      if (!excludedDataCenter)
        lucene.delete()
    }
  }

  /** Upserts the specified row.
    *
    * @param key      the partition key
    * @param row      the row
    * @param nowInSec now in seconds
    */
  def upsert(key: DecoratedKey, row: Row, nowInSec: Int) {
    if (!excludedDataCenter) {
      queue.submitAsynchronous(key, () => {
          val partition = partitioner.partition(key)
          val clustering = row.clustering()
          val term = this.term(key, clustering)
          try {
            val columns = columnsMapper.columns(key, row, nowInSec)
            val fields = schema.indexableFields(columns)
            if (fields.isEmpty) {
              lucene.delete(partition, term)
            } else {
              val doc = new Document
              keyIndexableFields(key, clustering).foreach(doc.add)
              fields.forEach(f => doc.add(f))
              lucene.upsert(partition, term, doc)
            }
          } catch {
            case ex: Throwable => {
              ex.printStackTrace()
              logger.error("error in indexing: ", ex)
              throw ex
            }
          }
      })
    }
  }

  /** Deletes the partition identified by the specified key.
    *
    * @param key        the partition key
    * @param clustering the clustering key
    */
  def delete(key: DecoratedKey, clustering: Clustering[_]) {
    if (!excludedDataCenter)
      queue.submitAsynchronous(key, () => {
        val partition = partitioner.partition(key)
        val term = this.term(key, clustering)
        lucene.delete(partition, term)
      })
  }

  /** Deletes the partition identified by the specified key.
    *
    * @param key the partition key
    */
  def delete(key: DecoratedKey) {
    if (!excludedDataCenter)
      queue.submitAsynchronous(key, () => {
        val partition = partitioner.partition(key)
        val term = this.term(key)
        lucene.delete(partition, term)
      })
  }

  /** Returns a new index searcher for the specified read command.
    *
    * @param command    the read command being executed
    * @param controller the read execution controller
    * @return a searcher with which to perform the supplied command
    */
  def search(
      command: ReadCommand,
      controller: ReadExecutionController): UnfilteredPartitionIterator = {

    if (!excludedDataCenter) {
      // Parse search
      tracer.trace("Building Lucene search")
      val search = expressionMapper.search(command)
      val query = search.query(schema, this.query(command).orNull)
      val afters = this.after(search.paging, command)
      val sort = this.sort(search)
      val count = command.limits.count

      // Refresh if required
      if (search.refresh) {
        tracer.trace("Refreshing Lucene index searcher")
        refresh()
      }


      // Search
      tracer.trace(s"Lucene index searching for $count rows")
      val partitions = partitioner.partitions(command)
      val readers = afters.filter(a => partitions.contains(a._1))
      val documents = lucene.search(readers, query, sort, count)
      reader(documents, command, controller)
    } else {
      new IndexReaderExcludingDataCenter(command, table)
    }

  }

  /** Returns the key range query represented by the specified read command.
    *
    * @param command the read command } else {
    * @return the key range query
    */
  def query(command: ReadCommand): Option[Query] = command match {
    case command: SinglePartitionReadCommand => val key = command.partitionKey
      val filter = command.clusteringIndexFilter(key)
      Some(query(key, filter))
    case command: PartitionRangeReadCommand => query(command.dataRange)
    case _ => throw new IndexException(s"Unsupported read command ${command.getClass}")
  }

  /** Returns a query to get the documents satisfying the specified key and clustering filter.
    *
    * @param key    the partition key
    * @param filter the clustering key range
    * @return a query to get the documents satisfying the key range
    */
  def query(key: DecoratedKey, filter: ClusteringIndexFilter): Query

  /** Returns a query to get the documents satisfying the specified data range.
    *
    * @param dataRange the data range
    * @return a query to get the documents satisfying the data range
    */
  def query(dataRange: DataRange): Option[Query]

  def after(pagingState: IndexPagingState, command: ReadCommand): List[(Int, Option[Term])] = {
    val partitions = partitioner.partitions(command)
    val afters = if (pagingState == null)
      (0 until partitioner.numPartitions).map(_ => None).toList
    else
      pagingState.forCommand(command, partitioner).map(_.map { case ((_, k), c) => after(k, c) })
    partitions.map(i => (i, afters(i)))
  }

  /** Returns a Lucene query to retrieve the row identified by the specified paging state.
    *
    * @param key        the partition key
    * @param clustering the clustering key
    * @return the query to retrieve the row
    */
  def after(key: DecoratedKey, clustering: Clustering[_]): Term

  /** Returns the Lucene sort with the specified search sorting requirements followed by the
    * Cassandra's natural ordering based on partitioning token and cell name.
    *
    * @param search the search containing sorting requirements
    * @return a Lucene sort according to `search`
    */
  def sort(search: Search): Sort = {
    val sortFields = mutable.ListBuffer[SortField]()
    if (search.usesSorting) {
      sortFields ++= search.sortFields(schema).asScala
    }
    if (search.usesRelevance) {
      sortFields += SortField.FIELD_SCORE
    }
    sortFields ++= keySortFields
    new Sort(sortFields.toArray: _*)
  }

  /** Reads from the local SSTables the rows identified by the specified search.
    *
    * @param documents  the Lucene documents
    * @param command    the Cassandra command
    * @param controller the read execution controller
    * @return the local rows satisfying the search
    */
  def reader(
      documents: DocumentIterator,
      command: ReadCommand,
      controller: ReadExecutionController): IndexReader

  /** Ensures that values present in a partition update are valid according to the schema.
    *
    * @param update the partition update containing the values to be validated
    */
  def validate(update: PartitionUpdate) {
    val key = update.partitionKey
    val now = FBUtilities.nowInSeconds
    update.forEach(row => schema.validate(columnsMapper.columns(key, row, now)))
  }

  /** @inheritdoc */
  override def commit() {
    if (!excludedDataCenter)
      queue.submitSynchronous(lucene.commit)
  }

  /** @inheritdoc */
  override def getNumDocs: Long = {
    if (!excludedDataCenter) {
      lucene.getNumDocs
    } else 0
  }

  /** @inheritdoc */
  override def getNumDeletedDocs: Long = {
    if (!excludedDataCenter) {
      lucene.getNumDeletedDocs
    } else 0
  }

  /** @inheritdoc */
  override def forceMerge(maxNumSegments: Int, doWait: Boolean) {
    if (!excludedDataCenter)
      queue.submitSynchronous(() => lucene.forceMerge(maxNumSegments, doWait))
  }

  /** @inheritdoc */
  override def forceMergeDeletes(doWait: Boolean) {
    if (!excludedDataCenter)
      queue.submitSynchronous(() => lucene.forceMergeDeletes(doWait))
  }

  /** @inheritdoc */
  override def refresh() {
    if (!excludedDataCenter)
      queue.submitSynchronous(lucene.refresh)
  }

}

/** Companion object for [[IndexService]]. */
object IndexService {

  /** Returns a new index service for the specified indexed table and index metadata.
    *
    * @param table         the indexed table
    * @param indexMetadata the index metadata
    * @return the index service
    */
  def build(table: ColumnFamilyStore, indexMetadata: IndexMetadata): IndexService = {
    if (table.getComparator.subtypes.isEmpty) {
      new IndexServiceSkinny(table, indexMetadata)
    } else {
      new IndexServiceWide(table, indexMetadata)
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy