com.stratio.cassandra.lucene.Index.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-lucene-index-plugin Show documentation
Cassandra Lucene Index plugin
The newest version!
/*
 * Copyright (C) 2014 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.cassandra.lucene

import java.util.concurrent.Callable
import java.util.function.BiFunction
import java.util.{Collections, Optional}
import java.{util => java}

import com.stratio.cassandra.lucene.search.Search
import com.stratio.cassandra.lucene.util.Logging
import org.apache.cassandra.cql3.Operator
import org.apache.cassandra.db.SinglePartitionReadCommand.Group
import org.apache.cassandra.db._
import org.apache.cassandra.db.compaction.CompactionManager
import org.apache.cassandra.db.filter.RowFilter
import org.apache.cassandra.db.lifecycle.{SSTableSet, View}
import org.apache.cassandra.db.marshal.{AbstractType, UTF8Type}
import org.apache.cassandra.db.partitions._
import org.apache.cassandra.exceptions.{ConfigurationException, InvalidRequestException}
import org.apache.cassandra.index.Index.{Indexer, Searcher}
import org.apache.cassandra.index.internal.CollatedViewIndexBuilder
import org.apache.cassandra.index.transactions.IndexTransaction
import org.apache.cassandra.index.{IndexRegistry, Index => CassandraIndex}
import org.apache.cassandra.io.sstable.ReducingKeyIterator
import org.apache.cassandra.schema.{ColumnMetadata, IndexMetadata, TableMetadata}
import org.apache.cassandra.utils.FBUtilities


/** [[CassandraIndex]] that uses Apache Lucene as backend. It allows, among
  * others, multi-column and full-text search.
  *
  * @param table         the indexed table
  * @param indexMetadata the index's metadata
  * @author Andres de la Pena `[email protected]`
  */
class Index(table: ColumnFamilyStore, indexMetadata: IndexMetadata)
  extends CassandraIndex with Logging {

  // Set Lucene query handler as CQL query handler
  IndexQueryHandler.activate()

  logger.debug(s"Building Lucene index ${table.metadata} $indexMetadata")

  val service = try IndexService.build(table, indexMetadata) catch {
    case e: Exception => throw new IndexException(e)
  }

  val name = service.qualifiedName

  /** Return a task to perform any initialization work when a new index instance is created. This
    * may involve costly operations such as (re)building the index, and is performed asynchronously
    * by SecondaryIndexManager.
    *
    * @return a task to perform any necessary initialization work
    */
  override def getInitializationTask: Callable[_] = {
    if (isBuilt || table.isEmpty) {
      return null
    }

    getBuildIndexTask()
  }

  private[this] def getBuildIndexTask(): Callable[Unit] =
    new Callable[Unit] {
      override def call(): Unit = {
        table.forceBlockingFlush

        try {
          val viewFragment = table.selectAndReference(View.selectFunction(SSTableSet.CANONICAL))
          val sstables = viewFragment.refs

          try {
            if (sstables.isEmpty) {
              logger.info("No SSTable data for {}.{} to build index {} from, marking empty index as built",
                          table.metadata.keyspace, table.metadata.name, indexMetadata.name)
              return
            }
            logger.info("Submitting index build of {}", table.name)
            val builder = new CollatedViewIndexBuilder(table,
                                                       Collections.singleton(Index.this),
                                                       new ReducingKeyIterator(sstables),
                                                       java.Collections.unmodifiableCollection(sstables))

            val future = CompactionManager.instance.submitIndexBuild(builder)
            FBUtilities.waitOnFuture(future)
          } finally {
            if (viewFragment != null) viewFragment.close()
            if (sstables != null) sstables.close()
          }
        }
        logger.info("Index build of {} complete", indexMetadata.name)
      }
    }

  private def isBuilt = SystemKeyspace.isIndexBuilt(table.keyspace.getName, indexMetadata.name)

  /** Returns the IndexMetadata which configures and defines the index instance. This should be the
    * same object passed as the argument to setIndexMetadata.
    *
    * @return the index's metadata
    */
  override def getIndexMetadata: IndexMetadata = indexMetadata

  /** Return a task to reload the internal metadata of an index. Called when the base table metadata
    * is modified or when the configuration of the Index is updated Implementations should return a
    * task which performs any necessary work to be done due to updating the configuration(s) such as
    * (re)building etc. This task is performed asynchronously by SecondaryIndexManager.
    *
    * @return task to be executed by the index manager during a reload
    */
  override def getMetadataReloadTask(indexMetadata: IndexMetadata): Callable[_] = () => {
    // TODO: Check return getBuildIndexTask if index metadata is different
    logger.debug(s"Reloading Lucene index $name metadata: $indexMetadata")
  }

  /** An index must be registered in order to be able to either subscribe to update events on the
    * base table and/or to provide IndexSearcher functionality for reads. The double dispatch
    * involved here, where the Index actually performs its own registration by calling back to the
    * supplied IndexRegistry's own registerIndex method, is to make the decision as to whether or
    * not to register an index belong to the implementation, not the manager.
    *
    * @param registry the index registry to register the instance with
    */
  override def register(registry: IndexRegistry) {
    registry.registerIndex(this)
  }

  /** If the index implementation uses a local table to store its index data this method should
    * return a handle to it. If not, an empty Optional should be returned. Typically, this is useful
    * for the built-in Index implementations.
    *
    * @return the Index's backing storage table
    */
  override def getBackingTable: Optional[ColumnFamilyStore] = Optional.empty()

  /** Return a task which performs a blocking flush of the index's data to persistent storage.
    *
    * @return task to be executed by the index manager to perform the flush
    */
  override def getBlockingFlushTask: Callable[_] = () => {
    logger.info(s"Flushing Lucene index $name")
    service.commit()
  }

  /** Return a task which invalidates the index, indicating it should no longer be considered
    * usable. This should include an clean up and releasing of resources required when dropping an
    * index.
    *
    * @return task to be executed by the index manager to invalidate the index
    */
  override def getInvalidateTask: Callable[_] = () => {
    logger.info(s"Invalidating Lucene index $name")
    service.delete()
  }

  /** Return a task to truncate the index with the specified truncation timestamp. Called when the
    * base table is truncated.
    *
    * @param truncatedAt timestamp of the truncation operation. This will be the same timestamp used
    *                    in the truncation of the base table.
    * @return task to be executed by the index manager when the base table is truncated.
    */
  override def getTruncateTask(truncatedAt: Long): Callable[_] = () => {
    logger.info(s"Truncating Lucene index $name")
    service.truncate()
  }

  /** Return true if this index can be built or rebuilt when the index manager determines it is
    * necessary. Returning false enables the index implementation (or some other component) to
    * control if and when SSTable data is incorporated into the index.
    *
    * This is called by SecondaryIndexManager in buildIndexBlocking, buildAllIndexesBlocking and
    * rebuildIndexesBlocking where a return value of false causes the index to be excluded from the
    * set of those which will process the SSTable data.
    *
    * @return if the index should be included in the set which processes SSTable data
    */
  override def shouldBuildBlocking: Boolean = {
    true
  }

  /** Called to determine whether this index targets a specific column. Used during schema
    * operations such as when dropping or renaming a column, to check if the index will be affected
    * by the change. Typically, if an index answers that it does depend upon a column, then schema
    * operations on that column are not permitted until the index is dropped or altered.
    *
    * @param column the column definition to check
    * @return true if the index depends on the supplied column being present; false if the column
    *         may be safely dropped or modified without adversely affecting the index
    */
  override def dependsOn(column: ColumnMetadata): Boolean = {
    // TODO: Could return true only for key and/or mapped columns?
    logger.trace(s"Asking if the index depends on column $column")
    service.dependsOn(column)
  }

  /** Called to determine whether this index can provide a searcher to execute a query on the
    * supplied column using the specified operator. This forms part of the query validation done
    * before a CQL select statement is executed.
    *
    * @param column   the target column of a search query predicate
    * @param operator the operator of a search query predicate
    * @return true if this index is capable of supporting such expressions, false otherwise
    */
  override def supportsExpression(column: ColumnMetadata, operator: Operator): Boolean = {
    logger.trace(s"Asking if the index supports the expression $column $operator")
    service.expressionMapper.supports(column, operator)
  }

  /** If the index supports custom search expressions using the {{{SELECT * FROM table WHERE
    * expr(index_name, expression)}}} syntax, this method should return the expected type of the
    * expression argument. For example, if the index supports custom expressions as Strings, calls
    * to this method should return `UTF8Type.instance`. If the index implementation does not support
    * custom expressions, then it should return null.
    *
    * @return the type of custom expressions supported by this index, or null if custom expressions
    *         are not supported.
    */
  override def customExpressionValueType: AbstractType[_] = {
    logger.trace("Requesting the custom expressions value type")
    UTF8Type.instance
  }

  /** Transform an initial RowFilter into the filter that will still need to applied to a set of
    * Rows after the index has performed it's initial scan. Used in ReadCommand#executeLocal to
    * reduce the amount of filtering performed on the results of the index query.
    *
    * @param filter the initial filter belonging to a ReadCommand
    * @return the (hopefully) reduced filter that would still need to be applied after the index was
    *         used to narrow the initial result set
    */
  override def getPostIndexQueryFilter(filter: RowFilter): RowFilter = {
    logger.trace(s"Getting the post index query filter for $filter")
    service.expressionMapper.postIndexQueryFilter(filter)
  }

  /** Return an estimate of the number of results this index is expected to return for any given
    * query that it can be used to answer. Used in conjunction with indexes() and
    * supportsExpression() to determine the most selective index for a given ReadCommand.
    * Additionally, this is also used by StorageProxy.estimateResultsPerRange to calculate the
    * initial concurrency factor for range requests
    *
    * @return the estimated average number of results aIndexSearcher may return for any given query
    */
  override def getEstimatedResultRows: Long = {
    logger.trace("Getting the estimated result rows")
    1
  }

  /** Called at write time to ensure that values present in the update are valid according to the
    * rules of all registered indexes which will process it. The partition key as well as the
    * clustering and cell values for each row in the update may be checked by index implementations
    *
    * @param update PartitionUpdate containing the values to be validated by registered indexes.
    * @throws InvalidRequestException If the update doesn't pass through the validation.
    */
  override def validate(update: PartitionUpdate) {
    logger.trace(s"Validating $update")
    try {
      service.validate(update)
    } catch {
      case e: Exception =>
        logger.debug(s"Invalid partition update: $update", e)
        throw new InvalidRequestException(e.getMessage)
    }
  }

  /** Creates an new indexer object for updates to a given partition.
    *
    * @param key             key of the partition being modified
    * @param columns         the regular and static columns the created indexer will have to deal
    *                        with. This can be empty as an update might only contain partition,
    *                        range and row deletions, but the indexer is guaranteed to not get any
    *                        cells for a column that is not part of columns.
    * @param nowInSec        current time of the update operation
    * @param transactionType indicates what kind of update is being performed on the base data i.e.
    *                        a write time insert/update/delete or the result of compaction
    * @return the newly created indexer or `null` if the index is not interested by the update (this
    *         could be because the index doesn't care about that particular partition, doesn't care
    *         about that type of transaction, ...).
    */
  override def indexerFor(key: DecoratedKey,
                          columns: RegularAndStaticColumns,
                          nowInSec: Int,
                          ctx: WriteContext,
                          transactionType: IndexTransaction.Type): Indexer = {
    service.writer(key, nowInSec, ctx.asInstanceOf[CassandraWriteContext].getGroup, transactionType)
  }

  override def supportsReplicaFilteringProtection(rowFilter: RowFilter): Boolean = false

  /** Return a function which performs post processing on the results of a partition range read
    * command. In future, this may be used as a generalized mechanism for transforming results on
    * the coordinator prior to returning them to the caller.
    *
    * This is used on the coordinator during execution of a range command to perform post processing
    * of merged results obtained from the necessary replicas. This is the only way in which results
    * are transformed in this way but this may change over time as usage is generalized. See
    * CASSANDRA-8717 for further discussion.
    *
    * The function takes a PartitionIterator of the results from the replicas which has already been
    * collated and reconciled, along with the command being executed. It returns another
    * PartitionIterator containing the results of the transformation (which may be the same as the
    * input if the transformation is a no-op).
    */
  override def postProcessorFor(command: ReadCommand)
  : BiFunction[PartitionIterator, ReadCommand, PartitionIterator] = {
    new ReadCommandPostProcessor(service)
  }

  def postProcessorFor(group: Group): BiFunction[PartitionIterator, Group, PartitionIterator] = {
    new GroupPostProcessor(service)
  }

  /** Factory method for query time search helper. Custom index implementations should perform any
    * validation of query expressions here and throw a meaningful InvalidRequestException when any
    * expression is invalid.
    *
    * @param command the read command being executed
    * @return an IndexSearcher with which to perform the supplied command
    * @throws InvalidRequestException if the command's expressions are invalid according to the
    *                                 specific syntax supported by the index implementation.
    */
  override def searcherFor(command: ReadCommand): Searcher = {
    logger.trace(s"Getting searcher for $command")
    try {
      controller => service.search(command, controller)
    } catch {
      case e: Exception =>
        logger.error(s"Error getting searcher for command: $command", e)
        throw new InvalidRequestException(e.getMessage)
    }
  }

  /** Validates the specified custom expression.
    *
    * @param expression the expression to be validated
    * @return the valid search represented by `expression`
    * @throws InvalidRequestException if the expression is not valid
    */
  def validate(expression: RowFilter.Expression): Search = {
    try {
      service.validate(expression)
    } catch {
      case e: Exception =>
        logger.debug(s"Invalid index expression: $expression", e)
        throw new InvalidRequestException(e.getMessage)
    }
  }
}

/** Companion object for [[Index]]. */
object Index extends Logging {

  /** Validates the specified index options.
    *
    * @param options  the options to be validated
    * @param metadata the metadata of the table to be indexed
    * @return the validated options
    * @throws ConfigurationException if the options are not valid
    */
  def validateOptions(options: java.Map[String, String],
                      metadata: TableMetadata): java.Map[String, String] = {
    logger.debug("Validating Lucene index options")
    try {
      IndexOptions.validate(options, metadata)
    } catch {
      case e: IndexException =>
        logger.error(s"Invalid index options: $options", e)
        throw new ConfigurationException(e.getMessage)
    }
    logger.debug("Lucene index options are valid")
    Collections.emptyMap[String, String]
  }

}