All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.stratio.cassandra.lucene.IndexService Maven / Gradle / Ivy

There is a newer version: 3.11.3.0
Show newest version
/*
 * Copyright (C) 2014 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.cassandra.lucene;

import com.stratio.cassandra.lucene.column.Columns;
import com.stratio.cassandra.lucene.index.*;
import com.stratio.cassandra.lucene.key.PartitionMapper;
import com.stratio.cassandra.lucene.key.TokenMapper;
import com.stratio.cassandra.lucene.schema.Schema;
import com.stratio.cassandra.lucene.search.Search;
import com.stratio.cassandra.lucene.search.SearchBuilder;
import com.stratio.cassandra.lucene.util.*;
import com.stratio.cassandra.lucene.util.TaskQueue;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.cql3.Operator;
import org.apache.cassandra.cql3.statements.IndexTarget;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.ClusteringIndexFilter;
import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.filter.RowFilter;
import org.apache.cassandra.db.filter.RowFilter.Expression;
import org.apache.cassandra.db.marshal.UTF8Type;
import org.apache.cassandra.db.partitions.PartitionIterator;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.index.transactions.IndexTransaction;
import org.apache.cassandra.schema.IndexMetadata;
import org.apache.cassandra.utils.Pair;
import org.apache.cassandra.utils.concurrent.OpOrder;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.management.JMException;
import javax.management.ObjectName;
import java.lang.management.ManagementFactory;
import java.nio.ByteBuffer;
import java.util.*;

import static org.apache.lucene.search.SortField.FIELD_SCORE;

/**
 * Lucene {@link Index} service provider.
 *
 * @author Andres de la Pena {@literal }
 */
abstract class IndexService implements IndexServiceMBean {

    protected static final Logger logger = LoggerFactory.getLogger(IndexService.class);

    private static final String POST_PROCESSING_FIELD = "_id";
    private static final Set POST_PROCESSING_FIELDS = Collections.singleton(POST_PROCESSING_FIELD);

    final String qualifiedName;
    final TokenMapper tokenMapper;
    final PartitionMapper partitionMapper;
    protected final ColumnFamilyStore table;
    protected final CFMetaData metadata;
    protected final Schema schema;
    private final FSIndex lucene;
    private final String name;
    private final String column;
    private final ColumnDefinition columnDefinition;
    private final TaskQueue queue;
    private final boolean mapsMultiCells;
    private String mbeanName;
    private ObjectName mbean;

    /**
     * Constructor using the specified indexed table and index metadata.
     *
     * @param indexedTable the indexed table
     * @param indexMetadata the index metadata
     */
    IndexService(ColumnFamilyStore indexedTable, IndexMetadata indexMetadata) {

        table = indexedTable;
        metadata = table.metadata;
        name = indexMetadata.name;
        column = column(indexMetadata);
        columnDefinition = columnDefinition(metadata, column);
        qualifiedName = String.format("%s.%s.%s", metadata.ksName, metadata.cfName, indexMetadata.name);
        mbeanName = String.format("com.stratio.cassandra.lucene:type=Lucene,keyspace=%s,table=%s,index=%s",
                                  metadata.ksName,
                                  metadata.cfName,
                                  name);

        // Parse options
        IndexOptions options = new IndexOptions(metadata, indexMetadata);

        // Setup mapping
        schema = options.schema;
        tokenMapper = new TokenMapper();
        partitionMapper = new PartitionMapper(metadata);
        mapsMultiCells = metadata.allColumns()
                                 .stream()
                                 .filter(x -> schema.mappedCells().contains(x.name.toString()))
                                 .anyMatch(x -> x.type.isMultiCell());

        // Setup FS index and write queue
        queue = new TaskQueue(options.indexingThreads, options.indexingQueuesSize);
        lucene = new FSIndex(name,
                             options.path,
                             options.schema.analyzer(),
                             options.refreshSeconds,
                             options.ramBufferMB,
                             options.maxMergeMB,
                             options.maxCachedMB);
    }

    private static String column(IndexMetadata indexMetadata) {
        String column = indexMetadata.options.get(IndexTarget.TARGET_OPTION_NAME);
        return StringUtils.isBlank(column) ? null : column;
    }

    private static ColumnDefinition columnDefinition(CFMetaData metadata, String name) {
        if (StringUtils.isNotBlank(name)) {
            for (ColumnDefinition def : metadata.allColumns()) {
                if (def.name.toString().equals(name)) {
                    return def;
                }
            }
        }
        return null;
    }

    void init() {

        // Initialize index
        List keySortFields = keySortFields();
        Sort keySort = new Sort(keySortFields.toArray(new SortField[keySortFields.size()]));
        try {
            lucene.init(keySort, fieldsToLoad());
        } catch (Exception e) {
            logger.error(String.format(
                    "Initialization of Lucene FS directory for index '%s' has failed, " +
                    "this could be caused by on-disk data corruption, " +
                    "or by an upgrade to an incompatible version, " +
                    "try to drop the failing index and create it again:",
                    name), e);
        }

        // Register JMX MBean
        try {
            mbean = new ObjectName(mbeanName);
            ManagementFactory.getPlatformMBeanServer().registerMBean(this, this.mbean);
        } catch (JMException e) {
            logger.error("Error while registering Lucene index JMX MBean", e);
        }
    }

    /**
     * Returns a new index service for the specified indexed table and index metadata.
     *
     * @param table the indexed table
     * @param indexMetadata the index metadata
     * @return the index service
     */
    static IndexService build(ColumnFamilyStore table, IndexMetadata indexMetadata) {
        return table.getComparator().subtypes().isEmpty()
               ? new IndexServiceSkinny(table, indexMetadata)
               : new IndexServiceWide(table, indexMetadata);
    }

    /**
     * Returns if the specified column definition is mapped by this index.
     *
     * @param columnDef a column definition
     * @return {@code true} if the column is mapped, {@code false} otherwise
     */
    boolean dependsOn(ColumnDefinition columnDef) {
        return schema.maps(columnDef);
    }

    /**
     * Returns if the specified {@link Expression} is targeted to this index
     *
     * @param expression a CQL query expression
     * @return {@code true} if {@code expression} is targeted to this index, {@code false} otherwise
     */
    private boolean supportsExpression(Expression expression) {
        return supportsExpression(expression.column(), expression.operator());
    }

    /**
     * Returns if a CQL expression with the specified {@link ColumnDefinition} and {@link Operator} is targeted to this
     * index
     *
     * @param columnDef the expression column definition
     * @param operator the expression operator
     * @return {@code true} if the expression is targeted to this index, {@code false} otherwise
     */
    boolean supportsExpression(ColumnDefinition columnDef, Operator operator) {
        return column != null &&
               operator == Operator.EQ &&
               column.equals(columnDef.name.toString()) &&
               columnDef.cellValueType() instanceof UTF8Type;
    }

    /**
     * Returns a copy of the specified {@link RowFilter} without any Lucene {@link Expression}s.
     *
     * @param filter a row filter
     * @return a copy of {@code filter} without Lucene {@link Expression}s
     */
    RowFilter getPostIndexQueryFilter(RowFilter filter) {
        if (column != null) {
            for (Expression expression : filter) {
                if (supportsExpression(expression)) {
                    filter = filter.without(expression);
                }
            }
        }
        return filter;
    }

    /**
     * Returns the validated {@link Search} contained in the specified expression.
     *
     * @param expression a custom CQL expression
     * @return the validated expression
     */
    Search validate(RowFilter.Expression expression) {
        ByteBuffer value = expression instanceof RowFilter.CustomExpression
                           ? ((RowFilter.CustomExpression) expression).getValue()
                           : expression.getIndexValue();
        String json = UTF8Type.instance.compose(value);
        Search search = SearchBuilder.fromJson(json).build();
        search.validate(schema);
        return search;
    }

    /**
     * Returns the names of the Lucene fields to be loaded from index during searches.
     *
     * @return the names of the fields to be loaded
     */
    abstract Set fieldsToLoad();

    /**
     * Returns the Lucene {@link SortField}s required to retrieve documents sorted by Cassandra's primary key.
     *
     * @return the sort fields
     */
    abstract List keySortFields();

    /**
     * Returns a {@link Columns} representing the specified {@link Row}.
     *
     * @param key the partition key
     * @param row the {@link Row}
     * @return the columns representing the specified {@link Row}
     */
    abstract Columns columns(DecoratedKey key, Row row);

    /**
     * Returns the Lucene {@link Document} representing the specified {@link Row}. 

Only the fields required by the * post processing phase of the specified {@link Search} will be added. * * @param key the partition key * @param row the {@link Row} * @param search a search * @return a document */ private Document document(DecoratedKey key, Row row, Search search) { Document document = new Document(); Columns columns = columns(key, row); keyIndexableFields(key, row).forEach(document::add); schema.postProcessingIndexableFields(columns, search).forEach(document::add); return document; } protected abstract List keyIndexableFields(DecoratedKey key, Row row); /** * Returns a Lucene {@link Term} uniquely identifying the specified {@link Row}. * * @param key the partition key * @param row the {@link Row} * @return a Lucene identifying {@link Term} */ abstract Term term(DecoratedKey key, Row row); /** * Returns a Lucene {@link Term} identifying documents representing all the {@link Row}'s which are in the partition * the specified {@link DecoratedKey}. * * @param key the partition key * @return a Lucene {@link Term} representing {@code key} */ Term term(DecoratedKey key) { return partitionMapper.term(key); } /** * Returns if SSTables can contain additional columns of the specified {@link Row} so read-before-write is required * prior to indexing. * * @param key the partition key * @param row the {@link Row} * @return {@code true} if read-before-write is required, {@code false} otherwise */ boolean needsReadBeforeWrite(DecoratedKey key, Row row) { if (mapsMultiCells) { return true; } else { Columns columns = columns(key, row); return schema.mappedCells().stream().anyMatch(x -> columns.withCellName(x).isEmpty()); } } /** * Returns a {@link NavigableSet} of the specified clusterings, sorted by the table metadata. * * @param clusterings the clusterings to be included in the set * @return the navigable sorted set */ NavigableSet clusterings(Clustering... clusterings) { NavigableSet sortedClusterings = new TreeSet<>(metadata.comparator); if (clusterings.length > 0) { sortedClusterings.addAll(Arrays.asList(clusterings)); } return sortedClusterings; } /** * Returns the {@link DecoratedKey} contained in the specified Lucene {@link Document}. * * @param document the {@link Document} containing the partition key to be get * @return the {@link DecoratedKey} contained in the specified Lucene {@link Document} */ DecoratedKey decoratedKey(Document document) { return partitionMapper.decoratedKey(document); } /** * Creates an new {@code IndexWriter} object for updates to a given partition. * * @param key key of the partition being modified * @param nowInSec current time of the update operation * @param opGroup operation group spanning the update operation * @param transactionType what kind of update is being performed on the base data * @return the newly created {@code IndexWriter} */ abstract IndexWriter indexWriter(DecoratedKey key, int nowInSec, OpOrder.Group opGroup, IndexTransaction.Type transactionType); /** * Deletes all the index contents. */ final void truncate() { queue.submitSynchronous(lucene::truncate); } /** * Closes and removes all the index files. */ final void delete() { try { queue.shutdown(); ManagementFactory.getPlatformMBeanServer().unregisterMBean(mbean); } catch (JMException e) { logger.error("Error while unregistering Lucene index MBean", e); } finally { lucene.delete(); } } /** * Upserts the specified {@link Row}. * * @param key the partition key * @param row the row to be upserted * @param nowInSec now in seconds */ void upsert(DecoratedKey key, Row row, int nowInSec) { queue.submitAsynchronous(key, () -> { Term term = term(key, row); Columns columns = columns(key, row).withoutDeleted(nowInSec); List fields = schema.indexableFields(columns); if (fields.isEmpty()) { lucene.delete(term); } else { Document document = new Document(); fields.forEach(document::add); keyIndexableFields(key, row).forEach(document::add); lucene.upsert(term, document); } }); } /** * Deletes the partition identified by the specified key. * * @param key the partition key * @param row the row to be deleted */ void delete(DecoratedKey key, Row row) { queue.submitAsynchronous(key, () -> { Term term = term(key, row); lucene.delete(term); }); } /** * Deletes the partition identified by the specified key. * * @param key the partition key */ void delete(DecoratedKey key) { queue.submitAsynchronous(key, () -> { Term term = term(key); lucene.delete(term); }); } /** * Returns a new {@link Index.Searcher} for the specified {@link ReadCommand}. * * @param command the read command being executed * @return a searcher with which to perform the supplied command */ Index.Searcher searcher(ReadCommand command) { // Parse search Tracer.trace("Building Lucene search"); String expression = expression(command); Search search = SearchBuilder.fromJson(expression).build(); Query query = search.query(schema, query(command).orElse(null)); Query after = after(search.paging(), command); Sort sort = sort(search); int count = command.limits().count(); // Refresh if required if (search.refresh()) { Tracer.trace("Refreshing Lucene index searcher"); refresh(); } // Search Tracer.trace("Lucene index searching for {} rows", count); DocumentIterator documents = lucene.search(after, query, sort, count); return (ReadExecutionController readExecutionController) -> indexReader(documents, command, readExecutionController); } private Search search(ReadCommand command) { return SearchBuilder.fromJson(expression(command)).build(); } private Search search(SinglePartitionReadCommand.Group group) { return SearchBuilder.fromJson(expression(group)).build(); } private String expression(ReadCommand command) { for (Expression expression : command.rowFilter().getExpressions()) { if (expression.isCustom()) { RowFilter.CustomExpression customExpression = (RowFilter.CustomExpression) expression; if (name.equals(customExpression.getTargetIndex().name)) { ByteBuffer bb = customExpression.getValue(); return UTF8Type.instance.compose(bb); } } if (supportsExpression(expression)) { ByteBuffer bb = expression.getIndexValue(); return UTF8Type.instance.compose(bb); } } throw new IndexException("Lucene search expression not found in command expressions"); } private String expression(SinglePartitionReadCommand.Group group) { String result = null; for (ReadCommand command : group.commands) { String expression = expression(command); if (result == null) { result = expression; } else if (!result.equals(expression)) { throw new IndexException("Unable to process command group with different index clauses"); } } return result; } /** * Returns the key range query represented by the specified {@link ReadCommand}. * * @param command the read command * @return the key range query */ private Optional query(ReadCommand command) { if (command instanceof SinglePartitionReadCommand) { DecoratedKey key = ((SinglePartitionReadCommand) command).partitionKey(); ClusteringIndexFilter clusteringFilter = command.clusteringIndexFilter(key); return Optional.of(query(key, clusteringFilter)); } else if (command instanceof PartitionRangeReadCommand) { DataRange dataRange = ((PartitionRangeReadCommand) command).dataRange(); return query(dataRange); } else { throw new IndexException("Unsupported read command {}", command.getClass()); } } /** * Returns a Lucene {@link Query} to get the {@link Document}s satisfying the specified {@link DecoratedKey} and * {@link ClusteringIndexFilter}. * * @param key the partition key * @param filter the clustering key range * @return a query to get the {@link Document}s satisfying the key range */ abstract Query query(DecoratedKey key, ClusteringIndexFilter filter); /** * Returns a Lucene {@link Query} to get the {@link Document}s satisfying the specified {@link DataRange}. * * @param dataRange the {@link DataRange} * @return a query to get the {@link Document}s satisfying the {@code dataRange} */ abstract Optional query(DataRange dataRange); private Query after(IndexPagingState pagingState, ReadCommand command) { try { if (pagingState != null) { Pair position = pagingState.forCommand(command); return position == null ? null : after(position.left, position.right).orElse(null); } return null; } catch (RuntimeException e) { throw new IndexException(e, "Invalid paging state"); } } /** * Returns a Lucene {@link Query} to retrieve the row identified by the specified paging state. * * @param key the partition key * @param clustering the clustering key * @return the query to retrieve the row */ abstract Optional after(DecoratedKey key, Clustering clustering); /** * Returns the Lucene {@link Sort} with the specified {@link Search} sorting requirements followed by the * Cassandra's natural ordering based on partitioning token and cell name. * * @param search the {@link Search} containing sorting requirements * @return a Lucene sort according to {@code search} */ private Sort sort(Search search) { List sortFields = new ArrayList<>(); if (search.usesSorting()) { sortFields.addAll(search.sortFields(schema)); } if (search.usesRelevance()) { sortFields.add(FIELD_SCORE); } sortFields.addAll(keySortFields()); return new Sort(sortFields.toArray(new SortField[sortFields.size()])); } /** * Retrieves from the local storage the {@link Row}s in the specified partition slice. * * @param key the partition key * @param clusterings the clustering keys * @param nowInSec max allowed time in seconds * @return a {@link Row} iterator */ UnfilteredRowIterator read(DecoratedKey key, NavigableSet clusterings, int nowInSec) { ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings, false); ColumnFilter columnFilter = ColumnFilter.all(metadata); SinglePartitionReadCommand readCommand= SinglePartitionReadCommand.create(metadata, nowInSec, key, columnFilter, filter); try(ReadExecutionController controller=readCommand.executionController()) { return readCommand.queryMemtableAndDisk(table,controller); } } /** * Retrieves from the local storage all the {@link Row}s in the specified partition. * * @param key the partition key * @param nowInSec max allowed time in seconds * @return a {@link Row} iterator */ UnfilteredRowIterator read(DecoratedKey key, int nowInSec) { return read(key, clusterings(Clustering.EMPTY), nowInSec); } /** * Reads from the local SSTables the rows identified by the specified search. * * @param documents the Lucene documents * @param command the Cassandra command * @param controller the Cassandra execution controller * @return the local {@link Row}s satisfying the search */ abstract IndexReader indexReader(DocumentIterator documents, ReadCommand command, ReadExecutionController controller); /** * Post processes in the coordinator node the results of a distributed search. Gets the k globally best results from * all the k best node-local results. * * @param partitions the node results iterator * @param group the read command group * @return the k globally best results */ PartitionIterator postProcess(PartitionIterator partitions, SinglePartitionReadCommand.Group group) { // Skip unneeded post processing if only one partition is involved if (group.commands.size() <= 1) { return partitions; } Search search = search(group); int limit = group.limits().count(); int nowInSec = group.nowInSec(); return postProcess(partitions, search, limit, nowInSec); } /** * Post processes in the coordinator node the results of a distributed search. Gets the k globally best results from * all the k best node-local results. * * @param partitions the node results iterator * @param command the read command * @return the k globally best results */ PartitionIterator postProcess(PartitionIterator partitions, ReadCommand command) { // Skip unneeded post processing if only one partition is involved if (command instanceof SinglePartitionReadCommand) { return partitions; } Search search = search(command); int limit = command.limits().count(); int nowInSec = command.nowInSec(); return postProcess(partitions, search, limit, nowInSec); } private PartitionIterator postProcess(PartitionIterator partitions, Search search, int limit, int nowInSec) { if (search.requiresFullScan()) { List> collectedRows = collect(partitions); // Skip if the search doesn't require any kind of sorting if (search.requiresPostProcessing() && !collectedRows.isEmpty()) { return process(search, limit, nowInSec, collectedRows); } } return partitions; } private List> collect(PartitionIterator partitions) { List> rows = new ArrayList<>(); TimeCounter time = TimeCounter.create().start(); try { while (partitions.hasNext()) { try (RowIterator partition = partitions.next()) { DecoratedKey key = partition.partitionKey(); while (partition.hasNext()) { SimpleRowIterator rowIterator = new SimpleRowIterator(partition); rows.add(Pair.create(key, rowIterator)); } } } } finally { logger.debug("Collected {} rows in {}", rows.size(), time.stop()); } return rows; } private SimplePartitionIterator process(Search search, int limit, int nowInSec, List> collectedRows) { TimeCounter time = TimeCounter.create().start(); List processedRows = new LinkedList<>(); try { // Index collected rows in memory RAMIndex index = new RAMIndex(schema.analyzer()); Integer id = 0; for (Pair pair : collectedRows) { DecoratedKey key = pair.left; SimpleRowIterator rowIterator = pair.right; Row row = rowIterator.getRow(); Document document = document(key, row, search); document.add(new StoredField(POST_PROCESSING_FIELD, id++)); index.add(document); } // Repeat search to sort partial results Query query = search.postProcessingQuery(schema); Sort sort = sort(search); List> documents = index.search(query, sort, limit, POST_PROCESSING_FIELDS); index.close(); // Collect post processed results for (Pair pair : documents) { Document document = pair.left; Float score = pair.right.score; id = Integer.parseInt(document.get(POST_PROCESSING_FIELD)); SimpleRowIterator rowIterator = collectedRows.get(id).right; rowIterator.setDecorator(row -> decorate(row, score, nowInSec)); processedRows.add(rowIterator); } } finally { Tracer.trace("Lucene post-process {} collected rows to {} result rows", collectedRows.size(), processedRows.size()); logger.debug("Post-processed {} collected rows to {} result rows in {}", collectedRows.size(), processedRows.size(), time.stop()); } return new SimplePartitionIterator(processedRows); } private Row decorate(Row row, Float score, int nowInSec) { if (column == null || score == null) { return row; } long timestamp = row.primaryKeyLivenessInfo().timestamp(); Row.Builder builder = BTreeRow.unsortedBuilder(nowInSec); builder.newRow(row.clustering()); builder.addRowDeletion(row.deletion()); builder.addPrimaryKeyLivenessInfo(row.primaryKeyLivenessInfo()); row.cells().forEach(builder::addCell); ByteBuffer value = UTF8Type.instance.decompose(Float.toString(score)); builder.addCell(BufferCell.live(columnDefinition, timestamp, value)); return builder.build(); } /** * Ensures that values present in the specified {@link PartitionUpdate} are valid according to the {@link Schema}. * * @param update the partition update containing the values to be validated */ void validate(PartitionUpdate update) { DecoratedKey key = update.partitionKey(); for (Row row : update) { schema.validate(columns(key, row)); } } /** {@inheritDoc} */ @Override public final void commit() { queue.submitSynchronous(lucene::commit); } /** {@inheritDoc} */ @Override public int getNumDocs() { return lucene.getNumDocs(); } /** {@inheritDoc} */ @Override public int getNumDeletedDocs() { return lucene.getNumDeletedDocs(); } /** {@inheritDoc} */ @Override public void forceMerge(int maxNumSegments, boolean doWait) { queue.submitSynchronous(() -> lucene.forceMerge(maxNumSegments, doWait)); } /** {@inheritDoc} */ @Override public void forceMergeDeletes(boolean doWait) { queue.submitSynchronous(() -> lucene.forceMergeDeletes(doWait)); } /** {@inheritDoc} */ @Override public void refresh() { queue.submitSynchronous(lucene::refresh); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy