org.apache.cassandra.index.sasi.disk.PerSSTableIndexWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sasi.disk;

import java.io.File;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.*;

import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.compaction.OperationType;
import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.db.rows.Unfiltered;
import org.apache.cassandra.index.sasi.analyzer.AbstractAnalyzer;
import org.apache.cassandra.index.sasi.conf.ColumnIndex;
import org.apache.cassandra.index.sasi.utils.CombinedTermIterator;
import org.apache.cassandra.index.sasi.utils.TypeUtil;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.io.FSError;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.Uninterruptibles;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PerSSTableIndexWriter implements SSTableFlushObserver
{
    private static final Logger logger = LoggerFactory.getLogger(PerSSTableIndexWriter.class);

    private static final int POOL_SIZE = 8;
    private static final ThreadPoolExecutor INDEX_FLUSHER_MEMTABLE;
    private static final ThreadPoolExecutor INDEX_FLUSHER_GENERAL;

    static
    {
        INDEX_FLUSHER_GENERAL = new JMXEnabledThreadPoolExecutor(POOL_SIZE, POOL_SIZE, 1, TimeUnit.MINUTES,
                                                                 new LinkedBlockingQueue<>(),
                                                                 new NamedThreadFactory("SASI-General"),
                                                                 "internal");
        INDEX_FLUSHER_GENERAL.allowCoreThreadTimeOut(true);

        INDEX_FLUSHER_MEMTABLE = new JMXEnabledThreadPoolExecutor(POOL_SIZE, POOL_SIZE, 1, TimeUnit.MINUTES,
                                                                  new LinkedBlockingQueue<>(),
                                                                  new NamedThreadFactory("SASI-Memtable"),
                                                                  "internal");
        INDEX_FLUSHER_MEMTABLE.allowCoreThreadTimeOut(true);
    }

    private final int nowInSec = FBUtilities.nowInSeconds();

    private final Descriptor descriptor;
    private final OperationType source;

    private final AbstractType keyValidator;

    @VisibleForTesting
    protected final Map indexes;

    private DecoratedKey currentKey;
    private long currentKeyPosition;
    private boolean isComplete;

    public PerSSTableIndexWriter(AbstractType keyValidator,
                                 Descriptor descriptor,
                                 OperationType source,
                                 Map supportedIndexes)
    {
        this.keyValidator = keyValidator;
        this.descriptor = descriptor;
        this.source = source;
        this.indexes = Maps.newHashMapWithExpectedSize(supportedIndexes.size());
        for (Map.Entry entry : supportedIndexes.entrySet())
            indexes.put(entry.getKey(), newIndex(entry.getValue()));
    }

    public void begin()
    {}

    public void startPartition(DecoratedKey key, long curPosition)
    {
        currentKey = key;
        currentKeyPosition = curPosition;
    }

    public void nextUnfilteredCluster(Unfiltered unfiltered)
    {
        if (!unfiltered.isRow())
            return;

        Row row = (Row) unfiltered;

        indexes.forEach((column, index) -> {
            ByteBuffer value = ColumnIndex.getValueOf(column, row, nowInSec);
            if (value == null)
                return;

            if (index == null)
                throw new IllegalArgumentException("No index exists for column " + column.name.toString());

            index.add(value.duplicate(), currentKey, currentKeyPosition);
        });
    }

    public void complete()
    {
        if (isComplete)
            return;

        currentKey = null;

        try
        {
            CountDownLatch latch = new CountDownLatch(indexes.size());
            for (Index index : indexes.values())
                index.complete(latch);

            Uninterruptibles.awaitUninterruptibly(latch);
        }
        finally
        {
            indexes.clear();
            isComplete = true;
        }
    }

    public Index getIndex(ColumnMetadata columnDef)
    {
        return indexes.get(columnDef);
    }

    public Descriptor getDescriptor()
    {
        return descriptor;
    }

    @VisibleForTesting
    protected Index newIndex(ColumnIndex columnIndex)
    {
        return new Index(columnIndex);
    }

    @VisibleForTesting
    protected class Index
    {
        @VisibleForTesting
        protected final String outputFile;

        private final ColumnIndex columnIndex;
        private final AbstractAnalyzer analyzer;
        private final long maxMemorySize;

        @VisibleForTesting
        protected final Set> segments;
        private int segmentNumber = 0;

        private OnDiskIndexBuilder currentBuilder;

        public Index(ColumnIndex columnIndex)
        {
            this.columnIndex = columnIndex;
            this.outputFile = descriptor.filenameFor(columnIndex.getComponent());
            this.analyzer = columnIndex.getAnalyzer();
            this.segments = new HashSet<>();
            this.maxMemorySize = maxMemorySize(columnIndex);
            this.currentBuilder = newIndexBuilder();
        }

        public void add(ByteBuffer term, DecoratedKey key, long keyPosition)
        {
            if (term.remaining() == 0)
                return;

            boolean isAdded = false;

            analyzer.reset(term);
            while (analyzer.hasNext())
            {
                ByteBuffer token = analyzer.next();
                int size = token.remaining();

                if (token.remaining() >= OnDiskIndexBuilder.MAX_TERM_SIZE)
                {
                    logger.info("Rejecting value (size {}, maximum {}) for column {} (analyzed {}) at {} SSTable.",
                            FBUtilities.prettyPrintMemory(term.remaining()),
                            FBUtilities.prettyPrintMemory(OnDiskIndexBuilder.MAX_TERM_SIZE),
                            columnIndex.getColumnName(),
                            columnIndex.getMode().isAnalyzed,
                            descriptor);
                    continue;
                }

                if (!TypeUtil.isValid(token, columnIndex.getValidator()))
                {
                    if ((token = TypeUtil.tryUpcast(token, columnIndex.getValidator())) == null)
                    {
                        logger.info("({}) Failed to add {} to index for key: {}, value size was {}, validator is {}.",
                                    outputFile,
                                    columnIndex.getColumnName(),
                                    keyValidator.getString(key.getKey()),
                                    FBUtilities.prettyPrintMemory(size),
                                    columnIndex.getValidator());
                        continue;
                    }
                }

                currentBuilder.add(token, key, keyPosition);
                isAdded = true;
            }

            if (!isAdded || currentBuilder.estimatedMemoryUse() < maxMemorySize)
                return; // non of the generated tokens were added to the index or memory size wasn't reached

            segments.add(getExecutor().submit(scheduleSegmentFlush(false)));
        }

        @VisibleForTesting
        protected Callable scheduleSegmentFlush(final boolean isFinal)
        {
            final OnDiskIndexBuilder builder = currentBuilder;
            currentBuilder = newIndexBuilder();

            final String segmentFile = filename(isFinal);

            return () -> {
                long start = System.nanoTime();

                try
                {
                    File index = new File(segmentFile);
                    return builder.finish(index) ? new OnDiskIndex(index, columnIndex.getValidator(), null) : null;
                }
                catch (Exception | FSError e)
                {
                    logger.error("Failed to build index segment {}", segmentFile, e);
                    return null;
                }
                finally
                {
                    if (!isFinal)
                        logger.info("Flushed index segment {}, took {} ms.", segmentFile, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
                }
            };
        }

        public void complete(final CountDownLatch latch)
        {
            logger.info("Scheduling index flush to {}", outputFile);

            getExecutor().submit((Runnable) () -> {
                long start1 = System.nanoTime();

                OnDiskIndex[] parts = new OnDiskIndex[segments.size() + 1];

                try
                {
                    // no parts present, build entire index from memory
                    if (segments.isEmpty())
                    {
                        scheduleSegmentFlush(true).call();
                        return;
                    }

                    // parts are present but there is something still in memory, let's flush that inline
                    if (!currentBuilder.isEmpty())
                    {
                        @SuppressWarnings("resource")
                        OnDiskIndex last = scheduleSegmentFlush(false).call();
                        segments.add(Futures.immediateFuture(last));
                    }

                    int index = 0;
                    ByteBuffer combinedMin = null, combinedMax = null;

                    for (Future f : segments)
                    {
                        @SuppressWarnings("resource")
                        OnDiskIndex part = f.get();
                        if (part == null)
                            continue;

                        parts[index++] = part;
                        combinedMin = (combinedMin == null || keyValidator.compare(combinedMin, part.minKey()) > 0) ? part.minKey() : combinedMin;
                        combinedMax = (combinedMax == null || keyValidator.compare(combinedMax, part.maxKey()) < 0) ? part.maxKey() : combinedMax;
                    }

                    OnDiskIndexBuilder builder = newIndexBuilder();
                    builder.finish(Pair.create(combinedMin, combinedMax),
                                   new File(outputFile),
                                   new CombinedTermIterator(parts));
                }
                catch (Exception | FSError e)
                {
                    logger.error("Failed to flush index {}.", outputFile, e);
                    FileUtils.delete(outputFile);
                }
                finally
                {
                    logger.info("Index flush to {} took {} ms.", outputFile, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start1));

                    for (int segment = 0; segment < segmentNumber; segment++)
                    {
                        @SuppressWarnings("resource")
                        OnDiskIndex part = parts[segment];

                        if (part != null)
                            FileUtils.closeQuietly(part);

                        FileUtils.delete(outputFile + "_" + segment);
                    }

                    latch.countDown();
                }
            });
        }

        private ExecutorService getExecutor()
        {
            return source == OperationType.FLUSH ? INDEX_FLUSHER_MEMTABLE : INDEX_FLUSHER_GENERAL;
        }

        private OnDiskIndexBuilder newIndexBuilder()
        {
            return new OnDiskIndexBuilder(keyValidator, columnIndex.getValidator(), columnIndex.getMode().mode);
        }

        public String filename(boolean isFinal)
        {
            return outputFile + (isFinal ? "" : "_" + segmentNumber++);
        }
    }

    protected long maxMemorySize(ColumnIndex columnIndex)
    {
        // 1G for memtable and configuration for compaction
        return source == OperationType.FLUSH ? 1073741824L : columnIndex.getMode().maxCompactionFlushMemoryInBytes;
    }

    public int hashCode()
    {
        return descriptor.hashCode();
    }

    public boolean equals(Object o)
    {
        return !(o == null || !(o instanceof PerSSTableIndexWriter)) && descriptor.equals(((PerSSTableIndexWriter) o).descriptor);
    }
}