All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.kafka.streams.state.internals.RocksDBTimestampedStore Maven / Gradle / Ivy

There is a newer version: 3.7.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kafka.streams.state.internals;

import java.util.Optional;
import org.apache.kafka.common.utils.AbstractIterator;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.errors.InvalidStateStoreException;
import org.apache.kafka.streams.errors.ProcessorStateException;
import org.apache.kafka.streams.state.TimestampedBytesStore;
import org.apache.kafka.streams.state.internals.metrics.RocksDBMetricsRecorder;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.ColumnFamilyOptions;
import org.rocksdb.DBOptions;
import org.rocksdb.ReadOptions;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.RocksIterator;
import org.rocksdb.WriteBatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Objects;

import static java.util.Arrays.asList;
import static org.apache.kafka.streams.state.TimestampedBytesStore.convertToTimestampedFormat;

/**
 * A persistent key-(value-timestamp) store based on RocksDB.
 */
public class RocksDBTimestampedStore extends RocksDBStore implements TimestampedBytesStore {
    private static final Logger log = LoggerFactory.getLogger(RocksDBTimestampedStore.class);

    public RocksDBTimestampedStore(final String name,
                            final String metricsScope) {
        super(name, metricsScope);
    }

    RocksDBTimestampedStore(final String name,
                            final String parentDir,
                            final RocksDBMetricsRecorder metricsRecorder) {
        super(name, parentDir, metricsRecorder);
    }

    @Override
    void openRocksDB(final DBOptions dbOptions,
                     final ColumnFamilyOptions columnFamilyOptions) {
        final List columnFamilyDescriptors = asList(
            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, columnFamilyOptions),
            new ColumnFamilyDescriptor("keyValueWithTimestamp".getBytes(StandardCharsets.UTF_8), columnFamilyOptions));
        final List columnFamilies = new ArrayList<>(columnFamilyDescriptors.size());

        try {
            db = RocksDB.open(dbOptions, dbDir.getAbsolutePath(), columnFamilyDescriptors, columnFamilies);
            setDbAccessor(columnFamilies.get(0), columnFamilies.get(1));
        } catch (final RocksDBException e) {
            if ("Column family not found: keyValueWithTimestamp".equals(e.getMessage())) {
                try {
                    db = RocksDB.open(dbOptions, dbDir.getAbsolutePath(), columnFamilyDescriptors.subList(0, 1), columnFamilies);
                    columnFamilies.add(db.createColumnFamily(columnFamilyDescriptors.get(1)));
                } catch (final RocksDBException fatal) {
                    throw new ProcessorStateException("Error opening store " + name + " at location " + dbDir.toString(), fatal);
                }
                setDbAccessor(columnFamilies.get(0), columnFamilies.get(1));
            } else {
                throw new ProcessorStateException("Error opening store " + name + " at location " + dbDir.toString(), e);
            }
        }
    }

    private void setDbAccessor(final ColumnFamilyHandle noTimestampColumnFamily,
                               final ColumnFamilyHandle withTimestampColumnFamily) {
        final RocksIterator noTimestampsIter = db.newIterator(noTimestampColumnFamily);
        noTimestampsIter.seekToFirst();
        if (noTimestampsIter.isValid()) {
            log.info("Opening store {} in upgrade mode", name);
            dbAccessor = new DualColumnFamilyAccessor(noTimestampColumnFamily, withTimestampColumnFamily);
        } else {
            log.info("Opening store {} in regular mode", name);
            dbAccessor = new SingleColumnFamilyAccessor(withTimestampColumnFamily);
            noTimestampColumnFamily.close();
        }
        noTimestampsIter.close();
    }


    private class DualColumnFamilyAccessor implements RocksDBAccessor {
        private final ColumnFamilyHandle oldColumnFamily;
        private final ColumnFamilyHandle newColumnFamily;

        private DualColumnFamilyAccessor(final ColumnFamilyHandle oldColumnFamily,
                                         final ColumnFamilyHandle newColumnFamily) {
            this.oldColumnFamily = oldColumnFamily;
            this.newColumnFamily = newColumnFamily;
        }

        @Override
        public void put(final byte[] key,
                        final byte[] valueWithTimestamp) {
            if (valueWithTimestamp == null) {
                try {
                    db.delete(oldColumnFamily, wOptions, key);
                } catch (final RocksDBException e) {
                    // String format is happening in wrapping stores. So formatted message is thrown from wrapping stores.
                    throw new ProcessorStateException("Error while removing key from store " + name, e);
                }
                try {
                    db.delete(newColumnFamily, wOptions, key);
                } catch (final RocksDBException e) {
                    // String format is happening in wrapping stores. So formatted message is thrown from wrapping stores.
                    throw new ProcessorStateException("Error while removing key from store " + name, e);
                }
            } else {
                try {
                    db.delete(oldColumnFamily, wOptions, key);
                } catch (final RocksDBException e) {
                    // String format is happening in wrapping stores. So formatted message is thrown from wrapping stores.
                    throw new ProcessorStateException("Error while removing key from store " + name, e);
                }
                try {
                    db.put(newColumnFamily, wOptions, key, valueWithTimestamp);
                    StoreQueryUtils.updatePosition(position, context);
                } catch (final RocksDBException e) {
                    // String format is happening in wrapping stores. So formatted message is thrown from wrapping stores.
                    throw new ProcessorStateException("Error while putting key/value into store " + name, e);
                }
            }
        }

        @Override
        public void prepareBatch(final List> entries,
                                 final WriteBatch batch) throws RocksDBException {
            for (final KeyValue entry : entries) {
                Objects.requireNonNull(entry.key, "key cannot be null");
                addToBatch(entry.key.get(), entry.value, batch);
            }
        }

        @Override
        public byte[] get(final byte[] key) throws RocksDBException {
            return get(key, Optional.empty());
        }

        @Override
        public byte[] get(final byte[] key, final ReadOptions readOptions) throws RocksDBException {
            return get(key, Optional.of(readOptions));
        }

        private byte[] get(final byte[] key, final Optional readOptions) throws RocksDBException {
            final byte[] valueWithTimestamp = readOptions.isPresent() ? db.get(newColumnFamily, readOptions.get(), key) : db.get(newColumnFamily, key);
            if (valueWithTimestamp != null) {
                return valueWithTimestamp;
            }

            final byte[] plainValue = readOptions.isPresent() ? db.get(oldColumnFamily, readOptions.get(), key) : db.get(oldColumnFamily, key);
            if (plainValue != null) {
                final byte[] valueWithUnknownTimestamp = convertToTimestampedFormat(plainValue);
                // this does only work, because the changelog topic contains correct data already
                // for other format changes, we cannot take this short cut and can only migrate data
                // from old to new store on put()
                put(key, valueWithUnknownTimestamp);
                return valueWithUnknownTimestamp;
            }

            return null;
        }

        @Override
        public byte[] getOnly(final byte[] key) throws RocksDBException {
            final byte[] valueWithTimestamp = db.get(newColumnFamily, key);
            if (valueWithTimestamp != null) {
                return valueWithTimestamp;
            }

            final byte[] plainValue = db.get(oldColumnFamily, key);
            if (plainValue != null) {
                return convertToTimestampedFormat(plainValue);
            }

            return null;
        }

        @Override
        public ManagedKeyValueIterator range(final Bytes from,
                                                     final Bytes to,
                                                     final boolean forward) {
            return new RocksDBDualCFRangeIterator(
                name,
                db.newIterator(newColumnFamily),
                db.newIterator(oldColumnFamily),
                from,
                to,
                forward,
                true);
        }

        @Override
        public void deleteRange(final byte[] from, final byte[] to) {
            try {
                db.deleteRange(oldColumnFamily, wOptions, from, to);
            } catch (final RocksDBException e) {
                // String format is happening in wrapping stores. So formatted message is thrown from wrapping stores.
                throw new ProcessorStateException("Error while removing key from store " + name, e);
            }
            try {
                db.deleteRange(newColumnFamily, wOptions, from, to);
            } catch (final RocksDBException e) {
                // String format is happening in wrapping stores. So formatted message is thrown from wrapping stores.
                throw new ProcessorStateException("Error while removing key from store " + name, e);
            }
        }

        @Override
        public ManagedKeyValueIterator all(final boolean forward) {
            final RocksIterator innerIterWithTimestamp = db.newIterator(newColumnFamily);
            final RocksIterator innerIterNoTimestamp = db.newIterator(oldColumnFamily);
            if (forward) {
                innerIterWithTimestamp.seekToFirst();
                innerIterNoTimestamp.seekToFirst();
            } else {
                innerIterWithTimestamp.seekToLast();
                innerIterNoTimestamp.seekToLast();
            }
            return new RocksDBDualCFIterator(name, innerIterWithTimestamp, innerIterNoTimestamp, forward);
        }

        @Override
        public ManagedKeyValueIterator prefixScan(final Bytes prefix) {
            final Bytes to = incrementWithoutOverflow(prefix);
            return new RocksDBDualCFRangeIterator(
                name,
                db.newIterator(newColumnFamily),
                db.newIterator(oldColumnFamily),
                prefix,
                to,
                true,
                false
            );
        }

        @Override
        public long approximateNumEntries() throws RocksDBException {
            return db.getLongProperty(oldColumnFamily, "rocksdb.estimate-num-keys")
                + db.getLongProperty(newColumnFamily, "rocksdb.estimate-num-keys");
        }

        @Override
        public void flush() throws RocksDBException {
            db.flush(fOptions, oldColumnFamily);
            db.flush(fOptions, newColumnFamily);
        }

        @Override
        public void addToBatch(final byte[] key,
                               final byte[] value,
                               final WriteBatch batch) throws RocksDBException {
            if (value == null) {
                batch.delete(oldColumnFamily, key);
                batch.delete(newColumnFamily, key);
            } else {
                batch.delete(oldColumnFamily, key);
                batch.put(newColumnFamily, key, value);
            }
        }

        @Override
        public void close() {
            oldColumnFamily.close();
            newColumnFamily.close();
        }
    }

    private class RocksDBDualCFIterator extends AbstractIterator>
        implements ManagedKeyValueIterator {

        // RocksDB's JNI interface does not expose getters/setters that allow the
        // comparator to be pluggable, and the default is lexicographic, so it's
        // safe to just force lexicographic comparator here for now.
        private final Comparator comparator = Bytes.BYTES_LEXICO_COMPARATOR;

        private final String storeName;
        private final RocksIterator iterWithTimestamp;
        private final RocksIterator iterNoTimestamp;
        private final boolean forward;

        private volatile boolean open = true;

        private byte[] nextWithTimestamp;
        private byte[] nextNoTimestamp;
        private KeyValue next;
        private Runnable closeCallback = null;

        RocksDBDualCFIterator(final String storeName,
                              final RocksIterator iterWithTimestamp,
                              final RocksIterator iterNoTimestamp,
                              final boolean forward) {
            this.iterWithTimestamp = iterWithTimestamp;
            this.iterNoTimestamp = iterNoTimestamp;
            this.storeName = storeName;
            this.forward = forward;
        }

        @Override
        public synchronized boolean hasNext() {
            if (!open) {
                throw new InvalidStateStoreException(String.format("RocksDB iterator for store %s has closed", storeName));
            }
            return super.hasNext();
        }

        @Override
        public synchronized KeyValue next() {
            return super.next();
        }

        @Override
        protected KeyValue makeNext() {
            if (nextNoTimestamp == null && iterNoTimestamp.isValid()) {
                nextNoTimestamp = iterNoTimestamp.key();
            }

            if (nextWithTimestamp == null && iterWithTimestamp.isValid()) {
                nextWithTimestamp = iterWithTimestamp.key();
            }

            if (nextNoTimestamp == null && !iterNoTimestamp.isValid()) {
                if (nextWithTimestamp == null && !iterWithTimestamp.isValid()) {
                    return allDone();
                } else {
                    next = KeyValue.pair(new Bytes(nextWithTimestamp), iterWithTimestamp.value());
                    nextWithTimestamp = null;
                    if (forward) {
                        iterWithTimestamp.next();
                    } else {
                        iterWithTimestamp.prev();
                    }
                }
            } else {
                if (nextWithTimestamp == null) {
                    next = KeyValue.pair(new Bytes(nextNoTimestamp), convertToTimestampedFormat(iterNoTimestamp.value()));
                    nextNoTimestamp = null;
                    if (forward) {
                        iterNoTimestamp.next();
                    } else {
                        iterNoTimestamp.prev();
                    }
                } else {
                    if (forward) {
                        if (comparator.compare(nextNoTimestamp, nextWithTimestamp) <= 0) {
                            next = KeyValue.pair(new Bytes(nextNoTimestamp), convertToTimestampedFormat(iterNoTimestamp.value()));
                            nextNoTimestamp = null;
                            iterNoTimestamp.next();
                        } else {
                            next = KeyValue.pair(new Bytes(nextWithTimestamp), iterWithTimestamp.value());
                            nextWithTimestamp = null;
                            iterWithTimestamp.next();
                        }
                    } else {
                        if (comparator.compare(nextNoTimestamp, nextWithTimestamp) >= 0) {
                            next = KeyValue.pair(new Bytes(nextNoTimestamp), convertToTimestampedFormat(iterNoTimestamp.value()));
                            nextNoTimestamp = null;
                            iterNoTimestamp.prev();
                        } else {
                            next = KeyValue.pair(new Bytes(nextWithTimestamp), iterWithTimestamp.value());
                            nextWithTimestamp = null;
                            iterWithTimestamp.prev();
                        }
                    }
                }
            }
            return next;
        }

        @Override
        public synchronized void close() {
            if (closeCallback == null) {
                throw new IllegalStateException("RocksDBDualCFIterator expects close callback to be set immediately upon creation");
            }
            closeCallback.run();

            iterNoTimestamp.close();
            iterWithTimestamp.close();
            open = false;
        }

        @Override
        public Bytes peekNextKey() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }
            return next.key;
        }

        @Override
        public void onClose(final Runnable closeCallback) {
            this.closeCallback = closeCallback;
        }
    }

    private class RocksDBDualCFRangeIterator extends RocksDBDualCFIterator {
        // RocksDB's JNI interface does not expose getters/setters that allow the
        // comparator to be pluggable, and the default is lexicographic, so it's
        // safe to just force lexicographic comparator here for now.
        private final Comparator comparator = Bytes.BYTES_LEXICO_COMPARATOR;
        private final byte[] rawLastKey;
        private final boolean forward;
        private final boolean toInclusive;

        RocksDBDualCFRangeIterator(final String storeName,
                                   final RocksIterator iterWithTimestamp,
                                   final RocksIterator iterNoTimestamp,
                                   final Bytes from,
                                   final Bytes to,
                                   final boolean forward,
                                   final boolean toInclusive) {
            super(storeName, iterWithTimestamp, iterNoTimestamp, forward);
            this.forward = forward;
            this.toInclusive = toInclusive;
            if (forward) {
                if (from == null) {
                    iterWithTimestamp.seekToFirst();
                    iterNoTimestamp.seekToFirst();
                } else {
                    iterWithTimestamp.seek(from.get());
                    iterNoTimestamp.seek(from.get());
                }
                rawLastKey = to == null ? null : to.get();
            } else {
                if (to == null) {
                    iterWithTimestamp.seekToLast();
                    iterNoTimestamp.seekToLast();
                } else {
                    iterWithTimestamp.seekForPrev(to.get());
                    iterNoTimestamp.seekForPrev(to.get());
                }
                rawLastKey = from == null ? null : from.get();
            }
        }

        @Override
        protected KeyValue makeNext() {
            final KeyValue next = super.makeNext();

            if (next == null) {
                return allDone();
            } else if (rawLastKey == null) {
                //null means range endpoint is open
                return next;
            } else {
                if (forward) {
                    if (comparator.compare(next.key.get(), rawLastKey) < 0) {
                        return next;
                    } else if (comparator.compare(next.key.get(), rawLastKey) == 0) {
                        return toInclusive ? next : allDone();
                    } else {
                        return allDone();
                    }
                } else {
                    if (comparator.compare(next.key.get(), rawLastKey) >= 0) {
                        return next;
                    } else {
                        return allDone();
                    }
                }
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy