All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.apple.foundationdb.map.BunchedMap Maven / Gradle / Ivy

There is a newer version: 2.8.110.0
Show newest version
/*
 * BunchedMap.java
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2015-2018 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.apple.foundationdb.map;

import com.apple.foundationdb.annotation.API;
import com.apple.foundationdb.KeySelector;
import com.apple.foundationdb.KeyValue;
import com.apple.foundationdb.MutationType;
import com.apple.foundationdb.ReadTransaction;
import com.apple.foundationdb.StreamingMode;
import com.apple.foundationdb.Transaction;
import com.apple.foundationdb.TransactionContext;
import com.apple.foundationdb.async.AsyncIterable;
import com.apple.foundationdb.async.AsyncPeekCallbackIterator;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.async.AsyncPeekIterator;
import com.apple.foundationdb.subspace.Subspace;
import com.apple.foundationdb.tuple.ByteArrayUtil;
import com.apple.foundationdb.tuple.ByteArrayUtil2;
import com.apple.foundationdb.util.LogMessageKeys;

import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;

/**
 * An implementation of a FoundationDB-backed map that bunches close keys together to minimize the
 * overhead of storing keys with a common prefix. The most straight-forward way to store a map in
 * FoundationDB is to store one key-value pair in the some subspace of the database for each key
 * and value of the map. However, this can lead to problems if there are either too many keys or
 * if the subspace prefix is too large as that prefix will be repeated many times (once for each key
 * in the map).
 *
 * 

* This structure "bunches" adjacent keys together so that one key in the database is responsible * for storing multiple entries in the map, which effectively amortizes the cost of this subspace prefix * across multiple map entries. In particular, the map will choose "signpost" keys in the map. For each * signpost, a key in the database is constructed that is the subspace prefix concatenated with the * serialized key. This key is then responsible for storing every entry in the map for which the key * is greater than or equal to the signpost key but less than the next signpost key. The signposts are * chosen dynamically as keys are added and removed from the map. In particular, there is a target * "bunch size" that is a parameter to the map, and upon inserting, the map will see if there is a * bunch that the given key can be placed in without exceeding the bunch size. If not, it will create * one by adding a new signpost key. *

* *

* The cost for bunching entries this way is that it requires that the client perform additional database * reads while inserting, so mutations have a higher latency than under the simpler scheme, and two clients * attempting to modify the same map are likely to experience contention. It is also more expensive to read * a single key from the map (as the read now will also read the data for keys in the same bunch as the desired * key). A full scan of the map requires less data be transferred over the wire as the subspace prefix can * be sent fewer times, so scan-heavy use-cases might not experience much of an overhead at all. *

* *

* Most methods of this class take a subspace. For the most part, these methods assume that there is one * durable instance of a BunchedMap within the bounds of the subspace provided. The exception * to this is the * {@link #scanMulti(ReadTransaction, Subspace, SubspaceSplitter, byte[], byte[], byte[], int, boolean)} scanMulti()} * family of methods. See the documentation on those methods for more information. *

* *

* This class is not thread-safe in the general case. Assuming that the serializer and key-comparator are * both thread-safe, this class is safe to use from multiple transactions at once or with multiple subspaces * concurrently within a single transaction. However, it is unsafe to modify two keys within the same subspace * in the same transaction from multiple threads. *

* * @param type of keys in the map * @param type of values in the map */ @API(API.Status.EXPERIMENTAL) public class BunchedMap { private static final int MAX_VALUE_SIZE = 10_000; // The actual max value size is 100_000, but let's stay clear of that private static final byte[] ZERO_ARRAY = new byte[]{0x00}; @Nonnull private final Comparator keyComparator; @Nonnull private final BunchedSerializer serializer; private final int bunchSize; /** * Create a bunched map with the given serializer, key comparator, and bunch size. The provided serializer * is used to serialize keys and values when writing to the database and to deserialize them when * reading. The comparator is used to maintain keys in a sorted order. The sorted order of keys, however, * should be consistent with the byte order of serialized keys (when using unsigned lexicographic comparison), * as that comparison method is used by the map when it is more efficient. The bunch size is the maximum number * of map keys within any bunch of keys within the database. This value is not stored durably in the database, * and it is safe to change this value over time (and to have different writers using different values for the * bunch size concurrently), though one writer might undo the work of another writer or make different decisions * when splitting up values or adding to bunches. * * @param serializer serialize to use when reading or writing data * @param keyComparator comparator used to order keys * @param bunchSize maximum size of bunch within the database */ public BunchedMap(@Nonnull BunchedSerializer serializer, @Nonnull Comparator keyComparator, int bunchSize) { this.serializer = serializer; this.keyComparator = keyComparator; this.bunchSize = bunchSize; } private static List makeMutable(@Nonnull List list) { if (list instanceof ArrayList) { return list; } else { return new ArrayList<>(list); } } private CompletableFuture> entryForKey(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull K key) { byte[] keyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(key)); tr.addReadConflictKey(keyBytes); // We need to use a range read rather than a getKey with a single key selector // because we need to return the key back as well as the value. // In practice, this range request should always return a single element, but // in rare cases, concurrent updates near and around the endpoints might // result in additional elements being returned. AsyncIterable iterable = tr.snapshot().getRange( KeySelector.lastLessOrEqual(keyBytes), KeySelector.firstGreaterThan(keyBytes), ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL ); return iterable.asList().thenApply(keyValues -> { if (keyValues.isEmpty()) { // There aren't any entries before this key in the database. return Optional.empty(); } else { // The last (and probably only) result of the range read should be // the greatest key that is less than or equal to keyBytes. KeyValue kv = keyValues.get(keyValues.size() - 1); if (ByteArrayUtil.compareUnsigned(kv.getKey(), keyBytes) > 0) { throw new BunchedMapException("signpost key found for key is greater than original key") .addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey)) .addLogInfo("key", ByteArrayUtil2.loggable(keyBytes)) .addLogInfo("signpostKey", ByteArrayUtil2.loggable(kv.getKey())); } if (ByteArrayUtil.startsWith(kv.getKey(), subspaceKey)) { // The candidate key is in the correct subspace, so this is the signpost key for the given key return Optional.of(kv); } else { // The candidate key is not in the correct subspace, so we must be looking for a // key that is smaller than the smallest key currently in the map (which is // vacuously the case if the map is empty). return Optional.empty(); } } }); } // Grand Theory of Conflict Ranges // // Because the map must do range scans that can potentially touch much larger ranges than // is necessary, all reads are done at snapshot isolation level and then conflict ranges are // added as needed to try and decrease contention. This logic is a little complicated, so here // is an attempt to explain the reasoning behind it. The main goal is to enforce the following // invariants: // // 1. For any given DB key, all map keys greater than or equal to that key but strictly less // than the next DB key are in the DB value associated with that key. // 2. Any map read that depends on the exact value of a map key being read that is changed // by a concurrent transaction will trigger a conflict at commit time. // 3. After a user has written a value to the map, subsequent operations should preserve that // value. // // In some sense, condition 1 is that the integrity of the data structure should be preserved, // condition 2 is somewhat analogous to serializability, and condition 3 is analogous to // linearizability and durability. This leads to the following set of conflict ranges, specified // here in an order that is supposed to reflect how straightforward each added conflict range is: // // a. When reading a map key, add a read conflict key to the corresponding key in the // DB regardless of DB keys actually read. When modifying a map key, add a write // conflict key to that same key. This gets us (2). // b. When modifying a DB key, we will end up issuing a write that re-writes all // values in its range, so add a read conflict range over those keys so that any // modifications to those keys that happen between read time and commit time // are not overwritten by the re-write. This gets us (3). // c. When adding a map key to the end of a DB key's range or merging an existing // DB key's range into a new key, a write conflict range must be added for the "gaps" // that existed between the key ranges. This is necessary because without it, // a concurrent modification can read the range that the DB key is now responsible // for and write to it in a way that violates (1). // // There exist semi-formal proofs as to why these conflict ranges are sufficient to // guarantee the three invariants proposed, but they are too large to fit into this // comment. But in addition to proving them correct, a fair amount of testing has gone into // trying to verify that they work as intended through randomized testing. private void addEntryListReadConflictRange(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes, @Nonnull List> entryList) { byte[] end = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(entryList.size() - 1).getKey()), ZERO_ARRAY); tr.addReadConflictRange(keyBytes, end); } private void insertAlone(@Nonnull Transaction tr, @Nonnull byte[] keyBytes, @Nonnull Map.Entry entry) { tr.addReadConflictKey(keyBytes); tr.set(keyBytes, serializer.serializeEntries(Collections.singletonList(entry))); } private void writeEntryListWithoutChecking(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes, @Nonnull byte[] oldKey, @Nonnull byte[] newKey, @Nonnull List> entryList, @Nonnull byte[] serializedBytes) { // The order of these operations is fairly important as, it turns out, adding an explicit // read conflict range does will skip over values that have already been written. This // means that we will miss the value that is the actual key we are writing if we // do these in the wrong order. // TODO: Adding an explicit read conflict range skips the keys in write cache (https://github.com/apple/foundationdb/issues/126) addEntryListReadConflictRange(tr, subspaceKey, newKey, entryList); if (!Arrays.equals(oldKey, newKey)) { tr.clear(oldKey); } tr.set(newKey, serializedBytes); if (!Arrays.equals(keyBytes, newKey)) { tr.addWriteConflictKey(keyBytes); } } private void writeEntryList(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes, @Nonnull byte[] oldKey, @Nonnull byte[] newKey, @Nonnull List> entryList, @Nullable KeyValue kvAfter, boolean isFirst, boolean isLast) { byte[] serializedBytes = serializer.serializeEntries(entryList); if (serializedBytes.length > MAX_VALUE_SIZE) { if (isFirst || entryList.size() == 1) { insertAlone(tr, keyBytes, entryList.get(0)); } else if (isLast) { insertAfter(tr, subspaceKey, keyBytes, kvAfter, entryList.get(entryList.size() - 1)); } else { // Splits the keys down the middle. In principle, this might result in keys that exceed // the maximum value size (if there are weird non-linearities in the serializer--for example, // if it compresses). However, in practice, this will almost always produce two keys that // are under the maximum value size. If one or more of the two keys exceed the correct size, but // each value is still less than the FDB maximum value size (which is likely given that // MAX_VALUE_SIZE is much less than the actual FDB maximum value size), then everything will just // work. If either one exceeds that maximum value size, then the insertion fails and an error // bubbles up to the user. This is worse than there not being an error, but it is safe. int splitPoint = entryList.size() / 2; List> firstEntries = entryList.subList(0, splitPoint); byte[] firstSerialized = serializer.serializeEntries(firstEntries); List> secondEntries = entryList.subList(splitPoint, entryList.size()); byte[] secondSerialized = serializer.serializeEntries(secondEntries); writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, oldKey, newKey, firstEntries, firstSerialized); byte[] secondKey = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(secondEntries.get(0).getKey())); writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, secondKey, secondKey, secondEntries, secondSerialized); } } else { if (serializer.canAppend() && isLast && entryList.size() > 1 && Arrays.equals(oldKey, newKey)) { // Note: APPEND_IF_FITS will silently fail if the size of the value is greater than the maximum // value size. It is therefore *very* important that we check what the size will be before // calling this method to make sure that the total size is not too large. Otherwise, we might // lose data. addEntryListReadConflictRange(tr, subspaceKey, newKey, entryList); tr.mutate(MutationType.APPEND_IF_FITS, newKey, serializer.serializeEntry(entryList.get(entryList.size() - 1))); tr.addWriteConflictKey(keyBytes); } else { writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, oldKey, newKey, entryList, serializedBytes); } // When appending before the beginning or writing after the end, we are essentially asserting // that this key will be responsible for an additional range of map keys. Concurrent transactions // might also claim this section of the logical key range in incompatible ways if we do not // declare write conflict ranges here. if (isFirst && entryList.size() >= 2) { tr.addWriteConflictRange(keyBytes, ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(1).getKey()))); } if (isLast && entryList.size() >= 2) { tr.addWriteConflictRange(ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(entryList.size() - 2).getKey())), keyBytes); } } } private void insertAfter(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes, @Nullable KeyValue kvAfter, @Nonnull Map.Entry entry) { if (kvAfter == null) { insertAlone(tr, keyBytes, entry); } else { K afterKey = serializer.deserializeKey(kvAfter.getKey(), subspaceKey.length); List> afterEntryList = serializer.deserializeEntries(afterKey, kvAfter.getValue()); if (afterEntryList.size() >= bunchSize) { // The next list of entries is too large. Write to a separate KV pair. insertAlone(tr, keyBytes, entry); } else { // Bunch this entry with the next one. List> newEntryList = new ArrayList<>(afterEntryList.size() + 1); newEntryList.add(entry); newEntryList.addAll(afterEntryList); writeEntryList(tr, subspaceKey, keyBytes, kvAfter.getKey(), keyBytes, newEntryList, null, true, false); } } } @Nonnull private Optional insertEntry(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes, @Nonnull K key, @Nonnull V value, @Nullable KeyValue kvBefore, @Nullable KeyValue kvAfter, @Nonnull Map.Entry entry) { if (kvBefore == null) { insertAfter(tr, subspaceKey, keyBytes, kvAfter, entry); return Optional.empty(); } else { K beforeKey = serializer.deserializeKey(kvBefore.getKey(), subspaceKey.length); List> beforeEntryList = serializer.deserializeEntries(beforeKey, kvBefore.getValue()); int insertIndex = 0; while (insertIndex < beforeEntryList.size() && keyComparator.compare(key, beforeEntryList.get(insertIndex).getKey()) > 0) { insertIndex++; } if (insertIndex < beforeEntryList.size() && keyComparator.compare(key, beforeEntryList.get(insertIndex).getKey()) == 0) { // This key is already in the map, so we are going to end up re-writing it iff the value is different. Map.Entry oldEntry = beforeEntryList.get(insertIndex); V oldValue = oldEntry.getValue(); if (!oldEntry.getValue().equals(value)) { beforeEntryList = makeMutable(beforeEntryList); beforeEntryList.set(insertIndex, entry); writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(), beforeEntryList, kvAfter, false, false); } else { // We are choosing to not re-write the key because it // is already the value we wanted any way. Add a // read conflict key to it so if something else // changes it, this transaction will need to be retried // to set it back. tr.addReadConflictKey(keyBytes); } return Optional.of(oldValue); } else if (insertIndex < beforeEntryList.size()) { // This key is going to be inserted somewhere in the middle beforeEntryList = makeMutable(beforeEntryList); beforeEntryList.add(insertIndex, entry); if (beforeEntryList.size() <= bunchSize) { // Insert the entry in the middle and serialize. writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(), beforeEntryList, kvAfter, false, false); } else { // Split this entry in half (roughly) and insert both halves int splitPoint = beforeEntryList.size() / 2; writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(), beforeEntryList.subList(0, splitPoint), null, false, false); List> secondEntries = beforeEntryList.subList(splitPoint, beforeEntryList.size()); byte[] secondKey = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(secondEntries.get(0).getKey())); writeEntryList(tr, subspaceKey, keyBytes, secondKey, secondKey, secondEntries, kvAfter, false, false); } return Optional.empty(); } else { // This key is going to be inserted after all of the keys in the before entry. if (beforeEntryList.size() < bunchSize) { // Append to the end of the current list. List> newEntryList = new ArrayList<>(beforeEntryList.size() + 1); newEntryList.addAll(beforeEntryList); newEntryList.add(entry); writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(), newEntryList, kvAfter, false, true); } else { // This key would make the bunch too large. Insert it into the next one. insertAfter(tr, subspaceKey, keyBytes, kvAfter, entry); } return Optional.empty(); } } } /** * Inserts or updates a key into a map with a new value. This will find an appropriate * bunch to insert the key into (or create one if one doesn't exist or if all of the candidates * are full). It will do work to make sure that the placement is locally optimal (that is, it * will choose between the one or two bunches closest to the key when performing its bunching). * It makes no attempt to fix suboptimal bunches elsewhere within the map. If the map already * contains key, it will overwrite the existing key with the new value. This will * return the old value if one is present. * *

* Note that this method is not thread-safe if multiple threads call it with the same * transaction and subspace. (Multiple calls with different transactions or subspaces are safe.) *

* *

* Note that this call is asynchronous. It will return a {@link CompletableFuture} that will be * completed when this task has completed. *

* * @param tcx database or transaction to use when performing the insertion * @param subspace subspace within which the map's data are located * @param key key of the map entry to insert * @param value value of the map entry to insert * @return a future that will complete with an optional that will either contain the previous value * associated with the key or be empty if there was not a previous value */ @Nonnull public CompletableFuture> put(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key, @Nonnull V value) { return tcx.runAsync(tr -> { byte[] subspaceKey = subspace.pack(); byte[] keyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(key)); // We need to know the key (and value) that is less than or equal to // the key we are trying to insert in our map as well as the key (and value) // that is greater than our key in the map. Many insertions will not actually // require both of these, but it is better to grab them both at once using // a single range read that (with a very high likelihood) will hit a single // storage server than to do the two reads separately. // // Note that we do this read at snapshot isolation level, so if we read too much, // that won't be a problem in terms of conflict ranges (we will just add the // correct conflict ranges later). // // In practice, the range read will almost always return at most 2 results. Because // of how range reads with key selectors are implemented, there is a slight // possibility that there will be more than two if, for example, additional // keys are added (within this transaction) to the RYW cache. return tr.snapshot().getRange( KeySelector.lastLessOrEqual(keyBytes), KeySelector.firstGreaterThan(keyBytes).add(1), ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL ).asList().thenApply(keyValues -> { KeyValue kvBefore = null; KeyValue kvAfter = null; for (KeyValue next : keyValues) { if (ByteArrayUtil.startsWith(next.getKey(), subspaceKey)) { if (ByteArrayUtil.compareUnsigned(keyBytes, next.getKey()) < 0) { kvAfter = next; break; // no need to continue after kvAfter is set } if (ByteArrayUtil.compareUnsigned(next.getKey(), keyBytes) <= 0) { kvBefore = next; } } } // If either of these trigger, than it means that I screwed up the logic here in // picking the correct keys and values. if (kvBefore != null && (ByteArrayUtil.compareUnsigned(keyBytes, kvBefore.getKey()) < 0 || !ByteArrayUtil.startsWith(kvBefore.getKey(), subspaceKey))) { throw new BunchedMapException("database key before map key compared incorrectly") .addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey)) .addLogInfo("key", ByteArrayUtil2.loggable(keyBytes)) .addLogInfo("kvBefore", ByteArrayUtil2.loggable(kvBefore.getKey())); } if (kvAfter != null && (ByteArrayUtil.compareUnsigned(keyBytes, kvAfter.getKey()) >= 0 || !ByteArrayUtil.startsWith(kvAfter.getKey(), subspaceKey))) { throw new BunchedMapException("database key after map key compared incorrectly") .addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey)) .addLogInfo("key", ByteArrayUtil2.loggable(keyBytes)) .addLogInfo("kvAfter", ByteArrayUtil2.loggable(kvAfter.getKey())); } Map.Entry newEntry = new AbstractMap.SimpleImmutableEntry<>(key, value); return insertEntry(tr, subspaceKey, keyBytes, key, value, kvBefore, kvAfter, newEntry); }); }); } /** * Determines whether a key is contained within the map. This method is safe to run concurrently * with other map operations in other threads. However, if there are concurrent * {@link #put(TransactionContext, Subspace, Object, Object) put} * or {@link #remove(TransactionContext, Subspace, Object) remove} * calls, then there are no guarantees as to whether this will return true or false. * * @param tcx database or transaction to use when performing reads * @param subspace subspace within which the map's data are located * @param key the key to check for membership within the map * @return a future that will be completed to true if the map contains key * and false otherwise */ @Nonnull public CompletableFuture containsKey(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key) { final byte[] subspaceKey = subspace.getKey(); return tcx.runAsync(tr -> entryForKey(tr, subspaceKey, key) .thenApply(optionalEntry -> optionalEntry .map(kv -> { K mapKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length); return serializer.deserializeKeys(mapKey, kv.getValue()).contains(key); }) .orElse(false) ) ); } /** * Retrieves the value associated with a key from the map. This method is safe to run concurrently * with other map operations. However, if there are concurrent * {@link #put(TransactionContext, Subspace, Object, Object) put} * or {@link #remove(TransactionContext, Subspace, Object) remove} * operations, then there are no guarantees as to whether this operation will see the result of the * concurrent operation or not. * * @param tcx database or transaction to use when performing reads * @param subspace subspace within which the map's data are located * @param key the key within the map to retrieve the value of * @return a future that will be completed with an optional that will be present with the value * associated with the key in the database or empty if the key is not contained within the map */ @Nonnull public CompletableFuture> get(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key) { final byte[] subspaceKey = subspace.getKey(); return tcx.runAsync(tr -> entryForKey(tr, subspaceKey, key) .thenApply(optionalEntry -> optionalEntry .flatMap(kv -> { K mapKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length); final List> entryList = serializer.deserializeEntries(mapKey, kv.getValue()); return entryList.stream() .filter(entry -> entry.getKey().equals(key)) .findAny() .map(Map.Entry::getValue); }) ) ); } /** * Removes a key from the map. This returns a future that will contain an optional with the * old value associated with the key within the map (prior to deletion) if one is present or * will be empty if the key was not contained within the map. * *

* Note that this method is not thread-safe if multiple threads call it with the same * transaction and subspace. (Multiple calls with different transactions or subspaces are safe.) *

* *

* Note that this call is asynchronous. It will return a {@link CompletableFuture} that will be completed * when this task has completed. *

* * @param tcx database or transaction to use when removing the key * @param subspace subspace within which the map's data are located * @param key the key to remove from the map * @return a future that will be completed with an optional that will be present with the value associated * with the key in the database (prior to removal) or will be empty if the key was not present */ @Nonnull public CompletableFuture> remove(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key) { final byte[] subspaceKey = subspace.getKey(); return tcx.runAsync(tr -> entryForKey(tr, subspaceKey, key).thenApply(optionalEntry -> optionalEntry.flatMap((KeyValue kv) -> { K mapKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length); List> entryList = serializer.deserializeEntries(mapKey, kv.getValue()); int foundIndex = -1; for (int i = 0; i < entryList.size(); i++) { if (entryList.get(i).getKey().equals(key)) { foundIndex = i; break; } } if (foundIndex != -1) { final Map.Entry oldEntry = entryList.get(foundIndex); // The value that gets written is based on the contents of the entries read, so // we need to add a read range with all of the values we are currently writing. addEntryListReadConflictRange(tr, subspaceKey, kv.getKey(), entryList); tr.addWriteConflictKey(ByteArrayUtil.join(subspaceKey, serializer.serializeKey(key))); if (entryList.size() == 1) { // The only key that was in the range was the key that // we are currently removing, so just remove it. tr.clear(kv.getKey()); } else { // We have other items in the entry. Remove the entry // we actually care about and serialize the rest. entryList = makeMutable(entryList); entryList.remove(foundIndex); if (foundIndex == 0) { tr.clear(kv.getKey()); byte[] newKey = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(0).getKey())); tr.set(newKey, serializer.serializeEntries(entryList)); } else { tr.set(kv.getKey(), serializer.serializeEntries(entryList)); } } return Optional.of(oldEntry.getValue()); } else { return Optional.empty(); } }))); } /** * Verify the integrity of the bunched map. This will read through all of the database keys associated * with the map and verify that all of the keys are in order. If it encounters an error, it will * complete with an exception. Otherwise, the returned future will complete normally. * * @param tcx database or transaction to use when removing the key * @param subspace subspace within which the map's data are located * @return a future that will complete when the integrity check has finished */ @Nonnull public CompletableFuture verifyIntegrity(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace) { return tcx.runAsync(tr -> { AtomicReference lastKey = new AtomicReference<>(null); byte[] subspaceKey = subspace.getKey(); AsyncIterable iterable = tr.getRange(subspace.range()); return AsyncUtil.forEach(iterable, kv -> { K boundaryKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length); if (lastKey.get() != null && keyComparator.compare(boundaryKey, lastKey.get()) < 0) { throw new BunchedMapException("boundary key out of order") .addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey)) .addLogInfo("lastKey", lastKey.get()) .addLogInfo("boundaryKey", boundaryKey); } lastKey.set(boundaryKey); List keys = serializer.deserializeKeys(boundaryKey, kv.getValue()); for (K key : keys) { if (keyComparator.compare(key, lastKey.get()) < 0) { throw new BunchedMapException("keys within bunch out of order") .addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey)) .addLogInfo("lastKey", lastKey.get()) .addLogInfo("nextKey", key); } lastKey.set(key); } }, tr.getExecutor()); }); } private void flushEntryList(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull List> currentEntryList, @Nonnull AtomicReference lastKey) { byte[] keyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(currentEntryList.get(0).getKey())); writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, keyBytes, keyBytes, currentEntryList, serializer.serializeEntries(currentEntryList)); lastKey.set(currentEntryList.get(currentEntryList.size() - 1).getKey()); currentEntryList.clear(); } /** * Compact the values within the map into as few keys as possible. This will scan through and re-write * the keys to be optimal. This feature is experimental at the moment, but it should be used to better * pack entries if needed. * * @param tcx database or transaction to use when compacting data * @param subspace subspace within which the map's data are located * @param keyLimit maximum number of database keys to read in a single transaction * @param continuation the continuation returned from a previous call or null * to start from the beginning of the subspace * @return future that will complete with a continuation that can be used to complete * the compaction across multiple transactions (null if finished) */ @Nonnull protected CompletableFuture compact(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, int keyLimit, @Nullable byte[] continuation) { return tcx.runAsync(tr -> { byte[] subspaceKey = subspace.getKey(); byte[] begin = (continuation == null) ? subspaceKey : continuation; byte[] end = subspace.range().end; final AsyncIterable iterable = tr.snapshot().getRange(begin, end, keyLimit); List> currentEntryList = new ArrayList<>(bunchSize); // The estimated size can be off (and will be off for many implementations of BunchedSerializer), // but it is just a heuristic to know when to split, so that's fine (I claim). AtomicInteger currentEntrySize = new AtomicInteger(0); AtomicInteger readKeys = new AtomicInteger(0); AtomicReference lastReadKeyBytes = new AtomicReference<>(null); AtomicReference lastKey = new AtomicReference<>(null); return AsyncUtil.forEach(iterable, kv -> { final K boundaryKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length); final List> entriesFromKey = serializer.deserializeEntries(boundaryKey, kv.getValue()); readKeys.incrementAndGet(); if (entriesFromKey.size() >= bunchSize && currentEntryList.isEmpty()) { // Nothing can be done. Just move on. lastReadKeyBytes.set(null); return; } if (lastReadKeyBytes.get() == null) { lastReadKeyBytes.set(kv.getKey()); } final byte[] endKeyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entriesFromKey.get(entriesFromKey.size() - 1).getKey()), ZERO_ARRAY); tr.addReadConflictRange(lastReadKeyBytes.get(), endKeyBytes); tr.addWriteConflictRange(lastReadKeyBytes.get(), kv.getKey()); lastReadKeyBytes.set(endKeyBytes); tr.clear(kv.getKey()); for (Map.Entry entry : entriesFromKey) { byte[] serializedEntry = serializer.serializeEntry(entry); if (currentEntrySize.get() + serializedEntry.length > MAX_VALUE_SIZE && !currentEntryList.isEmpty()) { flushEntryList(tr, subspaceKey, currentEntryList, lastKey); currentEntrySize.set(0); } currentEntryList.add(entry); currentEntrySize.addAndGet(serializedEntry.length); if (currentEntryList.size() == bunchSize) { flushEntryList(tr, subspaceKey, currentEntryList, lastKey); currentEntrySize.set(0); } } }, tr.getExecutor()).thenApply(vignore -> { if (!currentEntryList.isEmpty()) { flushEntryList(tr, subspaceKey, currentEntryList, lastKey); } // Return a valid continuation if there might be more keys if (lastKey.get() != null && keyLimit != ReadTransaction.ROW_LIMIT_UNLIMITED && readKeys.get() == keyLimit) { return ByteArrayUtil.join(subspaceKey, serializer.serializeKey(lastKey.get()), ZERO_ARRAY); } else { return null; } }); }); } /** * Scans the map and returns an iterator over all entries. This has the same behavior as the * {@link #scan(ReadTransaction, Subspace, byte[], int, boolean) scan()} method that takes more * parameters, but it will return an iterator that has no limit and always returns entries in ascending * order by key. * * @param tr transaction to use when scanning the map * @param subspace subspace in which the map's data are located * @return an iterator over the entries in the map */ @Nonnull public BunchedMapIterator scan(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace) { return scan(tr, subspace, null, Transaction.ROW_LIMIT_UNLIMITED, false); } /** * Scans the maps and returns an iterator over all entries. This has the same behavior as the * {@link #scan(ReadTransaction, Subspace, byte[], int, boolean) scan()} method that takes more * parameters, but it will return an iterator that has no limit and always returns entries in ascending * order by key. * * @param tr transaction to use when scanning the map * @param subspace subspace in which the map's data are located * @param continuation continuation from a previous scan (or null to start from the beginning) * @return an iterator over the entries in the map */ @Nonnull public BunchedMapIterator scan(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nullable byte[] continuation) { return scan(tr, subspace, continuation, ReadTransaction.ROW_LIMIT_UNLIMITED, false); } /** * Scans the map and returns an iterator over all entries. All entries will be returned sorted by * key. If reverse is true, it will return keys in descending order instead of in * ascending order. It will return at most limit keys from the map. Note that because of * bunching, this will probably require reading fewer keys from the database. Scans that require * multiple transactions can provide a continuation from a previous scan. The returned iterator * has a {@link BunchedMapIterator#getContinuation() getContinuation()} method that can be used to get an * appropriate value for that parameter. * * @param tr transaction to use when scanning the map * @param subspace subspace in which the map's data are located * @param continuation continuation from a previous scan (or null to start from the beginning) * @param limit maximum number of keys to return or {@link ReadTransaction#ROW_LIMIT_UNLIMITED} if no limit * @param reverse true if keys are wanted in descending instead of ascending order * @return an iterator over the entries in the map */ @Nonnull public BunchedMapIterator scan(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nullable byte[] continuation, int limit, boolean reverse) { byte[] subspaceKey = subspace.getKey(); AsyncIterable rangeReadIterable; K continuationKey; if (continuation == null) { continuationKey = null; rangeReadIterable = tr.getRange(subspace.range(), ReadTransaction.ROW_LIMIT_UNLIMITED, reverse); } else { continuationKey = serializer.deserializeKey(continuation); byte[] continuationKeyBytes = ByteArrayUtil.join(subspaceKey, continuation); if (reverse) { rangeReadIterable = tr.getRange(subspaceKey, continuationKeyBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, true); } else { rangeReadIterable = tr.getRange(KeySelector.lastLessOrEqual(continuationKeyBytes), KeySelector.firstGreaterOrEqual(subspace.range().end), ReadTransaction.ROW_LIMIT_UNLIMITED, false); } } return new BunchedMapIterator<>( AsyncPeekIterator.wrap(rangeReadIterable.iterator()), tr, subspace, subspace.getKey(), serializer, keyComparator, continuationKey, limit, reverse ); } /** * Scans multiple maps and returns an iterator over all of them. This behaves the same was as the * {@link #scanMulti(ReadTransaction, Subspace, SubspaceSplitter, byte[], byte[], byte[], int, boolean) scanMulti()} * method that takes additional subspaceStart and subspaceEnd parameters, * but this will always scan from the beginning of subspace to the end, assumes that * there is no limit to the number of entries to return, and always returns items in ascending order. * * @param tr transaction to use when scanning the maps * @param subspace subspace containing one or more maps * @param splitter object to determine which map a given key is in * @param type of the tag of each map subspace * @return an iterator over the entries in multiple maps */ @Nonnull public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter) { return scanMulti(tr, subspace, splitter, null, ReadTransaction.ROW_LIMIT_UNLIMITED, false); } /** * Scans multiple maps and returns an iterator over all of them. This behaves the same was as the * {@link #scanMulti(ReadTransaction, Subspace, SubspaceSplitter, byte[], byte[], byte[], int, boolean) scanMulti()} * method that takes additional subspaceStart and subspaceEnd parameters, * but this will always scan from the beginning of subspace to the end. * * @param tr transaction to use when scanning the maps * @param subspace subspace containing one or more maps * @param splitter object to determine which map a given key is in * @param continuation continuation from previous scan (or null to start from the beginning) * @param limit maximum number of entries to return * @param reverse true if the entries should be returned in descending order or false otherwise * @param type of the tag of each map subspace * @return an iterator over the entries in multiple maps */ @Nonnull public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter, @Nullable byte[] continuation, int limit, boolean reverse) { return scanMulti(tr, subspace, splitter, null, null, continuation, limit, reverse); } /** * Scans multiple maps and returns an iterator over all of them. The returned iterator * will produce one item for each entry in each map. To do so, the maps must be backed by contiguous * subspaces within subspace. The scan will start at the first subspace greater * than or equal to the subspace formed by concatenating the raw prefix of subspace * with subspaceStart (or with nothing if subspaceStart is * null) and will end with the last subspace less than the subspace formed * by concatenating the raw prefix of subspace with subspaceEnd * (or with nothing if subspaceEnd is null). The provided * {@link SubspaceSplitter} should be able to determine the subspace of the map that * a given key contains and (optionally) provide some tag that can be used to associate * that map with some value that might be used to distinguish one map from the other (if * the application needs it). * *

* For example, suppose there are ten BunchedMaps that all begin with a raw prefix * prefix followed by a {@link com.apple.foundationdb.tuple.Tuple Tuple}-encoded * integer (0 through 10). If one wanted to scan the three maps starting with map 3 and ending * with map 6, one would supply the following parameters: *

*
    *
  • subspace should be set to new Subspace(prefix)
  • *
  • * splitter should be implemented so that if given key within map n, * {@link SubspaceSplitter#subspaceOf(byte[]) subspaceOf(key)} returns subspace.subspace(Tuple.from(n)) * and {@link SubspaceSplitter#subspaceTag(Subspace) subspaceTag(subspaceOf(key))} returns n. *
  • *
  • subspaceStart should be set to Tuple.pack(3)
  • *
  • subspaceEnd should be set to Tuple.pack(7) (or Tuple.pack(6) concatenated with a 0-byte)
  • *
* *

* Furthermore, this method can be used across multiple calls or transactions by setting the * continuation parameter to the result of {@link BunchedMapMultiIterator#getContinuation() getContinuation()} * from a previous scan. (One should pass null to restart the scan from the beginning.) * The iterator will return at most limit entries. The entries will ordered first * by subspace and within a subspace by key. If reverse is true, the * items will be returned in descending order. Otherwise, they will be returned in ascending order. *

* * @param tr transaction to use when scanning the maps * @param subspace subspace containing one or more maps * @param splitter object to determine which map a given key is in * @param subspaceStart inclusive starting suffix of subspace to start the scan * @param subspaceEnd exclusive ending suffix of subspace to end the scan * @param continuation continuation from previous scan (or null to start from the beginning) * @param limit maximum number of entries to return * @param reverse true if the entries should be returned in descending order or false otherwise * @param type of the tag of each map subspace * @return an iterator over the entries in multiple maps */ @Nonnull public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter, @Nullable byte[] subspaceStart, @Nullable byte[] subspaceEnd, @Nullable byte[] continuation, int limit, boolean reverse) { return scanMulti(tr, subspace, splitter, subspaceStart, subspaceEnd, continuation, limit, null, reverse); } public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter, @Nullable byte[] subspaceStart, @Nullable byte[] subspaceEnd, @Nullable byte[] continuation, int limit, @Nullable Consumer postReadCallback, boolean reverse) { byte[] subspaceKey = subspace.getKey(); byte[] startBytes = (subspaceStart == null ? subspaceKey : ByteArrayUtil.join(subspaceKey, subspaceStart)); byte[] endBytes = (subspaceEnd == null ? ByteArrayUtil.strinc(subspaceKey) : ByteArrayUtil.join(subspaceKey, subspaceEnd)); AsyncIterable rangeReadIterable; if (continuation == null) { rangeReadIterable = tr.getRange(startBytes, endBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, reverse); } else { byte[] continuationEndpoint = ByteArrayUtil.join(subspaceKey, continuation); if (reverse) { if (ByteArrayUtil.compareUnsigned(continuationEndpoint, endBytes) < 0) { rangeReadIterable = tr.getRange(startBytes, continuationEndpoint, ReadTransaction.ROW_LIMIT_UNLIMITED, true); } else { rangeReadIterable = tr.getRange(startBytes, endBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, true); } } else { if (ByteArrayUtil.compareUnsigned(continuationEndpoint, startBytes) < 0) { rangeReadIterable = tr.getRange(startBytes, endBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, false); } else { rangeReadIterable = tr.getRange(KeySelector.lastLessThan(continuationEndpoint), KeySelector.firstGreaterOrEqual(endBytes), ReadTransaction.ROW_LIMIT_UNLIMITED, false); } } } final AsyncPeekIterator wrappedIterator; if (postReadCallback == null) { wrappedIterator = AsyncPeekIterator.wrap(rangeReadIterable.iterator()); } else { wrappedIterator = AsyncPeekCallbackIterator.wrap(rangeReadIterable.iterator(), postReadCallback); } return new BunchedMapMultiIterator<>( wrappedIterator, tr, subspace, subspaceKey, splitter, serializer, keyComparator, continuation, limit, reverse ); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy