com.apple.foundationdb.map.BunchedMap Maven / Gradle / Ivy
Show all versions of fdb-extensions Show documentation
/*
* BunchedMap.java
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2015-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.apple.foundationdb.map;
import com.apple.foundationdb.annotation.API;
import com.apple.foundationdb.KeySelector;
import com.apple.foundationdb.KeyValue;
import com.apple.foundationdb.MutationType;
import com.apple.foundationdb.ReadTransaction;
import com.apple.foundationdb.StreamingMode;
import com.apple.foundationdb.Transaction;
import com.apple.foundationdb.TransactionContext;
import com.apple.foundationdb.async.AsyncIterable;
import com.apple.foundationdb.async.AsyncPeekCallbackIterator;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.async.AsyncPeekIterator;
import com.apple.foundationdb.subspace.Subspace;
import com.apple.foundationdb.tuple.ByteArrayUtil;
import com.apple.foundationdb.tuple.ByteArrayUtil2;
import com.apple.foundationdb.util.LogMessageKeys;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
/**
* An implementation of a FoundationDB-backed map that bunches close keys together to minimize the
* overhead of storing keys with a common prefix. The most straight-forward way to store a map in
* FoundationDB is to store one key-value pair in the some subspace of the database for each key
* and value of the map. However, this can lead to problems if there are either too many keys or
* if the subspace prefix is too large as that prefix will be repeated many times (once for each key
* in the map).
*
*
* This structure "bunches" adjacent keys together so that one key in the database is responsible
* for storing multiple entries in the map, which effectively amortizes the cost of this subspace prefix
* across multiple map entries. In particular, the map will choose "signpost" keys in the map. For each
* signpost, a key in the database is constructed that is the subspace prefix concatenated with the
* serialized key. This key is then responsible for storing every entry in the map for which the key
* is greater than or equal to the signpost key but less than the next signpost key. The signposts are
* chosen dynamically as keys are added and removed from the map. In particular, there is a target
* "bunch size" that is a parameter to the map, and upon inserting, the map will see if there is a
* bunch that the given key can be placed in without exceeding the bunch size. If not, it will create
* one by adding a new signpost key.
*
*
*
* The cost for bunching entries this way is that it requires that the client perform additional database
* reads while inserting, so mutations have a higher latency than under the simpler scheme, and two clients
* attempting to modify the same map are likely to experience contention. It is also more expensive to read
* a single key from the map (as the read now will also read the data for keys in the same bunch as the desired
* key). A full scan of the map requires less data be transferred over the wire as the subspace prefix can
* be sent fewer times, so scan-heavy use-cases might not experience much of an overhead at all.
*
*
*
* Most methods of this class take a subspace. For the most part, these methods assume that there is one
* durable instance of a BunchedMap
within the bounds of the subspace provided. The exception
* to this is the
* {@link #scanMulti(ReadTransaction, Subspace, SubspaceSplitter, byte[], byte[], byte[], int, boolean)} scanMulti()}
* family of methods. See the documentation on those methods for more information.
*
*
*
* This class is not thread-safe in the general case. Assuming that the serializer and key-comparator are
* both thread-safe, this class is safe to use from multiple transactions at once or with multiple subspaces
* concurrently within a single transaction. However, it is unsafe to modify two keys within the same subspace
* in the same transaction from multiple threads.
*
*
* @param type of keys in the map
* @param type of values in the map
*/
@API(API.Status.EXPERIMENTAL)
public class BunchedMap {
private static final int MAX_VALUE_SIZE = 10_000; // The actual max value size is 100_000, but let's stay clear of that
private static final byte[] ZERO_ARRAY = new byte[]{0x00};
@Nonnull
private final Comparator keyComparator;
@Nonnull
private final BunchedSerializer serializer;
private final int bunchSize;
/**
* Create a bunched map with the given serializer, key comparator, and bunch size. The provided serializer
* is used to serialize keys and values when writing to the database and to deserialize them when
* reading. The comparator is used to maintain keys in a sorted order. The sorted order of keys, however,
* should be consistent with the byte order of serialized keys (when using unsigned lexicographic comparison),
* as that comparison method is used by the map when it is more efficient. The bunch size is the maximum number
* of map keys within any bunch of keys within the database. This value is not stored durably in the database,
* and it is safe to change this value over time (and to have different writers using different values for the
* bunch size concurrently), though one writer might undo the work of another writer or make different decisions
* when splitting up values or adding to bunches.
*
* @param serializer serialize to use when reading or writing data
* @param keyComparator comparator used to order keys
* @param bunchSize maximum size of bunch within the database
*/
public BunchedMap(@Nonnull BunchedSerializer serializer, @Nonnull Comparator keyComparator, int bunchSize) {
this.serializer = serializer;
this.keyComparator = keyComparator;
this.bunchSize = bunchSize;
}
private static List makeMutable(@Nonnull List list) {
if (list instanceof ArrayList>) {
return list;
} else {
return new ArrayList<>(list);
}
}
private CompletableFuture> entryForKey(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull K key) {
byte[] keyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(key));
tr.addReadConflictKey(keyBytes);
// We need to use a range read rather than a getKey with a single key selector
// because we need to return the key back as well as the value.
// In practice, this range request should always return a single element, but
// in rare cases, concurrent updates near and around the endpoints might
// result in additional elements being returned.
AsyncIterable iterable = tr.snapshot().getRange(
KeySelector.lastLessOrEqual(keyBytes),
KeySelector.firstGreaterThan(keyBytes),
ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL
);
return iterable.asList().thenApply(keyValues -> {
if (keyValues.isEmpty()) {
// There aren't any entries before this key in the database.
return Optional.empty();
} else {
// The last (and probably only) result of the range read should be
// the greatest key that is less than or equal to keyBytes.
KeyValue kv = keyValues.get(keyValues.size() - 1);
if (ByteArrayUtil.compareUnsigned(kv.getKey(), keyBytes) > 0) {
throw new BunchedMapException("signpost key found for key is greater than original key")
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey))
.addLogInfo("key", ByteArrayUtil2.loggable(keyBytes))
.addLogInfo("signpostKey", ByteArrayUtil2.loggable(kv.getKey()));
}
if (ByteArrayUtil.startsWith(kv.getKey(), subspaceKey)) {
// The candidate key is in the correct subspace, so this is the signpost key for the given key
return Optional.of(kv);
} else {
// The candidate key is not in the correct subspace, so we must be looking for a
// key that is smaller than the smallest key currently in the map (which is
// vacuously the case if the map is empty).
return Optional.empty();
}
}
});
}
// Grand Theory of Conflict Ranges
//
// Because the map must do range scans that can potentially touch much larger ranges than
// is necessary, all reads are done at snapshot isolation level and then conflict ranges are
// added as needed to try and decrease contention. This logic is a little complicated, so here
// is an attempt to explain the reasoning behind it. The main goal is to enforce the following
// invariants:
//
// 1. For any given DB key, all map keys greater than or equal to that key but strictly less
// than the next DB key are in the DB value associated with that key.
// 2. Any map read that depends on the exact value of a map key being read that is changed
// by a concurrent transaction will trigger a conflict at commit time.
// 3. After a user has written a value to the map, subsequent operations should preserve that
// value.
//
// In some sense, condition 1 is that the integrity of the data structure should be preserved,
// condition 2 is somewhat analogous to serializability, and condition 3 is analogous to
// linearizability and durability. This leads to the following set of conflict ranges, specified
// here in an order that is supposed to reflect how straightforward each added conflict range is:
//
// a. When reading a map key, add a read conflict key to the corresponding key in the
// DB regardless of DB keys actually read. When modifying a map key, add a write
// conflict key to that same key. This gets us (2).
// b. When modifying a DB key, we will end up issuing a write that re-writes all
// values in its range, so add a read conflict range over those keys so that any
// modifications to those keys that happen between read time and commit time
// are not overwritten by the re-write. This gets us (3).
// c. When adding a map key to the end of a DB key's range or merging an existing
// DB key's range into a new key, a write conflict range must be added for the "gaps"
// that existed between the key ranges. This is necessary because without it,
// a concurrent modification can read the range that the DB key is now responsible
// for and write to it in a way that violates (1).
//
// There exist semi-formal proofs as to why these conflict ranges are sufficient to
// guarantee the three invariants proposed, but they are too large to fit into this
// comment. But in addition to proving them correct, a fair amount of testing has gone into
// trying to verify that they work as intended through randomized testing.
private void addEntryListReadConflictRange(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes, @Nonnull List> entryList) {
byte[] end = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(entryList.size() - 1).getKey()), ZERO_ARRAY);
tr.addReadConflictRange(keyBytes, end);
}
private void insertAlone(@Nonnull Transaction tr, @Nonnull byte[] keyBytes, @Nonnull Map.Entry entry) {
tr.addReadConflictKey(keyBytes);
tr.set(keyBytes, serializer.serializeEntries(Collections.singletonList(entry)));
}
private void writeEntryListWithoutChecking(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes,
@Nonnull byte[] oldKey, @Nonnull byte[] newKey, @Nonnull List> entryList,
@Nonnull byte[] serializedBytes) {
// The order of these operations is fairly important as, it turns out, adding an explicit
// read conflict range does will skip over values that have already been written. This
// means that we will miss the value that is the actual key we are writing if we
// do these in the wrong order.
// TODO: Adding an explicit read conflict range skips the keys in write cache (https://github.com/apple/foundationdb/issues/126)
addEntryListReadConflictRange(tr, subspaceKey, newKey, entryList);
if (!Arrays.equals(oldKey, newKey)) {
tr.clear(oldKey);
}
tr.set(newKey, serializedBytes);
if (!Arrays.equals(keyBytes, newKey)) {
tr.addWriteConflictKey(keyBytes);
}
}
private void writeEntryList(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes,
@Nonnull byte[] oldKey, @Nonnull byte[] newKey, @Nonnull List> entryList,
@Nullable KeyValue kvAfter, boolean isFirst, boolean isLast) {
byte[] serializedBytes = serializer.serializeEntries(entryList);
if (serializedBytes.length > MAX_VALUE_SIZE) {
if (isFirst || entryList.size() == 1) {
insertAlone(tr, keyBytes, entryList.get(0));
} else if (isLast) {
insertAfter(tr, subspaceKey, keyBytes, kvAfter, entryList.get(entryList.size() - 1));
} else {
// Splits the keys down the middle. In principle, this might result in keys that exceed
// the maximum value size (if there are weird non-linearities in the serializer--for example,
// if it compresses). However, in practice, this will almost always produce two keys that
// are under the maximum value size. If one or more of the two keys exceed the correct size, but
// each value is still less than the FDB maximum value size (which is likely given that
// MAX_VALUE_SIZE is much less than the actual FDB maximum value size), then everything will just
// work. If either one exceeds that maximum value size, then the insertion fails and an error
// bubbles up to the user. This is worse than there not being an error, but it is safe.
int splitPoint = entryList.size() / 2;
List> firstEntries = entryList.subList(0, splitPoint);
byte[] firstSerialized = serializer.serializeEntries(firstEntries);
List> secondEntries = entryList.subList(splitPoint, entryList.size());
byte[] secondSerialized = serializer.serializeEntries(secondEntries);
writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, oldKey, newKey, firstEntries, firstSerialized);
byte[] secondKey = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(secondEntries.get(0).getKey()));
writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, secondKey, secondKey, secondEntries, secondSerialized);
}
} else {
if (serializer.canAppend() && isLast && entryList.size() > 1 && Arrays.equals(oldKey, newKey)) {
// Note: APPEND_IF_FITS will silently fail if the size of the value is greater than the maximum
// value size. It is therefore *very* important that we check what the size will be before
// calling this method to make sure that the total size is not too large. Otherwise, we might
// lose data.
addEntryListReadConflictRange(tr, subspaceKey, newKey, entryList);
tr.mutate(MutationType.APPEND_IF_FITS, newKey, serializer.serializeEntry(entryList.get(entryList.size() - 1)));
tr.addWriteConflictKey(keyBytes);
} else {
writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, oldKey, newKey, entryList, serializedBytes);
}
// When appending before the beginning or writing after the end, we are essentially asserting
// that this key will be responsible for an additional range of map keys. Concurrent transactions
// might also claim this section of the logical key range in incompatible ways if we do not
// declare write conflict ranges here.
if (isFirst && entryList.size() >= 2) {
tr.addWriteConflictRange(keyBytes, ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(1).getKey())));
}
if (isLast && entryList.size() >= 2) {
tr.addWriteConflictRange(ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(entryList.size() - 2).getKey())), keyBytes);
}
}
}
private void insertAfter(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes,
@Nullable KeyValue kvAfter, @Nonnull Map.Entry entry) {
if (kvAfter == null) {
insertAlone(tr, keyBytes, entry);
} else {
K afterKey = serializer.deserializeKey(kvAfter.getKey(), subspaceKey.length);
List> afterEntryList = serializer.deserializeEntries(afterKey, kvAfter.getValue());
if (afterEntryList.size() >= bunchSize) {
// The next list of entries is too large. Write to a separate KV pair.
insertAlone(tr, keyBytes, entry);
} else {
// Bunch this entry with the next one.
List> newEntryList = new ArrayList<>(afterEntryList.size() + 1);
newEntryList.add(entry);
newEntryList.addAll(afterEntryList);
writeEntryList(tr, subspaceKey, keyBytes, kvAfter.getKey(), keyBytes, newEntryList, null, true, false);
}
}
}
@Nonnull
private Optional insertEntry(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey, @Nonnull byte[] keyBytes,
@Nonnull K key, @Nonnull V value, @Nullable KeyValue kvBefore, @Nullable KeyValue kvAfter,
@Nonnull Map.Entry entry) {
if (kvBefore == null) {
insertAfter(tr, subspaceKey, keyBytes, kvAfter, entry);
return Optional.empty();
} else {
K beforeKey = serializer.deserializeKey(kvBefore.getKey(), subspaceKey.length);
List> beforeEntryList = serializer.deserializeEntries(beforeKey, kvBefore.getValue());
int insertIndex = 0;
while (insertIndex < beforeEntryList.size() && keyComparator.compare(key, beforeEntryList.get(insertIndex).getKey()) > 0) {
insertIndex++;
}
if (insertIndex < beforeEntryList.size() && keyComparator.compare(key, beforeEntryList.get(insertIndex).getKey()) == 0) {
// This key is already in the map, so we are going to end up re-writing it iff the value is different.
Map.Entry oldEntry = beforeEntryList.get(insertIndex);
V oldValue = oldEntry.getValue();
if (!oldEntry.getValue().equals(value)) {
beforeEntryList = makeMutable(beforeEntryList);
beforeEntryList.set(insertIndex, entry);
writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(),
beforeEntryList, kvAfter, false, false);
} else {
// We are choosing to not re-write the key because it
// is already the value we wanted any way. Add a
// read conflict key to it so if something else
// changes it, this transaction will need to be retried
// to set it back.
tr.addReadConflictKey(keyBytes);
}
return Optional.of(oldValue);
} else if (insertIndex < beforeEntryList.size()) {
// This key is going to be inserted somewhere in the middle
beforeEntryList = makeMutable(beforeEntryList);
beforeEntryList.add(insertIndex, entry);
if (beforeEntryList.size() <= bunchSize) {
// Insert the entry in the middle and serialize.
writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(),
beforeEntryList, kvAfter, false, false);
} else {
// Split this entry in half (roughly) and insert both halves
int splitPoint = beforeEntryList.size() / 2;
writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(),
beforeEntryList.subList(0, splitPoint), null, false, false);
List> secondEntries = beforeEntryList.subList(splitPoint, beforeEntryList.size());
byte[] secondKey = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(secondEntries.get(0).getKey()));
writeEntryList(tr, subspaceKey, keyBytes, secondKey, secondKey, secondEntries, kvAfter, false, false);
}
return Optional.empty();
} else {
// This key is going to be inserted after all of the keys in the before entry.
if (beforeEntryList.size() < bunchSize) {
// Append to the end of the current list.
List> newEntryList = new ArrayList<>(beforeEntryList.size() + 1);
newEntryList.addAll(beforeEntryList);
newEntryList.add(entry);
writeEntryList(tr, subspaceKey, keyBytes, kvBefore.getKey(), kvBefore.getKey(), newEntryList, kvAfter, false, true);
} else {
// This key would make the bunch too large. Insert it into the next one.
insertAfter(tr, subspaceKey, keyBytes, kvAfter, entry);
}
return Optional.empty();
}
}
}
/**
* Inserts or updates a key into a map with a new value. This will find an appropriate
* bunch to insert the key into (or create one if one doesn't exist or if all of the candidates
* are full). It will do work to make sure that the placement is locally optimal (that is, it
* will choose between the one or two bunches closest to the key when performing its bunching).
* It makes no attempt to fix suboptimal bunches elsewhere within the map. If the map already
* contains key
, it will overwrite the existing key with the new value. This will
* return the old value if one is present.
*
*
* Note that this method is not thread-safe if multiple threads call it with the same
* transaction and subspace. (Multiple calls with different transactions or subspaces are safe.)
*
*
*
* Note that this call is asynchronous. It will return a {@link CompletableFuture} that will be
* completed when this task has completed.
*
*
* @param tcx database or transaction to use when performing the insertion
* @param subspace subspace within which the map's data are located
* @param key key of the map entry to insert
* @param value value of the map entry to insert
* @return a future that will complete with an optional that will either contain the previous value
* associated with the key or be empty if there was not a previous value
*/
@Nonnull
public CompletableFuture> put(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key, @Nonnull V value) {
return tcx.runAsync(tr -> {
byte[] subspaceKey = subspace.pack();
byte[] keyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(key));
// We need to know the key (and value) that is less than or equal to
// the key we are trying to insert in our map as well as the key (and value)
// that is greater than our key in the map. Many insertions will not actually
// require both of these, but it is better to grab them both at once using
// a single range read that (with a very high likelihood) will hit a single
// storage server than to do the two reads separately.
//
// Note that we do this read at snapshot isolation level, so if we read too much,
// that won't be a problem in terms of conflict ranges (we will just add the
// correct conflict ranges later).
//
// In practice, the range read will almost always return at most 2 results. Because
// of how range reads with key selectors are implemented, there is a slight
// possibility that there will be more than two if, for example, additional
// keys are added (within this transaction) to the RYW cache.
return tr.snapshot().getRange(
KeySelector.lastLessOrEqual(keyBytes),
KeySelector.firstGreaterThan(keyBytes).add(1),
ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL
).asList().thenApply(keyValues -> {
KeyValue kvBefore = null;
KeyValue kvAfter = null;
for (KeyValue next : keyValues) {
if (ByteArrayUtil.startsWith(next.getKey(), subspaceKey)) {
if (ByteArrayUtil.compareUnsigned(keyBytes, next.getKey()) < 0) {
kvAfter = next;
break; // no need to continue after kvAfter is set
}
if (ByteArrayUtil.compareUnsigned(next.getKey(), keyBytes) <= 0) {
kvBefore = next;
}
}
}
// If either of these trigger, than it means that I screwed up the logic here in
// picking the correct keys and values.
if (kvBefore != null && (ByteArrayUtil.compareUnsigned(keyBytes, kvBefore.getKey()) < 0 || !ByteArrayUtil.startsWith(kvBefore.getKey(), subspaceKey))) {
throw new BunchedMapException("database key before map key compared incorrectly")
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey))
.addLogInfo("key", ByteArrayUtil2.loggable(keyBytes))
.addLogInfo("kvBefore", ByteArrayUtil2.loggable(kvBefore.getKey()));
}
if (kvAfter != null && (ByteArrayUtil.compareUnsigned(keyBytes, kvAfter.getKey()) >= 0 || !ByteArrayUtil.startsWith(kvAfter.getKey(), subspaceKey))) {
throw new BunchedMapException("database key after map key compared incorrectly")
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey))
.addLogInfo("key", ByteArrayUtil2.loggable(keyBytes))
.addLogInfo("kvAfter", ByteArrayUtil2.loggable(kvAfter.getKey()));
}
Map.Entry newEntry = new AbstractMap.SimpleImmutableEntry<>(key, value);
return insertEntry(tr, subspaceKey, keyBytes, key, value, kvBefore, kvAfter, newEntry);
});
});
}
/**
* Determines whether a key is contained within the map. This method is safe to run concurrently
* with other map operations in other threads. However, if there are concurrent
* {@link #put(TransactionContext, Subspace, Object, Object) put}
* or {@link #remove(TransactionContext, Subspace, Object) remove}
* calls, then there are no guarantees as to whether this will return true
or false
.
*
* @param tcx database or transaction to use when performing reads
* @param subspace subspace within which the map's data are located
* @param key the key to check for membership within the map
* @return a future that will be completed to true
if the map contains key
* and false
otherwise
*/
@Nonnull
public CompletableFuture containsKey(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key) {
final byte[] subspaceKey = subspace.getKey();
return tcx.runAsync(tr -> entryForKey(tr, subspaceKey, key)
.thenApply(optionalEntry -> optionalEntry
.map(kv -> {
K mapKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length);
return serializer.deserializeKeys(mapKey, kv.getValue()).contains(key);
})
.orElse(false)
)
);
}
/**
* Retrieves the value associated with a key from the map. This method is safe to run concurrently
* with other map operations. However, if there are concurrent
* {@link #put(TransactionContext, Subspace, Object, Object) put}
* or {@link #remove(TransactionContext, Subspace, Object) remove}
* operations, then there are no guarantees as to whether this operation will see the result of the
* concurrent operation or not.
*
* @param tcx database or transaction to use when performing reads
* @param subspace subspace within which the map's data are located
* @param key the key within the map to retrieve the value of
* @return a future that will be completed with an optional that will be present with the value
* associated with the key in the database or empty if the key is not contained within the map
*/
@Nonnull
public CompletableFuture> get(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key) {
final byte[] subspaceKey = subspace.getKey();
return tcx.runAsync(tr -> entryForKey(tr, subspaceKey, key)
.thenApply(optionalEntry -> optionalEntry
.flatMap(kv -> {
K mapKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length);
final List> entryList = serializer.deserializeEntries(mapKey, kv.getValue());
return entryList.stream()
.filter(entry -> entry.getKey().equals(key))
.findAny()
.map(Map.Entry::getValue);
})
)
);
}
/**
* Removes a key from the map. This returns a future that will contain an optional with the
* old value associated with the key within the map (prior to deletion) if one is present or
* will be empty if the key was not contained within the map.
*
*
* Note that this method is not thread-safe if multiple threads call it with the same
* transaction and subspace. (Multiple calls with different transactions or subspaces are safe.)
*
*
*
* Note that this call is asynchronous. It will return a {@link CompletableFuture} that will be completed
* when this task has completed.
*
*
* @param tcx database or transaction to use when removing the key
* @param subspace subspace within which the map's data are located
* @param key the key to remove from the map
* @return a future that will be completed with an optional that will be present with the value associated
* with the key in the database (prior to removal) or will be empty if the key was not present
*/
@Nonnull
public CompletableFuture> remove(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace, @Nonnull K key) {
final byte[] subspaceKey = subspace.getKey();
return tcx.runAsync(tr -> entryForKey(tr, subspaceKey, key).thenApply(optionalEntry -> optionalEntry.flatMap((KeyValue kv) -> {
K mapKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length);
List> entryList = serializer.deserializeEntries(mapKey, kv.getValue());
int foundIndex = -1;
for (int i = 0; i < entryList.size(); i++) {
if (entryList.get(i).getKey().equals(key)) {
foundIndex = i;
break;
}
}
if (foundIndex != -1) {
final Map.Entry oldEntry = entryList.get(foundIndex);
// The value that gets written is based on the contents of the entries read, so
// we need to add a read range with all of the values we are currently writing.
addEntryListReadConflictRange(tr, subspaceKey, kv.getKey(), entryList);
tr.addWriteConflictKey(ByteArrayUtil.join(subspaceKey, serializer.serializeKey(key)));
if (entryList.size() == 1) {
// The only key that was in the range was the key that
// we are currently removing, so just remove it.
tr.clear(kv.getKey());
} else {
// We have other items in the entry. Remove the entry
// we actually care about and serialize the rest.
entryList = makeMutable(entryList);
entryList.remove(foundIndex);
if (foundIndex == 0) {
tr.clear(kv.getKey());
byte[] newKey = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entryList.get(0).getKey()));
tr.set(newKey, serializer.serializeEntries(entryList));
} else {
tr.set(kv.getKey(), serializer.serializeEntries(entryList));
}
}
return Optional.of(oldEntry.getValue());
} else {
return Optional.empty();
}
})));
}
/**
* Verify the integrity of the bunched map. This will read through all of the database keys associated
* with the map and verify that all of the keys are in order. If it encounters an error, it will
* complete with an exception. Otherwise, the returned future will complete normally.
*
* @param tcx database or transaction to use when removing the key
* @param subspace subspace within which the map's data are located
* @return a future that will complete when the integrity check has finished
*/
@Nonnull
public CompletableFuture verifyIntegrity(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace) {
return tcx.runAsync(tr -> {
AtomicReference lastKey = new AtomicReference<>(null);
byte[] subspaceKey = subspace.getKey();
AsyncIterable iterable = tr.getRange(subspace.range());
return AsyncUtil.forEach(iterable, kv -> {
K boundaryKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length);
if (lastKey.get() != null && keyComparator.compare(boundaryKey, lastKey.get()) < 0) {
throw new BunchedMapException("boundary key out of order")
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey))
.addLogInfo("lastKey", lastKey.get())
.addLogInfo("boundaryKey", boundaryKey);
}
lastKey.set(boundaryKey);
List keys = serializer.deserializeKeys(boundaryKey, kv.getValue());
for (K key : keys) {
if (keyComparator.compare(key, lastKey.get()) < 0) {
throw new BunchedMapException("keys within bunch out of order")
.addLogInfo(LogMessageKeys.SUBSPACE, ByteArrayUtil2.loggable(subspaceKey))
.addLogInfo("lastKey", lastKey.get())
.addLogInfo("nextKey", key);
}
lastKey.set(key);
}
}, tr.getExecutor());
});
}
private void flushEntryList(@Nonnull Transaction tr, @Nonnull byte[] subspaceKey,
@Nonnull List> currentEntryList,
@Nonnull AtomicReference lastKey) {
byte[] keyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(currentEntryList.get(0).getKey()));
writeEntryListWithoutChecking(tr, subspaceKey, keyBytes, keyBytes, keyBytes, currentEntryList,
serializer.serializeEntries(currentEntryList));
lastKey.set(currentEntryList.get(currentEntryList.size() - 1).getKey());
currentEntryList.clear();
}
/**
* Compact the values within the map into as few keys as possible. This will scan through and re-write
* the keys to be optimal. This feature is experimental at the moment, but it should be used to better
* pack entries if needed.
*
* @param tcx database or transaction to use when compacting data
* @param subspace subspace within which the map's data are located
* @param keyLimit maximum number of database keys to read in a single transaction
* @param continuation the continuation returned from a previous call or null
* to start from the beginning of the subspace
* @return future that will complete with a continuation that can be used to complete
* the compaction across multiple transactions (null
if finished)
*/
@Nonnull
protected CompletableFuture compact(@Nonnull TransactionContext tcx, @Nonnull Subspace subspace,
int keyLimit, @Nullable byte[] continuation) {
return tcx.runAsync(tr -> {
byte[] subspaceKey = subspace.getKey();
byte[] begin = (continuation == null) ? subspaceKey : continuation;
byte[] end = subspace.range().end;
final AsyncIterable iterable = tr.snapshot().getRange(begin, end, keyLimit);
List> currentEntryList = new ArrayList<>(bunchSize);
// The estimated size can be off (and will be off for many implementations of BunchedSerializer),
// but it is just a heuristic to know when to split, so that's fine (I claim).
AtomicInteger currentEntrySize = new AtomicInteger(0);
AtomicInteger readKeys = new AtomicInteger(0);
AtomicReference lastReadKeyBytes = new AtomicReference<>(null);
AtomicReference lastKey = new AtomicReference<>(null);
return AsyncUtil.forEach(iterable, kv -> {
final K boundaryKey = serializer.deserializeKey(kv.getKey(), subspaceKey.length);
final List> entriesFromKey = serializer.deserializeEntries(boundaryKey, kv.getValue());
readKeys.incrementAndGet();
if (entriesFromKey.size() >= bunchSize && currentEntryList.isEmpty()) {
// Nothing can be done. Just move on.
lastReadKeyBytes.set(null);
return;
}
if (lastReadKeyBytes.get() == null) {
lastReadKeyBytes.set(kv.getKey());
}
final byte[] endKeyBytes = ByteArrayUtil.join(subspaceKey, serializer.serializeKey(entriesFromKey.get(entriesFromKey.size() - 1).getKey()), ZERO_ARRAY);
tr.addReadConflictRange(lastReadKeyBytes.get(), endKeyBytes);
tr.addWriteConflictRange(lastReadKeyBytes.get(), kv.getKey());
lastReadKeyBytes.set(endKeyBytes);
tr.clear(kv.getKey());
for (Map.Entry entry : entriesFromKey) {
byte[] serializedEntry = serializer.serializeEntry(entry);
if (currentEntrySize.get() + serializedEntry.length > MAX_VALUE_SIZE && !currentEntryList.isEmpty()) {
flushEntryList(tr, subspaceKey, currentEntryList, lastKey);
currentEntrySize.set(0);
}
currentEntryList.add(entry);
currentEntrySize.addAndGet(serializedEntry.length);
if (currentEntryList.size() == bunchSize) {
flushEntryList(tr, subspaceKey, currentEntryList, lastKey);
currentEntrySize.set(0);
}
}
}, tr.getExecutor()).thenApply(vignore -> {
if (!currentEntryList.isEmpty()) {
flushEntryList(tr, subspaceKey, currentEntryList, lastKey);
}
// Return a valid continuation if there might be more keys
if (lastKey.get() != null && keyLimit != ReadTransaction.ROW_LIMIT_UNLIMITED && readKeys.get() == keyLimit) {
return ByteArrayUtil.join(subspaceKey, serializer.serializeKey(lastKey.get()), ZERO_ARRAY);
} else {
return null;
}
});
});
}
/**
* Scans the map and returns an iterator over all entries. This has the same behavior as the
* {@link #scan(ReadTransaction, Subspace, byte[], int, boolean) scan()} method that takes more
* parameters, but it will return an iterator that has no limit and always returns entries in ascending
* order by key.
*
* @param tr transaction to use when scanning the map
* @param subspace subspace in which the map's data are located
* @return an iterator over the entries in the map
*/
@Nonnull
public BunchedMapIterator scan(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace) {
return scan(tr, subspace, null, Transaction.ROW_LIMIT_UNLIMITED, false);
}
/**
* Scans the maps and returns an iterator over all entries. This has the same behavior as the
* {@link #scan(ReadTransaction, Subspace, byte[], int, boolean) scan()} method that takes more
* parameters, but it will return an iterator that has no limit and always returns entries in ascending
* order by key.
*
* @param tr transaction to use when scanning the map
* @param subspace subspace in which the map's data are located
* @param continuation continuation from a previous scan (or null
to start from the beginning)
* @return an iterator over the entries in the map
*/
@Nonnull
public BunchedMapIterator scan(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nullable byte[] continuation) {
return scan(tr, subspace, continuation, ReadTransaction.ROW_LIMIT_UNLIMITED, false);
}
/**
* Scans the map and returns an iterator over all entries. All entries will be returned sorted by
* key. If reverse
is true
, it will return keys in descending order instead of in
* ascending order. It will return at most limit
keys from the map. Note that because of
* bunching, this will probably require reading fewer keys from the database. Scans that require
* multiple transactions can provide a continuation
from a previous scan. The returned iterator
* has a {@link BunchedMapIterator#getContinuation() getContinuation()} method that can be used to get an
* appropriate value for that parameter.
*
* @param tr transaction to use when scanning the map
* @param subspace subspace in which the map's data are located
* @param continuation continuation from a previous scan (or null
to start from the beginning)
* @param limit maximum number of keys to return or {@link ReadTransaction#ROW_LIMIT_UNLIMITED} if no limit
* @param reverse true
if keys are wanted in descending instead of ascending order
* @return an iterator over the entries in the map
*/
@Nonnull
public BunchedMapIterator scan(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nullable byte[] continuation, int limit, boolean reverse) {
byte[] subspaceKey = subspace.getKey();
AsyncIterable rangeReadIterable;
K continuationKey;
if (continuation == null) {
continuationKey = null;
rangeReadIterable = tr.getRange(subspace.range(), ReadTransaction.ROW_LIMIT_UNLIMITED, reverse);
} else {
continuationKey = serializer.deserializeKey(continuation);
byte[] continuationKeyBytes = ByteArrayUtil.join(subspaceKey, continuation);
if (reverse) {
rangeReadIterable = tr.getRange(subspaceKey, continuationKeyBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, true);
} else {
rangeReadIterable = tr.getRange(KeySelector.lastLessOrEqual(continuationKeyBytes), KeySelector.firstGreaterOrEqual(subspace.range().end), ReadTransaction.ROW_LIMIT_UNLIMITED, false);
}
}
return new BunchedMapIterator<>(
AsyncPeekIterator.wrap(rangeReadIterable.iterator()),
tr,
subspace,
subspace.getKey(),
serializer,
keyComparator,
continuationKey,
limit,
reverse
);
}
/**
* Scans multiple maps and returns an iterator over all of them. This behaves the same was as the
* {@link #scanMulti(ReadTransaction, Subspace, SubspaceSplitter, byte[], byte[], byte[], int, boolean) scanMulti()}
* method that takes additional subspaceStart
and subspaceEnd
parameters,
* but this will always scan from the beginning of subspace
to the end, assumes that
* there is no limit to the number of entries to return, and always returns items in ascending order.
*
* @param tr transaction to use when scanning the maps
* @param subspace subspace containing one or more maps
* @param splitter object to determine which map a given key is in
* @param type of the tag of each map subspace
* @return an iterator over the entries in multiple maps
*/
@Nonnull
public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter) {
return scanMulti(tr, subspace, splitter, null, ReadTransaction.ROW_LIMIT_UNLIMITED, false);
}
/**
* Scans multiple maps and returns an iterator over all of them. This behaves the same was as the
* {@link #scanMulti(ReadTransaction, Subspace, SubspaceSplitter, byte[], byte[], byte[], int, boolean) scanMulti()}
* method that takes additional subspaceStart
and subspaceEnd
parameters,
* but this will always scan from the beginning of subspace
to the end.
*
* @param tr transaction to use when scanning the maps
* @param subspace subspace containing one or more maps
* @param splitter object to determine which map a given key is in
* @param continuation continuation from previous scan (or null
to start from the beginning)
* @param limit maximum number of entries to return
* @param reverse true
if the entries should be returned in descending order or false
otherwise
* @param type of the tag of each map subspace
* @return an iterator over the entries in multiple maps
*/
@Nonnull
public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter,
@Nullable byte[] continuation, int limit, boolean reverse) {
return scanMulti(tr, subspace, splitter, null, null, continuation, limit, reverse);
}
/**
* Scans multiple maps and returns an iterator over all of them. The returned iterator
* will produce one item for each entry in each map. To do so, the maps must be backed by contiguous
* subspaces within subspace
. The scan will start at the first subspace greater
* than or equal to the subspace formed by concatenating the raw prefix of subspace
* with subspaceStart
(or with nothing if subspaceStart
is
* null
) and will end with the last subspace less than the subspace formed
* by concatenating the raw prefix of subspace
with subspaceEnd
* (or with nothing if subspaceEnd
is null
). The provided
* {@link SubspaceSplitter} should be able to determine the subspace of the map that
* a given key contains and (optionally) provide some tag that can be used to associate
* that map with some value that might be used to distinguish one map from the other (if
* the application needs it).
*
*
* For example, suppose there are ten BunchedMaps
that all begin with a raw prefix
* prefix
followed by a {@link com.apple.foundationdb.tuple.Tuple Tuple}-encoded
* integer (0 through 10). If one wanted to scan the three maps starting with map 3 and ending
* with map 6, one would supply the following parameters:
*
*
* subspace
should be set to new Subspace(prefix)
* -
*
splitter
should be implemented so that if given key
within map n
,
* {@link SubspaceSplitter#subspaceOf(byte[]) subspaceOf(key)} returns subspace.subspace(Tuple.from(n))
* and {@link SubspaceSplitter#subspaceTag(Subspace) subspaceTag(subspaceOf(key))} returns n
.
*
* subspaceStart
should be set to Tuple.pack(3)
* subspaceEnd
should be set to Tuple.pack(7)
(or Tuple.pack(6)
concatenated with a 0-byte)
*
*
*
* Furthermore, this method can be used across multiple calls or transactions by setting the
* continuation
parameter to the result of {@link BunchedMapMultiIterator#getContinuation() getContinuation()}
* from a previous scan. (One should pass null
to restart the scan from the beginning.)
* The iterator will return at most limit
entries. The entries will ordered first
* by subspace and within a subspace by key. If reverse
is true
, the
* items will be returned in descending order. Otherwise, they will be returned in ascending order.
*
*
* @param tr transaction to use when scanning the maps
* @param subspace subspace containing one or more maps
* @param splitter object to determine which map a given key is in
* @param subspaceStart inclusive starting suffix of subspace
to start the scan
* @param subspaceEnd exclusive ending suffix of subspace
to end the scan
* @param continuation continuation from previous scan (or null
to start from the beginning)
* @param limit maximum number of entries to return
* @param reverse true
if the entries should be returned in descending order or false
otherwise
* @param type of the tag of each map subspace
* @return an iterator over the entries in multiple maps
*/
@Nonnull
public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter,
@Nullable byte[] subspaceStart, @Nullable byte[] subspaceEnd,
@Nullable byte[] continuation, int limit, boolean reverse) {
return scanMulti(tr, subspace, splitter, subspaceStart, subspaceEnd, continuation, limit, null, reverse);
}
public BunchedMapMultiIterator scanMulti(@Nonnull ReadTransaction tr, @Nonnull Subspace subspace, @Nonnull SubspaceSplitter splitter,
@Nullable byte[] subspaceStart, @Nullable byte[] subspaceEnd,
@Nullable byte[] continuation, int limit, @Nullable Consumer postReadCallback, boolean reverse) {
byte[] subspaceKey = subspace.getKey();
byte[] startBytes = (subspaceStart == null ? subspaceKey : ByteArrayUtil.join(subspaceKey, subspaceStart));
byte[] endBytes = (subspaceEnd == null ? ByteArrayUtil.strinc(subspaceKey) : ByteArrayUtil.join(subspaceKey, subspaceEnd));
AsyncIterable rangeReadIterable;
if (continuation == null) {
rangeReadIterable = tr.getRange(startBytes, endBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, reverse);
} else {
byte[] continuationEndpoint = ByteArrayUtil.join(subspaceKey, continuation);
if (reverse) {
if (ByteArrayUtil.compareUnsigned(continuationEndpoint, endBytes) < 0) {
rangeReadIterable = tr.getRange(startBytes, continuationEndpoint, ReadTransaction.ROW_LIMIT_UNLIMITED, true);
} else {
rangeReadIterable = tr.getRange(startBytes, endBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, true);
}
} else {
if (ByteArrayUtil.compareUnsigned(continuationEndpoint, startBytes) < 0) {
rangeReadIterable = tr.getRange(startBytes, endBytes, ReadTransaction.ROW_LIMIT_UNLIMITED, false);
} else {
rangeReadIterable = tr.getRange(KeySelector.lastLessThan(continuationEndpoint), KeySelector.firstGreaterOrEqual(endBytes), ReadTransaction.ROW_LIMIT_UNLIMITED, false);
}
}
}
final AsyncPeekIterator wrappedIterator;
if (postReadCallback == null) {
wrappedIterator = AsyncPeekIterator.wrap(rangeReadIterable.iterator());
} else {
wrappedIterator = AsyncPeekCallbackIterator.wrap(rangeReadIterable.iterator(), postReadCallback);
}
return new BunchedMapMultiIterator<>(
wrappedIterator,
tr,
subspace,
subspaceKey,
splitter,
serializer,
keyComparator,
continuation,
limit,
reverse
);
}
}