
org.apache.flink.runtime.state.heap.CopyOnWriteStateTable Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.state.heap;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.runtime.state.RegisteredKeyedBackendStateMetaInfo;
import org.apache.flink.runtime.state.StateTransformationFunction;
import org.apache.flink.util.MathUtils;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.TreeSet;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
/**
* Implementation of Flink's in-memory state tables with copy-on-write support. This map does not support null values
* for key or namespace.
*
* {@link CopyOnWriteStateTable} sacrifices some peak performance and memory efficiency for features like incremental
* rehashing and asynchronous snapshots through copy-on-write. Copy-on-write tries to minimize the amount of copying by
* maintaining version meta data for both, the map structure and the state objects. However, we must often proactively
* copy state objects when we hand them to the user.
*
* As for any state backend, user should not keep references on state objects that they obtained from state backends
* outside the scope of the user function calls.
*
* Some brief maintenance notes:
*
* 1) Flattening the underlying data structure from nested maps (namespace) -> (key) -> (state) to one flat map
* (key, namespace) -> (state) brings certain performance trade-offs. In theory, the flat map has one less level of
* indirection compared to the nested map. However, the nested map naturally de-duplicates namespace objects for which
* #equals() is true. This leads to potentially a lot of redundant namespace objects for the flattened version. Those,
* in turn, can again introduce more cache misses because we need to follow the namespace object on all operations to
* ensure entry identities. Obviously, copy-on-write can also add memory overhead. So does the meta data to track
* copy-on-write requirement (state and entry versions on {@link StateTableEntry}).
*
* 2) A flat map structure is a lot easier when it comes to tracking copy-on-write of the map structure.
*
* 3) Nested structure had the (never used) advantage that we can easily drop and iterate whole namespaces. This could
* give locality advantages for certain access pattern, e.g. iterating a namespace.
*
* 4) Serialization format is changed from namespace-prefix compressed (as naturally provided from the old nested
* structure) to making all entries self contained as (key, namespace, state).
*
* 5) We got rid of having multiple nested tables, one for each key-group. Instead, we partition state into key-groups
* on-the-fly, during the asynchronous part of a snapshot.
*
* 6) Currently, a state table can only grow, but never shrinks on low load. We could easily add this if required.
*
* 7) Heap based state backends like this can easily cause a lot of GC activity. Besides using G1 as garbage collector,
* we should provide an additional state backend that operates on off-heap memory. This would sacrifice peak performance
* (due to de/serialization of objects) for a lower, but more constant throughput and potentially huge simplifications
* w.r.t. copy-on-write.
*
* 8) We could try a hybrid of a serialized and object based backends, where key and namespace of the entries are both
* serialized in one byte-array.
*
* 9) We could consider smaller types (e.g. short) for the version counting and think about some reset strategy before
* overflows, when there is no snapshot running. However, this would have to touch all entries in the map.
*
* This class was initially based on the {@link java.util.HashMap} implementation of the Android JDK, but is now heavily
* customized towards the use case of table for state entries.
*
* IMPORTANT: the contracts for this class rely on the user not holding any references to objects returned by this map
* beyond the life cycle of per-element operations. Or phrased differently, all get-update-put operations on a mapping
* should be within one call of processElement. Otherwise, the user must take care of taking deep copies, e.g. for
* caching purposes.
*
* @param type of key.
* @param type of namespace.
* @param type of value.
*/
public class CopyOnWriteStateTable extends StateTable implements Iterable> {
/**
* The logger.
*/
private static final Logger LOG = LoggerFactory.getLogger(CopyOnWriteStateTable.class);
/**
* Min capacity (other than zero) for a {@link CopyOnWriteStateTable}. Must be a power of two
* greater than 1 (and less than 1 << 30).
*/
private static final int MINIMUM_CAPACITY = 4;
/**
* Max capacity for a {@link CopyOnWriteStateTable}. Must be a power of two >= MINIMUM_CAPACITY.
*/
private static final int MAXIMUM_CAPACITY = 1 << 30;
/**
* Minimum number of entries that one step of incremental rehashing migrates from the old to the new sub-table.
*/
private static final int MIN_TRANSFERRED_PER_INCREMENTAL_REHASH = 4;
/**
* An empty table shared by all zero-capacity maps (typically from default
* constructor). It is never written to, and replaced on first put. Its size
* is set to half the minimum, so that the first resize will create a
* minimum-sized table.
*/
private static final StateTableEntry, ?, ?>[] EMPTY_TABLE = new StateTableEntry[MINIMUM_CAPACITY >>> 1];
/**
* Empty entry that we use to bootstrap our {@link CopyOnWriteStateTable.StateEntryIterator}.
*/
private static final StateTableEntry, ?, ?> ITERATOR_BOOTSTRAP_ENTRY = new StateTableEntry<>();
/**
* Maintains an ordered set of version ids that are still in use by unreleased snapshots.
*/
private final TreeSet snapshotVersions;
/**
* This is the primary entry array (hash directory) of the state table. If no incremental rehash is ongoing, this
* is the only used table.
**/
private StateTableEntry[] primaryTable;
/**
* We maintain a secondary entry array while performing an incremental rehash. The purpose is to slowly migrate
* entries from the primary table to this resized table array. When all entries are migrated, this becomes the new
* primary table.
*/
private StateTableEntry[] incrementalRehashTable;
/**
* The current number of mappings in the primary table.
*/
private int primaryTableSize;
/**
* The current number of mappings in the rehash table.
*/
private int incrementalRehashTableSize;
/**
* The next index for a step of incremental rehashing in the primary table.
*/
private int rehashIndex;
/**
* The current version of this map. Used for copy-on-write mechanics.
*/
private int stateTableVersion;
/**
* The highest version of this map that is still required by any unreleased snapshot.
*/
private int highestRequiredSnapshotVersion;
/**
* The last namespace that was actually inserted. This is a small optimization to reduce duplicate namespace objects.
*/
private N lastNamespace;
/**
* The {@link CopyOnWriteStateTable} is rehashed when its size exceeds this threshold.
* The value of this field is generally .75 * capacity, except when
* the capacity is zero, as described in the EMPTY_TABLE declaration
* above.
*/
private int threshold;
/**
* Incremented by "structural modifications" to allow (best effort)
* detection of concurrent modification.
*/
private int modCount;
/**
* Constructs a new {@code StateTable} with default capacity of 1024.
*
* @param keyContext the key context.
* @param metaInfo the meta information, including the type serializer for state copy-on-write.
*/
CopyOnWriteStateTable(InternalKeyContext keyContext, RegisteredKeyedBackendStateMetaInfo metaInfo) {
this(keyContext, metaInfo, 1024);
}
/**
* Constructs a new {@code StateTable} instance with the specified capacity.
*
* @param keyContext the key context.
* @param metaInfo the meta information, including the type serializer for state copy-on-write.
* @param capacity the initial capacity of this hash map.
* @throws IllegalArgumentException when the capacity is less than zero.
*/
@SuppressWarnings("unchecked")
private CopyOnWriteStateTable(InternalKeyContext keyContext, RegisteredKeyedBackendStateMetaInfo metaInfo, int capacity) {
super(keyContext, metaInfo);
// initialized tables to EMPTY_TABLE.
this.primaryTable = (StateTableEntry[]) EMPTY_TABLE;
this.incrementalRehashTable = (StateTableEntry[]) EMPTY_TABLE;
// initialize sizes to 0.
this.primaryTableSize = 0;
this.incrementalRehashTableSize = 0;
this.rehashIndex = 0;
this.stateTableVersion = 0;
this.highestRequiredSnapshotVersion = 0;
this.snapshotVersions = new TreeSet<>();
if (capacity < 0) {
throw new IllegalArgumentException("Capacity: " + capacity);
}
if (capacity == 0) {
threshold = -1;
return;
}
if (capacity < MINIMUM_CAPACITY) {
capacity = MINIMUM_CAPACITY;
} else if (capacity > MAXIMUM_CAPACITY) {
capacity = MAXIMUM_CAPACITY;
} else {
capacity = MathUtils.roundUpToPowerOfTwo(capacity);
}
primaryTable = makeTable(capacity);
}
// Public API from AbstractStateTable ------------------------------------------------------------------------------
/**
* Returns the total number of entries in this {@link CopyOnWriteStateTable}. This is the sum of both sub-tables.
*
* @return the number of entries in this {@link CopyOnWriteStateTable}.
*/
@Override
public int size() {
return primaryTableSize + incrementalRehashTableSize;
}
@Override
public S get(K key, N namespace) {
final int hash = computeHashForOperationAndDoIncrementalRehash(key, namespace);
final int requiredVersion = highestRequiredSnapshotVersion;
final StateTableEntry[] tab = selectActiveTable(hash);
int index = hash & (tab.length - 1);
for (StateTableEntry e = tab[index]; e != null; e = e.next) {
final K eKey = e.key;
final N eNamespace = e.namespace;
if ((e.hash == hash && key.equals(eKey) && namespace.equals(eNamespace))) {
// copy-on-write check for state
if (e.stateVersion < requiredVersion) {
// copy-on-write check for entry
if (e.entryVersion < requiredVersion) {
e = handleChainedEntryCopyOnWrite(tab, hash & (tab.length - 1), e);
}
e.stateVersion = stateTableVersion;
e.state = getStateSerializer().copy(e.state);
}
return e.state;
}
}
return null;
}
@Override
public Stream getKeys(N namespace) {
Iterable> iterable = () -> iterator();
return StreamSupport.stream(iterable.spliterator(), false)
.filter(entry -> entry.getNamespace().equals(namespace))
.map(entry -> entry.getKey());
}
@Override
public void put(K key, int keyGroup, N namespace, S state) {
put(key, namespace, state);
}
@Override
public S get(N namespace) {
return get(keyContext.getCurrentKey(), namespace);
}
@Override
public boolean containsKey(N namespace) {
return containsKey(keyContext.getCurrentKey(), namespace);
}
@Override
public void put(N namespace, S state) {
put(keyContext.getCurrentKey(), namespace, state);
}
@Override
public S putAndGetOld(N namespace, S state) {
return putAndGetOld(keyContext.getCurrentKey(), namespace, state);
}
@Override
public void remove(N namespace) {
remove(keyContext.getCurrentKey(), namespace);
}
@Override
public S removeAndGetOld(N namespace) {
return removeAndGetOld(keyContext.getCurrentKey(), namespace);
}
@Override
public void transform(N namespace, T value, StateTransformationFunction transformation) throws Exception {
transform(keyContext.getCurrentKey(), namespace, value, transformation);
}
// Private implementation details of the API methods ---------------------------------------------------------------
/**
* Returns whether this table contains the specified key/namespace composite key.
*
* @param key the key in the composite key to search for. Not null.
* @param namespace the namespace in the composite key to search for. Not null.
* @return {@code true} if this map contains the specified key/namespace composite key,
* {@code false} otherwise.
*/
boolean containsKey(K key, N namespace) {
final int hash = computeHashForOperationAndDoIncrementalRehash(key, namespace);
final StateTableEntry[] tab = selectActiveTable(hash);
int index = hash & (tab.length - 1);
for (StateTableEntry e = tab[index]; e != null; e = e.next) {
final K eKey = e.key;
final N eNamespace = e.namespace;
if ((e.hash == hash && key.equals(eKey) && namespace.equals(eNamespace))) {
return true;
}
}
return false;
}
/**
* Maps the specified key/namespace composite key to the specified value. This method should be preferred
* over {@link #putAndGetOld(Object, Object, Object)} (Object, Object)} when the caller is not interested
* in the old value, because this can potentially reduce copy-on-write activity.
*
* @param key the key. Not null.
* @param namespace the namespace. Not null.
* @param value the value. Can be null.
*/
void put(K key, N namespace, S value) {
final StateTableEntry e = putEntry(key, namespace);
e.state = value;
e.stateVersion = stateTableVersion;
}
/**
* Maps the specified key/namespace composite key to the specified value. Returns the previous state that was
* registered under the composite key.
*
* @param key the key. Not null.
* @param namespace the namespace. Not null.
* @param value the value. Can be null.
* @return the value of any previous mapping with the specified key or
* {@code null} if there was no such mapping.
*/
S putAndGetOld(K key, N namespace, S value) {
final StateTableEntry e = putEntry(key, namespace);
// copy-on-write check for state
S oldState = (e.stateVersion < highestRequiredSnapshotVersion) ?
getStateSerializer().copy(e.state) :
e.state;
e.state = value;
e.stateVersion = stateTableVersion;
return oldState;
}
/**
* Removes the mapping with the specified key/namespace composite key from this map. This method should be preferred
* over {@link #removeAndGetOld(Object, Object)} when the caller is not interested in the old value, because this
* can potentially reduce copy-on-write activity.
*
* @param key the key of the mapping to remove. Not null.
* @param namespace the namespace of the mapping to remove. Not null.
*/
void remove(K key, N namespace) {
removeEntry(key, namespace);
}
/**
* Removes the mapping with the specified key/namespace composite key from this map, returning the state that was
* found under the entry.
*
* @param key the key of the mapping to remove. Not null.
* @param namespace the namespace of the mapping to remove. Not null.
* @return the value of the removed mapping or {@code null} if no mapping
* for the specified key was found.
*/
S removeAndGetOld(K key, N namespace) {
final StateTableEntry e = removeEntry(key, namespace);
return e != null ?
// copy-on-write check for state
(e.stateVersion < highestRequiredSnapshotVersion ?
getStateSerializer().copy(e.state) :
e.state) :
null;
}
/**
* @param key the key of the mapping to remove. Not null.
* @param namespace the namespace of the mapping to remove. Not null.
* @param value the value that is the second input for the transformation.
* @param transformation the transformation function to apply on the old state and the given value.
* @param type of the value that is the second input to the {@link StateTransformationFunction}.
* @throws Exception exception that happen on applying the function.
* @see #transform(Object, Object, StateTransformationFunction).
*/
void transform(
K key,
N namespace,
T value,
StateTransformationFunction transformation) throws Exception {
final StateTableEntry entry = putEntry(key, namespace);
// copy-on-write check for state
entry.state = transformation.apply(
(entry.stateVersion < highestRequiredSnapshotVersion) ?
getStateSerializer().copy(entry.state) :
entry.state,
value);
entry.stateVersion = stateTableVersion;
}
/**
* Helper method that is the basis for operations that add mappings.
*/
private StateTableEntry putEntry(K key, N namespace) {
final int hash = computeHashForOperationAndDoIncrementalRehash(key, namespace);
final StateTableEntry[] tab = selectActiveTable(hash);
int index = hash & (tab.length - 1);
for (StateTableEntry e = tab[index]; e != null; e = e.next) {
if (e.hash == hash && key.equals(e.key) && namespace.equals(e.namespace)) {
// copy-on-write check for entry
if (e.entryVersion < highestRequiredSnapshotVersion) {
e = handleChainedEntryCopyOnWrite(tab, index, e);
}
return e;
}
}
++modCount;
if (size() > threshold) {
doubleCapacity();
}
return addNewStateTableEntry(tab, key, namespace, hash);
}
/**
* Helper method that is the basis for operations that remove mappings.
*/
private StateTableEntry removeEntry(K key, N namespace) {
final int hash = computeHashForOperationAndDoIncrementalRehash(key, namespace);
final StateTableEntry[] tab = selectActiveTable(hash);
int index = hash & (tab.length - 1);
for (StateTableEntry e = tab[index], prev = null; e != null; prev = e, e = e.next) {
if (e.hash == hash && key.equals(e.key) && namespace.equals(e.namespace)) {
if (prev == null) {
tab[index] = e.next;
} else {
// copy-on-write check for entry
if (prev.entryVersion < highestRequiredSnapshotVersion) {
prev = handleChainedEntryCopyOnWrite(tab, index, prev);
}
prev.next = e.next;
}
++modCount;
if (tab == primaryTable) {
--primaryTableSize;
} else {
--incrementalRehashTableSize;
}
return e;
}
}
return null;
}
private void checkKeyNamespacePreconditions(K key, N namespace) {
Preconditions.checkNotNull(key, "No key set. This method should not be called outside of a keyed context.");
Preconditions.checkNotNull(namespace, "Provided namespace is null.");
}
// Meta data setter / getter and toString --------------------------------------------------------------------------
@Override
public TypeSerializer getStateSerializer() {
return metaInfo.getStateSerializer();
}
@Override
public TypeSerializer getNamespaceSerializer() {
return metaInfo.getNamespaceSerializer();
}
@Override
public RegisteredKeyedBackendStateMetaInfo getMetaInfo() {
return metaInfo;
}
@Override
public void setMetaInfo(RegisteredKeyedBackendStateMetaInfo metaInfo) {
this.metaInfo = metaInfo;
}
// Iteration ------------------------------------------------------------------------------------------------------
@Override
public Iterator> iterator() {
return new StateEntryIterator();
}
// Private utility functions for StateTable management -------------------------------------------------------------
/**
* @see #releaseSnapshot(CopyOnWriteStateTableSnapshot)
*/
@VisibleForTesting
void releaseSnapshot(int snapshotVersion) {
// we guard against concurrent modifications of highestRequiredSnapshotVersion between snapshot and release.
// Only stale reads of from the result of #releaseSnapshot calls are ok.
synchronized (snapshotVersions) {
Preconditions.checkState(snapshotVersions.remove(snapshotVersion), "Attempt to release unknown snapshot version");
highestRequiredSnapshotVersion = snapshotVersions.isEmpty() ? 0 : snapshotVersions.last();
}
}
/**
* Creates (combined) copy of the table arrays for a snapshot. This method must be called by the same Thread that
* does modifications to the {@link CopyOnWriteStateTable}.
*/
@VisibleForTesting
@SuppressWarnings("unchecked")
StateTableEntry[] snapshotTableArrays() {
// we guard against concurrent modifications of highestRequiredSnapshotVersion between snapshot and release.
// Only stale reads of from the result of #releaseSnapshot calls are ok. This is why we must call this method
// from the same thread that does all the modifications to the table.
synchronized (snapshotVersions) {
// increase the table version for copy-on-write and register the snapshot
if (++stateTableVersion < 0) {
// this is just a safety net against overflows, but should never happen in practice (i.e., only after 2^31 snapshots)
throw new IllegalStateException("Version count overflow in CopyOnWriteStateTable. Enforcing restart.");
}
highestRequiredSnapshotVersion = stateTableVersion;
snapshotVersions.add(highestRequiredSnapshotVersion);
}
StateTableEntry[] table = primaryTable;
if (isRehashing()) {
// consider both tables for the snapshot, the rehash index tells us which part of the two tables we need
final int localRehashIndex = rehashIndex;
final int localCopyLength = table.length - localRehashIndex;
StateTableEntry[] copy = new StateTableEntry[localRehashIndex + table.length];
// for the primary table, take every index >= rhIdx.
System.arraycopy(table, localRehashIndex, copy, 0, localCopyLength);
// for the new table, we are sure that two regions contain all the entries:
// [0, rhIdx[ AND [table.length / 2, table.length / 2 + rhIdx[
table = incrementalRehashTable;
System.arraycopy(table, 0, copy, localCopyLength, localRehashIndex);
System.arraycopy(table, table.length >>> 1, copy, localCopyLength + localRehashIndex, localRehashIndex);
return copy;
} else {
// we only need to copy the primary table
return Arrays.copyOf(table, table.length);
}
}
/**
* Allocate a table of the given capacity and set the threshold accordingly.
*
* @param newCapacity must be a power of two
*/
private StateTableEntry[] makeTable(int newCapacity) {
if (MAXIMUM_CAPACITY == newCapacity) {
LOG.warn("Maximum capacity of 2^30 in StateTable reached. Cannot increase hash table size. This can lead " +
"to more collisions and lower performance. Please consider scaling-out your job or using a " +
"different keyed state backend implementation!");
}
threshold = (newCapacity >> 1) + (newCapacity >> 2); // 3/4 capacity
@SuppressWarnings("unchecked") StateTableEntry[] newTable
= (StateTableEntry[]) new StateTableEntry[newCapacity];
return newTable;
}
/**
* Creates and inserts a new {@link StateTableEntry}.
*/
private StateTableEntry addNewStateTableEntry(
StateTableEntry[] table,
K key,
N namespace,
int hash) {
// small optimization that aims to avoid holding references on duplicate namespace objects
if (namespace.equals(lastNamespace)) {
namespace = lastNamespace;
} else {
lastNamespace = namespace;
}
int index = hash & (table.length - 1);
StateTableEntry newEntry = new StateTableEntry<>(
key,
namespace,
null,
hash,
table[index],
stateTableVersion,
stateTableVersion);
table[index] = newEntry;
if (table == primaryTable) {
++primaryTableSize;
} else {
++incrementalRehashTableSize;
}
return newEntry;
}
/**
* Select the sub-table which is responsible for entries with the given hash code.
*
* @param hashCode the hash code which we use to decide about the table that is responsible.
* @return the index of the sub-table that is responsible for the entry with the given hash code.
*/
private StateTableEntry[] selectActiveTable(int hashCode) {
return (hashCode & (primaryTable.length - 1)) >= rehashIndex ? primaryTable : incrementalRehashTable;
}
/**
* Doubles the capacity of the hash table. Existing entries are placed in
* the correct bucket on the enlarged table. If the current capacity is,
* MAXIMUM_CAPACITY, this method is a no-op. Returns the table, which
* will be new unless we were already at MAXIMUM_CAPACITY.
*/
private void doubleCapacity() {
// There can only be one rehash in flight. From the amount of incremental rehash steps we take, this should always hold.
Preconditions.checkState(!isRehashing(), "There is already a rehash in progress.");
StateTableEntry[] oldTable = primaryTable;
int oldCapacity = oldTable.length;
if (oldCapacity == MAXIMUM_CAPACITY) {
return;
}
incrementalRehashTable = makeTable(oldCapacity * 2);
}
/**
* Returns true, if an incremental rehash is in progress.
*/
@VisibleForTesting
boolean isRehashing() {
// if we rehash, the secondary table is not empty
return EMPTY_TABLE != incrementalRehashTable;
}
/**
* Computes the hash for the composite of key and namespace and performs some steps of incremental rehash if
* incremental rehashing is in progress.
*/
private int computeHashForOperationAndDoIncrementalRehash(K key, N namespace) {
checkKeyNamespacePreconditions(key, namespace);
if (isRehashing()) {
incrementalRehash();
}
return compositeHash(key, namespace);
}
/**
* Runs a number of steps for incremental rehashing.
*/
@SuppressWarnings("unchecked")
private void incrementalRehash() {
StateTableEntry[] oldTable = primaryTable;
StateTableEntry[] newTable = incrementalRehashTable;
int oldCapacity = oldTable.length;
int newMask = newTable.length - 1;
int requiredVersion = highestRequiredSnapshotVersion;
int rhIdx = rehashIndex;
int transferred = 0;
// we migrate a certain minimum amount of entries from the old to the new table
while (transferred < MIN_TRANSFERRED_PER_INCREMENTAL_REHASH) {
StateTableEntry e = oldTable[rhIdx];
while (e != null) {
// copy-on-write check for entry
if (e.entryVersion < requiredVersion) {
e = new StateTableEntry<>(e, stateTableVersion);
}
StateTableEntry n = e.next;
int pos = e.hash & newMask;
e.next = newTable[pos];
newTable[pos] = e;
e = n;
++transferred;
}
oldTable[rhIdx] = null;
if (++rhIdx == oldCapacity) {
//here, the rehash is complete and we release resources and reset fields
primaryTable = newTable;
incrementalRehashTable = (StateTableEntry[]) EMPTY_TABLE;
primaryTableSize += incrementalRehashTableSize;
incrementalRehashTableSize = 0;
rehashIndex = 0;
return;
}
}
// sync our local bookkeeping the with official bookkeeping fields
primaryTableSize -= transferred;
incrementalRehashTableSize += transferred;
rehashIndex = rhIdx;
}
/**
* Perform copy-on-write for entry chains. We iterate the (hopefully and probably) still cached chain, replace
* all links up to the 'untilEntry', which we actually wanted to modify.
*/
private StateTableEntry handleChainedEntryCopyOnWrite(
StateTableEntry[] tab,
int tableIdx,
StateTableEntry untilEntry) {
final int required = highestRequiredSnapshotVersion;
StateTableEntry current = tab[tableIdx];
StateTableEntry copy;
if (current.entryVersion < required) {
copy = new StateTableEntry<>(current, stateTableVersion);
tab[tableIdx] = copy;
} else {
// nothing to do, just advance copy to current
copy = current;
}
// we iterate the chain up to 'until entry'
while (current != untilEntry) {
//advance current
current = current.next;
if (current.entryVersion < required) {
// copy and advance the current's copy
copy.next = new StateTableEntry<>(current, stateTableVersion);
copy = copy.next;
} else {
// nothing to do, just advance copy to current
copy = current;
}
}
return copy;
}
@SuppressWarnings("unchecked")
private static StateTableEntry getBootstrapEntry() {
return (StateTableEntry) ITERATOR_BOOTSTRAP_ENTRY;
}
/**
* Helper function that creates and scrambles a composite hash for key and namespace.
*/
private static int compositeHash(Object key, Object namespace) {
// create composite key through XOR, then apply some bit-mixing for better distribution of skewed keys.
return MathUtils.bitMix(key.hashCode() ^ namespace.hashCode());
}
// Snapshotting ----------------------------------------------------------------------------------------------------
int getStateTableVersion() {
return stateTableVersion;
}
/**
* Creates a snapshot of this {@link CopyOnWriteStateTable}, to be written in checkpointing. The snapshot integrity
* is protected through copy-on-write from the {@link CopyOnWriteStateTable}. Users should call
* {@link #releaseSnapshot(CopyOnWriteStateTableSnapshot)} after using the returned object.
*
* @return a snapshot from this {@link CopyOnWriteStateTable}, for checkpointing.
*/
@Override
public CopyOnWriteStateTableSnapshot createSnapshot() {
return new CopyOnWriteStateTableSnapshot<>(this);
}
/**
* Releases a snapshot for this {@link CopyOnWriteStateTable}. This method should be called once a snapshot is no more needed,
* so that the {@link CopyOnWriteStateTable} can stop considering this snapshot for copy-on-write, thus avoiding unnecessary
* object creation.
*
* @param snapshotToRelease the snapshot to release, which was previously created by this state table.
*/
void releaseSnapshot(CopyOnWriteStateTableSnapshot snapshotToRelease) {
Preconditions.checkArgument(snapshotToRelease.isOwner(this),
"Cannot release snapshot which is owned by a different state table.");
releaseSnapshot(snapshotToRelease.getSnapshotVersion());
}
// StateTableEntry -------------------------------------------------------------------------------------------------
/**
* One entry in the {@link CopyOnWriteStateTable}. This is a triplet of key, namespace, and state. Thereby, key and
* namespace together serve as a composite key for the state. This class also contains some management meta data for
* copy-on-write, a pointer to link other {@link StateTableEntry}s to a list, and cached hash code.
*
* @param type of key.
* @param type of namespace.
* @param type of state.
*/
static class StateTableEntry implements StateEntry {
/**
* The key. Assumed to be immutable and not null.
*/
final K key;
/**
* The namespace. Assumed to be immutable and not null.
*/
final N namespace;
/**
* The state. This is not final to allow exchanging the object for copy-on-write. Can be null.
*/
S state;
/**
* Link to another {@link StateTableEntry}. This is used to resolve collisions in the
* {@link CopyOnWriteStateTable} through chaining.
*/
StateTableEntry next;
/**
* The version of this {@link StateTableEntry}. This is meta data for copy-on-write of the table structure.
*/
int entryVersion;
/**
* The version of the state object in this entry. This is meta data for copy-on-write of the state object itself.
*/
int stateVersion;
/**
* The computed secondary hash for the composite of key and namespace.
*/
final int hash;
StateTableEntry() {
this(null, null, null, 0, null, 0, 0);
}
StateTableEntry(StateTableEntry other, int entryVersion) {
this(other.key, other.namespace, other.state, other.hash, other.next, entryVersion, other.stateVersion);
}
StateTableEntry(
K key,
N namespace,
S state,
int hash,
StateTableEntry next,
int entryVersion,
int stateVersion) {
this.key = key;
this.namespace = namespace;
this.hash = hash;
this.next = next;
this.entryVersion = entryVersion;
this.state = state;
this.stateVersion = stateVersion;
}
public final void setState(S value, int mapVersion) {
// naturally, we can update the state version every time we replace the old state with a different object
if (value != state) {
this.state = value;
this.stateVersion = mapVersion;
}
}
@Override
public K getKey() {
return key;
}
@Override
public N getNamespace() {
return namespace;
}
@Override
public S getState() {
return state;
}
@Override
public final boolean equals(Object o) {
if (!(o instanceof CopyOnWriteStateTable.StateTableEntry)) {
return false;
}
StateEntry, ?, ?> e = (StateEntry, ?, ?>) o;
return e.getKey().equals(key)
&& e.getNamespace().equals(namespace)
&& Objects.equals(e.getState(), state);
}
@Override
public final int hashCode() {
return (key.hashCode() ^ namespace.hashCode()) ^ Objects.hashCode(state);
}
@Override
public final String toString() {
return "(" + key + "|" + namespace + ")=" + state;
}
}
// For testing ----------------------------------------------------------------------------------------------------
@Override
public int sizeOfNamespace(Object namespace) {
int count = 0;
for (StateEntry entry : this) {
if (null != entry && namespace.equals(entry.getNamespace())) {
++count;
}
}
return count;
}
// StateEntryIterator ---------------------------------------------------------------------------------------------
/**
* Iterator over the entries in a {@link CopyOnWriteStateTable}.
*/
class StateEntryIterator implements Iterator> {
private StateTableEntry[] activeTable;
private int nextTablePosition;
private StateTableEntry nextEntry;
private int expectedModCount = modCount;
StateEntryIterator() {
this.activeTable = primaryTable;
this.nextTablePosition = 0;
this.expectedModCount = modCount;
this.nextEntry = getBootstrapEntry();
advanceIterator();
}
private StateTableEntry advanceIterator() {
StateTableEntry entryToReturn = nextEntry;
StateTableEntry next = entryToReturn.next;
// consider both sub-tables tables to cover the case of rehash
while (next == null) {
StateTableEntry[] tab = activeTable;
while (nextTablePosition < tab.length) {
next = tab[nextTablePosition++];
if (next != null) {
nextEntry = next;
return entryToReturn;
}
}
if (activeTable == incrementalRehashTable) {
break;
}
activeTable = incrementalRehashTable;
nextTablePosition = 0;
}
nextEntry = next;
return entryToReturn;
}
@Override
public boolean hasNext() {
return nextEntry != null;
}
@Override
public StateTableEntry next() {
if (modCount != expectedModCount) {
throw new ConcurrentModificationException();
}
if (nextEntry == null) {
throw new NoSuchElementException();
}
return advanceIterator();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Read-only iterator");
}
}
}