org.h2.mvstore.MVStore Maven / Gradle / Ivy
/*
* Copyright 2004-2018 H2 Group. Multiple-Licensed under the MPL 2.0,
* and the EPL 1.0 (http://h2database.com/html/license.html).
* Initial Developer: H2 Group
*/
package org.h2.mvstore;
import java.lang.Thread.UncaughtExceptionHandler;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.h2.compress.CompressDeflate;
import org.h2.compress.CompressLZF;
import org.h2.compress.Compressor;
import org.h2.mvstore.Page.PageChildren;
import org.h2.mvstore.cache.CacheLongKeyLIRS;
import org.h2.mvstore.type.StringDataType;
import org.h2.util.MathUtils;
import org.h2.util.New;
/*
TODO:
Documentation
- rolling docs review: at "Metadata Map"
- better document that writes are in background thread
- better document how to do non-unique indexes
- document pluggable store and OffHeapStore
TransactionStore:
- ability to disable the transaction log,
if there is only one connection
MVStore:
- better and clearer memory usage accounting rules
(heap memory versus disk memory), so that even there is
never an out of memory
even for a small heap, and so that chunks
are still relatively big on average
- make sure serialization / deserialization errors don't corrupt the file
- test and possibly improve compact operation (for large dbs)
- automated 'kill process' and 'power failure' test
- defragment (re-creating maps, specially those with small pages)
- store number of write operations per page (maybe defragment
if much different than count)
- r-tree: nearest neighbor search
- use a small object value cache (StringCache), test on Android
for default serialization
- MVStoreTool.dump should dump the data if possible;
possibly using a callback for serialization
- implement a sharded map (in one store, multiple stores)
to support concurrent updates and writes, and very large maps
- to save space when persisting very small transactions,
use a transaction log where only the deltas are stored
- serialization for lists, sets, sets, sorted sets, maps, sorted maps
- maybe rename 'rollback' to 'revert' to distinguish from transactions
- support other compression algorithms (deflate, LZ4,...)
- remove features that are not really needed; simplify the code
possibly using a separate layer or tools
(retainVersion?)
- optional pluggable checksum mechanism (per page), which
requires that everything is a page (including headers)
- rename "store" to "save", as "store" is used in "storeVersion"
- rename setStoreVersion to setDataVersion, setSchemaVersion or similar
- temporary file storage
- simple rollback method (rollback to last committed version)
- MVMap to implement SortedMap, then NavigableMap
- storage that splits database into multiple files,
to speed up compact and allow using trim
(by truncating / deleting empty files)
- add new feature to the file system API to avoid copying data
(reads that returns a ByteBuffer instead of writing into one)
for memory mapped files and off-heap storage
- support log structured merge style operations (blind writes)
using one map per level plus bloom filter
- have a strict call order MVStore -> MVMap -> Page -> FileStore
- autocommit commits, stores, and compacts from time to time;
the background thread should wait at least 90% of the
configured write delay to store changes
- compact* should also store uncommitted changes (if there are any)
- write a LSM-tree (log structured merge tree) utility on top of the MVStore
with blind writes and/or a bloom filter that
internally uses regular maps and merge sort
- chunk metadata: maybe split into static and variable,
or use a small page size for metadata
- data type "string": maybe use prefix compression for keys
- test chunk id rollover
- feature to auto-compact from time to time and on close
- compact very small chunks
- Page: to save memory, combine keys & values into one array
(also children & counts). Maybe remove some other
fields (childrenCount for example)
- Support SortedMap for MVMap
- compact: copy whole pages (without having to open all maps)
- maybe change the length code to have lower gaps
- test with very low limits (such as: short chunks, small pages)
- maybe allow to read beyond the retention time:
when compacting, move live pages in old chunks
to a map (possibly the metadata map) -
this requires a change in the compaction code, plus
a map lookup when reading old data; also, this
old data map needs to be cleaned up somehow;
maybe using an additional timeout
- rollback of removeMap should restore the data -
which has big consequences, as the metadata map
would probably need references to the root nodes of all maps
*/
/**
* A persistent storage for maps.
*/
public final class MVStore {
/**
* Whether assertions are enabled.
*/
public static final boolean ASSERT = false;
/**
* The block size (physical sector size) of the disk. The store header is
* written twice, one copy in each block, to ensure it survives a crash.
*/
static final int BLOCK_SIZE = 4 * 1024;
private static final int FORMAT_WRITE = 1;
private static final int FORMAT_READ = 1;
/**
* Used to mark a chunk as free, when it was detected that live bookkeeping
* is incorrect.
*/
private static final int MARKED_FREE = 10_000_000;
/**
* The background thread, if any.
*/
volatile BackgroundWriterThread backgroundWriterThread;
private volatile boolean reuseSpace = true;
private volatile boolean closed;
private final FileStore fileStore;
private final boolean fileStoreIsProvided;
private final int pageSplitSize;
/**
* The page cache. The default size is 16 MB, and the average size is 2 KB.
* It is split in 16 segments. The stack move distance is 2% of the expected
* number of entries.
*/
private final CacheLongKeyLIRS cache;
/**
* The page chunk references cache. The default size is 4 MB, and the
* average size is 2 KB. It is split in 16 segments. The stack move distance
* is 2% of the expected number of entries.
*/
private final CacheLongKeyLIRS cacheChunkRef;
/**
* The newest chunk. If nothing was stored yet, this field is not set.
*/
private Chunk lastChunk;
/**
* The map of chunks.
*/
private final ConcurrentHashMap chunks =
new ConcurrentHashMap<>();
/**
* The map of temporarily freed storage space caused by freed pages. The key
* is the unsaved version, the value is the map of chunks. The maps contains
* the number of freed entries per chunk.
*
* Access is partially synchronized, hence the need for concurrent maps.
* Sometimes we hold the MVStore lock, sometimes the MVMap lock, and sometimes
* we even sync on the ConcurrentHashMap object.
*/
private final ConcurrentHashMap> freedPageSpace =
new ConcurrentHashMap<>();
/**
* The metadata map. Write access to this map needs to be synchronized on
* the store.
*/
private final MVMap meta;
private final ConcurrentHashMap> maps =
new ConcurrentHashMap<>();
private final HashMap storeHeader = new HashMap<>();
private WriteBuffer writeBuffer;
private int lastMapId;
private int versionsToKeep = 5;
/**
* The compression level for new pages (0 for disabled, 1 for fast, 2 for
* high). Even if disabled, the store may contain (old) compressed pages.
*/
private final int compressionLevel;
private Compressor compressorFast;
private Compressor compressorHigh;
private final UncaughtExceptionHandler backgroundExceptionHandler;
private volatile long currentVersion;
/**
* The version of the last stored chunk, or -1 if nothing was stored so far.
*/
private long lastStoredVersion;
/**
* The estimated memory used by unsaved pages. This number is not accurate,
* also because it may be changed concurrently, and because temporary pages
* are counted.
*/
private int unsavedMemory;
private final int autoCommitMemory;
private boolean saveNeeded;
/**
* The time the store was created, in milliseconds since 1970.
*/
private long creationTime;
/**
* How long to retain old, persisted chunks, in milliseconds. For larger or
* equal to zero, a chunk is never directly overwritten if unused, but
* instead, the unused field is set. If smaller zero, chunks are directly
* overwritten if unused.
*/
private int retentionTime;
private long lastCommitTime;
/**
* The earliest chunk to retain, if any.
*/
private Chunk retainChunk;
/**
* The version of the current store operation (if any).
*/
private volatile long currentStoreVersion = -1;
private Thread currentStoreThread;
private volatile boolean metaChanged;
/**
* The delay in milliseconds to automatically commit and write changes.
*/
private int autoCommitDelay;
private final int autoCompactFillRate;
private long autoCompactLastFileOpCount;
private final Object compactSync = new Object();
private IllegalStateException panicException;
private long lastTimeAbsolute;
private long lastFreeUnusedChunks;
/**
* Create and open the store.
*
* @param config the configuration to use
* @throws IllegalStateException if the file is corrupt, or an exception
* occurred while opening
* @throws IllegalArgumentException if the directory does not exist
*/
MVStore(Map config) {
this.compressionLevel = DataUtils.getConfigParam(config, "compress", 0);
String fileName = (String) config.get("fileName");
FileStore fileStore = (FileStore) config.get("fileStore");
fileStoreIsProvided = fileStore != null;
if(fileStore == null && fileName != null) {
fileStore = new FileStore();
}
this.fileStore = fileStore;
int pgSplitSize = 48; // for "mem:" case it is # of keys
CacheLongKeyLIRS.Config cc = null;
if (this.fileStore != null) {
int mb = DataUtils.getConfigParam(config, "cacheSize", 16);
if (mb > 0) {
cc = new CacheLongKeyLIRS.Config();
cc.maxMemory = mb * 1024L * 1024L;
Object o = config.get("cacheConcurrency");
if (o != null) {
cc.segmentCount = (Integer)o;
}
}
pgSplitSize = 16 * 1024;
}
if (cc != null) {
cache = new CacheLongKeyLIRS<>(cc);
cc.maxMemory /= 4;
cacheChunkRef = new CacheLongKeyLIRS<>(cc);
} else {
cache = null;
cacheChunkRef = null;
}
pgSplitSize = DataUtils.getConfigParam(config, "pageSplitSize", pgSplitSize);
// Make sure pages will fit into cache
if (cache != null && pgSplitSize > cache.getMaxItemSize()) {
pgSplitSize = (int)cache.getMaxItemSize();
}
pageSplitSize = pgSplitSize;
backgroundExceptionHandler =
(UncaughtExceptionHandler)config.get("backgroundExceptionHandler");
meta = new MVMap<>(StringDataType.INSTANCE,
StringDataType.INSTANCE);
meta.init(this, 0, currentVersion);
if (this.fileStore != null) {
retentionTime = this.fileStore.getDefaultRetentionTime();
int kb = DataUtils.getConfigParam(config, "autoCommitBufferSize", 1024);
// 19 KB memory is about 1 KB storage
autoCommitMemory = kb * 1024 * 19;
autoCompactFillRate = DataUtils.getConfigParam(config, "autoCompactFillRate", 40);
char[] encryptionKey = (char[]) config.get("encryptionKey");
try {
if (!fileStoreIsProvided) {
boolean readOnly = config.containsKey("readOnly");
this.fileStore.open(fileName, readOnly, encryptionKey);
}
if (this.fileStore.size() == 0) {
creationTime = getTimeAbsolute();
lastCommitTime = creationTime;
storeHeader.put("H", 2);
storeHeader.put("blockSize", BLOCK_SIZE);
storeHeader.put("format", FORMAT_WRITE);
storeHeader.put("created", creationTime);
writeStoreHeader();
} else {
readStoreHeader();
}
} catch (IllegalStateException e) {
panic(e);
} finally {
if (encryptionKey != null) {
Arrays.fill(encryptionKey, (char) 0);
}
}
lastCommitTime = getTimeSinceCreation();
// setAutoCommitDelay starts the thread, but only if
// the parameter is different from the old value
int delay = DataUtils.getConfigParam(config, "autoCommitDelay", 1000);
setAutoCommitDelay(delay);
} else {
autoCommitMemory = 0;
autoCompactFillRate = 0;
}
}
private void panic(IllegalStateException e) {
handleException(e);
panicException = e;
closeImmediately();
throw e;
}
/**
* Open a store in exclusive mode. For a file-based store, the parent
* directory must already exist.
*
* @param fileName the file name (null for in-memory)
* @return the store
*/
public static MVStore open(String fileName) {
HashMap config = new HashMap<>();
config.put("fileName", fileName);
return new MVStore(config);
}
/**
* Open an old, stored version of a map.
*
* @param version the version
* @param mapId the map id
* @param template the template map
* @return the read-only map
*/
@SuppressWarnings("unchecked")
> T openMapVersion(long version, int mapId,
MVMap, ?> template) {
MVMap oldMeta = getMetaMap(version);
long rootPos = getRootPos(oldMeta, mapId);
MVMap, ?> m = template.openReadOnly();
m.setRootPos(rootPos, version);
return (T) m;
}
/**
* Open a map with the default settings. The map is automatically create if
* it does not yet exist. If a map with this name is already open, this map
* is returned.
*
* @param the key type
* @param the value type
* @param name the name of the map
* @return the map
*/
public MVMap openMap(String name) {
return openMap(name, new MVMap.Builder());
}
/**
* Open a map with the given builder. The map is automatically create if it
* does not yet exist. If a map with this name is already open, this map is
* returned.
*
* @param the key type
* @param the value type
* @param name the name of the map
* @param builder the map builder
* @return the map
*/
public synchronized , K, V> M openMap(
String name, MVMap.MapBuilder builder) {
checkOpen();
String x = meta.get("name." + name);
int id;
long root;
M map;
if (x != null) {
id = DataUtils.parseHexInt(x);
@SuppressWarnings("unchecked")
M old = (M) maps.get(id);
if (old != null) {
return old;
}
map = builder.create();
String config = meta.get(MVMap.getMapKey(id));
String v = DataUtils.getFromMap(config, "createVersion");
map.init(this, id, v != null ? DataUtils.parseHexLong(v): 0);
root = getRootPos(meta, id);
} else {
id = ++lastMapId;
map = builder.create();
map.init(this, id, currentVersion);
markMetaChanged();
x = Integer.toHexString(id);
meta.put(MVMap.getMapKey(id), map.asString(name));
meta.put("name." + name, x);
root = 0;
}
map.setRootPos(root, -1);
maps.put(id, map);
return map;
}
/**
* Get the set of all map names.
*
* @return the set of names
*/
public synchronized Set getMapNames() {
HashSet set = new HashSet<>();
checkOpen();
for (Iterator it = meta.keyIterator("name."); it.hasNext();) {
String x = it.next();
if (!x.startsWith("name.")) {
break;
}
set.add(x.substring("name.".length()));
}
return set;
}
/**
* Get the metadata map. This data is for informational purposes only. The
* data is subject to change in future versions.
*
* The data in this map should not be modified (changing system data may
* corrupt the store). If modifications are needed, they need be
* synchronized on the store.
*
* The metadata map contains the following entries:
*
* chunk.{chunkId} = {chunk metadata}
* name.{name} = {mapId}
* map.{mapId} = {map metadata}
* root.{mapId} = {root position}
* setting.storeVersion = {version}
*
*
* @return the metadata map
*/
public MVMap getMetaMap() {
checkOpen();
return meta;
}
private MVMap getMetaMap(long version) {
Chunk c = getChunkForVersion(version);
DataUtils.checkArgument(c != null, "Unknown version {0}", version);
c = readChunkHeader(c.block);
MVMap oldMeta = meta.openReadOnly();
oldMeta.setRootPos(c.metaRootPos, version);
return oldMeta;
}
private Chunk getChunkForVersion(long version) {
Chunk newest = null;
for (Chunk c : chunks.values()) {
if (c.version <= version) {
if (newest == null || c.id > newest.id) {
newest = c;
}
}
}
return newest;
}
/**
* Check whether a given map exists.
*
* @param name the map name
* @return true if it exists
*/
public boolean hasMap(String name) {
return meta.containsKey("name." + name);
}
private void markMetaChanged() {
// changes in the metadata alone are usually not detected, as the meta
// map is changed after storing
metaChanged = true;
}
private synchronized void readStoreHeader() {
Chunk newest = null;
boolean validStoreHeader = false;
// find out which chunk and version are the newest
// read the first two blocks
ByteBuffer fileHeaderBlocks = fileStore.readFully(0, 2 * BLOCK_SIZE);
byte[] buff = new byte[BLOCK_SIZE];
for (int i = 0; i <= BLOCK_SIZE; i += BLOCK_SIZE) {
fileHeaderBlocks.get(buff);
// the following can fail for various reasons
try {
HashMap m = DataUtils.parseChecksummedMap(buff);
if (m == null) {
continue;
}
int blockSize = DataUtils.readHexInt(
m, "blockSize", BLOCK_SIZE);
if (blockSize != BLOCK_SIZE) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_UNSUPPORTED_FORMAT,
"Block size {0} is currently not supported",
blockSize);
}
long version = DataUtils.readHexLong(m, "version", 0);
if (newest == null || version > newest.version) {
validStoreHeader = true;
storeHeader.putAll(m);
creationTime = DataUtils.readHexLong(m, "created", 0);
int chunkId = DataUtils.readHexInt(m, "chunk", 0);
long block = DataUtils.readHexLong(m, "block", 0);
Chunk test = readChunkHeaderAndFooter(block);
if (test != null && test.id == chunkId) {
newest = test;
}
}
} catch (Exception e) {
}
}
if (!validStoreHeader) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_FILE_CORRUPT,
"Store header is corrupt: {0}", fileStore);
}
long format = DataUtils.readHexLong(storeHeader, "format", 1);
if (format > FORMAT_WRITE && !fileStore.isReadOnly()) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_UNSUPPORTED_FORMAT,
"The write format {0} is larger " +
"than the supported format {1}, " +
"and the file was not opened in read-only mode",
format, FORMAT_WRITE);
}
format = DataUtils.readHexLong(storeHeader, "formatRead", format);
if (format > FORMAT_READ) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_UNSUPPORTED_FORMAT,
"The read format {0} is larger " +
"than the supported format {1}",
format, FORMAT_READ);
}
lastStoredVersion = -1;
chunks.clear();
long now = System.currentTimeMillis();
// calculate the year (doesn't have to be exact;
// we assume 365.25 days per year, * 4 = 1461)
int year = 1970 + (int) (now / (1000L * 60 * 60 * 6 * 1461));
if (year < 2014) {
// if the year is before 2014,
// we assume the system doesn't have a real-time clock,
// and we set the creationTime to the past, so that
// existing chunks are overwritten
creationTime = now - fileStore.getDefaultRetentionTime();
} else if (now < creationTime) {
// the system time was set to the past:
// we change the creation time
creationTime = now;
storeHeader.put("created", creationTime);
}
Chunk test = readChunkFooter(fileStore.size());
if (test != null) {
test = readChunkHeaderAndFooter(test.block);
if (test != null) {
if (newest == null || test.version > newest.version) {
newest = test;
}
}
}
if (newest == null) {
// no chunk
return;
}
// read the chunk header and footer,
// and follow the chain of next chunks
while (true) {
if (newest.next == 0 ||
newest.next >= fileStore.size() / BLOCK_SIZE) {
// no (valid) next
break;
}
test = readChunkHeaderAndFooter(newest.next);
if (test == null || test.id <= newest.id) {
break;
}
newest = test;
}
setLastChunk(newest);
loadChunkMeta();
// read all chunk headers and footers within the retention time,
// to detect unwritten data after a power failure
verifyLastChunks();
// build the free space list
for (Chunk c : chunks.values()) {
if (c.pageCountLive == 0) {
// remove this chunk in the next save operation
registerFreePage(currentVersion, c.id, 0, 0);
}
long start = c.block * BLOCK_SIZE;
int length = c.len * BLOCK_SIZE;
fileStore.markUsed(start, length);
}
}
private void loadChunkMeta() {
// load the chunk metadata: we can load in any order,
// because loading chunk metadata might recursively load another chunk
for (Iterator it = meta.keyIterator("chunk."); it.hasNext();) {
String s = it.next();
if (!s.startsWith("chunk.")) {
break;
}
s = meta.get(s);
Chunk c = Chunk.fromString(s);
if (chunks.putIfAbsent(c.id, c) == null) {
if (c.block == Long.MAX_VALUE) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_FILE_CORRUPT,
"Chunk {0} is invalid", c.id);
}
}
}
}
private void setLastChunk(Chunk last) {
lastChunk = last;
if (last == null) {
// no valid chunk
lastMapId = 0;
currentVersion = 0;
meta.setRootPos(0, -1);
} else {
lastMapId = last.mapId;
currentVersion = last.version;
chunks.put(last.id, last);
meta.setRootPos(last.metaRootPos, -1);
}
setWriteVersion(currentVersion);
}
private void verifyLastChunks() {
long time = getTimeSinceCreation();
ArrayList ids = new ArrayList<>(chunks.keySet());
Collections.sort(ids);
int newestValidChunk = -1;
Chunk old = null;
for (Integer chunkId : ids) {
Chunk c = chunks.get(chunkId);
if (old != null && c.time < old.time) {
// old chunk (maybe leftover from a previous crash)
break;
}
old = c;
if (c.time + retentionTime < time) {
// old chunk, no need to verify
newestValidChunk = c.id;
continue;
}
Chunk test = readChunkHeaderAndFooter(c.block);
if (test == null || test.id != c.id) {
break;
}
newestValidChunk = chunkId;
}
Chunk newest = chunks.get(newestValidChunk);
if (newest != lastChunk) {
// to avoid re-using newer chunks later on, we could clear
// the headers and footers of those, but we might not know about all
// of them, so that could be incomplete - but we check that newer
// chunks are written after older chunks, so we are safe
rollbackTo(newest == null ? 0 : newest.version);
}
}
/**
* Read a chunk header and footer, and verify the stored data is consistent.
*
* @param block the block
* @return the chunk, or null if the header or footer don't match or are not
* consistent
*/
private Chunk readChunkHeaderAndFooter(long block) {
Chunk header;
try {
header = readChunkHeader(block);
} catch (Exception e) {
// invalid chunk header: ignore, but stop
return null;
}
if (header == null) {
return null;
}
Chunk footer = readChunkFooter((block + header.len) * BLOCK_SIZE);
if (footer == null || footer.id != header.id) {
return null;
}
return header;
}
/**
* Try to read a chunk footer.
*
* @param end the end of the chunk
* @return the chunk, or null if not successful
*/
private Chunk readChunkFooter(long end) {
// the following can fail for various reasons
try {
// read the chunk footer of the last block of the file
ByteBuffer lastBlock = fileStore.readFully(
end - Chunk.FOOTER_LENGTH, Chunk.FOOTER_LENGTH);
byte[] buff = new byte[Chunk.FOOTER_LENGTH];
lastBlock.get(buff);
HashMap m = DataUtils.parseChecksummedMap(buff);
if (m != null) {
int chunk = DataUtils.readHexInt(m, "chunk", 0);
Chunk c = new Chunk(chunk);
c.version = DataUtils.readHexLong(m, "version", 0);
c.block = DataUtils.readHexLong(m, "block", 0);
return c;
}
} catch (Exception e) {
// ignore
}
return null;
}
private void writeStoreHeader() {
StringBuilder buff = new StringBuilder(112);
if (lastChunk != null) {
storeHeader.put("block", lastChunk.block);
storeHeader.put("chunk", lastChunk.id);
storeHeader.put("version", lastChunk.version);
}
DataUtils.appendMap(buff, storeHeader);
byte[] bytes = buff.toString().getBytes(StandardCharsets.ISO_8859_1);
int checksum = DataUtils.getFletcher32(bytes, 0, bytes.length);
DataUtils.appendMap(buff, "fletcher", checksum);
buff.append('\n');
bytes = buff.toString().getBytes(StandardCharsets.ISO_8859_1);
ByteBuffer header = ByteBuffer.allocate(2 * BLOCK_SIZE);
header.put(bytes);
header.position(BLOCK_SIZE);
header.put(bytes);
header.rewind();
write(0, header);
}
private void write(long pos, ByteBuffer buffer) {
try {
fileStore.writeFully(pos, buffer);
} catch (IllegalStateException e) {
panic(e);
throw e;
}
}
/**
* Close the file and the store. Unsaved changes are written to disk first.
*/
public void close() {
if (closed) {
return;
}
FileStore f = fileStore;
if (f != null && !f.isReadOnly()) {
stopBackgroundThread();
if (hasUnsavedChanges()) {
commitAndSave();
}
}
closeStore(true);
}
/**
* Close the file and the store, without writing anything. This will stop
* the background thread. This method ignores all errors.
*/
public void closeImmediately() {
try {
closeStore(false);
} catch (Throwable e) {
handleException(e);
}
}
private void closeStore(boolean shrinkIfPossible) {
if (closed) {
return;
}
// can not synchronize on this yet, because
// the thread also synchronized on this, which
// could result in a deadlock
stopBackgroundThread();
closed = true;
synchronized (this) {
if (fileStore != null && shrinkIfPossible) {
shrinkFileIfPossible(0);
}
// release memory early - this is important when called
// because of out of memory
if (cache != null) {
cache.clear();
}
if (cacheChunkRef != null) {
cacheChunkRef.clear();
}
for (MVMap, ?> m : new ArrayList<>(maps.values())) {
m.close();
}
chunks.clear();
maps.clear();
if (fileStore != null && !fileStoreIsProvided) {
fileStore.close();
}
}
}
/**
* Get the chunk for the given position.
*
* @param pos the position
* @return the chunk
*/
private Chunk getChunk(long pos) {
Chunk c = getChunkIfFound(pos);
if (c == null) {
int chunkId = DataUtils.getPageChunkId(pos);
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_FILE_CORRUPT,
"Chunk {0} not found", chunkId);
}
return c;
}
private Chunk getChunkIfFound(long pos) {
int chunkId = DataUtils.getPageChunkId(pos);
Chunk c = chunks.get(chunkId);
if (c == null) {
checkOpen();
if (!Thread.holdsLock(this)) {
// it could also be unsynchronized metadata
// access (if synchronization on this was forgotten)
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_CHUNK_NOT_FOUND,
"Chunk {0} no longer exists",
chunkId);
}
String s = meta.get(Chunk.getMetaKey(chunkId));
if (s == null) {
return null;
}
c = Chunk.fromString(s);
if (c.block == Long.MAX_VALUE) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_FILE_CORRUPT,
"Chunk {0} is invalid", chunkId);
}
chunks.put(c.id, c);
}
return c;
}
private void setWriteVersion(long version) {
for (MVMap, ?> map : maps.values()) {
map.setWriteVersion(version);
}
MVMap m = meta;
if (m == null) {
checkOpen();
}
m.setWriteVersion(version);
}
/**
* Commit the changes.
*
* For in-memory stores, this method increments the version.
*
* For persistent stores, it also writes changes to disk. It does nothing if
* there are no unsaved changes, and returns the old version. It is not
* necessary to call this method when auto-commit is enabled (the default
* setting), as in this case it is automatically called from time to time or
* when enough changes have accumulated. However, it may still be called to
* flush all changes to disk.
*
* @return the new version
*/
public synchronized long commit() {
if (fileStore != null) {
return commitAndSave();
}
long v = ++currentVersion;
setWriteVersion(v);
return v;
}
/**
* Commit all changes and persist them to disk. This method does nothing if
* there are no unsaved changes, otherwise it increments the current version
* and stores the data (for file based stores).
*
* At most one store operation may run at any time.
*
* @return the new version (incremented if there were changes)
*/
private synchronized long commitAndSave() {
if (closed) {
return currentVersion;
}
if (fileStore == null) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_WRITING_FAILED,
"This is an in-memory store");
}
if (currentStoreVersion >= 0) {
// store is possibly called within store, if the meta map changed
return currentVersion;
}
if (!hasUnsavedChanges()) {
return currentVersion;
}
if (fileStore.isReadOnly()) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_WRITING_FAILED, "This store is read-only");
}
try {
currentStoreVersion = currentVersion;
currentStoreThread = Thread.currentThread();
return storeNow();
} finally {
// in any case reset the current store version,
// to allow closing the store
currentStoreVersion = -1;
currentStoreThread = null;
}
}
private long storeNow() {
try {
return storeNowTry();
} catch (IllegalStateException e) {
panic(e);
return -1;
}
}
private long storeNowTry() {
long time = getTimeSinceCreation();
freeUnusedIfNeeded(time);
int currentUnsavedPageCount = unsavedMemory;
long storeVersion = currentStoreVersion;
long version = ++currentVersion;
lastCommitTime = time;
retainChunk = null;
// the metadata of the last chunk was not stored so far, and needs to be
// set now (it's better not to update right after storing, because that
// would modify the meta map again)
int lastChunkId;
if (lastChunk == null) {
lastChunkId = 0;
} else {
lastChunkId = lastChunk.id;
meta.put(Chunk.getMetaKey(lastChunkId), lastChunk.asString());
// never go backward in time
time = Math.max(lastChunk.time, time);
}
int newChunkId = lastChunkId;
while (true) {
newChunkId = (newChunkId + 1) % Chunk.MAX_ID;
Chunk old = chunks.get(newChunkId);
if (old == null) {
break;
}
if (old.block == Long.MAX_VALUE) {
IllegalStateException e = DataUtils.newIllegalStateException(
DataUtils.ERROR_INTERNAL,
"Last block not stored, possibly due to out-of-memory");
panic(e);
}
}
Chunk c = new Chunk(newChunkId);
c.pageCount = Integer.MAX_VALUE;
c.pageCountLive = Integer.MAX_VALUE;
c.maxLen = Long.MAX_VALUE;
c.maxLenLive = Long.MAX_VALUE;
c.metaRootPos = Long.MAX_VALUE;
c.block = Long.MAX_VALUE;
c.len = Integer.MAX_VALUE;
c.time = time;
c.version = version;
c.mapId = lastMapId;
c.next = Long.MAX_VALUE;
chunks.put(c.id, c);
// force a metadata update
meta.put(Chunk.getMetaKey(c.id), c.asString());
meta.remove(Chunk.getMetaKey(c.id));
ArrayList> list = new ArrayList<>(maps.values());
ArrayList> changed = New.arrayList();
for (MVMap, ?> m : list) {
m.setWriteVersion(version);
long v = m.getVersion();
if (m.getCreateVersion() > storeVersion) {
// the map was created after storing started
continue;
}
if (m.isVolatile()) {
continue;
}
if (v >= 0 && v >= lastStoredVersion) {
MVMap, ?> r = m.openVersion(storeVersion);
if (r.getRoot().getPos() == 0) {
changed.add(r);
}
}
}
applyFreedSpace(storeVersion);
WriteBuffer buff = getWriteBuffer();
// need to patch the header later
c.writeChunkHeader(buff, 0);
int headerLength = buff.position();
c.pageCount = 0;
c.pageCountLive = 0;
c.maxLen = 0;
c.maxLenLive = 0;
for (MVMap, ?> m : changed) {
Page p = m.getRoot();
String key = MVMap.getMapRootKey(m.getId());
if (p.getTotalCount() == 0) {
meta.put(key, "0");
} else {
p.writeUnsavedRecursive(c, buff);
long root = p.getPos();
meta.put(key, Long.toHexString(root));
}
}
meta.setWriteVersion(version);
Page metaRoot = meta.getRoot();
metaRoot.writeUnsavedRecursive(c, buff);
int chunkLength = buff.position();
// add the store header and round to the next block
int length = MathUtils.roundUpInt(chunkLength +
Chunk.FOOTER_LENGTH, BLOCK_SIZE);
buff.limit(length);
// the length of the file that is still in use
// (not necessarily the end of the file)
long end = getFileLengthInUse();
long filePos;
if (reuseSpace) {
filePos = fileStore.allocate(length);
} else {
filePos = end;
}
// end is not necessarily the end of the file
boolean storeAtEndOfFile = filePos + length >= fileStore.size();
if (!reuseSpace) {
// we can not mark it earlier, because it
// might have been allocated by one of the
// removed chunks
fileStore.markUsed(end, length);
}
c.block = filePos / BLOCK_SIZE;
c.len = length / BLOCK_SIZE;
c.metaRootPos = metaRoot.getPos();
// calculate and set the likely next position
if (reuseSpace) {
int predictBlocks = c.len;
long predictedNextStart = fileStore.allocate(
predictBlocks * BLOCK_SIZE);
fileStore.free(predictedNextStart, predictBlocks * BLOCK_SIZE);
c.next = predictedNextStart / BLOCK_SIZE;
} else {
// just after this chunk
c.next = 0;
}
buff.position(0);
c.writeChunkHeader(buff, headerLength);
revertTemp(storeVersion);
buff.position(buff.limit() - Chunk.FOOTER_LENGTH);
buff.put(c.getFooterBytes());
buff.position(0);
write(filePos, buff.getBuffer());
releaseWriteBuffer(buff);
// whether we need to write the store header
boolean writeStoreHeader = false;
if (!storeAtEndOfFile) {
if (lastChunk == null) {
writeStoreHeader = true;
} else if (lastChunk.next != c.block) {
// the last prediction did not matched
writeStoreHeader = true;
} else {
long headerVersion = DataUtils.readHexLong(
storeHeader, "version", 0);
if (lastChunk.version - headerVersion > 20) {
// we write after at least 20 entries
writeStoreHeader = true;
} else {
int chunkId = DataUtils.readHexInt(storeHeader, "chunk", 0);
while (true) {
Chunk old = chunks.get(chunkId);
if (old == null) {
// one of the chunks in between
// was removed
writeStoreHeader = true;
break;
}
if (chunkId == lastChunk.id) {
break;
}
chunkId++;
}
}
}
}
lastChunk = c;
if (writeStoreHeader) {
writeStoreHeader();
}
if (!storeAtEndOfFile) {
// may only shrink after the store header was written
shrinkFileIfPossible(1);
}
for (MVMap, ?> m : changed) {
Page p = m.getRoot();
if (p.getTotalCount() > 0) {
p.writeEnd();
}
}
metaRoot.writeEnd();
// some pages might have been changed in the meantime (in the newest
// version)
unsavedMemory = Math.max(0, unsavedMemory
- currentUnsavedPageCount);
metaChanged = false;
lastStoredVersion = storeVersion;
return version;
}
/**
* Try to free unused chunks. This method doesn't directly write, but can
* change the metadata, and therefore cause a background write.
*/
private void freeUnusedIfNeeded(long time) {
int freeDelay = retentionTime / 5;
if (time >= lastFreeUnusedChunks + freeDelay) {
// set early in case it fails (out of memory or so)
lastFreeUnusedChunks = time;
freeUnusedChunks();
// set it here as well, to avoid calling it often if it was slow
lastFreeUnusedChunks = getTimeSinceCreation();
}
}
private synchronized void freeUnusedChunks() {
if (lastChunk == null || !reuseSpace) {
return;
}
Set referenced = collectReferencedChunks();
long time = getTimeSinceCreation();
for (Iterator it = chunks.values().iterator(); it.hasNext(); ) {
Chunk c = it.next();
if (!referenced.contains(c.id)) {
if (canOverwriteChunk(c, time)) {
it.remove();
markMetaChanged();
meta.remove(Chunk.getMetaKey(c.id));
long start = c.block * BLOCK_SIZE;
int length = c.len * BLOCK_SIZE;
fileStore.free(start, length);
} else {
if (c.unused == 0) {
c.unused = time;
meta.put(Chunk.getMetaKey(c.id), c.asString());
markMetaChanged();
}
}
}
}
}
private Set collectReferencedChunks() {
long testVersion = lastChunk.version;
DataUtils.checkArgument(testVersion > 0, "Collect references on version 0");
long readCount = getFileStore().readCount.get();
Set referenced = new HashSet<>();
for (Cursor c = meta.cursor("root."); c.hasNext();) {
String key = c.next();
if (!key.startsWith("root.")) {
break;
}
long pos = DataUtils.parseHexLong(c.getValue());
if (pos == 0) {
continue;
}
int mapId = DataUtils.parseHexInt(key.substring("root.".length()));
collectReferencedChunks(referenced, mapId, pos, 0);
}
long pos = lastChunk.metaRootPos;
collectReferencedChunks(referenced, 0, pos, 0);
readCount = fileStore.readCount.get() - readCount;
return referenced;
}
private void collectReferencedChunks(Set targetChunkSet,
int mapId, long pos, int level) {
int c = DataUtils.getPageChunkId(pos);
targetChunkSet.add(c);
if (DataUtils.getPageType(pos) == DataUtils.PAGE_TYPE_LEAF) {
return;
}
PageChildren refs = readPageChunkReferences(mapId, pos, -1);
if (!refs.chunkList) {
Set target = new HashSet<>();
for (int i = 0; i < refs.children.length; i++) {
long p = refs.children[i];
collectReferencedChunks(target, mapId, p, level + 1);
}
// we don't need a reference to this chunk
target.remove(c);
long[] children = new long[target.size()];
int i = 0;
for (Integer p : target) {
children[i++] = DataUtils.getPagePos(p, 0, 0,
DataUtils.PAGE_TYPE_LEAF);
}
refs.children = children;
refs.chunkList = true;
if (cacheChunkRef != null) {
cacheChunkRef.put(refs.pos, refs, refs.getMemory());
}
}
for (long p : refs.children) {
targetChunkSet.add(DataUtils.getPageChunkId(p));
}
}
private PageChildren readPageChunkReferences(int mapId, long pos, int parentChunk) {
if (DataUtils.getPageType(pos) == DataUtils.PAGE_TYPE_LEAF) {
return null;
}
PageChildren r;
if (cacheChunkRef != null) {
r = cacheChunkRef.get(pos);
} else {
r = null;
}
if (r == null) {
// if possible, create it from the cached page
if (cache != null) {
Page p = cache.get(pos);
if (p != null) {
r = new PageChildren(p);
}
}
if (r == null) {
// page was not cached: read the data
Chunk c = getChunk(pos);
long filePos = c.block * BLOCK_SIZE;
filePos += DataUtils.getPageOffset(pos);
if (filePos < 0) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_FILE_CORRUPT,
"Negative position {0}; p={1}, c={2}", filePos, pos, c.toString());
}
long maxPos = (c.block + c.len) * BLOCK_SIZE;
r = PageChildren.read(fileStore, pos, mapId, filePos, maxPos);
}
r.removeDuplicateChunkReferences();
if (cacheChunkRef != null) {
cacheChunkRef.put(pos, r, r.getMemory());
}
}
if (r.children.length == 0) {
int chunk = DataUtils.getPageChunkId(pos);
if (chunk == parentChunk) {
return null;
}
}
return r;
}
/**
* Get a buffer for writing. This caller must synchronize on the store
* before calling the method and until after using the buffer.
*
* @return the buffer
*/
private WriteBuffer getWriteBuffer() {
WriteBuffer buff;
if (writeBuffer != null) {
buff = writeBuffer;
buff.clear();
} else {
buff = new WriteBuffer();
}
return buff;
}
/**
* Release a buffer for writing. This caller must synchronize on the store
* before calling the method and until after using the buffer.
*
* @param buff the buffer than can be re-used
*/
private void releaseWriteBuffer(WriteBuffer buff) {
if (buff.capacity() <= 4 * 1024 * 1024) {
writeBuffer = buff;
}
}
private boolean canOverwriteChunk(Chunk c, long time) {
if (retentionTime >= 0) {
if (c.time + retentionTime > time) {
return false;
}
if (c.unused == 0 || c.unused + retentionTime / 2 > time) {
return false;
}
}
Chunk r = retainChunk;
if (r != null && c.version > r.version) {
return false;
}
return true;
}
private long getTimeSinceCreation() {
return Math.max(0, getTimeAbsolute() - creationTime);
}
private long getTimeAbsolute() {
long now = System.currentTimeMillis();
if (lastTimeAbsolute != 0 && now < lastTimeAbsolute) {
// time seems to have run backwards - this can happen
// when the system time is adjusted, for example
// on a leap second
now = lastTimeAbsolute;
} else {
lastTimeAbsolute = now;
}
return now;
}
/**
* Apply the freed space to the chunk metadata. The metadata is updated, but
* completely free chunks are not removed from the set of chunks, and the
* disk space is not yet marked as free.
*
* @param storeVersion apply up to the given version
*/
private void applyFreedSpace(long storeVersion) {
while (true) {
ArrayList modified = New.arrayList();
Iterator>> it;
it = freedPageSpace.entrySet().iterator();
while (it.hasNext()) {
Entry> e = it.next();
long v = e.getKey();
if (v > storeVersion) {
continue;
}
ConcurrentHashMap freed = e.getValue();
for (Chunk f : freed.values()) {
Chunk c = chunks.get(f.id);
if (c == null) {
// already removed
continue;
}
// no need to synchronize, as old entries
// are not concurrently modified
c.maxLenLive += f.maxLenLive;
c.pageCountLive += f.pageCountLive;
if (c.pageCountLive < 0 && c.pageCountLive > -MARKED_FREE) {
// can happen after a rollback
c.pageCountLive = 0;
}
if (c.maxLenLive < 0 && c.maxLenLive > -MARKED_FREE) {
// can happen after a rollback
c.maxLenLive = 0;
}
modified.add(c);
}
it.remove();
}
for (Chunk c : modified) {
meta.put(Chunk.getMetaKey(c.id), c.asString());
}
if (modified.isEmpty()) {
break;
}
}
}
/**
* Shrink the file if possible, and if at least a given percentage can be
* saved.
*
* @param minPercent the minimum percentage to save
*/
private void shrinkFileIfPossible(int minPercent) {
if (fileStore.isReadOnly()) {
return;
}
long end = getFileLengthInUse();
long fileSize = fileStore.size();
if (end >= fileSize) {
return;
}
if (minPercent > 0 && fileSize - end < BLOCK_SIZE) {
return;
}
int savedPercent = (int) (100 - (end * 100 / fileSize));
if (savedPercent < minPercent) {
return;
}
if (!closed) {
sync();
}
fileStore.truncate(end);
}
/**
* Get the position right after the last used byte.
*
* @return the position
*/
private long getFileLengthInUse() {
long result = fileStore.getFileLengthInUse();
assert result == measureFileLengthInUse() : result + " != " + measureFileLengthInUse();
return result;
}
private long measureFileLengthInUse() {
long size = 2;
for (Chunk c : chunks.values()) {
if (c.len != Integer.MAX_VALUE) {
size = Math.max(size, c.block + c.len);
}
}
return size * BLOCK_SIZE;
}
/**
* Check whether there are any unsaved changes.
*
* @return if there are any changes
*/
public boolean hasUnsavedChanges() {
checkOpen();
if (metaChanged) {
return true;
}
for (MVMap, ?> m : maps.values()) {
if (!m.isClosed()) {
long v = m.getVersion();
if (v >= 0 && v > lastStoredVersion) {
return true;
}
}
}
return false;
}
private Chunk readChunkHeader(long block) {
long p = block * BLOCK_SIZE;
ByteBuffer buff = fileStore.readFully(p, Chunk.MAX_HEADER_LENGTH);
return Chunk.readChunkHeader(buff, p);
}
/**
* Compact the store by moving all live pages to new chunks.
*
* @return if anything was written
*/
public synchronized boolean compactRewriteFully() {
checkOpen();
if (lastChunk == null) {
// nothing to do
return false;
}
for (MVMap, ?> m : maps.values()) {
@SuppressWarnings("unchecked")
MVMap