com.kolibrifx.plovercrest.server.internal.engine.TableMapper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of plovercrest-server Show documentation
Show all versions of plovercrest-server Show documentation
Plovercrest server library.
The newest version!
/*
* Copyright (c) 2010-2017, KolibriFX AS. Licensed under the Apache License, version 2.0.
*/
package com.kolibrifx.plovercrest.server.internal.engine;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.log4j.Logger;
import com.kolibrifx.plovercrest.client.PlovercrestException;
import com.kolibrifx.plovercrest.client.TableClosedException;
import com.kolibrifx.plovercrest.client.TableFrozenException;
import com.kolibrifx.plovercrest.client.TableLockedException;
import com.kolibrifx.plovercrest.client.TableWriteError;
import com.kolibrifx.plovercrest.server.ReadObserver;
import com.kolibrifx.plovercrest.server.Table;
import com.kolibrifx.plovercrest.server.internal.AtomicTableInfo;
import com.kolibrifx.plovercrest.server.internal.MappedBigFile;
public class TableMapper {
// all sizes in bytes
static final long ENTRY_SIZE_SIZE = 4; // size of the entry size, how meta
static final long CHECKSUM_SIZE = ENTRY_SIZE_SIZE;
static final long TIMESTAMP_SIZE = 8;
static final long MINIMUM_ENTRY_DISTANCE = CHECKSUM_SIZE + ENTRY_SIZE_SIZE + TIMESTAMP_SIZE;
// Markers: all are used instead of entrySize and must thus be <= 0
private static final int END_OF_DATA_MARKER = 0;
private static final int FROZEN_MARKER = -1;
private static final Logger log = Logger.getLogger(TableMapper.class);
private final RandomAccessFile outputStream;
private final File dataFile;
private final int fanOutDistance;
private final MappedBigFile bigFile;
private final MappedBigFile.Writer writeIterator;
private final FanOutTable fanOut;
private final FileLock fileLock;
private final String tableName;
private final AtomicTableInfo atomicTableInfo = new AtomicTableInfo();
private boolean isFrozen = false;
private final AtomicLong lastValidTimestamp;
private final AtomicBoolean closed = new AtomicBoolean(false);
public class ReadIterator {
private ByteBuffer tempBuffer;
private final MappedBigFile.Reader reader;
private long currentEntryIndex = 0;
ReadIterator(final long entryIndex) {
this.reader = bigFile.createReader(0);
this.tempBuffer = ByteBuffer.allocate(8);
if (entryIndex > 0) {
seekToEntryIndex(entryIndex);
}
}
public ByteBuffer getTempBuffer(final int minimumCapacity) {
if (tempBuffer.capacity() < minimumCapacity) {
tempBuffer = ByteBuffer.allocate(((minimumCapacity * 2) + 3) & ~3);
}
return tempBuffer;
}
public long offset() {
return reader.tell();
}
public int readInt() {
return reader.readInt();
}
public long readLong() {
return reader.readLong();
}
public void readBuffer(final ByteBuffer bytes) {
reader.readBuffer(bytes);
}
private long peekCurrentTimestamp() {
final long offset = reader.tell();
if (offset >= atomicTableInfo.getValidDataLength()) {
return -1;
}
final int size = reader.readInt();
if (size == 0) {
throw new PlovercrestException("Table is corrupt (unexpected zero entry length)");
}
final long result = reader.readLong();
reader.seek(offset);
return result;
}
private void seekUsingFanoutOrCurrentPosition(final long desiredTimestamp) {
final long currentTimestamp = peekCurrentTimestamp();
// Search for N-1 in case there are duplicate timestamps exactly at N.
final int foundIndex = fanOut.seekTakePrevious(desiredTimestamp - 1);
final FanOutTable.Entry foundEntry = fanOut.entryAt(foundIndex);
if (currentTimestamp < 0) {
// current position is at the end, must use the fanout entry
jumpToFanOutEntry(foundIndex);
return;
}
// Optimization: if the current position is better than the found
// fanout position, do nothing.
// This has a big performance effect on some use cases like
// resampling.
if (foundEntry != null) {
if (foundEntry.timestamp <= currentTimestamp && currentTimestamp < desiredTimestamp) {
return;
}
} else {
if (currentTimestamp < desiredTimestamp) {
return;
}
}
jumpToFanOutEntry(foundIndex);
}
private long peekCurrentTimestampIfNeeded(final long timestamp) {
if (timestamp < 0) {
return peekCurrentTimestamp();
} else {
return timestamp;
}
}
public long seekTakeNext(final long desiredTimestamp) {
seekUsingFanoutOrCurrentPosition(desiredTimestamp);
long curTimestamp = -1;
long entrySize = 0;
long curOffset;
do {
curOffset = reader.tell();
if (curOffset + MINIMUM_ENTRY_DISTANCE > getDataLength()) {
break;
}
entrySize = reader.readInt();
if (entrySize <= 0) {
break;
}
curTimestamp = reader.readLong();
reader.seek(reader.tell() + entrySize + CHECKSUM_SIZE);
currentEntryIndex++;
}
while (curTimestamp < desiredTimestamp);
reader.seek(curOffset);
currentEntryIndex--;
assert reader.assertInvariants();
return peekCurrentTimestampIfNeeded(curTimestamp);
}
public long seekTakePrevious(final long desiredTimestamp) {
seekUsingFanoutOrCurrentPosition(desiredTimestamp);
long prevOffset = reader.tell();
long curTimestamp = -1;
long entrySize = 0;
long prevTimestamp = curTimestamp;
long prevEntryIndex = currentEntryIndex;
for (;;) {
final long startOffset = reader.tell();
if (startOffset + MINIMUM_ENTRY_DISTANCE > getDataLength()) {
currentEntryIndex = prevEntryIndex;
reader.seek(prevOffset);
assert reader.assertInvariants();
return peekCurrentTimestampIfNeeded(prevTimestamp);
}
entrySize = reader.readInt();
if (entrySize <= 0) {
currentEntryIndex = prevEntryIndex;
reader.seek(prevOffset);
assert reader.assertInvariants();
return peekCurrentTimestampIfNeeded(prevTimestamp);
}
curTimestamp = reader.readLong();
if (curTimestamp == desiredTimestamp) {
reader.seek(startOffset);
assert reader.assertInvariants();
return peekCurrentTimestampIfNeeded(curTimestamp);
}
if (curTimestamp > desiredTimestamp) {
currentEntryIndex = prevEntryIndex;
reader.seek(prevOffset);
assert reader.assertInvariants();
return peekCurrentTimestampIfNeeded(prevTimestamp);
}
reader.seek(reader.tell() + entrySize + CHECKSUM_SIZE);
prevEntryIndex = currentEntryIndex++;
prevOffset = startOffset;
prevTimestamp = curTimestamp;
}
}
void jumpToFanOutEntry(final int fanoutIndex) {
if (fanoutIndex < 0) {
reader.seek(0);
currentEntryIndex = 0;
} else {
reader.seek(fanOut.entryAt(fanoutIndex).filePosition);
currentEntryIndex = fanoutIndex * (long) fanOutDistance;
}
}
/**
* This function has several side effects: move the iterator to the end, and add missing
* fan-out entries. Also, update entryCount and validDataLength variables.
*/
void seekFromFanOutEntryToEnd(final int fanoutIndex) {
jumpToFanOutEntry(fanoutIndex);
long entryCount;
if (fanoutIndex < 0) {
entryCount = 0;
} else {
entryCount = fanOutDistance * fanoutIndex;
}
long startOffset;
long lastTimestamp = getLastTimestamp();
for (;;) {
startOffset = reader.tell();
if (!reader.isValidOffset(startOffset + MINIMUM_ENTRY_DISTANCE)) {
log.warn("Not enough data for a complete entry (startOffset=" + startOffset + ", fileSize="
+ reader.getFileSize() + ")");
break;
}
final int entrySize = reader.readInt();
if (entrySize == END_OF_DATA_MARKER) {
break;
} else if (entrySize == FROZEN_MARKER) {
isFrozen = true;
break;
} else if (entrySize <= 0) {
log.warn("Negative entry size, and not a known marker value " + entrySize);
break;
}
if (!reader.isValidOffset(startOffset + TIMESTAMP_SIZE + entrySize + CHECKSUM_SIZE)) {
log.warn("Entry larger than remaining file size. (startOffset=" + startOffset + ", entrySize="
+ entrySize + ", fileSize=" + reader.getFileSize() + ")");
break;
}
final long timestamp = reader.readLong(); // timestamp not part
// of
// entrySize
if (timestamp > lastTimestamp) {
lastTimestamp = timestamp;
}
if (timestamp > fanOut.lastTimestamp() && (entryCount % fanOutDistance) == 0) {
try {
fanOut.appendAndWrite(startOffset, timestamp);
} catch (final IOException e) {
log.error("Failed to write fanout table: " + e.getMessage(), e);
}
}
entryCount++;
reader.seek(reader.tell() + entrySize);
final int checksum = reader.readInt();
if (checksum != entrySize) {
log.warn("Checksum failed! Expected " + entrySize + ", got " + checksum);
break;
}
}
reader.seek(startOffset);
atomicTableInfo.updateAtomically(lastTimestamp, startOffset, getFirstTimestamp(), entryCount);
assert reader.assertInvariants();
}
public long seekToEntryIndex(final long entryIndex) {
if (currentEntryIndex == entryIndex) {
// the simplest optimization
return entryIndex;
}
// TODO: optimize by avoiding jump to fanout entry if the can
final List fanoutEntries = fanOut.getEntries();
if (fanoutEntries.isEmpty()) {
reader.seek(0);
currentEntryIndex = 0;
} else {
final long longFanoutIndex = entryIndex / fanOutDistance; // rounding down
int realFanoutIndex;
if (longFanoutIndex >= fanoutEntries.size()) {
realFanoutIndex = fanoutEntries.size() - 1;
} else {
realFanoutIndex = (int) longFanoutIndex;
}
currentEntryIndex = realFanoutIndex * fanOutDistance;
jumpToFanOutEntry(realFanoutIndex);
}
// We are at the closes fanout entry before the given index, now skip (entryIndex-currentEntryIndex) entries,
// or seek to the end of the table, whichever happens first.
while (currentEntryIndex < entryIndex) {
if (reader.tell() + MINIMUM_ENTRY_DISTANCE > getDataLength()) {
break;
}
final long entrySize = reader.readInt();
if (entrySize <= 0) {
// reached end
break;
}
// skip one entry
reader.seek(reader.tell() + entrySize + TIMESTAMP_SIZE + CHECKSUM_SIZE);
currentEntryIndex++;
}
return currentEntryIndex;
}
public boolean hasMore() {
final int entrySize = reader.peekInt();
return entrySize != 0;
}
public T next(final ReadObserver observer) {
try {
final long offset = reader.tell();
final T res = read(this, observer);
if (reader.tell() > offset) {
currentEntryIndex++;
}
return res;
} catch (final IOException e) {
throw new PlovercrestException("Failed to read from table", e);
}
}
public long getEntryIndex() {
return currentEntryIndex;
}
}
public TableMapper(final Table table, final AtomicLong lastValidTimestamp) {
this.lastValidTimestamp = lastValidTimestamp;
RandomAccessFile closeMeOnException = null;
try {
tableName = table.getName();
dataFile = table.dataFile();
outputStream = new RandomAccessFile(dataFile, "rwd");
closeMeOnException = outputStream;
fileLock = outputStream.getChannel().tryLock();
if (fileLock == null) {
throw new TableLockedException(dataFile);
}
bigFile = new MappedBigFile(tableName, outputStream);
closeMeOnException = null; // success, do not close
} catch (final OverlappingFileLockException e) {
log.debug("Caught OverlappingFileLockException");
throw new TableLockedException(table.dataFile());
} catch (final IOException e) {
throw new PlovercrestException("Failed to initialize table writer for " + table.getName(), e);
} finally {
try {
if (closeMeOnException != null) {
closeMeOnException.close();
}
} catch (final IOException e) {
log.error("Failed to free resources before throwing exception", e);
}
}
fanOutDistance = table.getInfo().getFanOutDistance();
fanOut = new FanOutTable(table.fanOutFile());
bigFile.expandFile(Math.max(dataFile.length(), 1));
initializeFanOut();
final ReadIterator findEnd = new ReadIterator(0);
findEnd.seekFromFanOutEntryToEnd(fanOut.lastIndex());
final ReadIterator findFirst = new ReadIterator(0);
atomicTableInfo.setFirstTimestamp(findFirst.peekCurrentTimestamp());
writeIterator = bigFile.createWriter(findEnd.offset());
// mark as end of file
if (!isFrozen) {
writeIterator.writeInt(END_OF_DATA_MARKER);
writeIterator.seek(findEnd.offset());
}
}
private boolean isFanOutEntryOk(final FanOutTable.Entry entry, final ReadIterator iterator) {
if (!iterator.reader.isValidOffset(entry.filePosition)) {
return false;
}
iterator.jumpToFanOutEntry(fanOut.indexOf(entry));
final int entrySize = iterator.readInt();
if (entrySize <= 0) {
return false;
}
final long timestamp = iterator.readLong();
return timestamp == entry.timestamp;
}
void initializeFanOut() {
try {
fanOut.readFromFile();
} catch (final IOException e) {
// Could recreate fanout table on-demand, but for now consider this
// an error
log.error("", e);
throw new PlovercrestException("I/O error reading fan-out table: " + e.getMessage());
}
// Verifying all entries can be (very) slow, so just check the last
// entry.
// If it doesn't match, remove it and repeat until the last entry is OK.
final ReadIterator iterator = new ReadIterator(0);
while (true) {
final FanOutTable.Entry entry = fanOut.lastEntry();
if (entry == null || isFanOutEntryOk(entry, iterator)) {
break;
}
try {
log.warn(String.format("Warning: removing fan-out entry %s, which doesn't match table data", entry));
fanOut.removeLastEntry();
} catch (final IOException e) {
throw new PlovercrestException("I/O error while attempting to fix corrupt fan-out table: "
+ e.getMessage());
}
}
}
private void throwWriteError(final String message) {
throw new TableWriteError("Table '" + tableName + "' write error: " + message);
}
// Multiple levels of synchronization:
// * synchronization of write() prevents write conflicts from different
// threads
// * writing entrySize last prevents readers from accessing a
// partially-written entry
// * entrySizeLock prevents readers from accessing a partially-written
// entrySize
public synchronized void write(final long timestamp, final ByteBuffer buffer) throws IOException {
if (isFrozen) {
throw new TableFrozenException(tableName);
}
if (timestamp < 0) {
throwWriteError("Timestamps must be non-negative: " + timestamp);
}
final long lastTimestampBeforeWrite = getLastTimestamp();
if (timestamp < lastTimestampBeforeWrite) {
final String msg =
String.format("Timestamps must be in order, got %s, but last timestamp is %s (diff = %s)",
timestamp, lastTimestampBeforeWrite, (lastTimestampBeforeWrite - timestamp));
throwWriteError(msg);
}
if (timestamp <= lastValidTimestamp.get()) {
final String msg =
String.format("Cannot write timestamp (%s) <= lastValidTimestamp (%s)", timestamp,
lastValidTimestamp);
throwWriteError(msg);
}
final int length = buffer.remaining();
if (length == 0) {
throwWriteError("Payload cannot be empty.");
}
assert getDataLength() == writeIterator.tell();
final long lastOffset = writeIterator.tell();
writeIterator.writeInt(length);
writeIterator.writeLong(timestamp);
writeIterator.writeBuffer(buffer);
writeIterator.writeInt(length); // checksum
final long entryCount = getEntryCount();
if ((entryCount % fanOutDistance) == 0) {
try {
fanOut.appendAndWrite(lastOffset, timestamp);
} catch (final IOException e) {
log.error("Failed to write fan-out table: " + e.getMessage(), e);
}
}
// Update values atomically to make sure readers don't get inconsistent views
final long first = getFirstTimestamp();
atomicTableInfo.updateAtomically(timestamp, writeIterator.tell(), first < 0 ? timestamp : first, entryCount + 1);
}
public synchronized boolean markAsFrozen() {
if (isFrozen) {
return false;
}
isFrozen = true;
final long offset = writeIterator.tell();
writeIterator.writeInt(FROZEN_MARKER);
writeIterator.seek(offset); // in case of unfreeze()
return true;
}
public synchronized boolean isFrozen() {
return isFrozen;
}
public synchronized void unfreeze() {
if (isFrozen) {
final long offset = writeIterator.tell();
writeIterator.writeInt(END_OF_DATA_MARKER);
writeIterator.seek(offset);
isFrozen = false;
}
}
T read(final ReadIterator iterator, final ReadObserver observer) throws IOException {
// avoid subtle race conditions by querying these before checking validDataLength
if (closed.get()) {
throw new TableClosedException(tableName);
}
final long lastBeforeRead = getLastTimestamp();
final long validDataLength = getDataLength();
final long lastValid = Math.max(getLastTimestamp(), lastValidTimestamp.get());
if (iterator.offset() >= validDataLength) {
observer.observeEnd(lastValid);
// Make sure all elements have been observed before we trigger observeFrozen().
// "lastBeforeRead == getLastTimestamp()" should be enough to ensure this.
if (isFrozen() && lastBeforeRead == getLastTimestamp()) {
observer.observeFrozen();
}
return null;
}
final int entrySize = iterator.readInt();
if (entrySize <= 0) {
throw new PlovercrestException(
String.format("Table is corrupt (entrySize %s at offset %s, valid data length %s)",
entrySize, iterator.offset(), validDataLength));
}
final long timestamp = iterator.readLong();
final ByteBuffer tempBuffer = iterator.getTempBuffer(entrySize);
tempBuffer.clear();
tempBuffer.limit(entrySize);
iterator.readBuffer(tempBuffer);
tempBuffer.flip();
final int checksum = iterator.readInt();
if (checksum != entrySize) {
throw new PlovercrestException(String.format("Table is corrupt (entrySize %s vs checksum %s)", entrySize,
checksum));
}
return observer.observe(timestamp, tempBuffer);
}
public ReadIterator getReadIterator(final long entryIndex) {
return new ReadIterator(entryIndex);
}
public void close(final boolean flush) {
if (!closed.compareAndSet(false, true)) {
return;
}
bigFile.close(flush);
try {
fileLock.release();
fanOut.close();
outputStream.close();
} catch (final IOException e) {
log.error("", e);
throw new PlovercrestException("Exception during close: " + e.getMessage(), e);
}
}
public synchronized void force() {
bigFile.force();
}
public long getFirstTimestamp() {
return atomicTableInfo.getFirstTimestamp();
}
public long getLastTimestamp() {
return atomicTableInfo.getLastTimestamp();
}
public long getDataLength() {
return atomicTableInfo.getValidDataLength();
}
public long getEntryCount() {
return atomicTableInfo.getEntryCount();
}
}