All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.kolibrifx.plovercrest.server.internal.engine.TableMapper Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2017, KolibriFX AS. Licensed under the Apache License, version 2.0.
 */

package com.kolibrifx.plovercrest.server.internal.engine;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.log4j.Logger;
import com.kolibrifx.plovercrest.client.PlovercrestException;
import com.kolibrifx.plovercrest.client.TableClosedException;
import com.kolibrifx.plovercrest.client.TableFrozenException;
import com.kolibrifx.plovercrest.client.TableLockedException;
import com.kolibrifx.plovercrest.client.TableWriteError;
import com.kolibrifx.plovercrest.server.ReadObserver;
import com.kolibrifx.plovercrest.server.Table;
import com.kolibrifx.plovercrest.server.internal.AtomicTableInfo;
import com.kolibrifx.plovercrest.server.internal.MappedBigFile;

public class TableMapper {
    // all sizes in bytes
    static final long ENTRY_SIZE_SIZE = 4; // size of the entry size, how meta
    static final long CHECKSUM_SIZE = ENTRY_SIZE_SIZE;
    static final long TIMESTAMP_SIZE = 8;
    static final long MINIMUM_ENTRY_DISTANCE = CHECKSUM_SIZE + ENTRY_SIZE_SIZE + TIMESTAMP_SIZE;

    // Markers: all are used instead of entrySize and must thus be <= 0
    private static final int END_OF_DATA_MARKER = 0;
    private static final int FROZEN_MARKER = -1;

    private static final Logger log = Logger.getLogger(TableMapper.class);

    private final RandomAccessFile outputStream;
    private final File dataFile;
    private final int fanOutDistance;
    private final MappedBigFile bigFile;
    private final MappedBigFile.Writer writeIterator;
    private final FanOutTable fanOut;
    private final FileLock fileLock;
    private final String tableName;

    private final AtomicTableInfo atomicTableInfo = new AtomicTableInfo();
    private boolean isFrozen = false;
    private final AtomicLong lastValidTimestamp;
    private final AtomicBoolean closed = new AtomicBoolean(false);

    public class ReadIterator {
        private ByteBuffer tempBuffer;
        private final MappedBigFile.Reader reader;
        private long currentEntryIndex = 0;

        ReadIterator(final long entryIndex) {
            this.reader = bigFile.createReader(0);
            this.tempBuffer = ByteBuffer.allocate(8);
            if (entryIndex > 0) {
                seekToEntryIndex(entryIndex);
            }
        }

        public ByteBuffer getTempBuffer(final int minimumCapacity) {
            if (tempBuffer.capacity() < minimumCapacity) {
                tempBuffer = ByteBuffer.allocate(((minimumCapacity * 2) + 3) & ~3);
            }
            return tempBuffer;
        }

        public long offset() {
            return reader.tell();
        }

        public int readInt() {
            return reader.readInt();
        }

        public long readLong() {
            return reader.readLong();
        }

        public void readBuffer(final ByteBuffer bytes) {
            reader.readBuffer(bytes);
        }

        private long peekCurrentTimestamp() {
            final long offset = reader.tell();
            if (offset >= atomicTableInfo.getValidDataLength()) {
                return -1;
            }
            final int size = reader.readInt();
            if (size == 0) {
                throw new PlovercrestException("Table is corrupt (unexpected zero entry length)");
            }
            final long result = reader.readLong();
            reader.seek(offset);
            return result;
        }

        private void seekUsingFanoutOrCurrentPosition(final long desiredTimestamp) {
            final long currentTimestamp = peekCurrentTimestamp();
            // Search for N-1 in case there are duplicate timestamps exactly at N.
            final int foundIndex = fanOut.seekTakePrevious(desiredTimestamp - 1);
            final FanOutTable.Entry foundEntry = fanOut.entryAt(foundIndex);
            if (currentTimestamp < 0) {
                // current position is at the end, must use the fanout entry
                jumpToFanOutEntry(foundIndex);
                return;
            }

            // Optimization: if the current position is better than the found
            // fanout position, do nothing.
            // This has a big performance effect on some use cases like
            // resampling.
            if (foundEntry != null) {
                if (foundEntry.timestamp <= currentTimestamp && currentTimestamp < desiredTimestamp) {
                    return;
                }
            } else {
                if (currentTimestamp < desiredTimestamp) {
                    return;
                }
            }
            jumpToFanOutEntry(foundIndex);
        }

        private long peekCurrentTimestampIfNeeded(final long timestamp) {
            if (timestamp < 0) {
                return peekCurrentTimestamp();
            } else {
                return timestamp;
            }
        }

        public long seekTakeNext(final long desiredTimestamp) {
            seekUsingFanoutOrCurrentPosition(desiredTimestamp);

            long curTimestamp = -1;
            long entrySize = 0;
            long curOffset;
            do {
                curOffset = reader.tell();
                if (curOffset + MINIMUM_ENTRY_DISTANCE > getDataLength()) {
                    break;
                }

                entrySize = reader.readInt();
                if (entrySize <= 0) {
                    break;
                }

                curTimestamp = reader.readLong();
                reader.seek(reader.tell() + entrySize + CHECKSUM_SIZE);
                currentEntryIndex++;
            }
            while (curTimestamp < desiredTimestamp);

            reader.seek(curOffset);
            currentEntryIndex--;
            assert reader.assertInvariants();
            return peekCurrentTimestampIfNeeded(curTimestamp);
        }

        public long seekTakePrevious(final long desiredTimestamp) {
            seekUsingFanoutOrCurrentPosition(desiredTimestamp);

            long prevOffset = reader.tell();
            long curTimestamp = -1;
            long entrySize = 0;
            long prevTimestamp = curTimestamp;
            long prevEntryIndex = currentEntryIndex;

            for (;;) {
                final long startOffset = reader.tell();
                if (startOffset + MINIMUM_ENTRY_DISTANCE > getDataLength()) {
                    currentEntryIndex = prevEntryIndex;
                    reader.seek(prevOffset);
                    assert reader.assertInvariants();
                    return peekCurrentTimestampIfNeeded(prevTimestamp);
                }

                entrySize = reader.readInt();
                if (entrySize <= 0) {
                    currentEntryIndex = prevEntryIndex;
                    reader.seek(prevOffset);
                    assert reader.assertInvariants();
                    return peekCurrentTimestampIfNeeded(prevTimestamp);
                }

                curTimestamp = reader.readLong();

                if (curTimestamp == desiredTimestamp) {
                    reader.seek(startOffset);
                    assert reader.assertInvariants();
                    return peekCurrentTimestampIfNeeded(curTimestamp);
                }

                if (curTimestamp > desiredTimestamp) {
                    currentEntryIndex = prevEntryIndex;
                    reader.seek(prevOffset);
                    assert reader.assertInvariants();
                    return peekCurrentTimestampIfNeeded(prevTimestamp);
                }

                reader.seek(reader.tell() + entrySize + CHECKSUM_SIZE);

                prevEntryIndex = currentEntryIndex++;
                prevOffset = startOffset;
                prevTimestamp = curTimestamp;
            }
        }

        void jumpToFanOutEntry(final int fanoutIndex) {
            if (fanoutIndex < 0) {
                reader.seek(0);
                currentEntryIndex = 0;
            } else {
                reader.seek(fanOut.entryAt(fanoutIndex).filePosition);
                currentEntryIndex = fanoutIndex * (long) fanOutDistance;
            }
        }

        /**
         * This function has several side effects: move the iterator to the end, and add missing
         * fan-out entries. Also, update entryCount and validDataLength variables.
         */
        void seekFromFanOutEntryToEnd(final int fanoutIndex) {
            jumpToFanOutEntry(fanoutIndex);
            long entryCount;
            if (fanoutIndex < 0) {
                entryCount = 0;
            } else {
                entryCount = fanOutDistance * fanoutIndex;
            }

            long startOffset;
            long lastTimestamp = getLastTimestamp();

            for (;;) {
                startOffset = reader.tell();

                if (!reader.isValidOffset(startOffset + MINIMUM_ENTRY_DISTANCE)) {
                    log.warn("Not enough data for a complete entry (startOffset=" + startOffset + ", fileSize="
                            + reader.getFileSize() + ")");
                    break;
                }

                final int entrySize = reader.readInt();
                if (entrySize == END_OF_DATA_MARKER) {
                    break;
                } else if (entrySize == FROZEN_MARKER) {
                    isFrozen = true;
                    break;
                } else if (entrySize <= 0) {
                    log.warn("Negative entry size, and not a known marker value " + entrySize);
                    break;
                }

                if (!reader.isValidOffset(startOffset + TIMESTAMP_SIZE + entrySize + CHECKSUM_SIZE)) {
                    log.warn("Entry larger than remaining file size. (startOffset=" + startOffset + ", entrySize="
                            + entrySize + ", fileSize=" + reader.getFileSize() + ")");
                    break;
                }

                final long timestamp = reader.readLong(); // timestamp not part
                                                          // of
                // entrySize
                if (timestamp > lastTimestamp) {
                    lastTimestamp = timestamp;
                }
                if (timestamp > fanOut.lastTimestamp() && (entryCount % fanOutDistance) == 0) {
                    try {
                        fanOut.appendAndWrite(startOffset, timestamp);
                    } catch (final IOException e) {
                        log.error("Failed to write fanout table: " + e.getMessage(), e);
                    }
                }
                entryCount++;
                reader.seek(reader.tell() + entrySize);

                final int checksum = reader.readInt();
                if (checksum != entrySize) {
                    log.warn("Checksum failed! Expected " + entrySize + ", got " + checksum);
                    break;
                }
            }
            reader.seek(startOffset);
            atomicTableInfo.updateAtomically(lastTimestamp, startOffset, getFirstTimestamp(), entryCount);
            assert reader.assertInvariants();
        }

        public long seekToEntryIndex(final long entryIndex) {
            if (currentEntryIndex == entryIndex) {
                // the simplest optimization
                return entryIndex;
            }
            // TODO: optimize by avoiding jump to fanout entry if the can
            final List fanoutEntries = fanOut.getEntries();
            if (fanoutEntries.isEmpty()) {
                reader.seek(0);
                currentEntryIndex = 0;
            } else {
                final long longFanoutIndex = entryIndex / fanOutDistance; // rounding down
                int realFanoutIndex;
                if (longFanoutIndex >= fanoutEntries.size()) {
                    realFanoutIndex = fanoutEntries.size() - 1;
                } else {
                    realFanoutIndex = (int) longFanoutIndex;
                }
                currentEntryIndex = realFanoutIndex * fanOutDistance;
                jumpToFanOutEntry(realFanoutIndex);
            }
            // We are at the closes fanout entry before the given index, now skip (entryIndex-currentEntryIndex) entries,
            // or seek to the end of the table, whichever happens first.
            while (currentEntryIndex < entryIndex) {
                if (reader.tell() + MINIMUM_ENTRY_DISTANCE > getDataLength()) {
                    break;
                }
                final long entrySize = reader.readInt();
                if (entrySize <= 0) {
                    // reached end
                    break;
                }
                // skip one entry
                reader.seek(reader.tell() + entrySize + TIMESTAMP_SIZE + CHECKSUM_SIZE);
                currentEntryIndex++;
            }
            return currentEntryIndex;
        }

        public boolean hasMore() {
            final int entrySize = reader.peekInt();
            return entrySize != 0;
        }

        public  T next(final ReadObserver observer) {
            try {
                final long offset = reader.tell();
                final T res = read(this, observer);
                if (reader.tell() > offset) {
                    currentEntryIndex++;
                }
                return res;
            } catch (final IOException e) {
                throw new PlovercrestException("Failed to read from table", e);
            }
        }

        public long getEntryIndex() {
            return currentEntryIndex;
        }
    }

    public TableMapper(final Table table, final AtomicLong lastValidTimestamp) {
        this.lastValidTimestamp = lastValidTimestamp;
        RandomAccessFile closeMeOnException = null;
        try {
            tableName = table.getName();
            dataFile = table.dataFile();
            outputStream = new RandomAccessFile(dataFile, "rwd");
            closeMeOnException = outputStream;
            fileLock = outputStream.getChannel().tryLock();
            if (fileLock == null) {
                throw new TableLockedException(dataFile);
            }
            bigFile = new MappedBigFile(tableName, outputStream);
            closeMeOnException = null; // success, do not close
        } catch (final OverlappingFileLockException e) {
            log.debug("Caught OverlappingFileLockException");
            throw new TableLockedException(table.dataFile());
        } catch (final IOException e) {
            throw new PlovercrestException("Failed to initialize table writer for " + table.getName(), e);
        } finally {
            try {
                if (closeMeOnException != null) {
                    closeMeOnException.close();
                }
            } catch (final IOException e) {
                log.error("Failed to free resources before throwing exception", e);
            }
        }
        fanOutDistance = table.getInfo().getFanOutDistance();
        fanOut = new FanOutTable(table.fanOutFile());

        bigFile.expandFile(Math.max(dataFile.length(), 1));
        initializeFanOut();
        final ReadIterator findEnd = new ReadIterator(0);
        findEnd.seekFromFanOutEntryToEnd(fanOut.lastIndex());

        final ReadIterator findFirst = new ReadIterator(0);
        atomicTableInfo.setFirstTimestamp(findFirst.peekCurrentTimestamp());

        writeIterator = bigFile.createWriter(findEnd.offset());
        // mark as end of file
        if (!isFrozen) {
            writeIterator.writeInt(END_OF_DATA_MARKER);
            writeIterator.seek(findEnd.offset());
        }
    }

    private boolean isFanOutEntryOk(final FanOutTable.Entry entry, final ReadIterator iterator) {
        if (!iterator.reader.isValidOffset(entry.filePosition)) {
            return false;
        }
        iterator.jumpToFanOutEntry(fanOut.indexOf(entry));
        final int entrySize = iterator.readInt();
        if (entrySize <= 0) {
            return false;
        }
        final long timestamp = iterator.readLong();
        return timestamp == entry.timestamp;
    }

    void initializeFanOut() {
        try {
            fanOut.readFromFile();
        } catch (final IOException e) {
            // Could recreate fanout table on-demand, but for now consider this
            // an error
            log.error("", e);
            throw new PlovercrestException("I/O error reading fan-out table: " + e.getMessage());
        }

        // Verifying all entries can be (very) slow, so just check the last
        // entry.
        // If it doesn't match, remove it and repeat until the last entry is OK.
        final ReadIterator iterator = new ReadIterator(0);
        while (true) {
            final FanOutTable.Entry entry = fanOut.lastEntry();
            if (entry == null || isFanOutEntryOk(entry, iterator)) {
                break;
            }
            try {
                log.warn(String.format("Warning: removing fan-out entry %s, which doesn't match table data", entry));
                fanOut.removeLastEntry();
            } catch (final IOException e) {
                throw new PlovercrestException("I/O error while attempting to fix corrupt fan-out table: "
                        + e.getMessage());
            }
        }
    }

    private void throwWriteError(final String message) {
        throw new TableWriteError("Table '" + tableName + "' write error: " + message);
    }

    // Multiple levels of synchronization:
    // * synchronization of write() prevents write conflicts from different
    // threads
    // * writing entrySize last prevents readers from accessing a
    // partially-written entry
    // * entrySizeLock prevents readers from accessing a partially-written
    // entrySize
    public synchronized void write(final long timestamp, final ByteBuffer buffer) throws IOException {
        if (isFrozen) {
            throw new TableFrozenException(tableName);
        }
        if (timestamp < 0) {
            throwWriteError("Timestamps must be non-negative: " + timestamp);
        }

        final long lastTimestampBeforeWrite = getLastTimestamp();
        if (timestamp < lastTimestampBeforeWrite) {
            final String msg =
                    String.format("Timestamps must be in order, got %s, but last timestamp is %s (diff = %s)",
                                  timestamp, lastTimestampBeforeWrite, (lastTimestampBeforeWrite - timestamp));
            throwWriteError(msg);
        }

        if (timestamp <= lastValidTimestamp.get()) {
            final String msg =
                    String.format("Cannot write timestamp (%s) <= lastValidTimestamp (%s)", timestamp,
                                  lastValidTimestamp);
            throwWriteError(msg);
        }

        final int length = buffer.remaining();

        if (length == 0) {
            throwWriteError("Payload cannot be empty.");
        }

        assert getDataLength() == writeIterator.tell();
        final long lastOffset = writeIterator.tell();
        writeIterator.writeInt(length);
        writeIterator.writeLong(timestamp);
        writeIterator.writeBuffer(buffer);
        writeIterator.writeInt(length); // checksum

        final long entryCount = getEntryCount();
        if ((entryCount % fanOutDistance) == 0) {
            try {
                fanOut.appendAndWrite(lastOffset, timestamp);
            } catch (final IOException e) {
                log.error("Failed to write fan-out table: " + e.getMessage(), e);
            }
        }

        // Update values atomically to make sure readers don't get inconsistent views
        final long first = getFirstTimestamp();
        atomicTableInfo.updateAtomically(timestamp, writeIterator.tell(), first < 0 ? timestamp : first, entryCount + 1);
    }

    public synchronized boolean markAsFrozen() {
        if (isFrozen) {
            return false;
        }
        isFrozen = true;
        final long offset = writeIterator.tell();
        writeIterator.writeInt(FROZEN_MARKER);
        writeIterator.seek(offset); // in case of unfreeze()
        return true;
    }

    public synchronized boolean isFrozen() {
        return isFrozen;
    }

    public synchronized void unfreeze() {
        if (isFrozen) {
            final long offset = writeIterator.tell();
            writeIterator.writeInt(END_OF_DATA_MARKER);
            writeIterator.seek(offset);
            isFrozen = false;
        }
    }

     T read(final ReadIterator iterator, final ReadObserver observer) throws IOException {
        // avoid subtle race conditions by querying these before checking validDataLength
        if (closed.get()) {
            throw new TableClosedException(tableName);
        }
        final long lastBeforeRead = getLastTimestamp();
        final long validDataLength = getDataLength();
        final long lastValid = Math.max(getLastTimestamp(), lastValidTimestamp.get());

        if (iterator.offset() >= validDataLength) {
            observer.observeEnd(lastValid);
            // Make sure all elements have been observed before we trigger observeFrozen().
            // "lastBeforeRead == getLastTimestamp()" should be enough to ensure this.
            if (isFrozen() && lastBeforeRead == getLastTimestamp()) {
                observer.observeFrozen();
            }
            return null;
        }
        final int entrySize = iterator.readInt();
        if (entrySize <= 0) {
            throw new PlovercrestException(
                                           String.format("Table is corrupt (entrySize %s at offset %s, valid data length %s)",
                                                         entrySize, iterator.offset(), validDataLength));
        }

        final long timestamp = iterator.readLong();

        final ByteBuffer tempBuffer = iterator.getTempBuffer(entrySize);
        tempBuffer.clear();
        tempBuffer.limit(entrySize);
        iterator.readBuffer(tempBuffer);
        tempBuffer.flip();

        final int checksum = iterator.readInt();
        if (checksum != entrySize) {
            throw new PlovercrestException(String.format("Table is corrupt (entrySize %s vs checksum %s)", entrySize,
                                                         checksum));
        }
        return observer.observe(timestamp, tempBuffer);
    }

    public ReadIterator getReadIterator(final long entryIndex) {
        return new ReadIterator(entryIndex);
    }

    public void close(final boolean flush) {
        if (!closed.compareAndSet(false, true)) {
            return;
        }
        bigFile.close(flush);
        try {
            fileLock.release();
            fanOut.close();
            outputStream.close();
        } catch (final IOException e) {
            log.error("", e);
            throw new PlovercrestException("Exception during close: " + e.getMessage(), e);
        }
    }

    public synchronized void force() {
        bigFile.force();
    }

    public long getFirstTimestamp() {
        return atomicTableInfo.getFirstTimestamp();
    }

    public long getLastTimestamp() {
        return atomicTableInfo.getLastTimestamp();
    }

    public long getDataLength() {
        return atomicTableInfo.getValidDataLength();
    }

    public long getEntryCount() {
        return atomicTableInfo.getEntryCount();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy