All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.yamcs.replication.ReplicationFile Maven / Gradle / Ivy

There is a newer version: 5.10.9
Show newest version
package org.yamcs.replication;

import java.io.Closeable;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.time.Instant;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.zip.CRC32;

import org.yamcs.logging.Log;
import org.yamcs.utils.StringConverter;

import io.netty.util.internal.PlatformDependent;

/**
 * Stores transactions in a memory mapped file. The data is split into pages, each page has a fixed number of
 * transactions.
 * 

* An index gives a pointer to the beginning of each page to allow to jump faster to a given transaction number. *

* The metadata transactions form a linked list in order to allow to send them all when a client connects. * *

* Header: * *

 * 12 bytes magic "YAMCS_STREAM"
 *  1 byte version
 *  3 bytes spare
 *  8 bytes first_id =  first transaction in the file = file_id - used for consistency check(if someone renames the file)
 *  4 bytes page_size - number of transactions per page 
 *  4 bytes max_pages - max number of pages (and the size of the index)
 * 
 *  8 bytes last_mod = last modification time
 *  4 bytes n =  number of full pages. If n=max_pages, the file is full, cannot be written to it
 *  4 bytes m = number of transactions on page n
 *  4 bytes firstMetadataPos - position of the first metadata transaction
 *  (max_pages+1) x 4 bytes idx - transaction index 
 *      idx[i] (i=0..max_pages) - offset in the file where transaction with id id_first + i*page_size starts
 *      idx[i] = 0 -> no such transaction. this means num_tx < i*m
 *      idx[max_pages] -> pointer to the end of the file.
 * 
* * transaction data: * *
 * 
 * 1 byte type - the type can be DATA or STREAM_INFO with the constants defined in {@link Message}
 * 3 bytes - size of the data that follows including the CRC (or alternatively including the first 4 bytes type+size but excluding the crc)
 * 4 bytes instance_id
 * 8 bytes transaction_id
 * n bytes data
 * 4 bytes CRC32 calculated over the data including the type and length
 * 
 *  for metadata the first 4 bytes of the data is the position of the next metadata record
 * 
 * 
* *

* The methods of this class throw {@link UncheckedIOException} instead of {@link IOException}. When working with memory * mapped files in java, an IO error will cause an unspecified unchecked exception or even crash of Java (because file * data is accessed using memory reads/writes). Therefore we prefer not to give a false sense of security by throwing * IOException only in some limited situations and converted all these to {@link UncheckedIOException}.. * *

* The one occasion when Java may crash while no hardware failure is present is when the disk is full. *

* TODO: add a checker and stop writing data if the disk usage is above a threshold. * */ public class ReplicationFile implements Closeable { static final String RPL_FILENAME_PREFIX = "RPL"; final static byte[] MAGIC = { 'Y', 'A', 'M', 'C', 'S', '_', 'S', 'T', 'R', 'E', 'A', 'M' }; // the position inside the record where the metadata position pointer sits // it is after size, instanceId, txid final static int METADATA_POS_OFFSET = 16; final static int MIN_RECORD_SIZE = 20; // size, instanceId, txId, crc final Log log; ReadWriteLock rwlock = new ReentrantReadWriteLock(); private MappedByteBuffer buf; private int lastMetadataTxStart; private FileChannel fc; final private boolean readOnly; final private Header1 hdr1; final private Header2 hdr2; private boolean fileFull = false; final Path path; CRC32 crc32 = new CRC32(); private boolean syncRequired; class Header1 { // this is the first part - fixed - of the header final static byte VERSION = 0; final static int LENGTH = 32; final long firstId; // first transaction id final int pageSize, maxPages; // new file Header1(long firstId, int pageSize, int maxPages) { this.firstId = firstId; this.pageSize = pageSize; this.maxPages = maxPages; buf.put(MAGIC); buf.putInt(VERSION << 24); buf.putLong(firstId); buf.putInt(pageSize); buf.putInt(maxPages); } // open existing file Header1(long firstTxId) { checkHdr1(firstTxId); firstId = firstTxId; pageSize = buf.getInt(); maxPages = buf.getInt(); } private void checkHdr1(long firstTxId) { byte[] magic = new byte[MAGIC.length]; buf.get(magic); if (!Arrays.equals(magic, MAGIC)) { throw new CorruptedFileException(path, "bad file, magic entry does not match: " + StringConverter.arrayToHexString(magic) + ". Expected " + StringConverter.arrayToHexString(MAGIC)); } int version = buf.getInt() >> 24; if (version != VERSION) { throw new CorruptedFileException(path, "bad version: " + version + ". Expected " + VERSION); } long id = buf.getLong(); if (id != firstTxId) { throw new CorruptedFileException(path, "bad firstId " + id + " expected " + firstTxId); } } @Override public String toString() { return "Header1 [firstId=" + firstId + ", pageSize=" + pageSize + ", maxPages=" + maxPages + "]"; } } class Header2 { final static int HDR_IDX_OFFSET = Header1.LENGTH + 20; int numFullPages; // number of full pages int lastPageNumTx; // number of transaction on the last page long lastMod; // last modification Header2(boolean newFile) { if (newFile) { this.numFullPages = 0; this.lastPageNumTx = 0; this.lastMod = System.currentTimeMillis(); write(); buf.position(HDR_IDX_OFFSET - 4); buf.putInt(0);// first metadata pointer writeIndex(0, endOffset()); buf.putInt(endOffset());// position of the transaction 0 for (int i = 1; i <= hdr1.maxPages; i++) { writeIndex(i, 0); } } else { buf.position(Header1.LENGTH); lastMod = buf.getLong(); numFullPages = buf.getInt(); lastPageNumTx = buf.getInt(); } } void write() { buf.putLong(Header1.LENGTH, lastMod); buf.putInt(Header1.LENGTH + 8, numFullPages); buf.putInt(Header1.LENGTH + 12, lastPageNumTx); } public int firstMetadataPointer() { return buf.getInt(HDR_IDX_OFFSET - 4); } /** * returns the offset of the end of hdr2 - where data begins */ public int endOffset() { return HDR_IDX_OFFSET + 4 * (hdr1.maxPages + 1); } public int getIndex(int n) { return buf.getInt(HDR_IDX_OFFSET + n * 4); } void writeIndex(int n, int txPos) { buf.putInt(HDR_IDX_OFFSET + n * 4, txPos); } void incrNumTx() { hdr2.lastPageNumTx++; if (hdr2.lastPageNumTx == hdr1.pageSize) { hdr2.numFullPages++; hdr2.lastPageNumTx = 0; hdr2.writeIndex(hdr2.numFullPages, buf.position()); } } int numTx() { return hdr1.pageSize * hdr2.numFullPages + hdr2.lastPageNumTx; } @Override public String toString() { return "Header2 [numFullPages=" + numFullPages + ", lastPageNumTx=" + lastPageNumTx + ", lastMod=" + Instant.ofEpochMilli(lastMod) + "]"; } } public long getFirstId() { return hdr1.firstId; } /** * Creates a new empty file * * @param dir * @param id * @param pageSize * @param maxPages * @param maxFileSize */ private ReplicationFile(String yamcsInstance, Path path, long id, int pageSize, int maxPages, int maxFileSize) { log = new Log(this.getClass(), yamcsInstance); this.path = path; if (Files.exists(path)) { throw new IllegalArgumentException("File " + path + " exists. Refusing to overwrite"); } try { fc = FileChannel.open(path, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.READ); buf = fc.map(MapMode.READ_WRITE, 0, maxFileSize); hdr1 = new Header1(id, pageSize, maxPages); hdr2 = new Header2(true); this.readOnly = false; this.lastMetadataTxStart = Header2.HDR_IDX_OFFSET - METADATA_POS_OFFSET - 4; buf.position(hdr2.endOffset()); log.info("Created new replication file {} pageSize: {}, maxPages:{}, maxFileSize: {}", path, hdr1.pageSize, hdr1.maxPages, maxFileSize); } catch (IOException e) { throw new UncheckedIOException(e); } } /** * Open an existing file for append */ private ReplicationFile(String yamcsInstance, Path path, long firstTxId, int maxFileSize) { log = new Log(this.getClass(), yamcsInstance); this.path = path; this.readOnly = false; try { fc = FileChannel.open(path, StandardOpenOption.READ, StandardOpenOption.WRITE); buf = fc.map(MapMode.READ_WRITE, 0, maxFileSize); } catch (IOException e) { throw new UncheckedIOException(e); } hdr1 = new Header1(firstTxId); hdr2 = new Header2(false); log.debug("{}, {}", hdr1, hdr2); // recover any non indexed good transactions at the end of the file and set the position at the end recover(); long endOffset = buf.position(); // find the offset where the next metadata has to be written lastMetadataTxStart = Header2.HDR_IDX_OFFSET - METADATA_POS_OFFSET - 4; while (true) { int nextMetadataTxStart = buf.getInt(lastMetadataTxStart + METADATA_POS_OFFSET); if (nextMetadataTxStart == 0 || nextMetadataTxStart + METADATA_POS_OFFSET > endOffset) { break; } if (nextMetadataTxStart <= lastMetadataTxStart) { throw new UncheckedIOException( new IOException("Corrupted file " + path + " at position " + lastMetadataTxStart + " the metadata pointer points in the past")); } lastMetadataTxStart = nextMetadataTxStart; } log.info("Opened for append {} pageSize: {}, maxPages:{}, num_tx: {}", path, hdr1.pageSize, hdr1.maxPages, hdr2.numTx()); } /** * Open an existing file read only */ private ReplicationFile(String yamcsInstance, Path path, long firstTxId) { log = new Log(this.getClass(), yamcsInstance); this.readOnly = true; this.path = path; try { fc = FileChannel.open(path, StandardOpenOption.READ); buf = fc.map(MapMode.READ_ONLY, 0, fc.size()); } catch (IOException e) { throw new UncheckedIOException(e); } hdr1 = new Header1(firstTxId); hdr2 = new Header2(false); log.debug("hdr1: {}, hdr2: {}", hdr1, hdr2); this.lastMetadataTxStart = Header2.HDR_IDX_OFFSET - 4; // set the filefull this.fileFull = true; recover(); log.info("Opened read-only {} pageSize: {}, maxPages:{}, num_tx: {}", path, hdr1.pageSize, hdr1.maxPages, hdr2.numTx()); } public static ReplicationFile newFile(String yamcsInstance, Path path, long firstTxId, int pageSize, int maxPages, int maxFileSize) { checkSize(pageSize, maxPages, maxFileSize); return new ReplicationFile(yamcsInstance, path, firstTxId, pageSize, maxPages, maxFileSize); } private static void checkSize(int pageSize, int maxPages, int maxFileSize) { int minSize = headerSize(pageSize, maxPages) + MIN_RECORD_SIZE; if (maxFileSize < minSize) { throw new IllegalArgumentException( "maxFileSize=" + maxFileSize + " too small; " + minSize + " bytes required for storing an empty transaction"); } } public static ReplicationFile openReadOnly(String yamcsInstance, Path path, long firstTxId) { return new ReplicationFile(yamcsInstance, path, firstTxId); } public static ReplicationFile openReadWrite(String yamcsInstance, Path path, long firstTxId, int maxFileSize) { return new ReplicationFile(yamcsInstance, path, firstTxId, maxFileSize); } /** * Write transaction to the file and returns the transaction id. *

* returns -1 if the transaction could not be written because the file is full. * * @param tx * @return */ public long writeData(Transaction tx) { if (readOnly) { throw new IllegalStateException("Read only file"); } else if (!fc.isOpen()) { // this may happen if the thread writing to the replication file is interrupted when writing log.warn("Attempting to write to a closed file"); return -1; } rwlock.writeLock().lock(); final int txStartPos = buf.position(); try { if (fileFull) { return -1; } else if (hdr2.numFullPages == hdr1.maxPages) { return abortWriteFileFull(txStartPos); } else if (buf.remaining() < MIN_RECORD_SIZE) { return abortWriteFileFull(txStartPos); } long txid = hdr1.firstId + hdr2.numTx(); log.trace("Writing transaction {} at position {}", txid, buf.position()); buf.putInt(0);// this is where the the type and size is written below buf.putInt(tx.getInstanceId()); buf.putLong(txid); byte type = tx.getType(); if (Transaction.isMetadata(type)) { buf.putInt(0);// next metadata position } try { tx.marshall(buf); } catch (BufferOverflowException | IndexOutOfBoundsException e) {// end of file return abortWriteFileFull(txStartPos); } if (buf.remaining() < 4) {// no space left for CRC return abortWriteFileFull(txStartPos); } int size = buf.position() - txStartPos; buf.putInt(txStartPos, (type << 24) | (size)); int crc = compute_crc(buf, txStartPos); buf.putInt(crc); if (Transaction.isMetadata(type)) { buf.putInt(lastMetadataTxStart + METADATA_POS_OFFSET, txStartPos); // update crc of the modified metadata record if (lastMetadataTxStart >= hdr2.endOffset()) { updateCrc(lastMetadataTxStart); } if (log.isTraceEnabled()) { log.trace("Wrote at offset {} the pointer to the next metadata at {}", lastMetadataTxStart + METADATA_POS_OFFSET, txStartPos); } lastMetadataTxStart = txStartPos; } hdr2.lastMod = System.currentTimeMillis(); log.trace("Wrote transaction {} of type {} at position {}, total size: {}", txid, type, txStartPos, size + 4); hdr2.incrNumTx(); return txid; } catch (Throwable e) { buf.position(txStartPos); log.error("Caught exception when writing the replication file ", e); throw e; } finally { rwlock.writeLock().unlock(); } } // starts from latest known transaction (according to the header) and checks for new ones private void recover() { int n = hdr2.numTx(); int startTxPos = getPosition(n); buf.position(startTxPos); int k = 0; while (buf.remaining() > MIN_RECORD_SIZE) { n = hdr2.numTx(); startTxPos = buf.position(); int size = buf.getInt() & 0xFFFFFF; if (size > buf.remaining() || size < 12) { break; } buf.getInt();// serverId long txId = buf.getLong(); if (txId != hdr1.firstId + n) { break; } buf.position(startTxPos + size); int crc = compute_crc(buf, startTxPos); if (crc != buf.getInt()) { log.debug("Trying to recover TX{}: CRC does not match", txId); break; } log.debug("Recovered TX{}", txId); hdr2.incrNumTx(); k++; } log.debug("Found {} transactions more than indicated in the header", k); buf.position(startTxPos); } private int abortWriteFileFull(int txStartPos) { fileFull = true; buf.position(txStartPos); log.debug("File {} full, numTx: {}", path, hdr2.numTx()); return -1; } // update the CRC of the record starting at the position private void updateCrc(int pos) { ByteBuffer buf1 = buf.duplicate(); buf1.position(pos); int size = buf1.getInt() & 0xFFFFFF; buf1.position(pos); buf1.limit(pos + size); crc32.reset(); crc32.update(buf1); buf1.limit(buf1.limit() + 4); buf1.putInt((int) crc32.getValue()); } // compute checksum from start to the current position private int compute_crc(ByteBuffer buf, int start) { int prevLimit = buf.limit(); buf.limit(buf.position()); buf.position(start); crc32.reset(); crc32.update(buf); buf.limit(prevLimit); return (int) crc32.getValue(); } /** * Returns a {@link ReplicationTail} containing a read only {@link ByteBuffer} having the position on given txId and * with the limit set to the current end of tx data. * *

* The tail can be sent back in {@link #getNewData(ReplicationTail)} to obtain more data if available. *

* {@link ReplicationTail#eof} = true means the file is full so no more data will be available in the future. * *

* if the txId is smaller than the first transaction of this file, an {@link IllegalArgumentException} is thrown. *

* If the txId is greater than the highest transaction in this file plus 1, null is returned *

* If the txId is the highest transaction in this file plus one, a tail with 0 transactions (i.e. position=limit in * the buffer) is returned; it can be used later to get more data. * * @param txId * @return */ public ReplicationTail tail(long txId) { int txNum = (int) (txId - hdr1.firstId); if (txNum < 0) { throw new IllegalArgumentException(txId + " is smaller than " + hdr1.firstId); } rwlock.readLock().lock(); try { int pos = getPosition(txNum); if (pos < 0) { return null; } ByteBuffer buf1 = buf.duplicate().asReadOnlyBuffer(); buf1.position(pos); buf1.limit(buf.position()); ReplicationTail rfe = new ReplicationTail(); rfe.buf = buf1; rfe.nextTxId = getNextTxId(); if (fileFull) { rfe.eof = true; } return rfe; } finally { rwlock.readLock().unlock(); } } /** * Change the limit inside the file tail to the current position in the file buffer. Update the nextTxId *

* Also update the eof flag if the file filled up since the last call. * * @param rfe */ public void getNewData(ReplicationTail rfe) { rwlock.readLock().lock(); try { rfe.buf.limit(buf.position()); if (fileFull) { rfe.eof = true; } rfe.nextTxId = getNextTxId(); } finally { rwlock.readLock().unlock(); } } /** * Get the position of the txNum th transaction in the file according to the index *

* return -1 if the transaction is beyond the end of the file. */ private int getPosition(int txNum) { // number of full pages int nfp = txNum / hdr1.pageSize; int m1 = txNum - nfp * hdr1.pageSize; if ((nfp > hdr2.numFullPages) || (nfp == hdr2.numFullPages && m1 > hdr2.lastPageNumTx)) { return -1; } // first jump to the right page int pos = hdr2.getIndex(nfp); long expectedTxId = hdr1.firstId + hdr1.pageSize * nfp; // then skip m1 transactions for (int i = 0; i < m1; i++) { pos = skipTransaction(pos, expectedTxId++); } return pos; } private int skipTransaction(int pos, long expectedTxId) { int typeSize = buf.getInt(pos); long txId = buf.getLong(pos + 8); if (txId != expectedTxId) {// consistency check throw new CorruptedFileException(path, "at offset " + pos + " expected txId " + expectedTxId + " but found " + txId + " instead"); } return pos + 4 + (typeSize & 0xFFFFFF); } public boolean isFull() { return fileFull; } /** * Iterate through the metadata */ public Iterator metadataIterator() { return new MetadataIterator(); } /** * Iterate through the data */ public Iterator iterator() { return new TxIterator(); } public void close() { try { if (!readOnly) { hdr2.write(); // Required to unmap on Windows, else truncate fails PlatformDependent.freeDirectBuffer(buf); fc.truncate(buf.position()); } else { // Required on Windows PlatformDependent.freeDirectBuffer(buf); } fc.close(); } catch (IOException e) { throw new UncheckedIOException(e); } } /** * Force writing the content on disk. *

* The method will call first {@link FileChannel#force(boolean)}, write the number of transactions to the header and * then call again {@link FileChannel#force(boolean)} to force also the header on the disk. *

* This way should guarantee that the transaction data is written on the disk before the header */ public void sync() throws IOException { if (!readOnly) { rwlock.readLock().lock(); try { fc.force(true); hdr2.write(); fc.force(true); } finally { rwlock.readLock().unlock(); } } } public static int headerSize(int pageSize, int maxPages) { return Header2.HDR_IDX_OFFSET + 4 * (maxPages + 1); } public int numTx() { return hdr2.numTx(); } public boolean isSyncRequired() { return syncRequired; } /** * Set the sync required flag such that the file is synchronized by the ReplicationMaster * * @param syncRequired */ public void setSyncRequired(boolean syncRequired) { this.syncRequired = syncRequired; } /** * Returns the last tx id from this file + 1. *

* If there is no transaction in this file, return 0. */ public long getNextTxId() { return hdr1.firstId + hdr2.numTx(); } class MetadataIterator implements Iterator { int nextPos; MetadataIterator() { nextPos = hdr2.firstMetadataPointer(); } @Override public boolean hasNext() { return nextPos > 0; } /** * Returns a ByteBuffer with the position set to where the record begins and the limit sets to where it ends *

* The structure of the metadata is * *

         *  1 byte type
         *  3 bytes size -size of data that follows( i.e. without the size itself) = n + 20
         *  4 bytes serverId
         *  8 bytes txId
         *  4 bytes metadata next position (to be ignored, only relevant inside the file)
         *   n bytes data
         *  4 bytes crc
         * 
*/ @Override public ByteBuffer next() { ByteBuffer buf1 = buf.asReadOnlyBuffer(); buf1.position(nextPos); int typesize = buf1.getInt(nextPos); int limit = nextPos + 4 + (typesize & 0xFFFFFF); buf1.limit(limit); nextPos = buf1.getInt(nextPos + METADATA_POS_OFFSET); return buf1; } } class TxIterator implements Iterator { int nextPos; TxIterator() { nextPos = hdr2.endOffset(); } @Override public boolean hasNext() { return nextPos > 0; } /** * Returns a ByteBuffer with the position set to where the record begins and the limit sets to where it ends *

* The structure of the metadata is * *

         *  1 byte type
         *  3 bytes size -size of data that follows( i.e. without the size itself) = n + 20
         *  4 bytes serverId
         *  8 bytes txId
         *  4 bytes metadata next position (to be ignored, only relevant inside the file)
         *   n bytes data
         *  4 bytes crc
         * 
*/ @Override public ByteBuffer next() { if (nextPos < 0) { throw new NoSuchElementException(); } ByteBuffer buf1 = buf.asReadOnlyBuffer(); buf1.position(nextPos); int typesize = buf1.getInt(nextPos); nextPos += 4 + (typesize & 0xFFFFFF); if (nextPos >= buf.limit()) { nextPos = -1; } else { buf1.limit(nextPos); } return buf1; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy