com.bigdata.io.writecache.WriteCache Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Feb 10, 2010
*/
package com.bigdata.io.writecache;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.log4j.Logger;
import com.bigdata.btree.IndexSegmentBuilder;
import com.bigdata.counters.CounterSet;
import com.bigdata.ha.msg.HAWriteMessage;
import com.bigdata.ha.msg.IHAWriteMessage;
import com.bigdata.io.ChecksumUtility;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.FileChannelUtility;
import com.bigdata.io.IBufferAccess;
import com.bigdata.io.IReopenChannel;
import com.bigdata.io.compression.CompressorRegistry;
import com.bigdata.io.compression.IRecordCompressor;
import com.bigdata.journal.AbstractBufferStrategy;
import com.bigdata.journal.StoreTypeEnum;
import com.bigdata.journal.WORMStrategy;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rwstore.RWStore;
import com.bigdata.util.Bytes;
import com.bigdata.util.ChecksumError;
/**
* This class provides a write cache with read-through for NIO writes on a
* {@link FileChannel} (and potentially on a remote service). This class is
* designed to maximize the opportunity for efficient NIO by combining many
* writes onto a single direct {@link ByteBuffer} and then efficiently
* transferring those writes onto the backing channel in a channel dependent
* manner. In general, there are three use cases for a {@link WriteCache}:
*
* - Gathered writes. This case is used by the {@link RWStore}.
* - Pure append of sequentially allocated records. This case is used by the
* {@link WORMStrategy} (WORM) and by the {@link IndexSegmentBuilder}.
* - Write of a single large buffer owned by the caller. This case may be used
* when the caller wants to manage the buffers or when the caller's buffer is
* larger than the write cache.
*
* The caller is responsible for managing which buffers are being written on and
* read on, when they are flushed, and when they are reset. It is perfectly
* reasonable to have more than one {@link WriteCache} and to read through on
* any {@link WriteCache} until it has been recycled. A {@link WriteCache} must
* be reset before it is put into play again for new writes.
*
* Note: For an append-only model (WORM), the caller MUST serialize writes onto
* the {@link IRawStore} and the {@link WriteCache}. This is required in order
* to ensure that the records are laid out in a dense linear fashion on the
* {@link WriteCache} and permits the backing buffer to be transferred in a
* single IO to the backing file.
*
* Note: For a {@link RWStore}, the caller must take more responsibility for
* managing the {@link WriteCache}(s) which are in play and scheduling their
* eviction onto the backing store. The caller can track the space remaining in
* each {@link WriteCache} and decide when to flush a {@link WriteCache} based
* on that information.
*
* @author Bryan Thompson
* @version $Id$
*/
abstract public class WriteCache implements IWriteCache {
protected static final Logger log = Logger.getLogger(WriteCache.class);
/**
* true
iff per-record checksums are being maintained.
*/
private final boolean useChecksum;
/**
* true
iff the buffer contents directly contain the recordMap data.
*/
private final boolean prefixWrites;
/**
* The size of the header for a prefix write.
*/
static final int SIZEOF_PREFIX_WRITE_METADATA = 8/* offset */+ 4/* size */+ 4/* latchedAddr */;
static final int PREFIX_OFFSET_POS = 0;
static final int PREFIX_SIZE_POS = 8;
/**
* The buffer used to absorb writes that are destined for some channel.
*
* Note: This is an {@link AtomicReference} since we want to clear this
* field in {@link #close()}.
*/
final private AtomicReference buf;
/**
* The read lock allows concurrent {@link #acquire()}s and permits both
* reads and writes on the acquired buffer, while the write lock prevents
* {@link #acquire()} during critical sections such as
* {@link #flush(boolean, long, TimeUnit)}, {@link #reset()},
* {@link #close()}.
*/
final private ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
/**
* Return the backing {@link ByteBuffer}. The caller may read or write on
* the buffer, but MUST NOT have a side effect on the
* {@link ByteBuffer#position()} without first synchronizing on the
* {@link ByteBuffer}. Once they are done, the caller MUST call
* {@link #release()}.
*
* Note: This uses the read lock to allow concurrent read/write operations
* on the backing buffer.
*
* Note: At most one write operation may execute concurrently in
* order to avoid side effects on the buffers position when copying data
* onto the buffer. This constraint must be imposed by the caller using a
* synchronized(buf){}
block during the critical sections where
* the buffer position will be updated by a write.
*
* @return The {@link ByteBuffer}.
*
* @throws InterruptedException
* @throws IllegalStateException
* if the {@link WriteCache} is closed.
*/
private ByteBuffer acquire() throws InterruptedException, IllegalStateException {
final Lock readLock = lock.readLock();
readLock.lockInterruptibly();
try {
// latch.inc();
final IBufferAccess tmp = buf.get();
if (tmp == null) {
// latch.dec();
throw new IllegalStateException();
}
// Note: The ReadLock is still held!
return tmp.buffer();
} catch (Throwable t) {
// Release the lock only on the error path.
readLock.unlock();
if (t instanceof InterruptedException)
throw (InterruptedException) t;
if (t instanceof IllegalStateException)
throw (IllegalStateException) t;
throw new RuntimeException(t);
}
}
/**
* Release the read lock on an acquired {@link ByteBuffer}.
*/
private void release() {
lock.readLock().unlock();
// latch.dec();
}
/**
* Return a read-only view of the backing {@link ByteBuffer}.
*
* @return The read-only view -or- null
if the
* {@link WriteCache} has been closed.
*/
ByteBuffer peek() {
final ByteBuffer b = buf.get().buffer();
return b == null ? null : b.asReadOnlyBuffer();
}
// /**
// * Return the buffer. No other thread will have access to the buffer. No
// * latch is established and there is no protocol for releasing the buffer
// * back. Instead, the buffer will become available again if the caller
// * releases the write lock.
// *
// * @throws IllegalMonitorStateException
// * unless the caller is holding the write lock.
// * @throws IllegalStateException
// * if the buffer reference has been cleared.
// */
// protected ByteBuffer getExclusiveBuffer() {
//
// if (!lock.writeLock().isHeldByCurrentThread())
// throw new IllegalMonitorStateException();
//
// final ByteBuffer tmp = buf.get();
//
// if (tmp == null)
// throw new IllegalStateException();
//
// return tmp;
//
// }
/**
* Lock used to make
* {@link #transferTo(WriteCache, WriteCache, ConcurrentMap)} mutex with
* {@link WriteCacheService#clearWrite(long, int)} for a specific
* {@link WriteCache} instance.
*/
// Note: Exposed to WriteCacheService.clearWrite().
final /*private*/ ReentrantLock transferLock = new ReentrantLock();
/**
* The metadata associated with a record in the {@link WriteCache}.
*/
public static class RecordMetadata {
/**
* The offset of the record in the file. The offset may be relative to a
* base offset known to the writeOnChannel() implementation.
*/
public final long fileOffset;
/**
* The offset within the {@link WriteCache}'s backing {@link ByteBuffer}
* of the start of the record.
*/
public final int bufferOffset;
/**
* The length of the record in bytes as it will be written on the
* channel. If checksums are being written, then the length of the
* record has already been incorporated into this value.
*/
public final int recordLength;
/**
* The RWStore latched address for the record. This can be used to
* recover the FixedAllocator. This field is only required for the
* RWStore and then only for HA.
*/
public final int latchedAddr;
/**
* Set true
when the record is deleted.
*
* Note: The {@link RecordMetadata} is removed from the
* {@link WriteCache#recordMap} when the record is deleted. This flag is
* only visible if the {@link RecordMetadata} was entered onto the
* {@link WriteCache#orderedRecords} list.
*/
private volatile boolean deleted;
/**
* When a record is used as a read cache then the readCount is
* maintained as a metric on its access. This could be used to
* determine eviction/compaction.
*
* Note: volatile to guarantee visibility of updates. Might do better
* with synchronized(this), synchronized(cache), or CAS.
*/
private volatile int hitCount;
public RecordMetadata(final long fileOffset, final int bufferOffset,
final int recordLength, final int latchedAddr) {
this.fileOffset = fileOffset;
this.bufferOffset = bufferOffset;
this.recordLength = recordLength;
this.latchedAddr = latchedAddr;
this.deleted = false;
}
public String toString() {
return getClass().getSimpleName() + "{fileOffset=" + fileOffset
+ ",bufferOffset=" + bufferOffset + ",len=" + recordLength
+ ",delete=" + deleted + "}";
}
final int getHitCount() {
return hitCount;
}
} // class RecordMetadata
/**
* An index into the write cache used for read through on the cache. The
* keys are the file offsets that would be used to read the corresponding
* record. The values describe the position in buffer where that record is
* found and the length of the record.
*
* Note: Exposed to inner classes.
*/
final protected ConcurrentMap recordMap;
/**
* An ordered list of the {@link RecordMetadata} in the order in which those
* records were created. This is maintained only for HA. It is used to
* communicate the allocations and deletes to a downstream RWS HA follower.
* The RWS follower relies on the ordered presentation of the addresses to
* infer the order in which the allocators were created, the size of the
* regions managed by those allocators, and the order in which the
* allocators appear in the allocator list (this is the same as the order of
* the creation of those allocators).
*
* Note: The RWS must have the actual order in which the addresses are
* created. The actual address allocations are serialized by the RWStore
* using its allocationLock. Therefore the calls to WriteCache.write() must
* also be serialized. However, it might be possible for a clear of an
* address to be concurrent with an allocation (I need to check this with
* Martyn) - for example, when releasing an allocation context. In any case,
* it is wise to guard updates to {@link #orderedRecords} both to ensure
* that the allocation order is maintained and to ensure that the data
* structure remains consistent (since it can be updated by multiple
* threads).
*
* Note: This data structure is guarded by the object monitor for the
* {@link ByteBuffer}. (This is the same thing that is used to serialize the
* writes on the {@link ByteBuffer}). Make sure that you are using the
* {@link ByteBuffer} and not a dup() of that {@link ByteBuffer}.
*/
final private List orderedRecords;
/**
* The offset of the first record written onto the {@link WriteCache}. This
* information is used when {@link #appendOnly} is true
as it
* gives the starting offset at which the entire {@link ByteBuffer} may be
* written in a single IO. When {@link #appendOnly} is false
* this is basically meaningless. This is initialized to -1L
as
* a clear indicator that there is no valid record written yet onto the
* cache.
*/
final private AtomicLong firstOffset = new AtomicLong(-1L);
/**
* Exposed to the WORM for HA support.
*
* @param firstOffset
* The first offset (from the HA message).
*/
protected void setFirstOffset(final long firstOffset) {
this.firstOffset.set(firstOffset);
}
/**
* The capacity of the backing buffer.
*/
final private int capacity;
/**
* When true
{@link #close()} will release the
* {@link ByteBuffer} back to the {@link DirectBufferPool}.
*/
final private boolean releaseBuffer;
/**
* A private instance used to compute the checksum of all data in the
* current {@link #buf}. This is enabled for the high availability write
* replication pipeline. The checksum over the entire {@link #buf} is
* necessary in this context to ensure that the receiver can verify the
* contents of the {@link #buf}. The per-record checksums CAN NOT be used
* for this purpose since large records may be broken across
*/
final private ChecksumHelper checker;
/**
* The then current extent of the backing file as of the last record written
* onto the cache before it was written onto the write replication pipeline.
* The receiver is responsible for adjusting its local file size to match.
*
* @see WriteCacheService#setExtent(long)
*/
private final AtomicLong fileExtent = new AtomicLong();
/**
* m_closedForWrites is set when the buffer is about to be flushed and ensures that
* nothing will be appended to the buffer until it is reset for reuse. This
* fixes a problem in the HA Pipeline where deletes could append to the buffer resulting
* in a reported buffer length in the HAMessage greater than the data sent.
*/
private volatile boolean m_closedForWrites = false;
// /**
// * The sequence must be set when the cache is ready to be flushed. In HA this
// * is sent down the pipeline to ensure correct synchronization when processing
// * logged messages.
// */
// private long sequence = -1;
//
// /**
// * The sequence #of this {@link WriteCache} block within the current write
// * set (origin ZERO(0)). This must be set when the cache is ready to be
// * flushed. In HA this is sent down the pipeline to ensure correct
// * synchronization when processing logged messages. This also winds up in
// * the {@link IRootBlockView} as a summary of the #of {@link WriteCache}
// * blocks transmitted during the write set for a specific commit point.
// */
// void setSequence(final long i) {
// sequence = i;
// }
//
// /**
// * The sequence #of this {@link WriteCache} block within the current write
// * set (origin ZERO(0)).
// */
// long getSequence() {
// return sequence;
// }
/**
* Create a {@link WriteCache} from either a caller supplied buffer or a
* direct {@link ByteBuffer} allocated from the {@link DirectBufferPool}.
*
* Note: The application MUST ensure that it {@link #close()}s the
* {@link WriteCache} or it can leak direct {@link ByteBuffer}s!
*
* Note: NIO operations are performed using a direct {@link ByteBuffer}
* (that is, one use backing bytes are allocated on the C heap). When the
* caller supplies a {@link ByteBuffer} that is allocated on the Java heap
* as opposed to in native memory, a temporary direct {@link ByteBuffer}
* will be allocated for the IO operation by Java. The JVM can fail to
* release this temporary direct {@link ByteBuffer}, resulting in a memory
* leak. For this reason, the {@link WriteCache} SHOULD use a direct
* {@link ByteBuffer}.
*
* @see http://bugs.sun.com/bugdatabase/view_bug.do;jsessionid=8f
* ab76d1d4479fffffffffa5abfb09c719a30?bug_id=6210541
*
* @param buf
* A {@link ByteBuffer} to be used as the write cache (optional).
* When null
a buffer will be allocated for you from
* the {@link DirectBufferPool}. Buffers allocated on your behalf
* will be automatically released by {@link #close()}.
* @param prefixWrites
* true
iff the implementation uses scattered
* writes. The RW store uses scattered writes since its updates
* are written to different parts of the backing file. The WORM
* store does not since all updates are written to the end of the
* user extent in the backing file.
* @param useChecksum
* true
iff the write cache will store the caller's
* checksum for a record and validate it on read.
* @param isHighlyAvailable
* when true
the whole record checksum is maintained
* for use when replicating the write cache along the write
* pipeline. This needs to be true
for HA1 as well
* since we need to write the HALog.
* @param bufferHasData
* when true
the caller asserts that the buffer has
* data (from a replicated write), in which case the position
* should be the start of the data in the buffer and the limit
* the #of bytes with valid data. when false
, the
* caller's buffer will be cleared. The code presumes that the
* {@link WriteCache} instance will be used to lay down a single
* buffer worth of data onto the backing file.
* @param fileExtent
* The then current extent of the backing file.
*
* @throws InterruptedException
*/
public WriteCache(IBufferAccess buf, final boolean prefixWrites,
final boolean useChecksum, final boolean isHighlyAvailable,
final boolean bufferHasData, final long fileExtent)
throws InterruptedException {
if (bufferHasData && buf == null)
throw new IllegalArgumentException();
if (buf == null) {
buf = DirectBufferPool.INSTANCE.acquire();
this.releaseBuffer = true;
} else {
this.releaseBuffer = false;
}
// if (quorumManager == null)
// throw new IllegalArgumentException();
// this.quorumManager = quorumManager;
this.useChecksum = useChecksum;
this.prefixWrites = prefixWrites;
if (isHighlyAvailable && !bufferHasData) {
// Note: No checker if buffer has data.
checker = new ChecksumHelper();
} else {
checker = null;
}
// save reference to the write cache.
this.buf = new AtomicReference(buf);
// the capacity of the buffer in bytes.
this.capacity = buf.buffer().capacity();
// apply the then current file extent.
this.fileExtent.set(fileExtent);
/*
* Discard anything in the buffer, resetting the position to zero, the
* mark to zero, and the limit to the capacity.
*/
if (!bufferHasData) {
buf.buffer().clear();
}
/*
* An estimate of the #of records that might fit within the write cache.
* This is based on an assumption that the "average" record is 1k. This
* is used solely to assign the initial capacity of this map.
*/
final int indexDefaultCapacity = capacity / (1 * Bytes.kilobyte32);
/*
* allocate and initialize the write cache index.
*
* For scattered writes we choose to use a sorted map so that we can
* easily flush writes to the file channel in order. This may not be
* important depending on the caching strategy of the underlying system
* but it cannot be a bad thing.
*
* If we do not need to support scattered writes then we have the option
* to use the ConcurrentHashMap which has the advantage of constant
* access time for read through support.
*
* Note: some literature indicates the ConcurrentSkipListMap scales
* better with concurrency, so we should benchmark this option for
* non-scattered writes as well.
*/
if (prefixWrites) {
recordMap = new ConcurrentSkipListMap();
} else {
recordMap = new ConcurrentHashMap(indexDefaultCapacity);
}
if (isHighlyAvailable && !bufferHasData) {
/*
* Only in HA mode, and not when we are processing a raw write cache
* buffer replicated from the leader.
*/
orderedRecords = new LinkedList();
} else {
orderedRecords = null;
}
if (bufferHasData) {
/*
* Populate the record map from the record.
*/
resetRecordMapFromBuffer();
}
}
/**
* Adds some debugging information.
*/
public String toString() {
return super.toString()//
+ "{recordCount=" + recordMap.size()//
+ ",firstOffset=" + firstOffset//
+ ",releaseBuffer=" + releaseBuffer//
+ ",prefixWrites=" + prefixWrites//
+ ",useChecksum=" + useChecksum//
+ ",bytesWritten=" + bytesWritten()//
+ ",bytesRemaining=" + remaining()//
+ ",bytesRemoved=" + m_removed//
+ "}";
}
/**
* The offset of the first record written onto the {@link WriteCache}. This
* information is used when {@link #appendOnly} is true
as it
* gives the starting offset at which the entire {@link ByteBuffer} may be
* written in a single IO. When {@link #appendOnly} is false
* this is basically meaningless.
*
* Note: This has been raised into the
* {@link #writeOnChannel(ByteBuffer, long, Map, long)} method signature. It
* has been reduced to a package private method so it will remain visible to
* the unit tests, otherwise it could become private.
*
* @return The first offset written into the {@link WriteCache} since it was
* last {@link #reset()} and -1L
if nothing has been
* written since the {@link WriteCache} was created or was last
* {@link #reset()}.
*/
final long getFirstOffset() {
return firstOffset.get();
}
/**
* The maximum length of a record which could be inserted into the buffer.
*
* Note: When checksums are enabled, this is 4 bytes less than the actual
* capacity of the underlying buffer since each record requires an
* additional four bytes for the checksum field.
*/
final public int capacity() {
return capacity - (useChecksum ? 4 : 0) - (prefixWrites ? SIZEOF_PREFIX_WRITE_METADATA : 0);
}
/**
* Return the #of bytes remaining in the buffer.
*
* Note: in order to rely on this value the caller MUST have exclusive
* access to the buffer. This API does not provide the means for acquiring
* that exclusive access. This is something that the caller has to arrange
* for themselves, which is why this is a package private method.
*/
final int remaining() {
final int remaining = capacity - bytesWritten();//buf.get().buffer().position();
return remaining;
}
/**
* The #of bytes written on the backing buffer.
*
* Note: in order to rely on this value the caller MUST have exclusive
* access to the buffer. This API does not provide the means for acquiring
* that exclusive access. This is something that the caller has to arrange
* for themselves, which is why this is a package private method.
*/
public final int bytesWritten() {
return buf.get().buffer().position();
}
/**
* Return true
if there are no records buffered on the cache.
* Note: The caller MUST be holding a lock for this to be value. Probably
* the write lock.
*
* @todo This currently tests the {@link #recordMap}. In fact, for at least
* the {@link RWStore} the record map COULD be empty with cleared
* writes on the backing {@link ByteBuffer}. Therefore this tests
* whether the {@link WriteCache} has data to be written but does not
* clearly report whether or not some data has been written onto the
* buffer (and hence it has fewer bytes remaining than might otherwise
* be expected).
*/
final boolean isEmpty() {
return recordMap.isEmpty();
}
/**
* Set the current extent of the backing file on the {@link WriteCache}
* object. When used as part of an HA write pipeline, the receiver is
* responsible for adjusting its local file size to match the file extent in
* each {@link WriteCache} message.
*
* @param fileExtent
* The current extent of the file.
*
* @throws IllegalArgumentException
* if the file extent is negative.
*
* @see WriteCacheService#setExtent(long)
*/
public void setFileExtent(final long fileExtent) {
if (fileExtent < 0L)
throw new IllegalArgumentException();
this.fileExtent.set(fileExtent);
}
public long getFileExtent() {
return fileExtent.get();
}
/**
* Return the checksum of all data written into the backing buffer for this
* {@link WriteCache} instance since it was last {@link #reset()}.
*
* @return The running checksum of the data written into the backing buffer.
*
* @throws UnsupportedOperationException
* if the {@link WriteCache} is not maintaining this checksum
* (i.e., if isHighlyAvailable := false
was
* specified to the constructor).
*/
// package private : exposed to WriteTask.call().
// int getWholeBufferChecksum(final ByteBuffer checksumBuffer) {
//
// final ByteBuffer src = peek().duplicate();
// // flip(limit=pos;pos=0)
// src.flip();
//
// return getWholeBufferChecksum(checksumBuffer, src, false);
//
// }
int getWholeBufferChecksum(final ByteBuffer checksumBuffer, final ByteBuffer src, final boolean isCompressed) {
if (checker == null)
throw new UnsupportedOperationException();
if (isCompressed || prefixWrites) {
/*
* Recalculate whole buffer checksum.
*
* Note: When using prefix writes, we mark deleted records by
* flipping the sign on the fileOffset in the pre-record header.
* This means that we can not use an incrementally computed
* checksum.
*
* Note: With the introduction of HALog compression (compress /
* expand), the target ByteBuffer may be sized for the message
* rather than drawn from a pool. Therefore, the assert has been
* modified such to ensure that the buffer has sufficient capacity
* for the transfer - as defined by limit().
*/
assert checksumBuffer.capacity() >= src.limit() : "b.limit="
+ src.limit() + ", checksumBuffer.capacity="
+ checksumBuffer.capacity();
// assert checksumBuffer.capacity() == src.capacity() : "b.capacity="
// + src.capacity() + ", checksumBuffer.capacity="
// + checksumBuffer.capacity();
// checksumBuffer.limit(checksumBuffer.capacity());
checksumBuffer.limit(src.limit());
checksumBuffer.position(0);
checksumBuffer.put(src);
checksumBuffer.flip();
checker.reset();
checker.checksum(checksumBuffer);
}
return checker.getChecksum();
}
/**
* {@inheritDoc}
*
* @throws IllegalStateException
* If the buffer is closed.
* @throws IllegalArgumentException
* If the caller's record is larger than the maximum capacity of
* cache (the record could not fit within the cache). The caller
* should check for this and provide special handling for such
* large records. For example, they can be written directly onto
* the backing channel.
*/
public boolean write(final long offset, final ByteBuffer data, final int chk) throws InterruptedException {
return write(offset, data, chk, true/* writeChecksum */,0/*latchedAddr*/);
}
/**
*
* @param offset
* @param data
* @param chk
* @param writeChecksum
* The checksum is appended to the record IFF this argument is
* true
and checksums are in use.
* @return
* @throws InterruptedException
*/
boolean write(final long offset, final ByteBuffer data, final int chk, boolean writeChecksum, final int latchedAddr)
throws InterruptedException {
// Note: The offset MAY be zero. This allows for stores without any
// header block.
assert !m_closedForWrites;
if (m_written) { // should be clean, NO WAY should this be written to!
log.error("Writing to CLEAN cache: " + hashCode());
throw new IllegalStateException("Writing to CLEAN cache: " + hashCode());
}
if (data == null)
throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_NULL);
final WriteCacheCounters counters = this.counters.get();
final ByteBuffer tmp = acquire();
try {
final int remaining = data.remaining();
// The #of bytes to transfer into the write cache.
final int datalen = remaining + (writeChecksum && useChecksum ? 4 : 0);
final int nwrite = datalen + (prefixWrites ? SIZEOF_PREFIX_WRITE_METADATA : 0);
if (nwrite > capacity) {
// This is more bytes than the total capacity of the buffer.
throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_OVERRUN);
}
if (remaining == 0)
throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_EMPTY);
/*
* Note: We need to be synchronized on the ByteBuffer here since
* this operation relies on the position() being stable.
*
* Note: Also see clearAddrMap(long) which is synchronized on the
* acquired ByteBuffer in the same manner to protect it during
* critical sections which have a side effect on the buffer
* position.
*/
final int pos;
synchronized (tmp) {
// the position() at which the record is cached in the buffer.
final int spos = tmp.position();
if (spos + nwrite > capacity) {
/*
* There is not enough room left in the write cache for this
* record.
*/
return false;
}
// add prefix data if required and set data position in buffer
if (prefixWrites) {
tmp.putLong(offset);
tmp.putInt(datalen);
tmp.putInt(latchedAddr);
pos = spos + SIZEOF_PREFIX_WRITE_METADATA;
} else {
pos = spos;
}
tmp.put(data);
/*
* Copy the record into the cache, updating position() as we go.
*
* Note that the checker must be invalidated if a RWCache
* "deletes" an entry by zeroing an address. Hence, the code no
* longer updates the checksum when [prefixWrites:=true].
*/
if (checker != null && !prefixWrites) {
// update the checksum (no side-effects on [data])
final ByteBuffer chkBuf = tmp.asReadOnlyBuffer();
chkBuf.position(spos);
chkBuf.limit(tmp.position());
checker.update(chkBuf);
}
// write checksum - if any
if (writeChecksum && useChecksum) {
tmp.putInt(chk);
if (checker != null && !prefixWrites) {
// update the running checksum to include this too.
checker.update(chk);
}
}
// set while synchronized since no contention.
firstOffset.compareAndSet(-1L/* expect */, offset/* update */);
// update counters while holding the lock.
counters.naccept++;
counters.bytesAccepted += nwrite;
/*
* Add metadata for the record so it can be read back from the
* cache.
*/
final RecordMetadata md = new RecordMetadata(offset, pos,
datalen, latchedAddr);
if (recordMap.put(Long.valueOf(offset), md) != null) {
/*
* Note: This exception indicates that the abort protocol
* did not reset() the current write cache before new writes
* were laid down onto the buffer.
*/
throw new AssertionError(
"Record exists for offset in cache: offset="
+ offset);
}
if (orderedRecords != null) {
/*
* Note: insert into this collection is guarded by the
* object monitor for the ByteBuffer. This ensures that the
* LinkedList data structure remains coherent when it is
* updated by multiple threads. It also ensures that the
* order of this list is the same as the ordinal position
* order assigned within the ByteBuffer
*
* Note: The real necessary ordering is the allocation
* ordering - any address allocation before another address
* MUST appear in the list before that other address. Since
* some addresses are recycled while others are newly
* allocated the latchedAddr values are not strictly
* ascending.
*/
orderedRecords.add(md);
}
} // synchronized(tmp)
if (log.isTraceEnabled()) { // @todo rather than hashCode() set a
// buffer# on each WriteCache instance.
log.trace("offset=" + offset + ", pos=" + pos + ", nwrite=" + nwrite + ", writeChecksum="
+ writeChecksum + ", useChecksum=" + useChecksum + ", nrecords=" + recordMap.size()
+ ", hashCode=" + hashCode());
}
return true;
} finally {
release();
}
}
/**
* {@inheritDoc}
*
* @throws IllegalStateException
* If the buffer is closed.
*/
public ByteBuffer read(final long offset, final int nbytes) throws InterruptedException, ChecksumError {
final WriteCacheCounters counters = this.counters.get();
// takes readLock returning buffer
final ByteBuffer tmp = acquire();
try {
// Look up the metadata for that record in the cache.
final RecordMetadata md;
if ((md = recordMap.get(offset)) == null) {
// The record is not in this write cache.
counters.nmiss.increment();
return null;
}
// length of the record w/o checksum field.
final int reclen = md.recordLength - (useChecksum ? 4 : 0);
// the start of the record in writeCache.
final int pos = md.bufferOffset;
// create a view with same offset, limit and position.
final ByteBuffer view = tmp.duplicate();
// adjust the view to just the record of interest.
view.limit(pos + reclen);
view.position(pos);
// System.out.println("WriteCache, addr: " + offset + ", from: " +
// pos + ", " + md.recordLength + ", thread: " +
// Thread.currentThread().getId());
/*
* Copy the data into a newly allocated buffer. This is necessary
* because our hold on the backing ByteBuffer for the WriteCache is
* only momentary. As soon as we release() the buffer the data in
* the buffer could be changed.
*/
final byte[] b = new byte[reclen];
final ByteBuffer dst = ByteBuffer.wrap(b);
// copy the data into [dst] (and the backing byte[]).
dst.put(view);
// flip buffer for reading.
dst.flip();
if (useChecksum && !(this instanceof ReadCache)) { // don't check if HIRS
final int chk = tmp.getInt(pos + reclen);
if (chk != ChecksumUtility.threadChk.get().checksum(b, 0/* offset */, reclen)) {
// Note: [offset] is a (possibly relative) file offset.
throw new ChecksumError(checkdata());
}
}
counters.nhit.increment();
if (log.isTraceEnabled()) {
log.trace(show(dst, "read bytes"));
}
// Increment cache read count
final int nhits = ++md.hitCount;
if (log.isTraceEnabled()) {
if (nhits > 2) {
log.trace("Cache read ");
}
}
return dst;
} finally {
release();
}
}
/**
* Dump some metadata and leading bytes from the buffer onto a
* {@link String}.
*
* @param buf
* The buffer.
* @param prefix
* A prefix for the dump.
*
* @return The {@link String}.
*/
private String show(final ByteBuffer buf, final String prefix) {
final StringBuffer str = new StringBuffer();
int tpos = buf.position();
if (tpos == 0) {
tpos = buf.limit();
}
str.append(prefix + ", length: " + tpos + " : ");
for (int tb = 0; tb < tpos && tb < 20; tb++) {
str.append(Integer.toString(buf.get(tb)) + ",");
}
// log.trace(str.toString());
return str.toString();
}
// private String show(final byte[] buf, int len, final String prefix) {
// final StringBuffer str = new StringBuffer();
// str.append(prefix + ": ");
// int tpos = len;
// str.append(prefix + ", length: " + tpos + " : ");
// for (int tb = 0; tb < tpos && tb < 20; tb++) {
// str.append(Integer.toString(buf[tb]) + ",");
// }
// // log.trace(str.toString());
// return str.toString();
// }
/**
* Flush the writes to the backing channel but DOES NOT sync the channel and
* DOES NOT {@link #reset()} the {@link WriteCache}. {@link #reset()} is a
* separate operation because a common use is to retain recently flushed
* instances for read-back.
*
* @param force
* When true
, the data will be forced to stable
* media.
*
* @throws IOException
* @throws InterruptedException
*/
public void flush(final boolean force) throws IOException, InterruptedException {
try {
if (!flush(force, Long.MAX_VALUE, TimeUnit.NANOSECONDS)) {
throw new RuntimeException();
}
} catch (TimeoutException e) {
throw new RuntimeException(e);
}
}
/**
* Flush the writes to the backing channel but DOES NOT sync the channel and
* DOES NOT {@link #reset()} the {@link WriteCache}. {@link #reset()} is a
* separate operation because a common use is to retain recently flushed
* instances for read-back.
*
* @param force
* When true
, the data will be forced to stable
* media.
*
* @throws IOException
* @throws TimeoutException
* @throws InterruptedException
*/
public boolean flush(final boolean force, final long timeout,
final TimeUnit unit) throws IOException, TimeoutException,
InterruptedException {
if(!m_closedForWrites)
closeForWrites();
// start time
final long begin = System.nanoTime();
// total nanoseconds to wait.
final long nanos = unit.toNanos(timeout);
// remaining nanoseconds to wait.
long remaining = nanos;
// final WriteCacheCounters counters = this.counters.get();
final Lock writeLock = lock.writeLock();
if (!writeLock.tryLock(remaining, TimeUnit.NANOSECONDS)) {
return false;
}
try {
final ByteBuffer tmp = this.buf.get().buffer();
if (tmp == null)
throw new IllegalStateException();
// #of bytes to write on the disk.
final int nbytes = tmp.position();
if (log.isTraceEnabled())
log.trace("nbytes=" + nbytes + ", firstOffset="
+ getFirstOffset());// + ", nflush=" + counters.nflush);
if (nbytes == 0) {
// NOP.
return true;
}
/*
* Create a view with same offset, limit and position.
*
* Note: The writeOnChannel method is given the view. This prevents
* it from adjusting the position() on the backing buffer.
*/
{
final ByteBuffer view = tmp.duplicate();
// adjust the view to just the dirty record.
view.limit(nbytes);
view.position(0);
// remaining := (total - elapsed).
remaining = nanos - (System.nanoTime() - begin);
// write the data on the disk file.
final boolean ret = writeOnChannel(view, getFirstOffset(),
Collections.unmodifiableMap(recordMap), remaining);
if (!ret) {
throw new TimeoutException("Unable to flush WriteCache");
}
// counters.nflush++;
return ret;
}
} finally {
writeLock.unlock();
}
}
/**
* Debug routine logs @ ERROR additional information when a checksum error
* has been encountered.
*
* @return An informative error message.
*
* @throws InterruptedException
* @throws IllegalStateException
*/
private String checkdata() throws IllegalStateException, InterruptedException {
if (!useChecksum) {
return "Unable to check since checksums are not enabled";
}
ByteBuffer tmp = acquire();
try {
int nerrors = 0;
int nrecords = recordMap.size();
for (Entry ent : recordMap.entrySet()) {
final RecordMetadata md = ent.getValue();
// length of the record w/o checksum field.
final int reclen = md.recordLength - 4;
// the start of the record in writeCache.
final int pos = md.bufferOffset;
final int chk = tmp.getInt(pos + reclen);
// create a view with same offset, limit and position.
final ByteBuffer view = tmp.duplicate();
// adjust the view to just the record of interest.
view.limit(pos + reclen);
view.position(pos);
final byte[] b = new byte[reclen];
final ByteBuffer dst = ByteBuffer.wrap(b);
// copy the data into [dst] (and the backing byte[]).
dst.put(view);
if (chk != ChecksumUtility.threadChk.get().checksum(b, 0/* offset */, reclen)) {
log.error("Bad data for address: " + ent.getKey());
nerrors++;
}
}
return "WriteCache checkdata - records: " + nrecords + ", errors: " + nerrors;
} finally {
release();
}
}
/**
* Write the data from the buffer onto the channel. This method provides a
* uniform means to request that the buffer write itself onto the backing
* channel, regardless of whether the channel is backed by a file, a socket,
* etc.
*
* Implementations of this method MAY support gathered writes, depending on
* the channel. The necessary information to perform a gathered write is
* present in the recordMap. On the other hand, the implementation
* MAY require that the records in the cache are laid out for a WORM, in
* which case {@link #getFirstOffset()} provides the starting offset for the
* data to be written. The application MUST coordinate the requirements for
* a R/W or WORM store with the use of the {@link WriteCache} and the means
* to write on the backing channel.
*
* @param buf
* The data to be written. Only the dirty bytes are visible in
* this view. The implementation should write all bytes from the
* current position to the limit.
* @param firstOffset
* The offset of the first record in the recordMap into the file
* (may be relative to a base offset within the file). This is
* provided as an optimization for the WORM which writes its
* records contiguously on the backing store.
* @param recordMap
* The mapping of record offsets onto metadata about those
* records.
* @param nanos
* The timeout for the operation in nanoseconds.
*
* @return true
if the operation was completed successfully
* within the time alloted.
*
* @throws InterruptedException
* if the thread was interrupted.
* @throws IOException
* if there was an IO problem.
*/
abstract protected boolean writeOnChannel(final ByteBuffer buf, final long firstOffset,
final Map recordMap, final long nanos) throws InterruptedException, TimeoutException,
IOException;
/**
* {@inheritDoc}.
*
* This implementation clears the buffer, the record map, and other internal
* metadata such that the {@link WriteCache} is prepared to receive new
* writes.
*
* @throws IllegalStateException
* if the write cache is closed.
*/
public void reset() throws InterruptedException {
final Lock writeLock = lock.writeLock();
writeLock.lockInterruptibly();
try {
// // wait until there are no readers using the buffer.
// latch.await();
final ByteBuffer tmp = buf.get().buffer();
if (tmp == null) {
// Already closed.
throw new IllegalStateException();
}
// reset all state.
_resetState(tmp);
} finally {
writeLock.unlock();
}
}
/**
* Permanently take the {@link WriteCache} instance out of service. If the
* buffer was allocated by the {@link WriteCache} then it is released back
* to the {@link DirectBufferPool}. After this method is called, records can
* no longer be read from nor written onto the {@link WriteCache}. It is
* safe to invoke this method more than once.
*
* Concurrent {@link #read(long, int)} requests will be serviced if the
* already hold the the read lock but requests will fail once the
*
* @throws InterruptedException
*/
public void close() throws InterruptedException {
final Lock writeLock = lock.writeLock();
writeLock.lockInterruptibly();
try {
// // wait until there are no readers using the buffer.
// latch.await();
/*
* Note: This method is thread safe. Only one thread will manage to
* clear the AtomicReference and it will do the rest of the work as
* well.
*/
// position := 0; limit := capacity.
final IBufferAccess tmp = buf.get();
if (tmp == null) {
// Already closed.
return;
}
if (buf.compareAndSet(tmp/* expected */, null/* update */)) {
try {
_resetState(tmp.buffer());
} finally {
if (releaseBuffer) {
tmp.release();
}
}
}
} finally {
writeLock.unlock();
}
}
/**
* Reset the internal state of the {@link WriteCache} in preparation to
* reuse it to receive more writes.
*
* Note: Keep private unless strong need for override since you can not call
* this method without holding the write lock
*
* @param tmp
*/
private void _resetState(final ByteBuffer tmp) {
if (tmp == null)
throw new IllegalArgumentException();
if (!lock.writeLock().isHeldByCurrentThread()) {
// The caller must be holding the write lock.
throw new IllegalMonitorStateException();
}
// clear the index since all records were flushed to disk.
if (!recordMap.isEmpty())
recordMap.clear();
if (orderedRecords != null) {
synchronized (tmp) {
orderedRecords.clear();
}
}
// clear to well known invalid offset.
firstOffset.set(-1L);
// position := 0; limit := capacity.
tmp.clear();
if (checker != null) {
// reset the running checksum of the data written onto the backing
// buffer.
checker.reset();
}
// Martyn: I moved your debug flag here so it is always cleared by
// reset().
m_written = false;
m_closedForWrites = false;
m_removed = 0;
// leave to WCS to manage referenceCount for cache
m_referenceCount.set(0);
}
// /**
// * Return the RMI message object that will accompany the payload from the
// * {@link WriteCache} when it is replicated along the write pipeline.
// *
// * @return cache A {@link WriteCache} to be replicated.
// */
// final IHAWriteMessage newHAWriteMessage(//
// final UUID storeUUID,
// final long quorumToken,
// final long lastCommitCounter,//
// final long lastCommitTime,//
// final long sequence,
// final ByteBuffer tmp
// ) {
//
// return new HAWriteMessage(//
// storeUUID,//
// lastCommitCounter,//
// lastCommitTime,//
// sequence, //
// bytesWritten(), getWholeBufferChecksum(tmp),
// prefixWrites ? StoreTypeEnum.RW : StoreTypeEnum.WORM,
// quorumToken, fileExtent.get(), firstOffset.get());
//
// }
/**
* Used to retrieve the {@link HAWriteMessage} AND the associated
* {@link ByteBuffer}.
*
* This allows the {@link WriteCache} to compress the data and create the
* correct {@link HAWriteMessage}.
*/
static public class HAPackage {
/**
* The message as it will be sent.
*/
private final IHAWriteMessage m_msg;
/**
* The data as it will be sent, with compression already applied if
* compression will be used.
*/
private final ByteBuffer m_data;
/**
*
* @param msg
* The message as it will be sent.
* @param data
* The data as it will be sent, with compression already
* applied if compression will be used.
*/
HAPackage(final IHAWriteMessage msg, final ByteBuffer data) {
m_msg = msg;
m_data = data;
}
public IHAWriteMessage getMessage() {
return m_msg;
}
public ByteBuffer getData() {
return m_data;
}
}
/**
* Return the optional key for the {@link CompressorRegistry} which
* identifies the {@link IRecordCompressor} to be applied.
*/
protected String getCompressorKey() {
// Default is NO compression.
return null;
}
/**
* Return the RMI message object plus the payload (the payload has been
* optionally compressed, depending on the configuration).
*/
final HAPackage newHAPackage(//
final UUID storeUUID,//
final long quorumToken,//
final long lastCommitCounter,//
final long lastCommitTime,//
final long sequence,//
final int replicationFactor,//
final ByteBuffer checksumBuffer
) {
final ByteBuffer b = peek().duplicate();
b.flip();
final ByteBuffer send;
final String compressorKey = getCompressorKey();
final IRecordCompressor compressor = CompressorRegistry.getInstance()
.get(compressorKey);
if (compressor != null) {
// Compress current buffer
send = compressor.compress(b);
} else {
send = b;
}
final int chksum = getWholeBufferChecksum(checksumBuffer, send.duplicate(), b != send /*isCompressed*/);
final HAWriteMessage msg = new HAWriteMessage(//
storeUUID,//
lastCommitCounter,//
lastCommitTime,//
sequence, //
send.limit(), chksum,
prefixWrites ? StoreTypeEnum.RW : StoreTypeEnum.WORM,
quorumToken, replicationFactor,
fileExtent.get(), firstOffset.get(),
compressorKey);
if (log.isTraceEnabled()) {
log.trace("Original buffer: " + b.limit() + ", final buffer: " + send.limit() + ", compressorKey: " + compressorKey + ", checksum: " + chksum);
}
return new HAPackage(msg, send);
}
/**
* The current performance counters.
*/
protected final AtomicReference counters = new AtomicReference(
new WriteCacheCounters());
/**
* Stores the number of bytes removed from this {@link WriteCache}.
*
* This can be used to determine whether the {@link WriteCache} should be
* flushed to disk or compacted to an aggregation buffer, avoiding writes
* and maximizing the chance of a read cache hit.
*
* Note: volatile
since not guarded by any lock.
*/
private volatile int m_removed;
/**
* Sets the performance counters to be used by the write cache. A service
* should do this if you want to aggregate the performance counters across
* multiple {@link WriteCache} instances.
*
* @param newVal
* The shared performance counters.
*
* @throws IllegalArgumentException
* if the argument is null
.
*/
void setCounters(final WriteCacheCounters newVal) {
if (newVal == null)
return;
this.counters.set(newVal);
}
/**
* Return the performance counters for the write cacher.
*/
public CounterSet getCounters() {
return counters.get().getCounters();
}
/**
* A {@link WriteCache} implementation suitable for an append-only file such
* as the {@link WORMStrategy} or the output file of the
* {@link IndexSegmentBuilder}.
*
* @author Bryan
* Thompson
*/
public static class FileChannelWriteCache extends WriteCache {
/**
* An offset which will be applied to each record written onto the
* backing {@link FileChannel}. The offset is generally the size of the
* root blocks for a journal or the checkpoint record for an index
* segment. It can be zero if you do not have anything at the head of
* the file.
*
* Note: This implies that writing the root blocks is done separately in
* the protocol since you can't write below this offset otherwise.
*/
final protected long baseOffset;
/**
* Used to re-open the {@link FileChannel} in this class.
*/
public final IReopenChannel opener;
/**
* @param baseOffset
* An offset
* @param buf
* @param opener
*
* @throws InterruptedException
*/
public FileChannelWriteCache(final long baseOffset,
final IBufferAccess buf, final boolean useChecksum,
final boolean isHighlyAvailable, final boolean bufferHasData,
final IReopenChannel opener,
final long fileExtent)
throws InterruptedException {
super(buf, false/* scatteredWrites */, useChecksum,
isHighlyAvailable, bufferHasData, fileExtent);
if (baseOffset < 0)
throw new IllegalArgumentException();
if (opener == null)
throw new IllegalArgumentException();
this.baseOffset = baseOffset;
this.opener = opener;
}
@Override
protected boolean writeOnChannel(final ByteBuffer data,
final long firstOffset,
final Map recordMap, final long nanos)
throws InterruptedException, IOException {
final long begin = System.nanoTime();
final int nbytes = data.remaining();
/*
* The position in the file at which the record will be written.
*/
final long pos = baseOffset + firstOffset;
/*
* Write bytes in [data] from position to limit onto the channel.
*
* @todo This ignores the timeout.
*/
final int nwrites = FileChannelUtility.writeAll(opener, data, pos);
final WriteCacheCounters counters = this.counters.get();
counters.nchannelWrite += nwrites;
counters.bytesWritten += nbytes;
counters.elapsedWriteNanos += (System.nanoTime() - begin);
return true;
}
}
/**
* The scattered write cache is used by the {@link RWStore} since the writes
* can be made to any part of the file assigned for data allocation.
*
* The writeonChannel must therefore utilize the {@link RecordMetadata} to
* write each update separately.
*
* To support HA, we prefix each write with the file position and buffer
* length in the cache. This enables the cache buffer to be sent as a single
* stream and the RecordMap rebuilt downstream.
*/
public static class FileChannelScatteredWriteCache extends WriteCache {
/**
* Used to re-open the {@link FileChannel} in this class.
*/
private final IReopenChannel opener;
private final BufferedWrite m_bufferedWrite;
/**
* @param baseOffset
* An offset
* @param buf
* @param opener
*
* @throws InterruptedException
*/
public FileChannelScatteredWriteCache(final IBufferAccess buf,
final boolean useChecksum, final boolean isHighlyAvailable,
final boolean bufferHasData,
final IReopenChannel opener,
final long fileExtent, final BufferedWrite bufferedWrite)
throws InterruptedException {
super(buf, true/* scatteredWrites */, useChecksum,
isHighlyAvailable, bufferHasData, fileExtent);
if (opener == null)
throw new IllegalArgumentException();
this.opener = opener;
m_bufferedWrite = bufferedWrite;
}
/**
* Called by WriteCacheService to process a direct write for large
* blocks and also to flush data from dirty caches.
*
* TODO The [nanos] parameter is ignored.
*/
protected boolean writeOnChannel(final ByteBuffer data,
final long firstOffsetIgnored,
final Map recordMap, final long nanos)
throws InterruptedException, IOException {
final long begin = System.nanoTime();
final int nbytes = data.remaining();
if (m_written) {
log.warn("DUPLICATE writeOnChannel for : " + this.hashCode());
} else {
// Can be empty if reset!
// assert !this.isEmpty();
m_written = true;
}
/*
* Retrieve the sorted write iterator and write each block to the
* file.
*
* If there is a BufferedWrite then ensure it is reset.
*/
if (m_bufferedWrite != null) {
m_bufferedWrite.reset();
}
int nwrites = 0;
final Iterator> entries = recordMap.entrySet().iterator();
while (entries.hasNext()) {
final Entry entry = entries.next();
final RecordMetadata md = entry.getValue();
// create a view on record of interest.
final ByteBuffer view = data.duplicate();
final int pos = md.bufferOffset;
view.limit(pos + md.recordLength);
view.position(pos);
final long offset = entry.getKey(); // offset in file to update
if (m_bufferedWrite == null) {
nwrites += FileChannelUtility.writeAll(opener, view, offset);
} else {
nwrites += m_bufferedWrite.write(offset, view, opener);
}
// if (log.isInfoEnabled())
// log.info("writing to: " + offset);
registerWriteStatus(offset, md.recordLength, 'W');
}
if (m_bufferedWrite != null) {
nwrites += m_bufferedWrite.flush(opener);
if (log.isTraceEnabled())
log.trace(m_bufferedWrite.getStats(null, true));
}
final WriteCacheCounters counters = this.counters.get();
counters.nchannelWrite += nwrites;
counters.bytesWritten += nbytes;
counters.elapsedWriteNanos += (System.nanoTime() - begin);
if (log.isTraceEnabled())
log.trace("WRITTEN ON CHANNEL");
return true;
}
/**
* Hook to rebuild {@link RecordMetadata} after buffer has been
* transferred. For the {@link FileChannelScatteredWriteCache} this
* means hopping trough the buffer marking offsets and data size into
* the {@link RecordMetadata} map, and ignoring any zero address entries
* that indicate a "freed" allocation.
*
* Update: This has now been changed to avoid problems with incremental
* checksums by indicating removal by appending a new prefix where the
* data length is zero.
*
* @throws InterruptedException
*/
@Override
public void resetRecordMapFromBuffer(final ByteBuffer buf,
final Map recordMap) {
recordMap.clear();
final int limit = buf.limit(); // end position.
int pos = buf.position(); // start position
// log.trace("position: " + pos + ", limit: " + limit);
while (pos < limit) {
buf.position(pos);
// 8 bytes (negative iff record is deleted)
final long fileOffset = buf.getLong();
assert fileOffset != 0L;
// 4 bytes (negative iff no data follows)
final int recordLength = buf.getInt();
assert recordLength != 0;
// 4 bytes
final int latchedAddr = buf.getInt();
// log.trace("Record fileOffset: " + fileOffset + ", length: " + recordLength + ", latchedAddr: " + latchedAddr);
// if (sze == 0 /* old style deleted */) {
// /*
// * Should only happen if a previous write was already made
// * to the buffer but the allocation has since been freed.
// */
// recordMap.remove(addr);
// removeAddress(latchedAddr);
if (fileOffset < 0 /* new style deleted */) {
if (recordMap.get(fileOffset) != null) {
// Should have been removed already.
throw new AssertionError();
}
/*
* Make sure that the address is declared. This covers the
* case where a record is allocated and then recycled before
* the WriteCache in which it was recorded is evicted from
* the dirtyList. This can happen when we are not
* compacting, as well as when we are compacting.
*
* Note: RWS will interpret a -recordLength as notification
* of the existence of an allocator for that address but
* will not create an actual allocation for that address at
* this time.
*/
// Ensure allocator exists (allocation may or may not be
// created).
addAddress(latchedAddr, recordLength);
if (recordLength > 0) {
// Delete allocation.
removeAddress(latchedAddr);
}
} else {
/*
* Note: Do not enter things into [orderedRecords] on the
* follower.
*/
if (recordLength < 0) {
/*
* Notice of allocation.
*
* Note: recordLength is always negative for this code
* path. The RWS will interpret the -recordLength as
* notification of the existence of an allocator for
* that address but will not create an actual allocation
* for that address at this time.
*/
addAddress(latchedAddr, recordLength);
} else {
/*
* Actual allocation with data.
*/
final RecordMetadata md = new RecordMetadata(
fileOffset, pos + SIZEOF_PREFIX_WRITE_METADATA,
recordLength, latchedAddr);
recordMap.put(fileOffset, md);
addAddress(latchedAddr, recordLength);
}
}
// skip header (addr + sze + latchedAddr) and data (if any)
pos += (SIZEOF_PREFIX_WRITE_METADATA + (recordLength > 0 ? recordLength
: 0));
}
}
/**
* A record add has been decoded.
*
* @param latchedAddr
* The latched address.
* @param size
* The size of the allocation in bytes.
*/
protected void addAddress(int latchedAddr, int size) {}
/**
* A record delete has been decoded.
*
* @param latchedAddr
* The latched address.
*/
protected void removeAddress(int latchedAddr) {}
} // class FileChannelScatteredWriteCache
public static class ReadCache extends WriteCache {
public ReadCache(IBufferAccess buf) throws InterruptedException {
super(buf, false/* prefixWrites */, true/* useChecksum */,
false/* isHighlyAvailable */, false/* bufferHasData */, 0/* fileExtent */);
}
@Override
protected boolean writeOnChannel(ByteBuffer buf, long firstOffset,
Map recordMap, long nanos)
throws InterruptedException, TimeoutException, IOException {
throw new UnsupportedOperationException();
}
/**
* Overide clearAddrMap for read cache to always remove from the record map.
*/
@Override
/* public */boolean clearAddrMap(final long addr, final int latchedAddr)
throws IllegalStateException, InterruptedException {
// Remove record from this cache.
final RecordMetadata removed = recordMap.remove(addr);
// might be null if concurrent transfer has taken place
return removed != null;
}
/**
* ReadCache is always closedForWrites
*/
@Override
public boolean isClosedForWrites() {
return true;
}
@Override
public void closeForWrites() {
throw new UnsupportedOperationException();
}
@Override
boolean write(final long offset, final ByteBuffer data, final int chk,
boolean writeChecksum, final int latchedAddr)
throws InterruptedException {
throw new UnsupportedOperationException();
}
/**
* Called from WCS when moving from hotList to ReadList.
*
* The hitCounts must be reset or full cache will always
* be copied.
*
* @return this ReadCache
*/
ReadCache resetHitCounts() {
Iterator mds = recordMap.values().iterator();
while (mds.hasNext()) {
mds.next().hitCount = 0;
}
return this;
}
}
/**
* To support deletion we will remove any entries for the provided address.
* This is just to yank something out of the cache which was created and
* then immediately deleted on the RW store before it could be written
* through to the disk. This does not reclaim any space in the write cache
* since allocations are strictly sequential within the cache and can only
* be used with the RW store. The RW store uses write prefixes in the cache
* buffer so we must zero the long address element as well to indicate that
* the record was removed from the buffer.
*
* This approach has now been refined to avoid problems with incremental
* checksums which otherwise would invalidate the buffer checksum to date.
* Rather than zeroing the address of the deleted block a new zero-length
* prefix is written that when processed will ensure any current recordMap
* entry is removed.
*
* TODO: An issue to be worked through is whether there remains a problem
* with a full buffer where there is not room for the dummy "remove" prefix.
* Whilst we could of course ensure that a buffer with less than the space
* required for prefixWrites should be moved immediately to the dirtlyList,
* there would still exist the possibility that the clear could be requested
* on a buffer already on the dirtyList. It looks like this should not
* matter, since each buffer update can be considered as an atomic update
* even if the set of writes are individually not atomic (the updates from a
* previous buffer will always have been completed before the next buffer is
* processed).
*
* In that case it appears we could ignore the situation where there is no
* room for the dummy "remove" prefix, since there will be no room for a new
* write also and the buffer will be flushed either on commit or a
* subsequent write.
*
* A problem previously existed with unsynchronized access to the
* ByteBuffer. Resulting in a conflict over the position() and buffer
* corruption.
*
* If the WriteCache is closed then it must not be modified at all otherwise
* any HA replication will not be binary compatible.
*
* @param addr
* The address of a cache entry.
*
* @throws InterruptedException
* @throws IllegalStateException
*/
/* public */boolean clearAddrMap(final long addr, final int latchedAddr)
throws IllegalStateException, InterruptedException {
// Note: Invoked from unit test w/o lock.
// if (!transferLock.isHeldByCurrentThread())
// throw new IllegalMonitorStateException();
/*
* Note: acquire() is mutex with the writeLock. clearAddrMap() will take
* the writeLock in order to ensure that this operation is atomic with
* respect to closeForWrites().
*/
final ByteBuffer tmp = acquire();
try {
if (m_closedForWrites) {
/*
* Neither the buffer nor the record map may be modified. The
* WriteCacheService is in the process of writing this buffer to
* the disk and replicating it to the downstream nodes (HA).
*
* The record exists, but we can not remove it. Return false.
*/
return false;
}
// Remove record from this cache.
final RecordMetadata removed = recordMap.remove(addr);
if (removed == null) {
/*
* Must be present.
*
* Buffer not closed for writes, but record moved. Mayhaps
* compacted to another?
*/
throw new AssertionError();
}
removed.deleted = true;
if (!prefixWrites) {
/*
* We will not record a deleted record. We are not in HA mode.
*/
m_removed += removed.recordLength;
return true;
}
// overwrite buffer, changing file offset to negative
final int addr_offset = removed.bufferOffset
- SIZEOF_PREFIX_WRITE_METADATA;
tmp.putLong(addr_offset, -removed.fileOffset); // abs write.
/*
* Fix up the debug flag when last address is cleared.
*/
if (m_written && recordMap.isEmpty()) {
m_written = false;
}
m_removed += removed.recordLength;
return true;
} finally {
release();
}
}
protected void registerWriteStatus(long offset, int length, char action) {
// NOP to be overridden for debug if required
}
boolean m_written = false;
private long lastOffset;
/**
* Called to clear the WriteCacheService map of references to this
* WriteCache.
*
* @param serviceRecordMap
* the map of the WriteCacheService that associates an address
* with a WriteCache
* @throws InterruptedException
*/
// * @param fileExtent
// * the current extent of the backing file.
void resetWith(final ConcurrentMap serviceRecordMap
// final long fileExtentIsIgnored
) throws InterruptedException {
final Iterator entries = recordMap.keySet().iterator();
if (serviceRecordMap != null && entries.hasNext()) {
transferLock.lock();
try {
if (log.isInfoEnabled())
log.info("resetting existing WriteCache: nrecords=" + recordMap.size() + ", hashCode=" + hashCode());
while (entries.hasNext()) {
final Long fileOffset = entries.next();
/*
* We need to guard against the possibility that the entry in
* the service record map has been updated concurrently such
* that it now points to a different WriteCache instance. This
* is possible (for the RWStore) if a recently freed record has
* been subsequently reallocated on a different WriteCache.
* Using the conditional remove on ConcurrentMap guards against
* this.
*/
final boolean removed = serviceRecordMap.remove(fileOffset, this);
registerWriteStatus(fileOffset, 0, removed ? 'R' : 'L');
}
} finally {
transferLock.unlock();
}
} else {
if (log.isInfoEnabled()) {
// debug to see recycling
log.info("clean WriteCache: hashCode=" + hashCode());
}
// cache is written but also transfererd to readCache
// if (m_written) {
// log.warn("Written WriteCache but with no records");
// }
}
reset(); // must ensure reset state even if cache already empty
// setFileExtent(fileExtent);
}
public void setRecordMap(Collection map) {
throw new RuntimeException("setRecordMap NotImplemented");
}
/**
* Checksum helper computes the running checksum from series of
* {@link ByteBuffer}s and int
checksum values as written onto
* the backing byte buffer for a {@link WriteCache} instance.
*/
private static class ChecksumHelper extends ChecksumUtility {
// /**
// * Private helper object.
// */
// private final Adler32 chk = new Adler32();
/**
* A private buffer used to format the per-record checksums when they
* need to be combined with the records written onto the write cache for
* a total checksum over the write cache contents.
*/
final private ByteBuffer chkbuf = ByteBuffer.allocate(4);
/**
* Update the running checksum to reflect the 4 byte integer.
*
* @param v
* The integer.
*/
public void update(final int v) {
chkbuf.clear();
chkbuf.putInt(v);
chk.update(chkbuf.array(), 0/* off */, 4/* len */);
}
public int getChecksum() {
return super.getChecksum();
}
public void reset() {
super.reset();
}
public void update(final ByteBuffer buf) {
super.update(buf);
}
// /**
// * Update the {@link Adler32} checksum from the data in the buffer.
// The
// * position, mark, and limit are unchanged by this operation. The
// * operation is optimized when the buffer is backed by an array.
// *
// * @param buf
// * The buffer.
// *
// * @return The checksum.
// */
// public void update(final ByteBuffer buf) {
// assert buf != null;
//
// final int pos = buf.position();
// final int limit = buf.limit();
//
// assert pos >= 0;
// assert limit > pos;
//
// if (buf.hasArray()) {
//
// /*
// * Optimized when the buffer is backed by an array.
// */
//
// final byte[] bytes = buf.array();
//
// final int len = limit - pos;
//
// if (pos > bytes.length - len) {
//
// throw new BufferUnderflowException();
//
// }
//
// chk.update(bytes, pos + buf.arrayOffset(), len);
//
// } else {
//
// for (int i = pos; i < limit; i++) {
//
// chk.update(buf.get(i));
//
// }
//
// }
//
// }
}
/**
* Used by the HAWriteMessage to retrieve the nextOffset as implied by the
* recordMap
*
* @return the last offset value
*/
public long getLastOffset() {
return lastOffset;
}
/**
* Hook to rebuild RecordMetadata after buffer has been transferred. For the
* default {@link WriteCache} this is a single entry using firstOffset and
* current position. For scattered writes, it uses a map with the addr,
* size, and data inlined.
*
* @see FileChannelScatteredWriteCache
*
* @throws InterruptedException
*/
public void resetRecordMapFromBuffer() throws InterruptedException {
final Lock writeLock = lock.writeLock();
writeLock.lockInterruptibly();
try {
resetRecordMapFromBuffer(buf.get().buffer().duplicate(), recordMap);
} finally {
writeLock.unlock();
}
}
/**
* Low-level routine copies the data from the caller's buffer into this
* buffer.
*
* @param bin
* The caller's buffer.
*
* @throws InterruptedException
*/
void copyRawBuffer(final ByteBuffer bin) throws InterruptedException {
final Lock writeLock = lock.writeLock();
writeLock.lockInterruptibly();
try {
final ByteBuffer buf = this.buf.get().buffer();
/*
* Copy the data from the caller's buffer into our own.
*
* Note: We receive the buffer with pos=0, limit=#ofbyteswritten. As
* a post-condition, pos will be advanced to the limit.
*/
buf.limit(bin.limit());
buf.position(0);
buf.put(bin); // copy the caller's buffer.
/*
* Rebuild the record map
*
* Note: rebuild reads from position to limit, advancing the
* position.
*
* Note: flush() expects pos=limit, so we are good to go after we
* rebuild the record map.
*/
buf.position(0); // reset the position.
resetRecordMapFromBuffer(buf, recordMap);
buf.position(buf.limit());
} finally {
writeLock.unlock();
}
}
/**
* Transfers records from this {@link WriteCache} to the destination
* {@link WriteCache}, updating the record map in the
* {@link WriteCacheService} as it goes.
*
* This method handles prefixWrites and useChecksum to transfer the correct
* bytes for the associated {@link RecordMetadata}.
*
* @param src
* The source buffer.
* @param dst
* Records are transferred into the dst {@link WriteCache}
* .
* @param serviceRecordMap
* The {@link WriteCacheService}'s record map.
* @param threshold
* The hitCount at which the record is transferred
*
* @return Returns true if the transfer is complete, or false if the
* destination runs out of room.
*
* @throws InterruptedException
*/
// package private
static boolean transferTo(final WriteCache src, final WriteCache dst,
final ConcurrentMap serviceRecordMap, final int threshold)
throws InterruptedException {
if (src == null)
throw new IllegalArgumentException();
if (dst == null)
throw new IllegalArgumentException();
if (src == dst)
throw new IllegalArgumentException();
// FIXME: check assumptions for transferTo, closedForWrites vs ReadCache etc
// if (src.m_closedForWrites) {
// // the source buffer must not be closed for writes.
// throw new IllegalStateException();
// }
// if (dst.m_closedForWrites) {
// // the dst buffer must not be closed for writes.
// throw new IllegalStateException();
// }
/*
* Note: This method is only invoked during critical code in
* WriteTask.call(). No other thread can write on [src] (because it is
* closed for writes) and no other thread can write on [dst] (because it
* is not visible to any other thread). Therefore we DO NOT need a lock
* here to prevent concurrent writers on [src] or [dst].
*
* However, there is a problem where concurrent clears of an addr are
* permitted. Those clears are not able to effect [src] since it is
* closed for writes. But they are also not being applied to the [dst]
* buffer since the serviceRecordMap is incrementally updated as we copy
* the records from [src] to [dst].
*
* Therefore, WriteCacheService.clearWrite() must be MUTEX with this
* method for the scope of the [src] buffer or we can lose that
* clearWrite(). I suggest that we add a distinct lock for this purpose
* so the only contention for WriteCacheService.clearWrite() is with
* transferTo() for the buffer that is currently being compacted (the
* [src] buffer).
*
* Note: For HA, replicated WriteCache buffers MUST set closeForWrites()
* to ensure that compaction DOES NOT take place on the followers!!!
*/
final Lock lock = src.transferLock;
lock.lock();
try {
/*
* Use a lock to ensure that the current state of the ByteBuffer is
* visible to this thread. This can be the readLock because (a) we
* are only reading on [src]; and (b) this method is invoked from a
* critical section in WriteTask.call(), writes are closed on [src],
* and nobody can modify [src] (it could also be the writeLock, but
* it does not really matter which we use here as far as I can tell
* since both closedForWrites() and the critical section are
* guaranteeing that no writes will be applied to [src]).
*/
final ByteBuffer bb = src.acquire().duplicate();
ByteBuffer dd = null;
// final int srcSize = src.recordMap.size();
// int notTransferred = 0;
// int transferred = 0;
try {
// Setup destination
dd = dst.acquire();
// Note: md.recordLength includes the checksum (suffix)
// check *destination* for prefixWrites - which will be zero for ReadCache
final int prefixlen = dst.prefixWrites ? SIZEOF_PREFIX_WRITE_METADATA : 0;
final Set> es = src.recordMap.entrySet();
final Iterator> entries = es.iterator();
while (entries.hasNext()) {
final Entry entry = entries.next();
final long fileOffset = entry.getKey(); // file offset.
final RecordMetadata md = entry.getValue();
if (serviceRecordMap != null) {
final WriteCache tmp = serviceRecordMap.get(fileOffset);
if (tmp != src) {
assert !(tmp instanceof ReadCache);
entries.remove();
continue;
// throw new AssertionError(
// "Record not owned by this cache: src="
// + src + ", owner=" + tmp
// + ", offset=" + fileOffset + ", md="
// + md);
}
}
assert !md.deleted; // not deleted (deleted entries should not be in the recordMap).
// only copy records >= to threshold
if (md.hitCount < threshold) {
// notTransferred++;
serviceRecordMap.remove(fileOffset);
} else {
final int len = prefixlen + md.recordLength;
final int dstremaining = dst.remaining();
if (len > dstremaining) {
// Not enough room in destination for this record.
if (dstremaining >= 512) {
// Destination still has room, keep looking.
// notTransferred++;
continue;
}
// Destination is full (or full enough).
return false;
}
// final ByteBuffer dup = bb;//bb.duplicate(); (dup'd
// above).
final int pos = md.bufferOffset - prefixlen;// include
// prefix
final int limit = pos + len; // and any postfix
final int dstoff; // offset in the destination buffer.
synchronized (bb) {
bb.limit(limit);
bb.position(pos);
// dst.writeRaw(fileOffset, dup, md.latchedAddr);
// Copy to destination.
synchronized (dd) {
dstoff = dd.position() + prefixlen;
dd.put(bb);
// transferred++;
assert dst.remaining() == (dstremaining - len) : "dst.remaining(): "
+ dst.remaining()
+ " expected: "
+ dstremaining;
}
}
/*
* Insert record into destination.
*
* Note: The [orderedList] on the target buffer is not
* updated because we handle the propagation of the
* address allocation/clear notices separately and
* synchronously using prepareAddressMetadataForHA().
*/
{
final RecordMetadata old = dst.recordMap.put(Long
.valueOf(fileOffset), new RecordMetadata(
fileOffset, dstoff/* bufferOffset */,
md.recordLength, md.latchedAddr));
assert old == null : "Write already found: " + old;
}
if (serviceRecordMap != null) {
/*
* Note: As soon as we update the service record map
* it is possible that
* WriteCacheService.clearWrite() will clear the
* record from [dst]. We can not rely on the record
* remaining in [dst] after this method call!
*/
final WriteCache tmp = serviceRecordMap.put(
fileOffset, dst);
assert src == tmp : "tmp=" + tmp + ",src=" + src
+ ", offset=" + fileOffset + ", md=" + md;
}
}
// Clear entry from src recordMap.
entries.remove();
}
// true iff all records were transfered out.
final boolean isEmpty = src.recordMap.isEmpty();
return isEmpty;
} finally {
try {
// FIXME: check assumptions re closedForWrites and ReadCache
// if (src.m_closedForWrites) {
// // the source buffer must not be closed for writes.
// throw new IllegalStateException();
// }
// if (dst.m_closedForWrites) {
// // the dst buffer must not be closed for writes.
// throw new IllegalStateException();
// }
} finally {
if (dd != null)
dst.release();
src.release();
}
}
} finally {
lock.unlock();
}
}
/**
* Apply the {@link #orderedRecords} to create a dense {@link WriteCache}
* buffer that presents the addresses from the {@link #recordMap} along with
* enough metadata to decide whether this is a delete or merely an address
* declaration. Address declarations are modeled by setting the record size
* to a negative value. Address deletes are modeled by setting the
* fileOffset to a negative value. Actual address writes are not
* communicated through this method, but their data will eventually make it
* to the follower if the address is not recycled before the
* {@link WriteCache} holding that data is communicated to the follower (in
* which case the follower will eventually see the delete marker for the
* address instead of the application data for the address).
*
* @return true unless there is nothing in the {@link WriteCache}.
*
* @throws InterruptedException
* @throws IllegalStateException
*/
boolean prepareAddressMetadataForHA() throws IllegalStateException,
InterruptedException {
if (!prefixWrites)
throw new IllegalStateException();
if (orderedRecords == null)
throw new IllegalStateException();
final ByteBuffer tmp = acquire();
try {
/*
* Note: We need to be synchronized on the ByteBuffer here
* since this operation relies on the position() being
* stable.
*
* Note: Also see clearAddrMap(long) which is synchronized
* on the acquired ByteBuffer in the same manner to protect
* it during critical sections which have a side effect on
* the buffer position.
*/
synchronized (tmp) {
// Note: guarded by synchronized(tmp)!
if (orderedRecords.isEmpty()) {
return false;
}
tmp.position(0);
tmp.limit(tmp.capacity());
for (RecordMetadata md : orderedRecords) {
if (md.deleted) {
/*
* Entry is address of deleted record. No application
* data follows the entry (the next thing in the buffer
* will be another entry).
*/
tmp.putLong(-md.fileOffset);
tmp.putInt(-md.recordLength);
} else {
/*
* Entry is notice of non-deleted address. No
* application data follows the entry (the next thing in
* the buffer will be another entry).
*/
tmp.putLong(md.fileOffset);
tmp.putInt(-md.recordLength);
}
tmp.putInt(md.latchedAddr);
} // next RecordMetadata
// Note: Guarded by synchronized(tmp)
orderedRecords.clear();
} // synchronized(tmp)
return true;
} finally {
release();
}
}
/**
* Overridden by
* {@link FileChannelScatteredWriteCache#resetRecordMapFromBuffer(ByteBuffer, Map)}
* .
*
* @param buf
* @param recordMap
*/
protected void resetRecordMapFromBuffer(final ByteBuffer buf,
final Map recordMap) {
recordMap.clear();
// put a single empty entry into the buffer.
recordMap.put(firstOffset.get(), new RecordMetadata(firstOffset.get(),
0, buf.limit(), 0/* latchedAddr */));
}
/**
* Called from {@link WriteCacheService} to lock buffer content immediately
* prior to flushing and HA pipline replication. Neither the internal buffer
* state nor the {@link #recordMap} may be changed once the
* {@link WriteCache} has been closed for writes. This is necessary to
* provide 100% binary replication. Otherwise the stores can differ in the
* data in freed allocation slots.
*
* @throws InterruptedException
* @throws IllegalStateException
*/
/*
* Note: exposed to IHAJournalStrategy.writeRawBuffer(). Implementations of
* that method must close the mock WriteCache against writes to prevent
* compaction of replicated WriteCache buffers on the receiver (HA).
*/
public void closeForWrites() throws IllegalStateException, InterruptedException {
/*
* Note: clearAddrMap() uses acquire() to operate on the recordMap and
* the buffer. This method must be mutex with clearAddrMap(), so we take
* the writeLock.
*/
final Lock lock = this.lock.writeLock();
lock.lockInterruptibly();
try {
if (m_closedForWrites)
throw new AssertionError();
m_closedForWrites = true;
} finally {
lock.unlock();
}
}
public boolean isClosedForWrites() {
return m_closedForWrites;
}
/**
* Return the percentage of space that has been removed through the
* application of {@link #clearAddrMap(long, int)} and hence could be
* recovered through compaction of the {@link WriteCache}.
*
* @return The percentage of recoverable space in [0:100].
*/
final int potentialCompaction() {
final int percentEmpty = (m_removed * 100) / bytesWritten();
assert percentEmpty >= 0 && percentEmpty <= 100;
return percentEmpty;
}
/*
* Managing reference counts for the memoizer pattern for the ReadCache.
*/
/**
* Allocate space for a record of the given length on this
* {@link WriteCache}.
*
* @param nbytes
* The size of the record.
*
* @return A view of the allocation on the {@link WriteCache} -or-
* null
if there is not enough room in this
* {@link WriteCache} for the allocation.
*
* @throws IllegalStateException
* if the {@link WriteCache} has been {@link #close() closed}.
* @throws InterruptedException
* if the lock could not be acquired.
*/
public ByteBuffer allocate(final int nbytes) throws IllegalStateException,
InterruptedException {
final ByteBuffer tmp = acquire();
try {
synchronized (tmp) {
if (remaining() > nbytes) {
// [pos] the position of the new allocation.
final int pos = tmp.position();
// Advance position() beyond the new allocation.
tmp.position(pos + nbytes);
// Dup the buffer for independent pos and limit.
final ByteBuffer ret = tmp.duplicate();
// Setup view onto new allocation.
ret.position(pos);
ret.limit(pos + nbytes);
// Return view.
return ret;
} else {
return null;
}
}
} finally {
release();
}
}
void commitToMap(final long offset, final int position, final int nbytes) {
final RecordMetadata md = new RecordMetadata(offset, position,
nbytes, -1/*latchedAddr*/);
if (recordMap.put(offset, md) != null) {
log.warn("Record already in cache");
}
}
/**
* The referenceCount is used to protect as early resetting to the clean
* list. It is incremented by the WCS when used as a readCache and
* thereafter by the memoizer when the cache is used for an installation.
* When decremented to zero, it should be returned to the clean list.
*/
final AtomicInteger m_referenceCount = new AtomicInteger(0);
public int getReferenceCount() {
return m_referenceCount.get();
}
/**
* Called when a new reference is acquired
*
* @return current reference count
*/
public int incrementReferenceCount() {
// Note: Used from critical sections. Nothing interruptable here!
return m_referenceCount.incrementAndGet();
}
/**
* Although public, it is designed to be used by the WriteCacheService
* with a memoizer pattern to support concurrent reads to
* read cache buffers.
*
* Called when a reference is released. The return
* value should be tested and if zero the cache should
* be returned to the clean list.
*
* @return current reference count
*/
public int decrementReferenceCount() {
return m_referenceCount.decrementAndGet();
}
/**
* Checks if cache recordMap contains address offset
*
* @param offset
* @return
*/
public boolean contains(final long offset) {
return recordMap.containsKey(offset);
}
}