com.bigdata.rwstore.FixedAllocator Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.rwstore;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.log4j.Logger;
import com.bigdata.cache.ConcurrentWeakValueCache;
import com.bigdata.io.ChecksumUtility;
import com.bigdata.journal.AbstractJournal.ISnapshotData;
import com.bigdata.journal.ICommitter;
import com.bigdata.rawstore.IAllocationContext;
import com.bigdata.rwstore.RWStore.AllocationStats;
import com.bigdata.rwstore.StorageStats.Bucket;
import com.bigdata.util.BytesUtil;
/**
* FixedAllocator
*
* Maintains List of AllocBlock(s)
*/
public class FixedAllocator implements Allocator {
private static final Logger log = Logger.getLogger(FixedAllocator.class);
// Profiling for BLZG-1667 indicated that checking logging level is more expensive than expected
private static final boolean s_islogDebug = log.isDebugEnabled();
private static final boolean s_islogTrace = log.isTraceEnabled();
private final int cModAllocation = 1 << RWStore.ALLOCATION_SCALEUP;
private final int cMinAllocation = cModAllocation * 1; // must be multiple of cModAllocation
volatile int m_freeBits;
volatile private int m_freeTransients;
FixedAllocator m_prevCommit;
FixedAllocator m_nextCommit;
/**
* Address of the {@link FixedAllocator} within the meta allocation space on
* the disk of the last committed version of this {@link FixedAllocator}.
* This is a bit that can be decoded by {@link RWStore#metaBit2Addr(int)}.
* The value is initially ZERO (0) which is never a legal bit.
*/
volatile private int m_diskAddr;
volatile private int m_index;
Bucket m_statsBucket = null;
/**
* If an allocator is selected in a smallSlotHighWaste scenario, then the sparseness test
* for allocation must be relaxed or there is a risk that no allocation would be made
* from a "free" allocator.
*/
boolean m_smallSlotHighWaste = false;
public void setIndex(final int index) {
final AllocBlock fb = (AllocBlock) m_allocBlocks.get(0);
if (s_islogDebug)
log.debug("Restored index " + index + " with " + getStartAddr()
+ "[" + fb.m_live[0] + "] from " + m_diskAddr);
m_index = index;
}
public long getStartAddr() {
// if (m_startAddr == 0) {
// log.warn("zero m_startAddr, setting to " + m_allocBlocks.get(0).m_addr);
//
// m_startAddr = m_allocBlocks.get(0).m_addr;
// }
return RWStore.convertAddr(m_startAddr);
}
/*
* Note: Object#equals() is fine with this compareTo() implementation. It is
* only used to sort the allocators.
*/
public int compareTo(final Object o) {
final Allocator other = (Allocator) o;
if (other.getStartAddr() == 0) {
return -1;
} else {
final long val = getStartAddr() - other.getStartAddr();
if (val == 0) {
throw new Error("Two allocators at same address");
}
return val < 0 ? -1 : 1;
}
}
public boolean equals(final Object o) {
return this == o;
}
/**
* Return the bit in metabits for the last persisted version of this
* allocator. This bit can be translated into the actual byte offset on the
* file by {@link RWStore#metaBit2Addr(int)}.
*/
public int getDiskAddr() {
return m_diskAddr;
}
/**
* Set the bit in metabits for this allocator.
*
* @param addr
* A bit obtained from {@link RWStore#metaAlloc()}.
*/
public void setDiskAddr(final int addr) {
if (m_index == -1) {
throw new IllegalStateException("Attempt to set a storage addr for an invalid FixedAllcator");
}
m_diskAddr = addr;
}
/**
* Return the byte offset on the file corresponding to a bit index into this
* {@link FixedAllocator}.
*
* The tweak of 3 to the offset is to ensure 1, that no address is zero and
* 2 to enable the values 1 & 2 to be special cased (this aspect is now
* historical).
*
* @param offset
* The bit index into the {@link FixedAllocator}.
*
* @return The byte offset on the backing file.
*/
public long getPhysicalAddress(int offset, final boolean nocheck) {
offset -= 3;
// The AllocBlock that manages that bit.
final AllocBlock block = (AllocBlock) m_allocBlocks.get(offset
/ allocBlockRange);
// The bit offset into the AllocBlock.
final int bit = offset % allocBlockRange;
// if (RWStore.tstBit(block.m_live, bit)
// || (m_sessionActive && RWStore.tstBit(block.m_transients, bit))) {
/*
* Compute the offset into the region managed by that AllocBlock and
* then add it to the byte offset of the AllocBlock on the backing file.
* This gives us the total offset on the backing file associated with
* that bit.
*/
final long paddr = RWStore.convertAddr(block.m_addr) + ((long) m_size * bit);
/*
* Just check transients since there are case (eg CommitRecordIndex)
* where committed data is accessed even if has been marked as ready to
* be recycled after the next commit
*/
if (nocheck || RWStore.tstBit(block.m_transients, bit)) {
return paddr;
} else {
if (RWStore.tstBit(block.m_commit, bit)) {
throw new IllegalStateException(
"Address committed but not set in transients");
}
m_store.showWriteCacheDebug(paddr);
log.warn("Physical address " + paddr + " not accessible for Allocator of size " + m_size);
return 0L;
}
}
public int getPhysicalSize(final int offset) {
return m_size;
}
public int getBlockSize() {
return m_size;
}
/**
* The free list for the allocation slot size serviced by this allocator.
* This is a reference back into the corresponding free list as managed by
* the RWStore.
*
* @see #setFreeList(ArrayList)
*/
private ArrayList m_freeList;
public void setFreeList(final ArrayList list) {
setFreeList(list, false);
}
/**
* The force parameter is set to true when the allocator is being moved from one
* free list to another.
*/
public void setFreeList(final ArrayList list, boolean force) {
if (m_freeList != list) {
m_freeList = list;
m_freeWaiting = true;
}
if (m_pendingContextCommit || !hasFree()) {
if (force) {
throw new IllegalStateException("The allocator cannot be added to the free list, pendingContextCommit: " + m_pendingContextCommit + ", hasFree: " + hasFree());
}
return;
}
if (force || meetsSmallSlotThreshold()) {
addToFreeList();
}
}
/**
* To support postHACommit an allocator can be removed from the current freelist
*/
void removeFromFreeList() {
if (m_freeList != null) {
// log.warn("Removing allocator " + m_index + " from free list");
m_freeList.remove(this);
m_freeWaiting = true;
}
}
volatile private IAllocationContext m_context;
/**
* @return whether the allocator is unassigned to an AllocationContext
*/
boolean isUnlocked() {
return m_context == null;
}
/**
* Indicates whether session protection has been used to protect
* store from re-allocating allocations reachable from read-only
* requests and concurrent transactions.
*/
private boolean m_sessionActive;
boolean m_pendingContextCommit = false; // accessible from RWStore
public void setAllocationContext(final IAllocationContext context) {
if (m_pendingContextCommit) {
throw new IllegalStateException("Already pending commit");
}
if (s_islogDebug)
checkBits();
if (context == null && m_context != null) {
// restore commit bits in AllocBlocks
for (AllocBlock allocBlock : m_allocBlocks) {
allocBlock.deshadow();
}
// return to dirty list
m_store.addToCommit(this);
m_pendingContextCommit = true;
} else if (context != null && m_context == null) {
// restore commit bits in AllocBlocks
for (AllocBlock allocBlock : m_allocBlocks) {
allocBlock.shadow();
}
// remove from dirty list if present!
// NO! m_store.removeFromCommit(this);
}
m_context = context;
if (s_islogDebug)
checkBits();
}
/**
* Unwinds the allocations made within the context and clears the write
* cache of any associated data
* @param writeCacheService
*/
public void abortAllocationContext(final IAllocationContext context, RWWriteCacheService writeCacheService) {
if (m_pendingContextCommit) {
throw new IllegalStateException("Already pending commit");
}
if (m_context != null) {
// restore commit bits in AllocBlocks
for (AllocBlock allocBlock : m_allocBlocks) {
allocBlock.abortshadow(writeCacheService);
}
// Reset freebits
m_freeBits = calcFreeBits();
m_context = context;
} else {
throw new IllegalArgumentException();
}
}
/**
* write called on commit, so this is the point when "transient frees" - the
* freeing of previously committed memory can be made available since we
* are creating a new commit point - the condition being that m_freeBits
* was zero and m_freeTransients not.
*/
public byte[] write() {
try {
final AllocBlock fb = m_allocBlocks.get(0);
if (s_islogTrace)
log.trace("writing allocator " + m_index + " for " + getStartAddr() + " with " + fb.m_live[0]);
final byte[] buf = new byte[1024];
final DataOutputStream str = new DataOutputStream(new FixedOutputStream(buf));
final boolean protectTransients = m_sessionActive || m_store.isSessionProtected();
try {
str.writeInt(m_size);
assert m_sessionActive || m_freeTransients == transientbits();
final Iterator iter = m_allocBlocks.iterator();
while (iter.hasNext()) {
final AllocBlock block = iter.next();
str.writeInt(block.m_addr);
for (int i = 0; i < m_bitSize; i++) {
str.writeInt(block.m_live[i]);
}
if (!protectTransients) {
/**
* This assert will trip if any address was freed under
* session protection and therefore remained accessible
* until released.
* The value returned by releaseSession should be zero
* since all "frees" should already have removed any
* writes to the writeCacheService
*/
assert m_sessionFrees.intValue() == 0;
// assert block.releaseSession(m_store.m_writeCache) == 0;
// clear out writes - FIXME is releaseSession okay
block.releaseCommitWrites(m_store.getWriteCacheService());
// Moved to postCommit()
// block.m_transients = block.m_live.clone();
}
}
// add checksum
final int chk = ChecksumUtility.getCHK().checksum(buf,
str.size());
str.writeInt(chk);
} finally {
str.close();
}
if (s_islogDebug)
checkBits();
return buf;
} catch (IOException e) {
throw new StorageTerminalError("Error on write", e);
}
}
private int calcFreeBits() {
int freeBits = 0;
final Iterator iter = m_allocBlocks.iterator();
// final int blockSize = m_bitSize * 32 * m_size;
while (iter.hasNext()) {
final AllocBlock block = iter.next();
for (int i = 0; i < m_bitSize; i++) {
freeBits += 32 - Integer.bitCount(block.m_transients[i]);
}
}
return freeBits;
}
private int calcLiveFreeBits() {
int freeBits = 0;
final Iterator iter = m_allocBlocks.iterator();
// final int blockSize = m_bitSize * 32 * m_size;
while (iter.hasNext()) {
final AllocBlock block = iter.next();
for (int i = 0; i < m_bitSize; i++) {
freeBits += 32 - Integer.bitCount(block.m_live[i]);
}
}
return freeBits;
}
private boolean checkBits() {
final int calcFree = calcFreeBits();
final int calcLiveFree = calcLiveFreeBits();
final boolean ret = m_freeBits == calcFree
&& (m_freeBits + m_freeTransients) == calcLiveFree;
if (!ret)
throw new AssertionError("m_free: " + m_freeBits + ", calcFree: " + calcFree);
return ret;
}
// read does not read in m_size since this is read to determine the class of
// allocator
public void read(final DataInputStream str) {
try {
m_freeBits = 0;
final Iterator iter = m_allocBlocks.iterator();
final int blockSize = m_bitSize * 32 * m_size;
while (iter.hasNext()) {
final AllocBlock block = iter.next();
block.m_addr = str.readInt();
for (int i = 0; i < m_bitSize; i++) {
block.m_live[i] = str.readInt();
/**
* Need to calc how many free blocks are available, minor
* optimization by checking against either empty or full to
* avoid scanning every bit unnecessarily
**/
if (block.m_live[i] == 0) { // empty
m_freeBits += 32;
} else if (block.m_live[i] != 0xFFFFFFFF) { // not full
final int anInt = block.m_live[i];
// for (int bit = 0; bit < 32; bit++) {
// if ((anInt & (1 << bit)) == 0) {
// m_freeBits++;
// }
// }
m_freeBits += 32 - Integer.bitCount(anInt);
} // else full so no freebits
}
block.m_transients = (int[]) block.m_live.clone();
block.m_commit = (int[]) block.m_live.clone();
if (m_startAddr == 0) {
m_startAddr = block.m_addr;
}
// int endAddr = block.m_addr + blockSize;
if (block.m_addr > 0) {
m_endAddr = block.m_addr + blockSize;
}
}
} catch (IOException e) {
throw new StorageTerminalError("Error on read", e);
}
}
/** The size of the allocation slots in bytes. */
final int m_size;
private int m_startAddr = 0;
private int m_endAddr = 0;
/**
* For "small slot" allocators the allocation search is
* always from bit areas with less than a maximum density to
* ensure that writes have better locality.
*/
int m_allocIndex = -1;
/**
* The #of int32 values in a single {@link AllocBlock} region. The
* {@link FixedAllocator} can manage many {@link AllocBlock}s.
*/
private final int m_bitSize;
/**
* The #of bits in an {@link AllocBlock} (the #of slots managed by that
* {@link AllocBlock}). Each slot managed by the {@link AllocBlock} is
* {@link #m_size} bytes.
*/
private final int allocBlockRange;
private final ArrayList m_allocBlocks;
final RWStore m_store;
/**
* Calculating the number of ints (m_bitSize) cannot rely on a power of 2. Previously this
* assumption was sufficient to guarantee a rounding on to an 64k boundary. However, now
* nints * 32 * 64 = 64K, so need multiple of 32 ints.
*
* So, whatever multiple of 64, if we allocate a multiple of 32 ints we are guaranteed to be
* on an 64K boundary.
*
* This does mean that for the largest blocks of ~256K, we are allocating 256Mb of space
*
* @param size The size of the allocation slots in bytes.
* @param preserveSessionData
* @param cache
*/
FixedAllocator(final RWStore store, final int size) {//, final RWWriteCacheService cache) {
// Note: ZERO (0) is never a valid metabits bit.
m_diskAddr = 0;
m_store = store;
m_size = size;
// By default, disk-based allocators should optimize for density
m_bitSize = calcBitSize(true /* optDensity */, size, cMinAllocation, cModAllocation);
// The #of bits in an AllocBlock.
allocBlockRange = 32 * m_bitSize;
// number of blocks in this allocator, bitSize plus 1 for start address
// The 1K allocator is 256 ints, one is used to record the slot size and
// another for the checksum; leaving 254 to be used to store the
// AllocBlocks.
final int numBlocks = 254 / (m_bitSize + 1);
/*
* Create AllocBlocks for this FixedAllocator, but do not allocate
* either the AllocBlocks or their managed allocation slots on the
* persistent heap yet.
*/
m_allocBlocks = new ArrayList(numBlocks);
for (int i = 0; i < numBlocks; i++) {
m_allocBlocks.add(new AllocBlock(0, m_bitSize, this));//, cache));
}
m_freeTransients = 0;
m_freeBits = 32 * m_bitSize * numBlocks;
}
/**
* find the allocationIndex of first "sparsely committed" AllocBlock.
*
* Checks the committed bits of all the AllocBlocks until one is found with
* > 50% free (or less than 50% allocated) of the committed bits.
* @param store
* @param i
*/
void resetAllocIndex() {
resetAllocIndex(0);
}
void resetAllocIndex(final int start) {
m_allocIndex = start;
if (m_size <= m_store.cSmallSlot) {
for (int a = m_allocIndex/m_bitSize; a < m_allocBlocks.size(); a++) {
final AllocBlock ab = m_allocBlocks.get(a);
checkBlock(ab);
for (int i = (m_allocIndex%m_bitSize); i < m_bitSize; i++) {
// first check if transients are already full
if (ab.m_transients[i] != 0xFFFFFFFF) {
/*
* If small slots are in a high waste scenario, then do not check for extra
* locality in uncommitted state
*/
if (m_smallSlotHighWaste || Integer.bitCount(ab.m_commit[i]) < 16) {
final AllocBlock abr = m_allocBlocks.get(m_allocIndex/m_bitSize);
assert abr == ab;
return;
}
}
m_allocIndex++;
}
}
// must remove from free list if we cannot set the alloc Index for a small slot
if (start == 0) {
removeFromFreeList();
} else {
resetAllocIndex(0);
}
}
}
/**
* This determines the size of the reservation required in terms of
* the number of ints each holding bits for 32 slots.
*
* The minimum return value will be 1, for a single int holiding 32 bits.
*
* The maximum value will be the number of ints required to fill the minimum
* reservation.
*
* The minimum reservation will be some multiple of the
* address multiplier that allows alloction blocks to address large addresses
* with an INT32. For example, by setting a minimum reservation at 128K, the
* allocation blocks INT32 start address may be multiplied by 128K to provide
* a physical address.
*
* The minReserve must be a power of 2, eg 1K, 2k or 4K.. etc
*
* A standard minReserve of 16K is plenty big enough, enabling 32TB of
* addressable store. The logical maximum used store is calculated as the
* maximum fixed allocation size * MAX_INT. So a store with a maximum
* fixed slot size of 4K could only allocated 8TB.
*
* Since the allocation size must be MOD 0 the minReserve, the lower the
* minReserve the smaller the allocation may be required for larger
* slot sizes.
*
* Another consideration is file locality. In this case the emphasis is
* on larger contiguous areas to improve the likely locality of allocations
* made by a FixedAllocator. Here the addressability implied by the reserve
* is not an issue, and larger reserves are chosen to improve locality. The
* downside is a potential for more wasted space, but this
* reduces as the store size grows and in large stores (> 10GB) becomes
* insignificant.
*
* Therefore, if a FixedAllocator is to be used in a large store and
* locality needs to be optimised for SATA disk access then the minReserve
* should be high = say 128K, while if the allocator is tuned to ByteBuffer
* allocation, a minallocation of 8 to 16K is more suitable.
*
* A final consideration is allocator reference efficiency in the sense
* to maximise the amount of allocations that can be made. By this I mean
* just how close we can get to MAX_INT allocations. For example, if we
* allow for upto 8192 allocations from a single allocator, but in
* practice average closer to 4096 then the maximum number of allocations
* comes down from MAX_INT to MAX_INT/2. This is also a consideration when
* considering max fixed allocator size, since if we require a large number
* of Blobs this reduces the amount of "virtual" allocations by at least
* a factro of three for each blob (at least 2 fixed allocations for
* content and 1 more for the header). A variation on the current Blob
* implementation could include the header in the first allocation, thus
* reducing the minimum Blob allocations from 3 to 2, but the point still
* holds that too small a max fixed allocation could dramatically reduce the
* number of allocations that could be made.
*
* @param alloc the slot size to be managed
* @param minReserve the minimum reservation in bytes
* @return the size of the int array
*/
public static int calcBitSize(final boolean optDensity, final int alloc, final int minReserve, final int modAllocation) {
final int intAllocation = 32 * alloc; // min 32 bits
// we need to find smallest number of ints * the intAllocation
// such that totalAllocation % minReserve is 0
// example 6K intAllocation would need 8 ints for 48K for 16K min
// likewise a 24K intAllocation would require 2 ints
// if optimising for density set min ints to 8
int nints = optDensity ? 8 : 1;
while ((nints * intAllocation) < minReserve) nints++;
while ((nints * intAllocation) % modAllocation != 0) nints++;
return nints;
}
public String getStats(final AtomicLong counter) {
final StringBuilder sb = new StringBuilder(getSummaryStats());
final Iterator iter = m_allocBlocks.iterator();
while (iter.hasNext()) {
final AllocBlock block = iter.next();
if (block.m_addr == 0) {
break;
}
sb.append(block.getStats(null) + "\r\n");
if (counter != null)
counter.addAndGet(block.getAllocBits() * (long) m_size);
}
return sb.toString();
}
public String getSummaryStats() {
return"Block size : " + m_size
+ " start : " + getStartAddr() + " free : " + m_freeBits
+ "\r\n";
}
public boolean verify(int addr) {
if (addr >= m_startAddr && addr < m_endAddr) {
final Iterator iter = m_allocBlocks.iterator();
while (iter.hasNext()) {
final AllocBlock block = iter.next();
if (block.verify(addr, m_size)) {
return true;
}
}
}
return false;
}
public boolean addressInRange(int addr) {
if (addr >= m_startAddr && addr < m_endAddr) {
final Iterator iter = m_allocBlocks.iterator();
while (iter.hasNext()) {
final AllocBlock block = iter.next();
if (block.addressInRange(addr, m_size)) {
return true;
}
}
}
return false;
}
private boolean m_freeWaiting = true;
// track number of frees to be cleared on session releases
private AtomicInteger m_sessionFrees = new AtomicInteger(0);
public boolean free(final int addr, final int size) {
return free(addr, size, false);
}
/**
* Need to check if address to be freed was 'live' for any shadowed allocator to
* determine if we need to adjust the 'savedLive' data. This is critical since
* otherwise we will not be able to reset any unisolated alloc/frees.
*/
public boolean free(final int addr, final int size, final boolean overideSession) {
if (addr < 0) {
final int offset = ((-addr) & RWStore.OFFSET_BITS_MASK) - 3; // bit adjust
final int nbits = 32 * m_bitSize;
final int block = offset/nbits;
/**
* When a session is released any m_sessionActive FixedAllocators
* should be atomically released.
* However, if any state allowed a call to free once the store
* is not session protected, this must NOT overwrite m_sessionActive
* if it is already set since a commit would reset the transient bits
* without first clearing addresses them from the writeCacheService
*/
final boolean tmp = m_sessionActive;
m_sessionActive = tmp || m_store.isSessionProtected();
if (tmp && !m_sessionActive) throw new AssertionError();
try {
if (s_islogDebug)
checkBits();
if (((AllocBlock) m_allocBlocks.get(block))
.freeBit(offset % nbits, m_sessionActive && !overideSession)) { // bit adjust
m_freeBits++;
checkFreeList();
} else {
m_freeTransients++;
if (m_sessionActive) {
assert checkSessionFrees();
}
}
if (m_statsBucket != null) {
m_statsBucket.delete(size);
}
} catch (IllegalArgumentException iae) {
// catch and rethrow with more information
throw new IllegalArgumentException("IAE with address: " + addr + ", size: " + size + ", context: " + (m_context == null ? -1 : m_context.hashCode()), iae);
}
if (s_islogDebug)
checkBits();
return true;
} else if (addr >= m_startAddr && addr < m_endAddr) {
final Iterator iter = m_allocBlocks.iterator();
while (iter.hasNext()) {
final AllocBlock block = iter.next();
if (block.free(addr, m_size)) {
m_freeTransients++;
if (s_islogDebug)
checkBits();
return true;
}
}
}
if (s_islogDebug)
checkBits();
return false;
}
private boolean checkSessionFrees() {
final int sessionFrees = m_sessionFrees.incrementAndGet();
int sessionBits = 0;
for (AllocBlock ab : m_allocBlocks) {
sessionBits += ab.sessionBits();
}
return sessionFrees <= sessionBits;
}
private void checkFreeList() {
if (m_freeWaiting && !m_pendingContextCommit) {
if (meetsSmallSlotThreshold()) {
addToFreeList();
resetAllocIndex(0);
}
}
}
void addToFreeList() {
assert m_freeWaiting;
m_freeWaiting = false;
m_freeList.add(this);
m_allocIndex = -1;
if (s_islogDebug)
log.debug("Returning Allocator to FreeList - " + m_size);
}
private boolean meetsSmallSlotThreshold() {
// check threshold for all slots
if (m_freeBits < m_store.cDefaultFreeBitsThreshold) {
return false;
}
// then check for small slots
if (m_size <= m_store.cSmallSlot) { // it's a small slot
final boolean ret = m_freeBits > m_store.cSmallSlotThreshold;
return ret;
} else {
return true;
}
}
/**
* The introduction of IAllocationContexts has added some complexity to
* the older concept of a free list. With AllocationContexts it is
* possibly for allocator to have free space available but this being
* restricted to a specific AllocationContext.
*
* In addition to the standard free allocation search we want to add a
* "density" restriction for small slots to encourage the aggregation
* of writes (by increasing the likelihood of sibling slot allocation).
*
* There is some "Do What I mean" complexity here, with difficulty in
* determining a good rule to identify an initial allocation point. There
* is a danger of significantly reducing the allocation efficiency of
* short transactions if we too naively check committed bit density. We
* should only do this when identifying the initial allocation, and when
* the allocIndex is incremented.
*/
public int alloc(final RWStore store, final int size,
final IAllocationContext context) {
try {
if (size <= 0)
throw new IllegalArgumentException(
"Allocate requires positive size, got: " + size);
if (size > m_size)
throw new IllegalArgumentException(
"FixedAllocator with slots of " + m_size
+ " bytes requested allocation for " + size
+ " bytes");
if (m_freeBits == 0) {
throw new IllegalStateException("Request to allocate from " + m_size + "byte slot FixedAllocator with zero bits free - should not be on the Free List");
}
int addr = -1;
// Special allocation for small slots
if (m_size <= m_store.cSmallSlot) {
return allocFromIndex(size);
}
final Iterator iter = m_allocBlocks.iterator();
int count = -1;
while (addr == -1 && iter.hasNext()) {
count++;
final AllocBlock block = iter.next();
checkBlock(block);
addr = block.alloc(m_size);
}
if (addr != -1) {
addr += 3; // Tweak to ensure non-zero address for offset 0
if (--m_freeBits == 0) {
if (s_islogTrace)
log.trace("Remove from free list");
removeFromFreeList();
// Should have been first on list, now check for first
if (m_freeList.size() > 0) {
if (s_islogDebug) {
final FixedAllocator nxt = (FixedAllocator) m_freeList
.get(0);
log.debug("Freelist head: " + nxt.getSummaryStats());
}
}
}
addr += (count * 32 * m_bitSize);
final int value = -((m_index << RWStore.OFFSET_BITS) + addr);
if (m_statsBucket != null) {
m_statsBucket.allocate(size);
}
return value;
} else {
StringBuilder sb = new StringBuilder();
sb.append("FixedAllocator returning null address, with freeBits: "
+ m_freeBits + "\n");
for (AllocBlock ab : m_allocBlocks) {
sb.append(ab.show() + "\n");
}
log.error(sb);
return 0;
}
} finally {
if (s_islogDebug)
checkBits();
}
}
boolean checkBlock0() {
return checkBlock(m_allocBlocks.get(0));
}
boolean checkBlock(final AllocBlock block) {
if (block.m_addr == 0) {
int blockSize = 32 * m_bitSize;
if (m_statsBucket != null) {
m_statsBucket.addSlots(blockSize);
}
blockSize *= m_size;
blockSize >>= RWStore.ALLOCATION_SCALEUP;
block.m_addr = grabAllocation(m_store, blockSize);
if (s_islogDebug)
log.debug("Allocation block at " + block.m_addr
+ " of " + (blockSize << 16) + " bytes");
if (m_startAddr == 0) {
m_startAddr = block.m_addr;
}
m_endAddr = block.m_addr - blockSize;
return true; // commit required
} else {
return false;
}
}
int allocFromIndex(final int size) {
if (m_allocIndex == -1) {
resetAllocIndex();
if (m_allocIndex == -1) {
throw new AssertionError("Unable to set AllocIndex with m_freeBits: " + m_freeBits);
}
}
if (s_islogDebug)
checkBits();
if (s_islogDebug) { // calcFreeBits is relatively expensive, so only enable in DEBUG
if (m_freeBits != calcFreeBits()) {
final int calc = calcFreeBits();
throw new AssertionError("m_freeBits != calcFreeBits() : " + m_freeBits + "!=" + calc);
}
}
assert m_freeBits == calcFreeBits();
// there MUST be bits free in the m_allocIndex block
final AllocBlock ab = m_allocBlocks.get(m_allocIndex/m_bitSize);
if (ab.m_addr == 0) {
throw new AssertionError("No allocation for AllocBlock with m_allocIndex: " + m_allocIndex);
}
final int abblock = m_allocIndex % m_bitSize;
assert ab.m_transients[abblock] != 0xFFFFFFFF; // not all set
final int bit = RWStore.fndBit(ab.m_transients[abblock]);
assert bit >= 0;
m_freeBits--;
final int abit = (abblock*32) + bit;
RWStore.setBit(ab.m_live, abit);
RWStore.setBit(ab.m_transients, abit);
// Note +3 for address teak for special low order bits
final int addr = -((m_index << RWStore.OFFSET_BITS) + (m_allocIndex*32) + (bit + 3));
// Now check current index
if (ab.m_transients[abblock] == 0xFFFFFFFF) {
// find next allocIndex
resetAllocIndex(m_allocIndex+1);
}
if (s_islogDebug) { // calcFreeBits is relatively expensive, so only enable in DEBUG
final int calc = calcFreeBits();
if (m_freeBits != calc) {
throw new AssertionError("m_freeBits != calcFreeBits() : " + m_freeBits + "!=" + calc);
}
}
// assert m_freeBits == calcFreeBits();
if (m_statsBucket != null) {
m_statsBucket.allocate(size);
}
return addr;
}
protected int grabAllocation(RWStore store, int blockSize) {
final int ret = store.allocBlock(blockSize);
return ret;
}
public boolean hasFree() {
return m_freeBits > 0;
}
public void addAddresses(final ArrayList addrs) {
final Iterator blocks = m_allocBlocks.iterator();
// FIXME int baseAddr = -((m_index << 16) + 4); // bit adjust
int baseAddr = -(m_index << 16); // bit adjust??
while (blocks.hasNext()) {
final AllocBlock block = (AllocBlock) blocks.next();
block.addAddresses(addrs, baseAddr);
baseAddr -= 32 * m_bitSize;
}
}
/**
* returns the raw start address
*/
public int getRawStartAddr() {
return m_startAddr;
}
public int getIndex() {
return m_index;
}
public void appendShortStats(final StringBuilder str,
final AllocationStats[] stats) {
int si = -1;
if (stats == null) {
str.append("Index: " + m_index + ", " + m_size);
} else {
for (int i = 0; i < stats.length; i++) {
if (m_size == stats[i].m_blockSize) {
si = i;
break;
}
}
}
final Iterator blocks = m_allocBlocks.iterator();
while (blocks.hasNext()) {
final AllocBlock block = blocks.next();
if (block.m_addr != 0) {
str.append(block.getStats(si == -1 ? null : stats[si]));
} else {
break;
}
}
str.append("\n");
}
public int getAllocatedBlocks() {
int allocated = 0;
final Iterator blocks = m_allocBlocks.iterator();
while (blocks.hasNext()) {
if (blocks.next().m_addr != 0) {
allocated++;
} else {
break;
}
}
return allocated;
}
/**
* @return the amount of heap storage assigned to this allocator over
* all reserved allocation blocks.
*/
public long getFileStorage() {
final long blockSize = 32L * m_bitSize * m_size;
long allocated = getAllocatedBlocks();
allocated *= blockSize;
return allocated;
}
/**
* Computes the amount of storage allocated using the freeBits count.
*
* @return the amount of storage to alloted slots in the allocation blocks
*/
public long getAllocatedSlots() {
final int allocBlocks = getAllocatedBlocks();
int xtraFree = m_allocBlocks.size() - allocBlocks;
xtraFree *= 32 * m_bitSize;
final int freeBits = m_freeBits - xtraFree;
final long alloted = (allocBlocks * 32 * m_bitSize) - freeBits;
return alloted * m_size;
}
public boolean isAllocated(int offset) {
offset -= 3;
final int allocBlockRange = 32 * m_bitSize;
final AllocBlock block = (AllocBlock) m_allocBlocks.get(offset / allocBlockRange);
final int bit = offset % allocBlockRange;
return RWStore.tstBit(block.m_live, bit);
}
public boolean isCommitted(int offset) {
offset -= 3;
final int allocBlockRange = 32 * m_bitSize;
final AllocBlock block = (AllocBlock) m_allocBlocks.get(offset / allocBlockRange);
final int bit = offset % allocBlockRange;
return RWStore.tstBit(block.m_commit, bit);
}
protected final AllocBlock getBlockFromLocalOffset(int offset) {
offset -= 3;
final int allocBlockRange = 32 * m_bitSize;
return (AllocBlock) m_allocBlocks.get(offset / allocBlockRange);
}
/**
* If the context is this allocators context AND it is not in the commit bits
* then we can immediately free.
*/
public boolean canImmediatelyFree(final int addr, final int size,
final IAllocationContext context) {
final int offset = ((-addr) & RWStore.OFFSET_BITS_MASK); // bit adjust
final boolean committed = isCommitted(offset);
if (!m_pendingContextCommit && ((context == m_context) || (m_context == null && !context.isIsolated()))) {
return !committed;
} else if (m_context != null) {
// This must *not* be an address transiently allocated by the associated Allocator
if (!committed)
throw new IllegalStateException("Attempt to free address with invalid context");
return false;
} else {
return false;
}
}
public void setBucketStats(Bucket b) {
m_statsBucket = b;
}
/**
* The semantics of reset are to ditch all unisolated modifications
* since the last commit point. Note that this includes unisolated frees
* as well as allocations.
*
* @param cache
* @param nextAllocation
*/
boolean reset(RWWriteCacheService cache, final int nextAllocation) {
boolean isolatedWrites = false;
for (AllocBlock ab : m_allocBlocks) {
if (ab.m_addr == 0)
break;
ab.reset(cache);
isolatedWrites = isolatedWrites || ab.m_saveCommit != null;
if (ab.m_addr <= nextAllocation && ab.m_saveCommit == null) { // only free if no isolated writes
// this should mean that all allocations were made since last commit
// in which case...
assert ab.freeBits() == ab.totalBits();
ab.m_addr = 0;
}
}
m_freeTransients = transientbits();
m_freeBits = calcFreeBits();
// Ensure allocIndex is reset
m_allocIndex = -1;
assert calcSessionFrees();
if (s_islogDebug)
checkBits();
return isolatedWrites;
}
private boolean calcSessionFrees() {
int sessionBits = 0;
for (AllocBlock ab : m_allocBlocks) {
sessionBits += ab.sessionBits();
}
m_sessionFrees.set(sessionBits);
return true;
}
void releaseSession(RWWriteCacheService cache) {
if (m_context != null) {
throw new IllegalStateException("Calling releaseSession on shadowed allocator");
}
if (this.m_sessionActive) {
final int start = m_sessionFrees.intValue();
// try {
if (s_islogTrace)
log.trace("Allocator: #" + m_index + " releasing session protection");
int releasedAllocations = 0;
for (AllocBlock ab : m_allocBlocks) {
releasedAllocations += ab.releaseSession(cache);
}
assert !m_store.isSessionProtected() : "releaseSession called with isSessionProtected: true";
m_sessionActive = false; // should only need indicate that it contains no cached writes
m_freeBits = freebits();
final int freebits = freebits();
if (m_freeBits > freebits)
log.error("m_freeBits too high: " + m_freeBits + " > (calc): " + freebits);
m_freeTransients = transientbits();
checkFreeList();
// assert m_sessionFrees == releasedAllocations : "Allocator: " + hashCode() + " m_sessionFrees: " + m_sessionFrees + " != released: " + releasedAllocations;
if (start > releasedAllocations) {
log.error("BAD! Allocator: " + hashCode() + ", size: " + m_size + " m_sessionFrees: " + m_sessionFrees.intValue() + " > released: " + releasedAllocations);
} else {
// log.error("GOOD! Allocator: " + hashCode() + ", size: " + m_size + " m_sessionFrees: " + m_sessionFrees.intValue() + " <= released: " + releasedAllocations);
}
// } finally {
final int end = m_sessionFrees.getAndSet(0);
assert start == end : "SessionFrees concurrent modification: " + start + " != " + end;
// }
} else {
assert m_sessionFrees.intValue() == 0 : "Session Inactive with sessionFrees: " + m_sessionFrees.intValue();
}
}
private int freebits() {
int freeBits = 0;
for (AllocBlock ab : m_allocBlocks) {
freeBits += ab.freeBits();
}
return freeBits;
}
private int transientbits() {
int freeBits = 0;
for (AllocBlock ab : m_allocBlocks) {
freeBits += ab.transientBits();
}
return freeBits;
}
public long getPhysicalAddress(final int offset) {
return getPhysicalAddress(offset, false); // do NOT override address checks
}
void setAddressExternal(final int latchedAddr) {
final int offset = ((-latchedAddr) & RWStore.OFFSET_BITS_MASK) - 3; // bit adjust
final int nbits = 32 * m_bitSize;
final int block = offset/nbits;
final int bit = offset % nbits;
final AllocBlock ab = m_allocBlocks.get(block);
if (ab.m_addr == 0) {
// fixup offset
int blockSize = 32 * m_bitSize;
blockSize *= m_size;
blockSize >>= RWStore.ALLOCATION_SCALEUP;
ab.m_addr = grabAllocation(m_store, blockSize);
if (block == 0)
m_startAddr = ab.m_addr;
}
ab.setBitExternal(bit);
m_freeBits--;
}
public int getSlotSize() {
return m_size;
}
/**
* Add the committed allocated slot contents to the digest
*
* FIXME: First version is correct rather than optimal, need to
* consider if there is any benefit to
*
* @param snapshot
* @param digest
*/
public void computeDigest(final Object snapshot, final MessageDigest digest) {
// create buffer of slot size
final ByteBuffer bb = ByteBuffer.allocate(m_size);
final byte[] ba = m_index == 0 && s_islogDebug ? bb.array() : null;
for (AllocBlock b : m_allocBlocks) {
final int bits = b.m_commit.length * 32;
final long startAddr = RWStore.convertAddr(b.m_addr);
for (int i = 0; i < bits; i++) {
if (RWStore.tstBit(b.m_commit, i)) {
final long paddr = startAddr + (m_size * i);
bb.position(0);
m_store.readRaw(paddr, bb);
digest.update(bb);
if (ba != null) {
log.debug(BytesUtil.toHexString(ba));
}
}
}
}
{
final byte[] data = digest.digest();
final StringBuffer sb = new StringBuffer();
for (byte b : data) {
if (sb.length() > 0)
sb.append(",");
sb.append(b);
}
log.warn("ALLOCATOR[" + m_index + ":" + m_size + "] freeBits: " + freebits() + ", DIGEST:" + sb.toString());
}
}
/**
* Update the historical commit bits only once confirmed
*/
public void postCommit() {
final boolean protectTransients = m_sessionActive || m_store.isSessionProtected();
for (AllocBlock b : m_allocBlocks) {
b.m_commit = b.m_live.clone();
if (!protectTransients)
b.m_transients = b.m_live.clone();
/**
* If this allocator is shadowed then copy the new committed
* state to m_saveCommit
*/
if (m_context != null) {
// do not copy live bits to committed bits, leave to context.release()
// throw new IllegalStateException("Must not commit shadowed FixedAllocator!");
// } else if (m_store.isSessionPreserved()) {
// block.m_commit = block.m_transients.clone();
} else {
b.m_commit = b.m_live.clone();
// if m_saveCommit is set then it must be m_pendingContextCommit
if (b.m_saveCommit != null) {
if (!m_pendingContextCommit)
throw new IllegalStateException("Unexpected m_saveCommit when no pending commit");
b.m_saveCommit = null;
b.m_isoFrees = null;
}
}
}
if (m_pendingContextCommit) {
m_pendingContextCommit = false;
if (m_freeWaiting && meetsSmallSlotThreshold()) {
addToFreeList();
}
}
if (!protectTransients /*!this.m_sessionActive*/) {
m_freeBits += m_freeTransients;
// Handle re-addition to free list once transient frees are
// added back
if (m_freeWaiting && meetsSmallSlotThreshold()) {
addToFreeList();
}
m_freeTransients = 0;
}
if (s_islogDebug)
checkBits();
}
/*
* Checks for allocations committed in xfa that are free in this allocator
* and should be removed from the historical external cache.
*/
public int removeFreedWrites(final FixedAllocator xfa,
final ConcurrentWeakValueCache externalCache) {
// Compare the committed bits in each AllocBlock
int count = 0;
for (int i = 0; i < m_allocBlocks.size(); i++) {
final AllocBlock ab = m_allocBlocks.get(i);
final AllocBlock xab = xfa.m_allocBlocks.get(i);
// NOTE that absolute bit offsets are bumped by 3 for historical reasons
final int blockBitOffset = 3 + (i * xab.m_commit.length * 32);
for (int b = 0; b < xab.m_commit.length; b++) {
if (xab.m_commit[b] != ab.m_commit[b]) { // some difference
// compute those set in xfa not set in ab (removed)
final int removed = xab.m_commit[b] & ~ab.m_commit[b];
if (removed != 0) { // something to do
// need to test each of 32 bits
for (int bit = 0; bit < 32; bit++) {
if ((removed & (1 << bit)) != 0) {
// Test bit calculation
final int tstBit = blockBitOffset + (b * 32) + bit;
if (!(xfa.isCommitted(tstBit) && !isCommitted(tstBit))) {
log.error("Bit problem: " + tstBit);
}
final long paddr = xfa.getPhysicalAddress(tstBit);
if (s_islogTrace) {
log.trace("Checking address for removal: " + paddr);
}
count++;
externalCache.remove(paddr);
}
}
}
}
}
}
if (s_islogTrace)
log.trace("FA index: " + m_index + ", freed: " + count);
return count;
}
/**
* Determines if the provided physical address is within an allocated slot
* @param addr
* @return
*/
public boolean verifyAllocatedAddress(long addr) {
if (s_islogTrace)
log.trace("Checking Allocator " + m_index + ", size: " + m_size);
final Iterator blocks = m_allocBlocks.iterator();
final long range = m_size * m_bitSize * 32;
while (blocks.hasNext()) {
final int startAddr = blocks.next().m_addr;
if (startAddr != 0) {
final long start = RWStore.convertAddr(startAddr);
final long end = start + range;
if (s_islogTrace)
log.trace("Checking " + addr + " between " + start + " - " + end);
if (addr >= start && addr < end)
return true;
} else {
break;
}
}
return false;
}
/**
* Add a copy of the currently committed allocation data to the snapshot. This is used by the snapshot
* mechanism to ensure that a file copy, taken over the course of multiple commits, will contain the
* correct allocation data from the time the snapshot was taken.
*/
void snapshot(final ISnapshotData tm) {
if (m_diskAddr > 0)
tm.put(m_store.metaBit2Addr(m_diskAddr), commitData());
}
/**
* Returns the 1K committed allocation data by writing the commit data for each allocation block.
*/
byte[] commitData() {
try {
final byte[] buf = new byte[1024];
final DataOutputStream str = new DataOutputStream(new FixedOutputStream(buf));
try {
str.writeInt(m_size);
final Iterator iter = m_allocBlocks.iterator();
while (iter.hasNext()) {
final AllocBlock block = iter.next();
str.writeInt(block.m_addr);
for (int i = 0; i < m_bitSize; i++) {
str.writeInt(block.m_commit[i]);
}
}
// add checksum
final int chk = ChecksumUtility.getCHK().checksum(buf,
str.size());
str.writeInt(chk);
} finally {
str.close();
}
return buf;
} catch (IOException e) {
throw new StorageTerminalError("Error on write", e);
}
}
public void addToRegionMap(HashMap map) {
for (AllocBlock ab : m_allocBlocks) {
if (ab.m_addr != 0) {
final FixedAllocator pa = map.put(ab.m_addr, this);
if (pa != null) {
throw new IllegalStateException("Duplicate mapping Allocators, " + pa.m_index + ", " + m_index);
}
}
}
}
}