com.bigdata.resources.StoreManager Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 24, 2008
*/
package com.bigdata.resources;
import java.io.File;
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.log4j.Logger;
import com.bigdata.bfs.BigdataFileSystem;
import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.IndexSegmentStore;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.cache.HardReferenceQueue;
import com.bigdata.concurrent.NamedLock;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.AbstractJournal;
import com.bigdata.journal.AbstractLocalTransactionManager;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.CommitRecordIndex;
import com.bigdata.journal.ConcurrencyManager;
import com.bigdata.journal.DiskOnlyStrategy;
import com.bigdata.journal.IBufferStrategy;
import com.bigdata.journal.ICommitRecord;
import com.bigdata.journal.IConcurrencyManager;
import com.bigdata.journal.ILocalTransactionManager;
import com.bigdata.journal.IResourceLockService;
import com.bigdata.journal.IResourceManager;
import com.bigdata.journal.IRootBlockView;
import com.bigdata.journal.ITransactionService;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Name2Addr;
import com.bigdata.journal.TemporaryStore;
import com.bigdata.journal.WORMStrategy;
import com.bigdata.journal.WORMStrategy.StoreCounters;
import com.bigdata.journal.WriteExecutorService;
import com.bigdata.mdi.IPartitionMetadata;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.IndexPartitionCause;
import com.bigdata.mdi.JournalMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.service.DataService;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.EventType;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.ManagedResourceService;
import com.bigdata.service.MetadataService;
import com.bigdata.sparse.SparseRowStore;
import com.bigdata.util.Bytes;
import com.bigdata.util.DaemonThreadFactory;
import com.bigdata.util.config.NicUtil;
/**
* Class encapsulates logic for managing the store files (journals and index
* segments), including the logic to compute the effective release time for the
* managed resources and to release those resources by deleting them from the
* file system.
*
* @todo There is neither a "CREATE_TEMP_DIR" and "DELETE_ON_CLOSE" does not
* remove all directories created during setup. One of the consequences is
* that you have to explicitly clean up after a unit test using a
* {@link ResourceManager} or it will leave its files around.
*
* @todo {@link BufferMode#Temporary} is not supported (verify whether the
* Transient mode is supported).
*
* @todo If we approach the limit on free space for the {@link #dataDir} then we
* need to shed index partitions to other data services or potentially
* become more aggressive in releasing old resources. See
* {@link #getDataDirFreeSpace(File)}
*
* @author Bryan Thompson
*/
abstract public class StoreManager extends ResourceEvents implements
IResourceManager {
/**
* Logger.
*/
private static final Logger log = Logger.getLogger(StoreManager.class);
/**
* Options for the {@link StoreManager}.
*
* @author Bryan Thompson
*/
public static interface Options extends com.bigdata.journal.Options {
/**
* The property whose value is the name of the directory in which the
* store files will be created (no default). This property is required
* unless the instance is transient. If you specify
* {@link com.bigdata.journal.Options#BUFFER_MODE} as
* {@link BufferMode#Transient} then journals will be NOT stored in the
* file system and {@link ResourceManager#overflow()} will be disabled.
*
* The files are created within subdirectories as follows: The
* "journals" subdirectory contains the journal files. The "segments"
* directory contains subdirectories corresponding to the index UUID for
* each scale-out index. Within those index-specific directories, the
* index segment files are assigned to files using the temporary file
* mechanisms using the munged index name as the file prefix and
* {@link Options#SEG} as the file suffix. If the index is partitioned
* then the partition identifier appears as part of the file prefix.
*
* Note: While files are stored per the scheme described above, the
* entire {@link #DATA_DIR} will be scanned recursively to identify all
* journal files and index segments during startup. Files will be used
* where ever they are found but the {@link IResourceMetadata#getFile()}
* read from a given resource MUST correspond to its relative location
* within the {@link #DATA_DIR}.
*
* Note: Each {@link DataService} or {@link MetadataService} MUST have
* its own {@link #DATA_DIR}.
*/
String DATA_DIR = StoreManager.class.getName()+".dataDir";
/**
* The capacity of the LRU cache of open {@link IRawStore}s. The
* capacity of this cache indirectly controls how many stores will be
* held open. The main reason for keeping an store open is to reuse its
* buffers if another request arrives "soon" which would read on that
* store. Note that "stores" includes both {@link ManagedJournal}s and
* {@link IndexSegmentStore}s.
*
* The effect of this parameter is indirect owning to the semantics of
* weak references and the control of the JVM over when they are
* cleared. Once an index becomes weakly reachable, the JVM will
* eventually GC the index object, thereby releasing its object graph.
* Since stores which are strongly reachable never have their weak
* reference cleared this provides our guarantee that stores are never
* closed if they are in use.
*
* Stores have non-transient resources and MUST explicitly be closed.
* Since we are not notified before the weak reference is closed, our
* only remaining option is {@link AbstractJournal#finalize()} and
* {@link IndexSegmentStore#finalize()}, both of which close the store
* if it is still open.
*
* @see #DEFAULT_STORE_CACHE_CAPACITY
*/
String STORE_CACHE_CAPACITY = StoreManager.class.getName()
+ ".storeCacheCapacity";
/**
* The default for the {@link #STORE_CACHE_CAPACITY} option.
*/
String DEFAULT_STORE_CACHE_CAPACITY = "20";
/**
* The time in milliseconds before an entry in the store cache will be
* cleared from the backing {@link HardReferenceQueue} (default
* {@value #DEFAULT_STORE_CACHE_TIMEOUT}). This property controls how
* long the store cache will retain an {@link IRawStore} which has not
* been recently used. This is in contrast to the cache capacity.
*/
String STORE_CACHE_TIMEOUT = StoreManager.class.getName()
+ ".storeCacheTimeout";
String DEFAULT_STORE_CACHE_TIMEOUT = "" + (60 * 1000); // One minute.
/**
* A boolean property whose value determines whether or not startup will
* complete successfully if bad files are identified during the startup
* scan (default {@value #DEFAULT_IGNORE_BAD_FILES}). When
* false
the {@link StoreManager} will refuse to start if
* if find bad files. When true
the {@link StoreManager}
* will startup anyway but some index views may not be available.
* Regardless, bad files will be logged as they are identified and all
* files will be scanned before the {@link StoreManager} aborts.
*/
String IGNORE_BAD_FILES = StoreManager.class.getName()+".ignoreBadFiles";
String DEFAULT_IGNORE_BAD_FILES = "false";
/**
* Option may be used to disable the purge of old resources during
* startup.
*/
String PURGE_OLD_RESOURCES_DURING_STARTUP = StoreManager.class.getName()
+ ".purgeOldResourcesDuringStartup";
String DEFAULT_PURGE_OLD_RESOURCES_DURING_STARTUP = "true";
/**
* Option specifies the #of bytes under management below which we will
* accelerate the overflow of the live journal by reducing its maximum
* extent below the nominal configured maximum extent. The purpose of
* this option is to promote rapid overflow of a new data service (where
* new is measured by the #of bytes under management). This helps to
* increase the rate at which index partitions are split (and moved if
* the there is more than one new data service starting). When ZERO (0)
* the feature is disabled.
*/
String ACCELERATE_OVERFLOW_THRESHOLD = StoreManager.class.getName()
+ ".accelerateOverflowThreshold";
String DEFAULT_ACCELERATE_OVERFLOW_THRESHOLD = ""+(Bytes.gigabyte);
}
/**
* Performance counters for the {@link StoreManager}.
*
* @author Bryan Thompson
*/
public static interface IStoreManagerCounters {
/**
* The configured data directory.
*/
String DataDir = "DataDir";
/**
* The configured tmp directory.
*/
String TmpDir = "TmpDir";
/**
* true
iff {@link StoreManager#isOpen()}
*/
String IsOpen = "isOpen";
/**
* true
iff {@link StoreManager#isStarting()}
*/
String IsStarting = "isStarting";
/**
* true
iff {@link StoreManager#isRunning()}
*/
String IsRunning = "isRunning";
String StoreCacheCapacity = "Store Cache Capacity";
String StoreCacheSize = "Store Cache Size";
/**
* #of journals currently under management.
*/
String ManagedJournalCount = "Managed Journal Count";
/**
* #of index segments currently under management."
*/
String ManagedSegmentStoreCount = "Managed Segment Store Count";
String JournalReopenCount = "Journal (Re-)open Count";
String SegmentStoreReopenCount = "Segment Store (Re-)open Count";
/**
* #of journals which have been deleted.
*/
String JournalDeleteCount = "Journal Delete Count";
/**
* #of index segments which have been deleted.
*/
String SegmentStoreDeleteCount = "Segment Store Delete Count";
/**
* The #of bytes currently under management by the {@link StoreManager}.
*/
String BytesUnderManagement = "Bytes Under Management";
/**
* The #of bytes in journals currently under management by the
* {@link StoreManager}.
*/
String JournalBytesUnderManagement = "Journal Bytes Under Management";
/**
* The #of bytes in index segments currently under management by the
* {@link StoreManager}.
*/
String SegmentBytesUnderManagement = "Segment Bytes Under Management";
/**
* The #of bytes in resources that have been deleted by the
* {@link StoreManager} after they became release free.
*/
String BytesDeleted = "Bytes Deleted";
/**
* The #of bytes available on the disk volume on which the data
* directory is located.
*/
String DataDirBytesAvailable = "Data Volume Bytes Available";
/**
* The #of bytes available on the disk volume on which the temporary
* directory is located.
*/
String TmpDirBytesAvailable = "Temp Volume Bytes Available";
/**
* The maximum extent of any journal managed by this service as of the
* time when it was closed out by synchronous overflow processing.
*/
String MaximumJournalSizeAtOverflow = "Maximum Journal Size At Overflow";
/**
* The elapsed milliseconds to date required to purge old resources from
* the file system.
*
* @see StoreManager#purgeOldResources()
*/
String PurgeResourcesMillis = "Purge Resources Millis";
/**
* The current release time for the {@link StoreManager}.
*
* @see StoreManager#getReleaseTime()
*/
String ReleaseTime = "Release Time";
/**
* The timestamp associated with the last synchronous overflow event.
*/
String LastOverflowTime = "Last Overflow Time";
/**
* The most recent commit time preserved when resources were last purged
* from the {@link StoreManger}.
*
* @see StoreManager#purgeResources
*/
String LastCommitTimePreserved = "Last Commit Time Preserved";
/**
* The most recent commit time.
*/
String LastCommitTime = "Last Commit Time";
}
/**
* The directory in which the data files reside.
*
* Note: It is a hard requirement that each resource is located by the
* {@link IResourceMetadata#getFile() path} relative to the {@link #dataDir}.
*
* @see Options#DATA_DIR
* @see IResourceMetadata#getFile()
*/
protected final File dataDir;
/** Directory containing the journal resources. */
protected final File journalsDir;
/** Directory containing the index segment resources. */
protected final File segmentsDir;
/**
* The directory in which the temporary files will reside.
*
* @see Options#TMP_DIR
*/
protected final File tmpDir;
/**
* The performance counters for the {@link IBufferStrategy} backing the live
* journal and any historical journals which are concurrently open with the
* live journal. A single instance of this object is used, and a hard
* reference to that instance is held here, so that we can track the
* cumulative performance counters across the live cycles of all journal
* instances used by the data service over time. The performance counters
* are not themselves persistent and do not survive a restart of the
* {@link StoreManager}.
*/
private final StoreCounters storeCounters = new StoreCounters();
/**
* The performance counters for the {@link IBufferStrategy} backing the live
* journal and any historical journals which are concurrently open with the
* live journal. A single instance of this object is used, and a hard
* reference to that instance is held here, so that we can track the
* cumulative performance counters across the live cycles of all journal
* instances used by the data service over time. The performance counters
* are not themselves persistent and do not survive a restart of the
* {@link StoreManager}.
*/
public final StoreCounters getStoreCounters() {
return storeCounters;
}
/**
* A map over the journal histories. The map is transient and is
* re-populated from a scan of the file system during startup.
*
* The keys are the timestamp at which the journal was put into service. The
* values are the journal resource descriptions. Given the timestamp of some
* historical state of an index, this map is used to locate the journal on
* which that historical state of the index would be found.
*/
final private JournalIndex journalIndex;
/**
* A map over the index segments by ascending createTime and UUID. The map
* is transient and is re-populated from a scan of the file system during
* startup.
*
* The keys are the createTime of the index segment followed by the index
* segment UUID (to break ties). The values are the
* {@link IResourceMetadata} object describing that index segment. This map
* is used to provide some basic reporting but is primarily used to delete
* index segment resources once they are no longer required.
*
* @todo Is this strictly necessary? Do we have all the necessary
* information in the journals? Review the logic and decide.
*/
final private IndexSegmentIndex segmentIndex;
/**
* A non-thread-safe collection of {@link UUID}s for {@link IndexSegment}s
* which have been newly built but not yet incorporated in a re-start safe
* manner into an index partition view. {@link UUID}s in this collection
* are excluded from release by {@link #purgeOldResources()}.
*
* @see #purgeOldResources()
* @see IndexManager#buildIndexSegment(String,
* com.bigdata.btree.ILocalBTreeView, boolean, long, byte[], byte[],
* Event)
*/
final private Set retentionSet = new HashSet();
/**
* Add an {@link IndexSegment} to the set of {@link IndexSegment}s which
* have been generated but not yet incorporated into an index partition view
* and hence we must take special cautions to prevent their release.
*
* @param The {@link UUID} of the {@link IndexSegmentStore}.
*
* @see #retentionSetRemove(UUID)
* @see #retentionSet
*/
protected void retentionSetAdd(final UUID uuid) {
if (uuid == null)
throw new IllegalArgumentException();
synchronized (retentionSet) {
if (!retentionSet.add(uuid)) {
// that UUID is already in this collection.
throw new IllegalStateException("Already in set: " + uuid);
}
}
}
/**
* Remove an {@link IndexSegment} from the {@link #retentionSet}. DO NOT
* invoke this until the {@link IndexSegment} has been incorporated in a
* restart safe manner into an index partition view (that is, post-commit
* rather than during the task that incorporates it into the view) or is
* known to be no longer required (post MOVE, task failed, etc).
*
* @param uuid
* The {@link UUID} of the {@link IndexSegmentStore}.
*
* @see #retentionSetAdd(UUID)
* @see #retentionSet
*/
protected void retentionSetRemove(final UUID uuid) {
if (uuid == null)
throw new IllegalArgumentException();
synchronized (retentionSet) {
if (!retentionSet.remove(uuid)) {
/*
* Note: Only a warning since invoked during error handling when
* the resource might have not made it into the retentionSet in
* the first place.
*/
log.warn("Not in retentionSet: " + uuid);
}
}
}
/**
* A cache that is used by the to automatically close out unused
* {@link IndexSegmentStore}s. An {@link IndexSegment} that is no longer
* used will have its reference cleared when it is swept by the garbage
* collector and will automatically release all of its buffers (node and
* leaf cache, etc). However, at that point the {@link IndexSegmentStore} is
* still open, and it can buffer a significant amount of data in addition to
* the file handle.
*
* When the weak reference is cleared we know that there are no longer any
* hard references to the {@link IndexSegment} and hence the corresponding
* {@link IndexSegmentStore} should be closed. In fact, we can immediately
* remove the {@link IndexSegmentStore} from the cache of open stores and
* then close the store. At this point if the store is re-opened it will be
* a new object. This is easy enough to do since the {@link UUID} of the
* {@link IndexSegmentStore} is the key in our map!
*
* @see Options#STORE_CACHE_CAPACITY
* @see Options#STORE_CACHE_TIMEOUT
*/
// final protected WeakValueCache storeCache;
final protected ConcurrentWeakValueCacheWithTimeout storeCache;
/**
* Provides locks on a per-{resourceUUID} basis for higher concurrency.
*/
private final transient NamedLock namedLock = new NamedLock();
/**
* The #of entries in the hard reference cache for {@link IRawStore}s,
* including both {@link ManagedJournal}s and IndexSegment}s. There MAY be
* more {@link IRawStore}s open than are reported by this method if there
* are hard references held by the application to those {@link IRawStore}s.
* {@link IRawStore}s that are not fixed by a hard reference will be
* quickly finalized by the JVM.
*/
public int getStoreCacheSize() {
return storeCache.size();
}
/**
* true
iff {@link BufferMode#Transient} was indicated.
*/
private final boolean isTransient;
// /**
// * A direct {@link ByteBuffer} that will be used as the write cache for the
// * live journal and which will be handed off from live journal to live
// * journal during overflow processing which is allocated iff
// * {@link BufferMode#Disk} is chosen.
// *
// * Note: This design is motivated by by JVM bug
// * 6210541 which describes a failure by
// * releaseTemporaryDirectBuffer()
to release temporary direct
// * {@link ByteBuffer}s that are allocated for channel IO.
// *
// * @see com.bigdata.journal.Options#WRITE_CACHE_CAPACITY
// * @see DiskOnlyStrategy
// */
// private ByteBuffer writeCache;
/**
* A atomic hard reference to the live journal.
*/
final protected AtomicReference liveJournalRef = new AtomicReference(null);
/**
* true
initially and remains true
until the
* {@link ResourceManager} is shutdown.
*
* @see #isOpen()
*/
private final AtomicBoolean open = new AtomicBoolean(true);
/**
* true
initially and until {@link #start()} completes
* successfully, this is used to disambiguate the startup transient state
* from the shutdown state.
*
* @see #isStarting()
*/
private final AtomicBoolean starting = new AtomicBoolean(true);
/**
* The service used to send files to other data services and to exchange NIO
* {@link ByteBuffer} in support of distributed query processing.
*/
private ManagedResourceService resourceService;
/**
* The service used to send files to other data services and to exchange NIO
* {@link ByteBuffer} in support of distributed query processing.
*/
public ManagedResourceService getResourceService() {
assertRunning();
return resourceService;
}
// /**
// * The port at which you can connect to the {@link ResourceService}. This
// * service provides remote access to resources hosted by the owning
// * {@link DataService}. This is used for moving resources to other data
// * services in the federation, including supporting service failover.
// *
// * @return The port used to connect to that service.
// *
// * @todo this could also be used for remote backup. however, note that you
// * can not read the live journal using this object.
// */
// public int getResourceServicePort() {
//
// assertRunning();
//
// return resourceService.port;
//
// }
/**
* @see Options#IGNORE_BAD_FILES
*/
private final boolean ignoreBadFiles;
/**
* @see Options#PURGE_OLD_RESOURCES_DURING_STARTUP
*/
private final boolean purgeOldResourcesDuringStartup;
/**
* @see Options#ACCELERATE_OVERFLOW_THRESHOLD
*/
protected final long accelerateOverflowThreshold;
/**
* Used to run the {@link Startup}. @todo defer to init() outside of ctor. Also, defer {@link Startup} until init() outside of ctor.
*/
private final ExecutorService startupService = Executors
.newSingleThreadExecutor(new DaemonThreadFactory
(getClass().getName()+".startupService"));
/**
* Succeeds if the {@link StoreManager} {@link #isOpen()} and is NOT
* {@link #isStarting()} (the test itself is NOT atomic).
*
* @throws IllegalStateException
* unless open and not starting.
*/
protected void assertRunning() {
if (!isOpen())
throw new IllegalStateException("Not open");
if (isStarting())
throw new IllegalStateException("Starting up");
}
/**
* Return true
iff the {@link StoreManager} is open and
* startup processing has been completed.
*/
public boolean isRunning() {
return isOpen() && !isStarting();
}
/**
* @throws IllegalStateException
* unless open.
*/
protected void assertOpen() {
if (!isOpen())
throw new IllegalStateException();
}
/**
* @throws IllegalStateException
* if open.
*/
protected void assertNotOpen() {
if (isOpen())
throw new IllegalStateException();
}
/**
* Return true
iff the {@link StoreManager} is running. If
* the {@link StoreManager} is currently starting up, then this will await
* the completion of the {@link Startup} task.
*
* @return true
if the {@link StoreManager} is running and
* false
if it is shutdown.
*/
public boolean awaitRunning() {
while (isOpen() && isStarting()) {
try {
if (log.isInfoEnabled())
log.info("Waiting on startup : " + dataDir + " ...");
Thread.sleep(1000/* ms */);
} catch (InterruptedException ex) {
throw new RuntimeException("Interrupted awaiting startup: "
+ ex);
}
}
return isRunning();
}
/**
* A map from the resource UUID to the absolute {@link File} for that
* resource.
*
* Note: The {@link IResourceMetadata} reported by an
* {@link AbstractJournal} or {@link IndexSegmentStore} generally reflects
* the name of the file as specified to the ctor for that class, so it may
* be relative to some arbitrary directory or absolute within the file
* system.
*/
private final Map resourceFiles = new HashMap();
/**
* The properties given to the ctor.
*/
private final Properties properties;
/**
* Release time is zero (0L) until notified otherwise - 0L is ignored.
*
* @see #setReleaseTime(long)
*/
private long releaseTime = 0L;
/**
* The elapsed #of milliseconds in {@link #purgeOldResources()}
*/
protected long purgeResourcesMillis = 0L;
/**
* The last value computed by {@link #getEffectiveReleaseTime()} and ZERO(0)
* until a value has been calculated.
*/
protected long lastCommitTimePreserved = 0L;
/**
* The last commit time corresponding to the last synchronous overflow event
* and ZERO (0L) until there has been a synchronous overflow event.
*/
protected long lastOverflowTime = 0L;
/**
* The observed maximum size of a journal (its length in bytes) as measured
* at each synchronous overflow event.
*/
protected long maximumJournalSizeAtOverflow = 0L;
/**
* The #of {@link ManagedJournal}s that have been (re-)opened to date.
*/
final protected AtomicLong journalReopenCount = new AtomicLong();
/**
* The #of {@link IndexSegmentStore}s that have been (re-)opened to date.
*/
final protected AtomicLong segmentStoreReopenCount = new AtomicLong();
/**
* The #of {@link ManagedJournal}s that have been deleted to date.
*/
final protected AtomicLong journalDeleteCount = new AtomicLong();
/**
* The #of {@link IndexSegmentStore}s that have been deleted to date.
*/
final protected AtomicLong segmentStoreDeleteCount = new AtomicLong();
/**
* The #of bytes currently under management EXCEPT those on the live
* journal. This is incremented each time a new resource is added using
* {@link #addResource(IResourceMetadata, File)} and decremented each
* time a resource is deleted.
*/
final protected AtomicLong bytesUnderManagement = new AtomicLong();
final protected AtomicLong journalBytesUnderManagement = new AtomicLong();
final protected AtomicLong segmentBytesUnderManagement = new AtomicLong();
/**
* The #of bytes that have been deleted since startup.
*/
final protected AtomicLong bytesDeleted = new AtomicLong();
/**
* The #of bytes currently under management, including those written on the
* live journal.
*
* @throws IllegalStateException
* during startup or if the {@link StoreManager} is closed.
*/
public long getBytesUnderManagement() {
assertRunning();
return bytesUnderManagement.get()
+ getLiveJournal().getBufferStrategy().getExtent();
}
/**
* The #of bytes in {@link ManagedJournal}s, including those written on the
* live journal.
*
* @throws IllegalStateException
* during startup or if the {@link StoreManager} is closed.
*/
public long getJournalBytesUnderManagement() {
assertRunning();
return journalBytesUnderManagement.get()
+ getLiveJournal().getBufferStrategy().getExtent();
}
/**
* The #of bytes in managed {@link IndexSegmentStore}s.
*
* @throws IllegalStateException
* during startup or if the {@link StoreManager} is closed.
*/
public long getSegmentBytesUnderManagement() {
assertRunning();
return segmentBytesUnderManagement.get();
}
/**
* The #of bytes of free space remaining on the volume hosting the
* {@link #dataDir}.
*
* @return The #of bytes of free space remaining -or- -1L
if
* the free space could not be determined.
*/
public long getDataDirFreeSpace() {
return getFreeSpace(dataDir);
}
/**
* The #of bytes of free space remaining on the volume hosting the
* {@link #tmpDir}.
*
* @return The #of bytes of free space remaining -or- -1L
if
* the free space could not be determined.
*/
public long getTempDirFreeSpace() {
return getFreeSpace(tmpDir);
}
/**
* Return the free space in bytes on the volume hosting some directory.
*
* @param dir
* A directory hosted on some volume.
*
* @return The #of bytes of free space remaining for the volume hosting the
* directory -or- -1L
if the free space could not be
* determined.
*/
/*
* Note: This was written using Apache FileSystemUtil originally. That would
* shell out "df" under un*x. Unfortunately, shelling out a child process
* requires a commitment from the OS to support a process with as much
* process space as the parent. For the data service, that is a lot of RAM.
* In general, the O/S allows "over committment" of the available swap
* space, but you can run out of swap and then you have a problem. If the
* host was configured with scanty swap, then this problem could be
* triggered very easily and would show up as "Could not allocate memory".
*
* See http://forums.sun.com/thread.jspa?messageID=9834041#9834041
*/
private long getFreeSpace(final File dir) {
try {
if(!dir.exists()) {
return -1;
}
/*
* Note: This return 0L if there is no free space or if the File
* does not "name" a partition in the file system semantics. That
* is why we check dir.exists() above.
*/
return dir.getUsableSpace();
} catch(Throwable t) {
log.error("Could not get free space: dir=" + dir + " : "
+ t, t);
// the error is logger and ignored.
return -1L;
}
}
// /**
// * Return the free space in bytes on the volume hosting some directory.
// *
// * Note: This uses the apache IO commons {@link FileSystemUtils} to report
// * the free space on the volume hosting the directory and then converts kb
// * to bytes.
// *
// * @param dir
// * A directory hosted on some volume.
// *
// * @return The #of bytes of free space remaining for the volume hosting the
// * directory -or- -1L
if the free space could not be
// * determined.
// *
// * @see http://commons.apache.org/io/api-release/org/apache/commons/io/FileSystemUtils.html
// */
// private long getFreeSpace(final File dir) {
//
// try {
//
// return FileSystemUtils.freeSpaceKb(dir.toString())
// * Bytes.kilobyte;
//
// } catch(Throwable t) {
//
// log.error("Could not get free space: dir=" + dir + " : "
// + t, t);
//
// // the error is logger and ignored.
// return -1L;
//
// }
//
// }
/**
* An object wrapping the {@link Properties} given to the ctor.
*/
public Properties getProperties() {
return new Properties(this.properties);
}
/**
* Return true
iff data can not be made restart-safe.
*/
public boolean isTransient() {
return isTransient;
}
/**
* Note: This constructor starts an asynchronous thread that scans the data
* directory for journals and index segments and creates the initial journal
* if no store files are found.
*
* Note: The store files are NOT accessible until the asynchronous startup
* is finished. Caller's MUST verify that the {@link StoreManager#isOpen()}
* AND NOT submit tasks until {@link StoreManager#isStarting()} returns
* false
.
*
* @param properties
* See {@link Options}.
*
* @see Startup
*/
protected StoreManager(final Properties properties) {
if (properties == null)
throw new IllegalArgumentException();
this.properties = properties;
// ignoreBadFiles
{
ignoreBadFiles = Boolean
.parseBoolean(properties.getProperty(
Options.IGNORE_BAD_FILES,
Options.DEFAULT_IGNORE_BAD_FILES));
if (log.isInfoEnabled())
log.info(Options.IGNORE_BAD_FILES + "=" + ignoreBadFiles);
}
// purgeOldResourcesDuringStartup
{
purgeOldResourcesDuringStartup = Boolean
.parseBoolean(properties.getProperty(
Options.PURGE_OLD_RESOURCES_DURING_STARTUP,
Options.DEFAULT_PURGE_OLD_RESOURCES_DURING_STARTUP));
if (log.isInfoEnabled())
log.info(Options.PURGE_OLD_RESOURCES_DURING_STARTUP + "="
+ purgeOldResourcesDuringStartup);
}
// accelerateOverflowThreshold
{
accelerateOverflowThreshold = Long.parseLong(properties
.getProperty(Options.ACCELERATE_OVERFLOW_THRESHOLD,
Options.DEFAULT_ACCELERATE_OVERFLOW_THRESHOLD));
if (log.isInfoEnabled())
log.info(Options.ACCELERATE_OVERFLOW_THRESHOLD + "="
+ accelerateOverflowThreshold);
if (accelerateOverflowThreshold < 0) {
throw new RuntimeException(
Options.ACCELERATE_OVERFLOW_THRESHOLD
+ " must be non-negative");
}
}
/*
* storeCacheCapacity
*/
{
final int storeCacheCapacity = Integer.parseInt(properties
.getProperty(Options.STORE_CACHE_CAPACITY,
Options.DEFAULT_STORE_CACHE_CAPACITY));
if (log.isInfoEnabled())
log.info(Options.STORE_CACHE_CAPACITY + "="
+ storeCacheCapacity);
if (storeCacheCapacity <= 0)
throw new RuntimeException(Options.STORE_CACHE_CAPACITY
+ " must be positive");
final long storeCacheTimeout = Long.parseLong(properties
.getProperty(Options.STORE_CACHE_TIMEOUT,
Options.DEFAULT_STORE_CACHE_TIMEOUT));
if (log.isInfoEnabled())
log.info(Options.STORE_CACHE_TIMEOUT + "=" + storeCacheTimeout);
if (storeCacheTimeout < 0)
throw new RuntimeException(Options.STORE_CACHE_TIMEOUT
+ " must be non-negative");
storeCache = new ConcurrentWeakValueCacheWithTimeout(
storeCacheCapacity, TimeUnit.MILLISECONDS
.toNanos(storeCacheTimeout));
// storeCache = new WeakValueCache(
// new LRUCache(storeCacheCapacity));
}
// /*
// * Allocate an optional write cache that will be passed from live
// * journal to live journal during overflow.
// */
// {
//
// writeCache = AbstractJournal.getWriteCache(properties);
//
// }
/*
* Create the _transient_ index in which we will store the mapping from
* the commit times of the journals to their resource descriptions.
*/
journalIndex = JournalIndex.createTransient();//tmpStore);
segmentIndex = IndexSegmentIndex.createTransient();//(tmpStore);
if (log.isInfoEnabled())
log.info("Current working directory: "
+ new File(".").getAbsolutePath());
// true iff transient journals is requested.
isTransient = BufferMode.valueOf(properties.getProperty(
Options.BUFFER_MODE, Options.DEFAULT_BUFFER_MODE.toString())) == BufferMode.Transient;
/*
* data directory.
*/
if (isTransient) {
/*
* Transient.
*/
dataDir = null;
journalsDir = null;
segmentsDir = null;
} else {
/*
* Persistent.
*/
// Note: dataDir is _canonical_
final File dataDir;
try {
final String val = properties.getProperty(Options.DATA_DIR);
if (val == null) {
throw new RuntimeException("Required property: "
+ Options.DATA_DIR);
}
// Note: stored in canonical form.
dataDir = new File(val).getCanonicalFile();
if (log.isInfoEnabled())
log.info(Options.DATA_DIR + "=" + dataDir);
journalsDir = new File(dataDir, "journals").getCanonicalFile();
segmentsDir = new File(dataDir, "segments").getCanonicalFile();
} catch (IOException ex) {
throw new RuntimeException(ex);
}
if (!dataDir.exists()) {
if (log.isInfoEnabled())
log.info("Creating: " + dataDir);
if (!dataDir.mkdirs()) {
throw new RuntimeException("Could not create directory: "
+ dataDir.getAbsolutePath());
}
}
if (!journalsDir.exists()) {
if(log.isInfoEnabled())
log.info("Creating: " + journalsDir);
if (!journalsDir.mkdirs()) {
throw new RuntimeException("Could not create directory: "
+ journalsDir.getAbsolutePath());
}
}
if (!segmentsDir.exists()) {
if(log.isInfoEnabled())
log.info("Creating: " + segmentsDir);
if (!segmentsDir.mkdirs()) {
throw new RuntimeException("Could not create directory: "
+ segmentsDir.getAbsolutePath());
}
}
// verify all are directories vs regular files.
if (!dataDir.isDirectory()) {
throw new RuntimeException("Not a directory: "
+ dataDir.getAbsolutePath());
}
if (!journalsDir.isDirectory()) {
throw new RuntimeException("Not a directory: "
+ journalsDir.getAbsolutePath());
}
if (!segmentsDir.isDirectory()) {
throw new RuntimeException("Not a directory: "
+ segmentsDir.getAbsolutePath());
}
this.dataDir = dataDir;
}
// temp directory.
{
// Note: tmpDir is _canonical_
final File tmpDir;
try {
tmpDir = new File(properties.getProperty(Options.TMP_DIR,
System.getProperty("java.io.tmpdir")))
.getCanonicalFile();
} catch (IOException ex) {
throw new RuntimeException(ex);
}
if(log.isInfoEnabled())
log.info(Options.TMP_DIR + "=" + tmpDir);
if (!tmpDir.exists()) {
if(log.isInfoEnabled())
log.info("Creating temp directory: " + tmpDir);
if (!tmpDir.mkdirs()) {
throw new RuntimeException("Could not create directory: "
+ tmpDir.getAbsolutePath());
}
}
if (!tmpDir.isDirectory()) {
throw new RuntimeException("Not a directory: "
+ tmpDir.getAbsolutePath());
}
this.tmpDir = tmpDir;
}
/*
* Asynchronous startup processing.
*/
startupService.submit(new Startup());
}
/**
* Runs a startup scan of the data directory and creates the initial journal
* if none was found. If the {@link Startup} task fails or is interrupted
* then the {@link StoreManager} will be {@link StoreManager#shutdownNow()}.
*
* @author Bryan Thompson
*/
private class Startup implements Runnable {
@Override
public void run() {
try {
try {
start();
// successful startup
starting.set(false);
// Purge any resources that we no longer require.
if(purgeOldResourcesDuringStartup)
purgeOldResources();
} catch (Throwable ex) {
// avoid possibility that isRunning() could become true.
open.set(false);
log.error("Problem during startup? : " + ex, ex);
shutdownNow();
// terminate Startup task.
throw new RuntimeException(ex);
}
} finally {
/*
* Whether or not startup was successful, we make sure that this
* flag is turned off.
*/
starting.set(false);
if (log.isInfoEnabled())
log.info("Startup "
+ (isOpen() ? "successful" : "failed")
+ " : "
+ (isTransient ? "transient" : Options.DATA_DIR
+ "=" + dataDir));
}
}
/**
* Starts up the {@link StoreManager}.
*
* Note: Implementations of this method MUST be
* synchronized
.
*
* @throws InterruptedException
*
* @throws IllegalStateException
* if the {@link IConcurrencyManager} has not been set
* (after a timeout).
*
* @throws IllegalStateException
* if the the {@link ResourceManager} is already running.
*
* @throws InterruptedException
* if the startup scan is interrupted.
*
* @throws RuntimeException
* if bad files are encountered, etc.
*/
final private void start() throws InterruptedException {
if (!isStarting()) {
throw new IllegalStateException();
}
/*
* Verify that the concurrency manager has been set and wait a while
* it if is not available yet.
*/
{
int nwaits = 0;
while (true) {
try {
getConcurrencyManager();
break;
} catch (IllegalStateException ex) {
Thread.sleep(100/* ms */);
if (++nwaits % 50 == 0)
log.warn("Waiting for concurrency manager");
}
}
}
try {
final IBigdataFederation> fed = getFederation();
if (fed == null) {
/*
* Some of the unit tests do not start the txs until after
* the DataService. For those unit tests getFederation()
* will return null during startup() of the DataService. To
* have a common code path, we throw the exception here
* which is caught below.
*/
throw new UnsupportedOperationException();
}
while (true) {
if (fed.getTransactionService() != null) {
break;
}
log.warn("Waiting for transaction service discovery");
}
} catch (UnsupportedOperationException ex) {
log.warn("Federation not available - running in test case?");
}
/*
* Look for pre-existing data files.
*/
if (!isTransient) {
if (log.isInfoEnabled())
log.info("Starting scan of data directory: " + dataDir);
final Stats stats = new Stats();
scanDataDirectory(dataDir, stats);
final int nbad = stats.badFiles.size();
if(log.isInfoEnabled())
log.info("Scan results: " + stats);
if (!stats.badFiles.isEmpty()) {
if (ignoreBadFiles) {
log.warn("The following "
+ nbad
+ " file(s) had problems and are being ignored: "
+ stats.badFiles);
} else {
/*
* Note: This exception will be thrown if we could not
* get a lock on a journal file (see FileMetadata - the
* lock error is not reported until we try to read the
* magic field) or if there is a problem with the data
* in the file. You have to examine the stack trace to
* see what the root cause is.
*/
final String msg = "Could not open " + nbad
+ " files - will not start : problem files="
+ stats.badFiles;
log.fatal(msg);
throw new RuntimeException(msg);
}
}
assert journalIndex.getEntryCount() == stats.njournals;
assert segmentIndex.getEntryCount() == stats.nsegments;
assert resourceFiles.size() + nbad == stats.nfiles : "#resourceFiles="
+ resourceFiles.size()
+ ", #nbad="
+ nbad
+ ", nfiles=" + stats.nfiles;
}
/*
* Open the live journal.
*/
openLiveJournal();
// /*
// * Purge any index partition moves which did not complete before
// * shutdown.
// */
// purgeIncompleteMoves();
/*
* Notify the transaction service of the last commit time for the
* live journal for this data service. This will be zero (0L) iff
* this is a new journal on a new data service.
*
* Note: This notification is not required unless the commit time
* log for the transaction service is lost. In that case it provides
* a backup allowing new transactions to read from the last global
* commit point (once all data services have joined).
*/
final long lastCommitTime = liveJournalRef.get()
.getLastCommitTime();
if (lastCommitTime != 0L) {
getConcurrencyManager().getTransactionManager().notifyCommit(
lastCommitTime);
}
try {
resourceService = new ManagedResourceService(
new InetSocketAddress(
InetAddress
.getByName(NicUtil
.getIpAddress(
"default.nic"/* systemPropertyName */,
"default"/* defaultNicName */,
false/* loopbackOk */)),
0/* port */), 0/* requestServicePoolSize */) {
@Override
protected File getResource(final UUID uuid) throws Exception {
if (!isRunning()) {
throw new Exception("Not running.");
}
return resourceFiles.get(uuid);
}
};
} catch (IOException ex) {
throw new RuntimeException("Could not start: "
+ resourceService, ex);
}
}
/**
* Open the "live" journal.
*/
private void openLiveJournal() throws InterruptedException {
if (log.isInfoEnabled())
log.info("Creating/opening the live journal: dataDir="
+ dataDir);
if (Thread.interrupted())
throw new InterruptedException();
final Properties p = getProperties();
final File file;
final boolean newJournal;
if (journalIndex.getEntryCount() == 0) {
/*
* There are no existing journal files. Create new journal using
* a unique filename in the appropriate subdirectory of the data
* directory. Since the file is empty, it will be initialized
* as a new Journal.
*/
if (log.isInfoEnabled())
log.info("Creating initial journal: dataDir=" + dataDir);
// unique file name for new journal.
if (isTransient) {
file = null;
} else {
try {
file = File.createTempFile("journal", // prefix
Options.JNL,// suffix
journalsDir // directory
).getCanonicalFile();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/*
* Set the createTime on the new journal resource.
*/
p.setProperty(Options.CREATE_TIME, Long
.toString(nextTimestamp()));
overrideJournalExtent(p);
newJournal = true;
} else {
/*
* There is at least one pre-existing journal file, so we open
* the one with the largest timestamp - this will be the most
* current journal and the one that will receive writes until it
* overflows.
*/
// resource metadata for journal with the largest
// timestamp.
final IResourceMetadata resource = journalIndex
.find(Long.MAX_VALUE);
if (log.isInfoEnabled())
log.info("Will open as live journal: " + resource);
assert resource != null : "No resource? : timestamp="
+ Long.MAX_VALUE;
// lookup absolute file for that resource.
file = resourceFiles.get(resource.getUUID());
if (file == null) {
throw new NoSuchStoreException(resource.getUUID());
}
if (log.isInfoEnabled())
log.info("Opening most recent journal: " + file
+ ", resource=" + resource);
newJournal = false;
}
if (!isTransient) {
assert file.isAbsolute() : "Path must be absolute: " + file;
p.setProperty(Options.FILE, file.toString());
}
if (log.isInfoEnabled())
log.info("Open/create of live journal: newJournal="
+ newJournal + ", file=" + file);
// Create/open journal.
{
if (Thread.interrupted())
throw new InterruptedException();
final ManagedJournal tmp = new ManagedJournal(p);
if (newJournal) {
// add to the set of managed resources.
addResource(tmp.getResourceMetadata(), tmp.getFile());
}
/*
* Add to set of open stores.
*
* Note: single-threaded during startup.
*/
storeCache.put(tmp.getRootBlockView().getUUID(), tmp);
// storeCache.put(tmp.getRootBlockView().getUUID(), tmp, false/*
// dirty */);
if (Thread.interrupted())
throw new InterruptedException();
liveJournalRef.set(tmp);
/*
* Subtract out the #of bytes in the live journal.
*/
final long extent = -tmp.getBufferStrategy().getExtent();
bytesUnderManagement.addAndGet(extent);
journalBytesUnderManagement.addAndGet(extent);
}
}
// /**
// * Purge any index partition moves which did not complete successfully
// * on restart. These index partitions are identified by scanning the
// * indices registered on the live journal. If an index has
// * sourcePartitionId != -1
in its
// * {@link LocalPartitionMetadata} then the index was being moved onto
// * this {@link IDataService} when the service was shutdown. The index
// * (together with any {@link IndexSegment} resources that are identified
// * in its {@link LocalPartitionMetadata}) is deleted.
// *
// * @todo write a unit test for this feature.
// *
// * @todo test MDS to verify that the index partition flagged as an
// * incomplete move is not registered as part of scale-out index?
// *
// * @deprecated This is no longer necessary. The new MOVE does not use
// * {@link LocalPartitionMetadata#getSourcePartitionId()}
// * field. Index segments are cleaned up during a failed
// * receive. If the index segment for some reason is NOT
// * cleaned up, then it will be released eventually (unless
// * an immortal database is being used) since it will not be
// * incorporated into any index partition view.
// */
// private void purgeIncompleteMoves() {
//
// final boolean reallyDelete = true;
//
// final ManagedJournal liveJournal = liveJournalRef.get();
//
// // using read-committed view of Name2Addr
// final ITupleIterator itr = liveJournal.getName2Addr()
// .rangeIterator();
//
// // the list of indices that will be dropped.
// final List toDrop = new LinkedList();
//
// while (itr.hasNext()) {
//
// final ITuple tuple = itr.next();
//
// final Entry entry = EntrySerializer.INSTANCE
// .deserialize(new DataInputBuffer(tuple.getValue()));
//
// /*
// * Open the mutable btree on the journal (not the full view of
// * that index).
// */
// final BTree btree = (BTree) liveJournal.getIndex(entry.checkpointAddr);
//
// final String name = btree.getIndexMetadata().getName();
//
// final LocalPartitionMetadata pmd = btree.getIndexMetadata().getPartitionMetadata();
//
// if (pmd != null) {
//
//// System.err.println("\nname=" + name + "\npmd=" + pmd);
//
// if (pmd.getSourcePartitionId() != -1) {
//
// log.warn("Incomplete index partition move: name="
// + name + ", pmd=" + pmd);
//
// for (IResourceMetadata resource : pmd.getResources()) {
//
// if (resource.isIndexSegment()) {
//
// final File file = resourceFiles.get(resource.getUUID());
//
//// final File file = new File(segmentsDir,
//// resource.getFile());
//
// log.warn("Deleting index segment: " + file);
//
// if (file.exists()) {
//
// if (reallyDelete) {
//
// deleteResource(resource.getUUID(),
// false/* isJournal */);
//
// }
//
// } else {
//
// log.warn("Could not locate file: " + file);
//
// }
//
// }
//
// }
//
// }
//
// }
//
// if (!toDrop.isEmpty() && reallyDelete) {
//
// for (String s : toDrop) {
//
// liveJournal.dropIndex(s);
//
// }
//
// liveJournal.commit();
//
// }
//
// }
//
// } // purgeIncompleteMoves()
} // class Startup
/**
* true
initially and until {@link #start()} completes
* successfully.
*/
public boolean isStarting() {
return starting.get();
}
/**
* false
initially and remains false
until
* {@link #start()} completes successfully. once true
this
* remains true
until either {@link #shutdown()} or
* {@link #shutdownNow()} is invoked.
*/
@Override
public boolean isOpen() {
return open.get();
}
// /**
// * Clears any stale entries in the LRU backing the {@link #storeCache}
// */
// public void clearStaleCacheEntries() {
//
// storeCache.clearStaleRefs();
//
// }
@Override
synchronized public void shutdown() {
if (log.isInfoEnabled())
log.info("");
final boolean wasOpen = this.open.get();
/*
* Note: clear before we clear [starting] or the
* StoreManager#isRunning() could report true.
*/
this.open.set(false);
// Note: if startup is running, then cancel immediately.
startupService.shutdownNow();
// failsafe clear : note that [open] is already false.
starting.set(false);
if (!wasOpen)
return;
try {
closeStores();
} catch (Exception ex) {
log.warn(ex.getMessage(), ex);
}
if (resourceService != null) {
resourceService.shutdown();
resourceService = null;
}
// try {
// tmpStore.destroy();
// } catch (Exception ex) {
// log.warn(ex.getMessage(), ex);
// }
// // release the write cache.
// writeCache = null;
}
@Override
synchronized public void shutdownNow() {
if(log.isInfoEnabled())
log.info("");
final boolean wasOpen = this.open.get();
/*
* Note: clear before we clear [starting] or the
* StoreManager#isRunning() could report true.
*/
this.open.set(false);
startupService.shutdownNow();
// failsafe clear : note that [open] is already false.
starting.set(false);
if (!wasOpen)
return;
try {
closeStores();
} catch (Exception ex) {
log.warn(ex.getMessage(), ex);
}
if (resourceService != null) {
resourceService.shutdownNow();
resourceService = null;
}
// try {
// tmpStore.destroy();
// } catch (Exception ex) {
// log.warn(ex.getMessage(), ex);
// }
// // release the write cache.
// writeCache = null;
}
/**
* Helper class gathers statistics about files during a scan.
*
* @author Bryan Thompson
*/
private static class Stats {
/**
* #of files scanned.
*/
public int nfiles;
/**
* #of journal files scanned.
*/
public int njournals;
/**
* #of index segment files found.
*/
public int nsegments;
/**
* A list of all bad files found during the scan.
*/
public Collection badFiles = Collections
.synchronizedCollection(new TreeSet());
/**
* total #of bytes of user data found in those files.
*/
public long nbytes;
public String toString() {
return "Stats{nfiles=" + nfiles + ", njournals=" + njournals
+ ", nsegments=" + nsegments + ", nbad=" + badFiles.size()
+ ", nbytes=" + nbytes + ", badFiles=" + badFiles + "}";
}
};
/**
* Recursively scan a directory structure identifying all journal and index
* segment resources and populating the internal {@link #resourceFiles} map.
* In addition, all journal files are listed in the {@link #journals} map so
* that we can find the relevant journal quickly for a given timestamp.
*
* Note: This requires that we open each resource in order to extract its
* {@link IResourceMetadata} description. We only open the {@link IRawStore}
* for the resource, not its indices. The stores are closed again
* immediately.
*
* @param dir
* A directory to scan.
*
* @throws InterruptedException
*/
private void scanDataDirectory(File dir, Stats stats)
throws InterruptedException {
if (dir == null)
throw new IllegalArgumentException();
if (!dir.isDirectory())
throw new IllegalArgumentException();
if (Thread.interrupted())
throw new InterruptedException();
final File[] files = dir.listFiles(newFileFilter());
for (final File file : files) {
if (file.isDirectory()) {
scanDataDirectory(file, stats);
} else {
scanFile(file, stats);
}
}
}
private void scanFile(final File file, final Stats stats)
throws InterruptedException {
if (Thread.interrupted())
throw new InterruptedException();
if (log.isInfoEnabled())
log.info("Scanning file: " + file + ", stats=" + stats);
final IResourceMetadata resource;
// name of the file.
final String name = file.getName();
// #of bytes in the file as reported by the OS.
final long len = file.length();
if (len > 0 && name.endsWith(Options.JNL)) {
final Properties properties = getProperties();
properties.setProperty(Options.FILE, file.getAbsolutePath());
// Note: no writes allowed during startup.
// Note: disables the write cache among other things.
properties.setProperty(Options.READ_ONLY, "true");
final AbstractJournal tmp;
try {
tmp = new ManagedJournal(properties);
} catch (Exception ex) {
log.error("Problem opening journal: file="
+ file.getAbsolutePath(), ex);
stats.nfiles++;
stats.badFiles.add(file.getAbsolutePath());
return;
}
try {
resource = tmp.getResourceMetadata();
stats.nfiles++;
stats.njournals++;
stats.nbytes += len;
} finally {
tmp.close();
}
} else if (len > 0 && name.endsWith(Options.SEG)) {
/*
* Attempt to open the index segment.
*/
final IndexSegmentStore segStore;
try {
segStore = new IndexSegmentStore(file);
} catch (Exception ex) {
log.error("Problem opening segment: file="
+ file.getAbsolutePath(), ex);
stats.nfiles++;
stats.badFiles.add(file.getAbsolutePath());
return;
}
try {
resource = segStore.getResourceMetadata();
stats.nfiles++;
stats.nsegments++;
stats.nbytes += len;
} finally {
if(segStore.isOpen()) {
/*
* Note: opening the segment with [load == false] does not
* really open anything so you do not need to close the
* segment afterwards. I've put the conditional logic here
* just in case that changes.
*/
segStore.close();
}
}
} else {
if (len == 0L
&& (name.endsWith(Options.JNL) || name
.endsWith(Options.SEG))) {
log.warn("Ignoring empty file: " + file);
} else {
/*
* This file is not relevant to the resource manager.
*/
log.warn("Ignoring file: " + file);
}
return;
}
if (log.isInfoEnabled())
log.info("Found " + resource + " in " + file);
// if (!file.getName().equals(new File(resource.getFile()).getName())) {
//
// /*
// * The base name and extension of the file does not agree with that
// * metadata reported by the store (unlikely since the store reports
// * its metadata based on the file that it opened).
// */
//
// log.error("Wrong filename: actual=" + file + ", expected="
// + file);
//
// }
// addResource(resource, file.getAbsoluteFile());
addResource(resource, file);
}
@Override
public File getTmpDir() {
return tmpDir;
}
/**
* Note: The returned {@link File} is in canonical form.
*/
@Override
public File getDataDir() {
return dataDir;
}
/**
* Closes ALL open store files.
*
* Note: This is invoked by {@link #shutdown()} and {@link #shutdownNow()}.
*/
private void closeStores() {
// final Iterator itr = storeCache.iterator();
final Iterator> itr = storeCache.iterator();
while (itr.hasNext()) {
// final IRawStore store = itr.next();
final IRawStore store = itr.next().get();
if (store == null) {
// weak reference has been cleared.
continue;
}
try {
store.close();
} catch (Exception ex) {
log.warn(ex.getMessage(), ex);
}
itr.remove();
}
}
/**
* The #of journals on hand.
*/
synchronized public long getManagedJournalCount() {
assertOpen();
return journalIndex.getEntryCount();
}
/**
* The #of index segments on hand.
*/
synchronized public long getManagedSegmentCount() {
assertOpen();
return segmentIndex.getEntryCount();
}
/**
* Notify the resource manager of a new resource. The resource is added to
* {@link #resourceFiles} and to either {@link #journalIndex} or
* {@link #segmentIndex} as appropriate. As a post-condition, you can use
* {@link #openStore(UUID)} to open the resource using the {@link UUID}
* specified by {@link IResourceMetadata#getUUID()}.
*
* Note: This also adds the size of the store in bytes as reported by the OS
* to {@link #bytesUnderManagement}.
*
* Note: Adding a resource to the store manager has no persistent effect
* other than the presumed presence of the specified file in the file
* system. However, error handling routines SHOULD invoke
* {@link #deleteResource(UUID, boolean)} in order to remove a resource that
* was not built correctly or not incorporated into the view. Otherwise the
* mapping from the {@link UUID} to the {@link File} will be maintained in
* memory and the {@link StoreManager} will overreport the #of bytes under
* management.
*
* @param resourceMetadata
* The metadata describing that resource.
* @param file
* The file in the local file system which is the resource.
*
* @throws RuntimeException
* if the file does not exist.
* @throws RuntimeException
* if there is already a resource registered with the same UUID
* as reported by {@link IResourceMetadata#getUUID()}
* @throws RuntimeException
* if the {@link #journalIndex} or {@link #segmentIndex} already
* know about that resource.
* @throws RuntimeException
* if {@link #openStore(UUID)} already knows about that
* resource.
* @throws IllegalArgumentException
* if the resourceMetadata is null
.
* @throws IllegalArgumentException
* if the file is null
and
* {@link #isTransient} is false
.
*
* @see #deleteResource(UUID, boolean)
* @see #retentionSetAdd(UUID)
* @see #retentionSetRemove(UUID)
*/
synchronized protected void addResource(
final IResourceMetadata resourceMetadata,
File file
) {
if (resourceMetadata == null)
throw new IllegalArgumentException();
if (file == null && !isTransient)
throw new IllegalArgumentException();
assertOpen();
final UUID uuid = resourceMetadata.getUUID();
if (log.isInfoEnabled())
log.info("file=" + file + ", uuid=" + uuid);
if (file != null) {
file = file.getAbsoluteFile();
}
// synchronized (storeCache) {
if (storeCache.get(uuid) != null) {
throw new RuntimeException("Resource already open?: "
+ resourceMetadata);
}
// }
final long extent;
if (!isTransient) {
if (!file.exists()) {
throw new RuntimeException("File not found: " + file);
}
// check for existing entry under that UUID.
final File tmp = resourceFiles.get(uuid);
if (tmp != null) {
throw new RuntimeException("Resource already registered: uuid="
+ uuid + " as file=" + tmp + " (given file=" + file
+ ")");
}
// add new entry.
resourceFiles.put(uuid, file);
// size of the file.
extent = file.length();
} else {
// transient resource - no extent.
extent = 0L;
}
if (resourceMetadata.isJournal()) {
journalIndex.add((JournalMetadata)resourceMetadata);
journalBytesUnderManagement.addAndGet(extent);
} else {
segmentIndex.add((SegmentMetadata)resourceMetadata);
segmentBytesUnderManagement.addAndGet(extent);
}
/*
* Track the #of bytes under management.
*/
bytesUnderManagement.addAndGet(extent);
}
/**
* Returns a filter that is used to recognize files that are managed by this
* class. The {@link ResourceManager} will log warnings if it sees an
* unexpected file and will NOT {@link #deleteResources()} files that it
* does not recognize.
*
* @see ResourceFileFilter
*
* @todo perhaps define setFileFilter and getFileFilter instead since
* subclassing this method is a bit difficult. The
* {@link ResourceFileFilter} would have to be a static class and we
* would have to pass in the {@link IResourceManager} so that it could
* get the {@link #dataDir}.
*/
protected ResourceFileFilter newFileFilter() {
return new ResourceFileFilter(this);
}
/**
* The object used to control access to the index resources.
*
* @throws IllegalStateException
* if the object has not been set yet using
* {@link #setConcurrencyManager(IConcurrencyManager)}.
*/
public abstract IConcurrencyManager getConcurrencyManager();
public abstract void setConcurrencyManager(IConcurrencyManager concurrencyManager);
/**
* The {@link ManagedJournal} provides the backing store used to absorb
* writes and retain history for the scale-out architecture.
*
* Note: This implementation is designed to use a shared
* {@link ConcurrencyManager} across all open journal instances for a
* {@link DataService}.
*
* @author Bryan
* Thompson
*/
public class ManagedJournal extends AbstractJournal {
// /**
// * Note: Each instance of the {@link ManagedJournal} reuses the SAME
// * {@link StoreManager#writeCache}. Therefore you MUST close out writes
// * on the old journal BEFORE you may allocate a new journal.
// *
// * @param properties
// *
// * @see AbstractJournal#closeForWrites(long)
// */
protected ManagedJournal(final Properties properties) {
super(properties);//, writeCache);
/*
* Set the performance counters on the new store so that we have a
* cumulative track of all activity on both the "live" journals and
* the "historical" journals managed by this data service.
*
* FIXME Must also roll the counters forward for the other journal
* buffer strategies! (The implementation class is different for the
* WORMStrategy, which is causing complications right now.)
*/
if (getBufferStrategy() instanceof DiskOnlyStrategy) {
((DiskOnlyStrategy) getBufferStrategy())
.setStoreCounters(getStoreCounters());
} else if (getBufferStrategy() instanceof WORMStrategy) {
((WORMStrategy) getBufferStrategy())
.setStoreCounters(getStoreCounters());
}
}
@Override
public String toString() {
/*
* Note: Should not depend on any state that might be unreachable,
* e.g., because the store is not open, etc.
*/
final IRootBlockView rootBlock = getRootBlockView();
return getClass().getName()
+ "{file="
+ getFile()
+ ", open="
+ ManagedJournal.this.isOpen()
+ (rootBlock != null ? ", uuid="
+ getRootBlockView().getUUID() : "") + "}";
}
/**
* Note: Exposed for the {@link DataService} which needs this for its
* 2-phase commit protocol.
*/
@Override
public long commitNow(final long commitTime) {
return super.commitNow(commitTime);
}
/**
* Exposed for {@link StoreManger#getResourcesForTimestamp(long)} which
* requires access to the {@link CommitRecordIndex} for the
* lastCommitTime on the historical journals.
*
* Note: This always returns a distinct index object. The code relies on
* this fact to avoid contention with the live {@link CommitRecordIndex}
* for the live journal.
*/
@Override
public CommitRecordIndex getCommitRecordIndex(final long addr,
final boolean readOnly) {
return super.getCommitRecordIndex(addr, readOnly);
}
@Override
public AbstractLocalTransactionManager getLocalTransactionManager() {
return (AbstractLocalTransactionManager) getConcurrencyManager()
.getTransactionManager();
}
// public DataServiceTransactionManager getLocalTransactionManager() {
//
// return (DataServiceTransactionManager) getConcurrencyManager()
// .getTransactionManager();
//
// }
/**
* The data services support group commit.
*/
@Override
public boolean isGroupCommit() {
return true;
}
@Override
public SparseRowStore getGlobalRowStore() {
return getFederation().getGlobalRowStore();
}
@Override
public SparseRowStore getGlobalRowStore(final long timestamp) {
return getFederation().getGlobalRowStore(timestamp);
}
@Override
public BigdataFileSystem getGlobalFileSystem() {
return getFederation().getGlobalFileSystem();
}
@Override
public DefaultResourceLocator getResourceLocator() {
return (DefaultResourceLocator) getFederation()
.getResourceLocator();
}
@Override
public ExecutorService getExecutorService() {
return getFederation().getExecutorService();
}
@Override
public IResourceLockService getResourceLockService() {
return getFederation().getResourceLockService();
}
@Override
public TemporaryStore getTempStore() {
return getFederation().getTempStore();
}
/**
* Extended to set the {@link IResourceMetadata} to this journal if it
* is null
since a remote caller can not have the correct
* metadata on hand when they formulate the request.
*/
@Override
protected void validateIndexMetadata(final String name,
final IndexMetadata metadata) {
super.validateIndexMetadata(name, metadata);
final LocalPartitionMetadata pmd = metadata.getPartitionMetadata();
if(pmd == null) {
/*
* Note: This case permits unpartitioned indices for the MDS.
*/
return;
}
if (pmd.getResources() == null) {
/*
* A [null] for the resources field is a specific indication
* that we need to specify the resource metadata for the live
* journal at the time that the index partition is registered.
* This indicator is used when the metadata service registers an
* index partition remotely on a data service since it does not
* (and can not) have access to the resource metadata for the
* live journal as of the time that the index partition actually
* gets registered on the data service.
*
* The index partition split and join tasks do not have this
* problem since they are run locally. However, an index
* partition move operation also needs to do this.
*/
final ResourceManager resourceManager = ((ResourceManager) (StoreManager.this));
metadata.setPartitionMetadata(//
new LocalPartitionMetadata(//
pmd.getPartitionId(),//
pmd.getSourcePartitionId(),//
pmd.getLeftSeparatorKey(),//
pmd.getRightSeparatorKey(),//
new IResourceMetadata[] {//
// The live journal.
getResourceMetadata() //
},
// cause
IndexPartitionCause.register(resourceManager)
// /*
// * Note: Retains whatever history given by the
// * caller.
// */
// , pmd.getHistory() + "register(name=" + name
// + ",partitionId="
// + pmd.getPartitionId() + ") "
));
} else {
if (pmd.getResources().length == 0) {
throw new RuntimeException(
"Missing resource description: name=" + name
+ ", pmd=" + pmd);
}
if (!pmd.getResources()[0].isJournal()) {
throw new RuntimeException(
"Expecting resources[0] to be journal: name="
+ name + ", pmd=" + pmd);
}
if (!pmd.getResources()[0].getUUID().equals(
getRootBlockView().getUUID())) {
throw new RuntimeException(
"Expecting resources[0] to be this journal but has wrong UUID: name="
+ name + ", pmd=" + pmd);
}
}
}
@Override
public ScheduledFuture> addScheduledTask(Runnable task,
long initialDelay, long delay, TimeUnit unit) {
return getFederation().addScheduledTask(task, initialDelay, delay,
unit);
}
@Override
public boolean getCollectPlatformStatistics() {
return getFederation().getCollectPlatformStatistics();
}
@Override
public boolean getCollectQueueStatistics() {
return getFederation().getCollectQueueStatistics();
}
@Override
public int getHttpdPort() {
return getFederation().getHttpdPort();
}
@Override
public boolean isHAJournal() {
return false;
}
} // class ManagedJournal
/**
* The journal on which writes are made.
*
* @throws IllegalStateException
* if the {@link StoreManager} is not open.
* @throws IllegalStateException
* if the {@link StoreManager} is still starting up.
*/
@Override
public ManagedJournal getLiveJournal() {
assertRunning();
final ManagedJournal tmp = liveJournalRef.get();
assert tmp != null : "open=" + isOpen() + ", starting="
+ isStarting() + ", dataDir=" + dataDir;
assert tmp.isOpen();
/*
* Note: There is a brief period when we close out writes on the live
* journal before we cut over to the new live journal. Therefore this
* assertion can not be made since it is violated during that brief
* period.
*
* Note: Concurrent readers are always allowed, even during that brief
* period.
*/
// assert !liveJournal.isReadOnly();
return tmp;
}
// /**
// * This lock is used to prevent asynchronous processes such as
// * {@link ConcurrencyManager#getIndexCounters()} from acquiring the live
// * journal during the period between when we close out the old journal
// * against future writes and when the new live journal is in place.
// *
// * Note: {@link AbstractJournal#closeForWrites(long)} does not disturb
// * concurrent readers.
// */
// protected final ReentrantLock liveJournalLock = new ReentrantLock();
/**
* @throws IllegalStateException
* if the {@link StoreManager} is not open.
* @throws IllegalStateException
* if the {@link StoreManager} is still starting up.
*
* @todo write tests for unisolated and read-committed. make sure that there
* is no fencepost for read committed immediately after an overflow
* (there should not be since we do a commit when we register the
* indices on the new store).
*/
@Override
public AbstractJournal getJournal(final long timestamp) {
assertRunning();
if (timestamp == ITx.UNISOLATED || timestamp == ITx.READ_COMMITTED) {
/*
* This is a request for the live journal.
*
* Note: The live journal remains open except during overflow, when
* it is changed to a new journal and the old live journal is
* closed. Therefore we NEVER cause the live journal to be opened
* from the disk in this method.
*/
return getLiveJournal();
}
final IResourceMetadata resource;
synchronized (journalIndex) {
/*
* @todo add a weak reference cache in front of this by timestamp?
* (The MDI had a hotspot for a similar pattern of use, but I have
* not verified yet whether there is such a hotspot here).
*/
resource = journalIndex.find(Math.abs(timestamp));
}
if (resource == null) {
log.warn("No such journal: timestamp=" + timestamp);
return null;
}
return (AbstractJournal) openStore(resource.getUUID());
}
/**
* Opens an {@link IRawStore}.
*
* @param uuid
* The UUID identifying that store file.
*
* @return The open {@link IRawStore}.
*
* @throws IllegalStateException
* if the {@link StoreManager} is not open.
* @throws IllegalStateException
* if the {@link StoreManager} is still starting up.
* @throws IllegalArgumentException
* if uuid is null
.
* @throws NoSuchStoreException
* if the {@link UUID} is not recognized.
* @throws NoSuchStoreException
* if the resource for that {@link UUID} could not be found.
* @throws RuntimeException
* if something else goes wrong.
*
* @todo it seems that we always have the {@link IResourceMetadata} on hand
* when we need to (re-)open a store so it might be nice to pass that
* in as it would make for more informative error messages when
* something goes wrong (except that I was planning to drop the file
* name from that interface).
*/
@Override
public IRawStore openStore(final UUID uuid) {
assertRunning();
if (uuid == null) {
throw new IllegalArgumentException();
}
/*
* Note: These operations can have modest latency, especially if we open
* a fully buffered index segment. Therefore we use a per-store
* (actually, per-resource UUID, which is the same thing) lock to avoid
* imposing latency on threads requiring access to different stores.
*/
final Lock lock = namedLock.acquireLock(uuid);
try {
/*
* Check to see if the given resource is already open.
*/
IRawStore store;
// synchronized(storeCache) {
store = storeCache.get(uuid);
// }
if (store != null) {
if (!store.isOpen()) {
if (store instanceof IndexSegmentStore) {
/*
* We can simply re-open an index segment's store file.
*/
// // Note: relative to the data directory!
// final File file = resourceFiles.get(uuid);
//
// if (file == null) {
//
// throw new NoSuchStoreException(uuid);
//
// }
//
// if (!file.exists()) {
//
// throw new RuntimeException(
// "Resource file missing? uuid=" + uuid
// + ", file=" + file);
//
// }
// re-open the store file. it will complain if the file is gone.
((IndexSegmentStore) store).reopen();
// re-opening the store.
segmentStoreReopenCount.incrementAndGet();
// done.
return store;
} else {
/*
* Note: Journals should not be closed without also
* removing them from the list of open resources. The
* live journal SHOULD NOT be closed except during
* shutdown or overflow (when it is replaced by a new
* live journal).
*/
throw new AssertionError();
}
}
return store;
}
if (store == null) {
/*
* Attempt to open the resource.
*/
// Lookup filename by resource UUID.
final File file = resourceFiles.get(uuid);
if (file == null) {
/*
* Note: Non-transactional read-historical operations DO NOT
* declare read locks and therefore are unable to prevent
* resources from being released, which can lead to this
* exception.
*/
throw new NoSuchStoreException(uuid);
}
if (!file.exists()) {
throw new NoSuchStoreException("Resource file missing? uuid="
+ uuid + ", file=" + file);
}
final UUID actualUUID;
if (file.getName().endsWith(Options.JNL)) {
/*
* Open a historical journal.
*
* Note: The live journal is never opened by this code path.
* It is opened when the resource manager is instantiated
* and it will remain open except during shutdown and
* overflow (when it is replaced by a new live journal).
*/
final Properties properties = getProperties();
properties.setProperty(Options.FILE, file.toString());
// All historical journals are read-only!
// Note: disables the write cache among other things.
properties.setProperty(Options.READ_ONLY, "true");
final AbstractJournal journal = new ManagedJournal(
properties);
final long closeTime = journal.getRootBlockView()
.getCloseTime();
// verify journal was closed for writes.
assert closeTime != 0 : "Journal not closed for writes? "
+ " : file=" + file + ", uuid=" + uuid
+ ", closeTime=" + closeTime;
assert journal.isReadOnly();
actualUUID = journal.getRootBlockView().getUUID();
store = journal;
// opened another journal.
journalReopenCount.incrementAndGet();
} else {
/*
* FIXME Make sure that the segStore either makes it into
* the cache or is closed even for spurious exceptions.
* E.g.,
*
* try {segStore=...; store=segStore;} catch()
* {if(store!=null)store.close();}
*
* But not it if was already open and not after it makes
* it into the cache.
*/
final IndexSegmentStore segStore = new IndexSegmentStore(file);
actualUUID = segStore.getCheckpoint().segmentUUID;
store = segStore;
// opened another index segment store.
segmentStoreReopenCount.incrementAndGet();
}
/*
* Verify the resource UUID.
*/
if (!actualUUID.equals(uuid)) {
// close the resource.
store.close();
throw new RuntimeException("Wrong UUID: file=" + file
+ ", expecting=" + uuid + ", actual=" + actualUUID);
}
assert store != null;
assert store.isOpen();
assert store.isStable();
}
// cache the reference.
// synchronized(storeCache) {
storeCache.put(uuid, store);//, false/* dirty */);
// storeCache.put(uuid, store, false/* dirty */);
// }
// return the reference to the open store.
return store;
} finally {
lock.unlock();
}
}
/**
* Report the next timestamp assigned by the {@link ITransactionService}.
*/
protected long nextTimestamp() {
final ILocalTransactionManager transactionManager = getConcurrencyManager()
.getTransactionManager();
return transactionManager.nextTimestamp();
}
@Override
public void deleteResources() {
assertNotOpen();
// NOP if transient.
if (isTransient())
return;
if (log.isInfoEnabled())
log.info("Deleting all resources: " + dataDir);
recursiveDelete(dataDir);
// approx. #of bytes deleted.
bytesDeleted.addAndGet(bytesUnderManagement.get());
// nothing left under management.
bytesUnderManagement.set(0L);
journalBytesUnderManagement.set(0L);
segmentBytesUnderManagement.set(0L);
}
/**
* Recursively removes any files and subdirectories and then removes the
* file (or directory) itself.
*
* Note: Files that are not recognized will be logged by the
* {@link ResourceFileFilter}.
*
* @param f
* A file or directory.
*/
private void recursiveDelete(final File f) {
if (f.isDirectory()) {
final File[] children = f.listFiles(newFileFilter());
if (children == null) {
// No such file or directory exists.
return;
}
for (int i = 0; i < children.length; i++) {
recursiveDelete(children[i]);
}
}
if (log.isInfoEnabled())
log.info("Removing: " + f);
if (f.exists() && !f.delete()) {
log.warn("Could not remove: " + f);
}
}
/**
* Updates the {@link #releaseTime}.
*
* Data services MAY release data for views whose timestamp is less than or
* equal to the specified release time IFF that action would be in keeping
* with their local history retention policy (minReleaseAge) AND if the data
* is not required for the most current committed state (data for the most
* current committed state is not releasable regardless of the release time
* or the minReleaseAge).
*
* @see #purgeOldResources(), which is responsible for actually deleting the
* old resources.
*/
public void setReleaseTime(final long releaseTime) {
assertOpen();
if (releaseTime < 0L) {
throw new IllegalArgumentException();
}
this.releaseTime = releaseTime;
}
/**
* Return the last value set with {@link #setReleaseTime(long)}.
*/
public long getReleaseTime() {
return releaseTime;
}
/**
* @see IndexManager#getIndexRetentionTime()
*/
abstract protected long getIndexRetentionTime();
/**
* In order to have atomic semantics and prevent a read-historical operation
* from starting concurrently that would have access to a view that is being
* purged, {@link IndexManager#getIndex(String, long)} and
* {@link StoreManager#purgeOldResources()} MUST contend for a shared lock.
* This is a {@link ReentrantReadWriteLock} since concurrent getIndex()
* requests can proceed as long as {@link StoreManager#purgeOldResources()}
* is not running. Also note that contention is not required for
* {@link ITx#UNISOLATED} index views.
*/
protected final ReentrantReadWriteLock indexCacheLock = new ReentrantReadWriteLock();
/**
* Identify and delete resources no longer required by the index views from
* the current releaseTime up to the lastCommitTime.
*
* Note: The ability to read from a historical commit point requires the
* existence of the journals back until the one covering that historical
* commit point. This is because the distinct historical commit points for
* the indices are ONLY defined on the journals. The index segments carry
* forward the committed state of a specific index as of the commitTime of
* the index from which the segment was built. This means that you can
* substitute the index segment for the historical index state on older
* journals, but the index segment carries forward only a single commit
* point for the index so it can not be used to read from arbitrary
* historical commit points.
*
* The caller MUST hold the exclusive lock on the
* {@link WriteExecutorService}.
*
* @return A summary of the work done -or- null
if the
* preconditions for the purge operation were not satisfied.
*
* @see src/architecture/purgeResourceDecisionsMatrix.xls
*
* @see #purgeOldResources(long, boolean)
*/
final protected PurgeResult purgeOldResources() {
final long beginPurgeTime = System.currentTimeMillis();
/*
* The last commit time on record in the live journal.
*
* Note: This used to be invoked during synchronous overflow so the
* [lastCommitTime] was in fact the last commit time on the OLD journal.
* However, this is now invoked at arbitrary times (as long as there is
* a lock on the write service) so we really need to use the
* [lastOverflowTime] here to have the same semantics.
*/
final long lastCommitTime = getLiveJournal().getRootBlockView().getLastCommitTime();
if (lastCommitTime == 0L) {
if (log.isInfoEnabled())
log.info("Nothing committed yet.");
return null;
}
/*
* Make sure that we have the current release time. It is periodically
* pushed by the transaction manager, but we pull it here since we are
* about to make a decision based on the releaseTime concerning which
* resources to release.
*/
{
final IBigdataFederation fed;
try {
fed = getFederation();
} catch (UnsupportedOperationException ex) {
log.warn("Federation not available: Running in test harness?");
return null;
}
try {
final ITransactionService txService = fed
.getTransactionService();
if (txService != null) {
this.releaseTime = txService.getReleaseTime();
} else {
log
.warn("Could not discover txService - Proceeding with current release time.");
}
} catch (IOException ex) {
/*
* Since the releaseTime is monotonically increasing, if there
* is an RMI problem then we use the last release time that was
* pushed to us by the txService.
*/
log.warn("Proceeding with current release time: " + ex);
}
}
if (this.releaseTime == 0L) {
/*
* Note: The [releaseTime] is advanced by the transaction service
* when it decides that a commit point will no longer be reachable
* by new transactions and no running transactions is reading from
* that commit point.
*
* Note: We do not release anything until the releaseTime has been
* set by the transaction service. This centralizes decisions
* concerning how long to preserve history while distributing the
* actions taken based on those decisions.
*/
log.warn("releaseTime not set.");
return null;
}
// // debugging - writes out stores and indices in their respective
// caches.
// if(false) {// @todo remove code.
// int nstores = 0, nindices = 0;
// {
// Iterator> itr = storeCache.iterator();
// while (itr.hasNext()) {
// IRawStore store = itr.next().get();
// if (store != null) {
// log.warn("Store: " + store);
// nstores++;
// }
// }
// }
// {
// Iterator> itr2 = ((IndexManager) this).indexCache
// .iterator();
// while (itr2.hasNext()) {
// IIndex ndx = itr2.next().get();
// if (ndx != null) {
// log.warn("Index: " + ndx);
// nindices++;
// }
// }
// }
// log.warn("nstores=" + nstores + ", nindices=" + nindices);
// }
final Event e = new Event(getFederation(), new EventResource(),
EventType.PurgeResources).start();
/*
* Prevent concurrent access to the index cache.
*/
indexCacheLock.writeLock().lock();
try {
/*
* The earliest timestamp that MUST be retained for the
* read-historical indices in the cache.
*
* FIXME There is a cycle here which makes it impossible to release
* an index view sooner than the timeout on the index cache when the
* index cache capacity is larger than the current minimum
* requirements (review store cache and index segment as well).
*
* The problem is that the backing hard reference queue for the
* index cache does not distinguish between actively used indices
* and those that are just being held open in case they might be
* used against "soon" so we are not able to figure out which
* indices can be closed and are therefore required to accept a
* release time which is MUCH earlier than the release time given by
* the transaction service.
*
* There are a few ways to approach this. One is to use local
* read-historical transactions for flyweight read-only operations.
* That will give us a real measure of the #of operations reading on
* any given timestamp [a fair amount of work and requires
* duplicating many of the facilities of the distributed transaction
* manager so that we can track the earliest local tx]. Another is
* to reduce the index cache capacity and timeout and then use a
* fully buffered journal so it does not matter as much if we close
* out an index [a partial fix].
*/
final long indexRetentionTime = getIndexRetentionTime();
/*
* Choose whichever timestamp would preserve more history (that is,
* choose the earlier timestamp). Note that the index retention time
* is -1 if there are no indices in the cache.
*/
final long choosenReleaseTime = indexRetentionTime == -1L ? this.releaseTime
: Math.min(indexRetentionTime, this.releaseTime);
// final long releaseTime = Math.min(indexRetentionTime, Math.min(
// maxReleaseTime, this.releaseTime));
/*
* This is the age of the selected release time as computed from the
* last commit time on the live journal.
*/
final long releaseAge = (lastCommitTime - choosenReleaseTime);
if (log.isInfoEnabled())
log.info("Choosen releaseTime=" + choosenReleaseTime
+ ": given releaseTime=" + this.releaseTime
+ ", indexRetentionTime=" + indexRetentionTime
+ " (this is "
+ TimeUnit.MILLISECONDS.toSeconds(releaseAge)
+ " seconds before/after the lastCommitTime="+lastCommitTime+")");
/*
* The earliest commit time on record in any journal available to
* the StoreManager.
*/
final long firstCommitTime;
{
// the earliest journal available to the store manager.
final IResourceMetadata resource = journalIndex.findNext(0L);
// open that journal.
final AbstractJournal j0 = (AbstractJournal) openStore(resource
.getUUID());
// the first commit time on the earliest journal available.
firstCommitTime = j0.getRootBlockView().getFirstCommitTime();
}
/*
* Find the commitTime that we are going to preserve.
*/
final long commitTimeToPreserve;
if (choosenReleaseTime < firstCommitTime) {
/*
* If the computed [releaseTime] is before the first commit
* record on the earliest available journal then there was
* nothing that could be deleted and we just return immediately.
*/
if (log.isInfoEnabled())
log.info("Release time is earlier than any commit time.");
// Nothing to do.
return null;
} else if (choosenReleaseTime >= lastCommitTime) {
/*
* If the computed [releaseTime] GTE the last commit point then
* we choose the [lastCommitTime] instead.
*
* Note: If there have been no writes on this data service but
* there have been writes on other data services then the
* txService will eventually advance the releaseTime beyond the
* lastCommitTime on this data service. Since we never release
* the last commit point we set the commitTimeToPreserve to the
* lastCommitTime on the local data service.
*/
commitTimeToPreserve = lastCommitTime;
if (log.isInfoEnabled())
log.info("commitTimeToPreserve := " + commitTimeToPreserve
+ " (this is the lastCommitTime)");
} else {
/*
* Find the timestamp for the commit record that is strictly
* greater than the release time.
*/
commitTimeToPreserve = getCommitTimeStrictlyGreaterThan(choosenReleaseTime);
if (log.isInfoEnabled())
log
.info("commitTimeToPreserve := "
+ commitTimeToPreserve
+ " (this is the first commitTime GT the releaseTime="
+ choosenReleaseTime + ")");
}
/*
* Make a note for reporting purposes.
*/
this.lastCommitTimePreserved = commitTimeToPreserve;
/*
* Find resources that were in use as of that commitTime.
*/
final Set resourcesInUse;
final long elapsedScanCommitIndicesTime;
{
final long begin = System.currentTimeMillis();
resourcesInUse = getResourcesForTimestamp(commitTimeToPreserve);
synchronized(retentionSet) {
resourcesInUse.addAll(retentionSet);
}
elapsedScanCommitIndicesTime = System.currentTimeMillis()
- begin;
}
if (log.isInfoEnabled()) {
/* Log the in use resources (resources that MUST NOT be
* deleted).
*/
for (UUID uuid : resourcesInUse) {
log.info("In use: file=" + resourceFiles.get(uuid)
+ ", uuid=" + uuid);
}
}
final long journalBeforeCount = getManagedJournalCount();
final long segmentBeforeCount = getManagedSegmentCount();
final long bytesBeforeCount = getBytesUnderManagement();
/*
* Delete anything that is: ( NOT in use )
*
* AND ( createTime < commitTimeToPreserve )
*/
final long elapsedDeleteResourcesTime;
{
final long begin = System.currentTimeMillis();
deleteUnusedResources(commitTimeToPreserve, resourcesInUse);
elapsedDeleteResourcesTime = System.currentTimeMillis() - begin;
}
final long journalAfterCount = getManagedJournalCount();
final long segmentAfterCount = getManagedSegmentCount();
final long bytesAfterCount = getBytesUnderManagement();
final long elapsedPurgeResourcesTime = System.currentTimeMillis() - beginPurgeTime;
purgeResourcesMillis += elapsedPurgeResourcesTime;
final PurgeResult result = new PurgeResult(firstCommitTime, lastCommitTime,
this.releaseTime, indexRetentionTime, choosenReleaseTime,
commitTimeToPreserve, resourcesInUse.size(),
journalBeforeCount, journalAfterCount, segmentBeforeCount,
segmentAfterCount, bytesBeforeCount, bytesAfterCount,
elapsedScanCommitIndicesTime, elapsedDeleteResourcesTime,
elapsedPurgeResourcesTime);
e.addDetails(result.getParams());
return result;
} finally {
indexCacheLock.writeLock().unlock();
e.end();
}
}
/**
* Delete unused resources given a set of resources that are still in use.
* The unused resources are identified by scanning the {@link #journalIndex}
* and the {@link #segmentIndex}. For each resource found in either of
* those indices which is NOT found in resourcesInUse and whose
* createTime is GTE the specified timestamp, we take the following steps:
*
* - close iff open
* - remove from lists of known resources
* - clear the associated {@link ILRUCache}
* - delete in the file system
*
* Note: {@link IndexSegment}s pose a special case. Their create time is
* the timestamp associated with their source view. During asynchronous
* overflow processing we generate {@link IndexSegment}s from the
* lastCommitTime of the old journal. Therefore their createTime timestamp
* is often LT the commitTimeToPreserve. In order to prevent these
* {@link IndexSegment}s from being released before they are put to use (by
* incorporating them into an index partition view) we DO NOT add them to
* the {@link #segmentIndex} until they are part of an index partition view.
*
* @param commitTimeToPreserve
* Resources created as of or later than this timestamp WILL NOT
* be deleted.
* @param resourcesInUse
* The set of resources required by views as of the
* commitTimeToPreserve. These resources have create
* times LTE to commitTimeToPreserve but are in use but
* at least one view as of that commit time and therefore MUST
* NOT be deleted.
*
* @see IndexManager#buildIndexSegment(String,
* com.bigdata.btree.ILocalBTreeView, boolean, long, byte[], byte[],
* Event)
*/
private void deleteUnusedResources(final long commitTimeToPreserve,
final Set resourcesInUse) {
/*
* Delete old journals.
*/
// #of journals deleted.
int njournals = 0;
{
/*
* Note: This iterator supports traversal with concurrent
* modification (by a single thread). If we decide to delete a
* journal resource, then deleteResource() will be tasked to delete
* it from the [journalIndex] as well.
*/
final ITupleIterator itr = journalIndex.rangeIterator(
null/* fromKey */, null/* toKey */, 0/* capacity */,
IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/*filter*/);
while (itr.hasNext()) {
final ITuple tuple = itr.next();
final IResourceMetadata resourceMetadata = (IResourceMetadata) SerializerUtil
.deserialize(tuple.getValue());
// the create timestamp for that resource.
final long createTime = resourceMetadata.getCreateTime();
if (createTime >= commitTimeToPreserve) {
/*
* Do NOT delete any resources whose createTime is GTE the
* given commit time.
*/
if (log.isInfoEnabled())
log
.info("Stopping at resource GTE commitTime to preserve: createTime="
+ createTime
+ ", file="
+ resourceMetadata.getFile());
break;
}
final UUID uuid = resourceMetadata.getUUID();
if (resourcesInUse.contains(uuid)) {
// still required as of that timestamp.
continue;
}
try {
deleteUnusedResource(resourceMetadata);
} catch (Throwable t) {
// log error and keep going.
log.error("Could not delete journal: "
+ resourceMetadata.getFile(), t);
}
// remove from the [journalIndex].
itr.remove();
njournals++;
}
}
/*
* Delete old index segments.
*/
// #of segments deleted.
int nsegments = 0;
{
/*
* Note: This iterator supports traversal with concurrent
* modification (by a single thread). If we decide to delete a
* indexSegment resource, then deleteResource() will be tasked to
* delete it from the [segmentIndex] as well.
*/
final ITupleIterator itr = segmentIndex.rangeIterator(
null/* fromKey */, null/* toKey */, 0/* capacity */,
IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/* filter */);
while (itr.hasNext()) {
final ITuple tuple = itr.next();
final IResourceMetadata resourceMetadata = (IResourceMetadata) SerializerUtil
.deserialize(tuple.getValue());
// the create timestamp for that resource.
final long createTime = resourceMetadata.getCreateTime();
if (createTime >= commitTimeToPreserve) {
/*
* Do NOT delete any resources whose createTime is GTE the
* given commit time.
*/
if (log.isInfoEnabled())
log
.info("Stopping at resource GTE commitTime to preserve: createTime="
+ createTime
+ ", file="
+ resourceMetadata.getFile());
break;
}
final UUID uuid = resourceMetadata.getUUID();
if (resourcesInUse.contains(uuid)) {
// still required as of that timestamp.
continue;
}
try {
// delete the backing file.
deleteUnusedResource(resourceMetadata);
} catch (Throwable t) {
// log error and keep going.
log.error("Could not delete segment - continuing: "
+ resourceMetadata.getFile(), t);
}
// remove from the [segmentIndex]
itr.remove();
nsegments++;
}
}
if (log.isInfoEnabled())
log.info("Given " + resourcesInUse.size()
+ " resources that are in use as of timestamp="
+ commitTimeToPreserve + ", deleted " + njournals
+ " journals and " + nsegments + " segments");
}
/**
* Delete the resource in the file system and remove it from the
* {@link #storeCache} and {@link #resourceFiles} and either
* {@link #journalIndex} or {@link #segmentIndex} as appropriate.
*
*
* DO NOT delete resources that are in use!
*
* A resource that has not yet been incoporated into a view may be deleted
* without futher concern. However, once a resource has been incorporated
* into a view then you MUST arange for appropriate synchronization before
* the resource may be deleted. For example, {@link #purgeOldResources()}
* imposes that constraint on the caller that they are responsible for
* synchronization and is generally invoked during synchronous overflow
* since we know that there are no active writers at that time.
*
* Pre-conditions:
*
* - The resource identified by that {@link UUID} exists and is not the
* live journal.
* - The resource is not in use (not checked).
* - The resource is found in {@link #resourceFiles}.
*
* Post-conditions:
*
* - The resource is closed if it was open and is no longer found in the
* {@link #storeCache}.
* - The resource is no longer found in {@link #resourceFiles}.
* - The backing file for the resource has been deleted (the backing file
* is obtain from {@link #resourceFiles}).
* - Various counters maintained by the {@link StoreManager} have been
* updated (bytes delete, bytes under management, etc).
* - The file has been removed from either the {@link #journalIndex} or
* the {@link #segmentIndex} as appropriate.
*
*
* @param uuid
* The {@link UUID} which identifies the resource.
* @param isJournal
* true
if the resource is a journal.
*/
protected void deleteResource(final UUID uuid, final boolean isJournal)
throws NoSuchStoreException {
if (log.isInfoEnabled())
log.info("deleteResource: uuid=" + uuid + ", isJournal="
+ isJournal);
if (uuid == null)
throw new IllegalArgumentException();
if (uuid == liveJournalRef.get().getRootBlockView().getUUID()) {
/*
* Can't close out the live journal!
*
* Note: using the reference directly since invoked during startup
* to delete index segments left lying around if there is an
* incomplete move.
*/
throw new IllegalArgumentException();
}
synchronized (retentionSet) {
if (retentionSet.contains(uuid)) {
throw new IllegalStateException("Resource in retentionSet: "
+ uuid);
}
}
/*
* Close out store iff open.
*/
{
final IRawStore store = storeCache.remove(uuid);
if (store != null) {
final File file = store.getFile();
if(isJournal) {
assert store instanceof AbstractJournal;
} else {
assert store instanceof IndexSegmentStore;
}
try {
if (store.isOpen()) {
// make sure the store is closed.
store.close();
}
} catch (IllegalStateException t) {
/*
* There should not be closed journals in the cache since
* they are only closed by the finalizer.
*
* However, an IndexSegmentStore will be closed if the
* IndexSegment is closed and it can still be in the cache
* until its reference is cleared when it gets finalized.
*
* Note: if there is a concurrent close then that might be
* interesting and should at least be explored further.
*/
if (isJournal)
// probably a problem.
log.error(file, t);
else
// probably NOT a problem.
log.warn(file, t);
}
}
}
/*
* delete the backing file.
*/
{
final File file = resourceFiles.remove(uuid);
if (log.isInfoEnabled())
log.info("DELETE: file=" + file + ", uuid=" + uuid + ", isJournal="
+ isJournal);
if (file == null) {
/*
* Note: This can happen if you confuse the indexUUID and the
* indexSegment's UUID in the code. The former is on the
* IndexMetadata while the latter (the one that you want) is on
* the SegmentMetadata.
*/
throw new NoSuchStoreException(uuid);
}
if (!file.exists()) {
throw new RuntimeException("Not found: " + file);
}
final long length = file.length();
if (!file.delete()) {
throw new RuntimeException("Could not delete: " + file);
}
// track #of bytes deleted since startup.
bytesDeleted.addAndGet(length);
// track #of bytes still under management.
bytesUnderManagement.addAndGet(-length);
if(isJournal) {
journalBytesUnderManagement.addAndGet(-length);
journalDeleteCount.incrementAndGet();
} else {
segmentBytesUnderManagement.addAndGet(-length);
segmentStoreDeleteCount.incrementAndGet();
}
}
/*
* Remove the resource from either journalIndex or segmentIndex as
* appropriate.
*/
{
boolean found = false;
if (isJournal) {
synchronized (journalIndex) {
@SuppressWarnings("unchecked")
final ITupleIterator itr = journalIndex
.rangeIterator(null/* fromKey */,
null/* toKey */, 0/* capacity */,
IRangeQuery.DEFAULT | IRangeQuery.CURSOR,
null/* filter */);
while(itr.hasNext()) {
final IResourceMetadata md = itr.next().getObject();
if(md.getUUID().equals(uuid)) {
itr.remove();
found = true;
break;
}
}
}
} else {
synchronized (segmentIndex) {
@SuppressWarnings("unchecked")
final ITupleIterator itr = segmentIndex
.rangeIterator(null/* fromKey */,
null/* toKey */, 0/* capacity */,
IRangeQuery.DEFAULT | IRangeQuery.CURSOR,
null/* filter */);
while (itr.hasNext()) {
final IResourceMetadata md = itr.next().getObject();
if (md.getUUID().equals(uuid)) {
itr.remove();
found = true;
break;
}
}
}
}
if (!found)
throw new NoSuchStoreException(uuid);
}
}
/**
* Variant used by {@link #deleteUnusedResources(long, Set)}, which is in
* turned invoked by {@link #purgeOldResources()}. This implementation is
* different in that we have the {@link IResourceManager} on hand when we
* need to delete the resource. I judge it worth the redundency in the code
* to have a variant specific to this use case so that the DELETE log
* messages report the {@link IResourceMetadata#getCreateTime() create time}
* which can be used as a cross-check on {@link #purgeOldResources()}.
* Pre-conditions:
*
* - The resource described by the {@link IResourceMetadata} exists and
* is not the live journal.
* - The resource is not in use (not checked).
* - The resource is found in {@link #resourceFiles}.
*
* Post-conditions:
*
* - The resource is closed if it was open and is no longer found in the
* {@link #storeCache}.
* - The resource is no longer found in {@link #resourceFiles}.
* - The {@link ILRUCache} for that resource has been cleared.
* - The backing file for the resource has been deleted (the backing file
* is obtain from {@link #resourceFiles}).
* - Various counters maintained by the {@link StoreManager} have been
* updated (bytes delete, bytes under management, etc).
*
*
* Note: The caller MUST remove the entry for the resource from either
* {@link #journalIndex} or the {@link #segmentIndex} as appropriate. For
* this use case, the caller can handle that efficiently since they are
* already traversing an iterator on the appropriate {@link BTree} and can
* use {@link Iterator#remove()} to delete the corresponding entry from the
* {@link BTree}.
*
* @param resourceMetadata
* The metadata describing the resource to be deleted.
*/
private void deleteUnusedResource(final IResourceMetadata resourceMetadata) {
if (log.isInfoEnabled())
log.info("deleteResource: " + resourceMetadata);
if (resourceMetadata == null)
throw new IllegalArgumentException();
final UUID uuid = resourceMetadata.getUUID();
if (uuid == liveJournalRef.get().getRootBlockView().getUUID()) {
/*
* Can't close out the live journal!
*
* Note: using the reference directly since invoked during startup
* to delete index segments left lying around if there is an
* incomplete move.
*/
throw new IllegalArgumentException();
}
synchronized (retentionSet) {
if (retentionSet.contains(uuid)) {
throw new IllegalStateException("Resource in retentionSet: "
+ uuid);
}
}
/*
* Close out store iff open.
*/
{
final IRawStore store = storeCache.remove(uuid);
if (store != null) {
final File file = store.getFile();
if(resourceMetadata.isJournal()) {
assert store instanceof AbstractJournal;
} else {
assert store instanceof IndexSegmentStore;
}
try {
if (store.isOpen()) {
// make sure the store is closed.
store.close();
}
} catch (IllegalStateException t) {
/*
* There should not be closed journals in the cache since
* they are only closed by the finalizer.
*
* However, an IndexSegmentStore will be closed if the
* IndexSegment is closed and it can still be in the cache
* until its reference is cleared when it gets finalized.
*
* Note: if there is a concurrent close then that might be
* interesting and should at least be explored further.
*/
if (resourceMetadata.isJournal())
// probably a problem.
log.error(file, t);
else
// probably NOT a problem.
log.warn(file, t);
}
}
}
// @see BLZG-1501 (remove LRUNexus)
// /*
// * Clear record for that store from the LRUNexus and remove the entry
// * for the store itself from the LRUNexus.
// */
// if (LRUNexus.INSTANCE != null) {
//
// LRUNexus.INSTANCE.deleteCache(uuid);
//
// }
/*
* delete the backing file.
*/
{
final File file = resourceFiles.remove(uuid);
/*
* Note: This logs the file as reported by [resourceFiles] as well
* as the file in IResourceMetadata in case any discrepency arises.
*/
// if (log.isInfoEnabled())
// log.info("DELETE: " + resourceMetadata + " : " + file);
log.warn("DELETE: " + resourceMetadata + " : " + file);
if (file == null) {
/*
* Note: This can happen if you confuse the indexUUID and the
* indexSegment's UUID in the code. The former is on the
* IndexMetadata while the latter (the one that you want) is on
* the SegmentMetadata.
*/
throw new NoSuchStoreException(uuid);
}
if (!file.exists()) {
throw new RuntimeException("Not found: " + file);
}
final long length = file.length();
if (!file.delete()) {
throw new RuntimeException("Could not delete: " + file);
}
// track #of bytes deleted since startup.
bytesDeleted.addAndGet(length);
// track #of bytes still under management.
bytesUnderManagement.addAndGet(-length);
if(resourceMetadata.isJournal()) {
journalBytesUnderManagement.addAndGet(-length);
journalDeleteCount.incrementAndGet();
} else {
segmentBytesUnderManagement.addAndGet(-length);
segmentStoreDeleteCount.incrementAndGet();
}
}
}
/**
* Finds the journal spanning the first {@link ICommitRecord} that is
* strictly greater than the specified timestamp and returns the timestamp
* of that {@link ICommitRecord}.
*
* @param releaseTime
* A release time as set by {@link #setReleaseTime(long)}. Any
* resource as of this timestamp is available for release.
*
* @return The timestamp of the first commit record whose timestamp is
* strictly greater than releaseTime.
*
* @throws IllegalArgumentException
* If there is no commit point that is strictly greater than the
* releaseTime. This implies that the release time is either in
* the future or, if the releaseTime is equal to the last
* commitTime, that you are trying to release everything in the
* database.
*/
protected long getCommitTimeStrictlyGreaterThan(final long releaseTime) {
final ManagedJournal journal = (ManagedJournal) getJournal(releaseTime);
if (journal == null) {
throw new IllegalArgumentException("No data for releaseTime="
+ releaseTime);
}
final IRootBlockView rootBlockView = journal.getRootBlockView();
final ICommitRecord commitRecord = journal
.getCommitRecordStrictlyGreaterThan(releaseTime);
if (commitRecord == null) {
final long closeTime = rootBlockView.getCloseTime();
if (closeTime == 0L) {
/*
* Since this journal is not closed then we know that the next
* commit would be on this journal, but there is no commit for
* that release time.
*/
throw new IllegalArgumentException("No data for releaseTime="
+ releaseTime);
}
/*
* Otherwise this journal was closed as of this timestamp.
* getJournal(timestamp) returns the journal having data for the
* timestamp. However, since we are interested in the _next_ commit
* point, we need to recursively invoke ourselves when the close
* time of this journal.
*/
log.warn("Examining prior journal (fence post): closeTime="
+ closeTime + ", releaseTime=" + releaseTime);
return getCommitTimeStrictlyGreaterThan(closeTime);
}
/*
* This is the timestamp associated with the commit point that is the
* first commit point strictly greater than the given release time.
*/
final long commitTime = commitRecord.getTimestamp();
log.warn("Chose commitTime=" + commitTime + " given releaseTime="
+ releaseTime);
assert commitTime > releaseTime;
return commitTime;
}
/**
* Finds all resources used by any registered index as of the
* commitTimeToPreserve up to and including the lastCommitTime for
* the live journal.
*
* Note: We include all dependencies for all commit points subsequent to the
* probe in order to ensure that we do not accidently release dependencies
* required for more current views of the index.
*
* Note: This method solely considers the index views as defined at each
* commit point starting with the given commit point. It DOES NOT pay
* attention to the release time or to any other aspect of the state of the
* system.
*
* @param commitTimeToPreserve
* The commit time corresponding to the first commit point which
* must be preserved.
*
* @return The set of resource {@link UUID}s required by at least one index
* for any commit time GTE the specified commit time.
*/
protected Set getResourcesForTimestamp(final long commitTimeToPreserve) {
if (log.isDebugEnabled())
log.debug("commitTimeToPreserve=" + commitTimeToPreserve
+ ", lastCommitTime="
+ getLiveJournal().getRootBlockView().getLastCommitTime());
// must be a commitTime.
if (commitTimeToPreserve <= 0)
throw new IllegalArgumentException();
final Set uuids = new LinkedHashSet(512);
/*
* The live journal is always a dependency, even if there are no indices
* declared.
*/
uuids.add(getLiveJournal().getRootBlockView().getUUID());
/*
* Scan all journals having data for commit points GTE the given
* [commitTime].
*
* Note: We have to scan ALL journals since they are organized by their
* createTime in the [journalIndex] not their [lastCommitTime].
*/
synchronized(journalIndex) {
@SuppressWarnings("unchecked")
final ITupleIterator itr = journalIndex.rangeIterator();
while(itr.hasNext()) {
final ITuple tuple = itr.next();
final JournalMetadata journalMetadata = tuple.getObject();
final UUID uuid = journalMetadata.getUUID();
final ManagedJournal journal = (ManagedJournal) openStore(uuid);
// the last commit point on that journal.
final long lastCommitTime = journal.getRootBlockView()
.getLastCommitTime();
if (lastCommitTime < commitTimeToPreserve) {
/*
* Ignore this journal since last commit point is strictly
* LT our starting [commitTime].
*
* Note: Since the index partition views are re-defined on
* the new journal by each synchronous overflow operation,
* we do not need to consider older journals in order to
* discover the resources used by all index partition views
* defined as of the start of any given journal.
*/
continue;
}
/*
* Scan commit points on that journal.
*/
{
if (log.isDebugEnabled())
log.debug("Examining journal: file="
+ journal.getFile() + ", lastCommitTime="
+ lastCommitTime + ", uuid="
+ journal.getRootBlockView().getUUID());
/*
* The index of commit points for the journal, loaded from
* the last commit point on the journal. This is Ok since we
* always want to read up to the lastCommitPoint on each
* journal, including on the live journal.
*
* Note: This is NOT the live CommitRecordIndex. The live
* CommitRecordIndex is NOT protected for use by concurrent
* threads.
*/
final CommitRecordIndex commitRecordIndex = journal
.getCommitRecordIndex(journal.getRootBlockView()
.getCommitRecordIndexAddr(),true/*readOnly*/);
/*
* A per-journal hash set of the [checkpointAddr] for the
* BTree's that we have examined so that we can skip over
* any BTree whose state has not been changed since the last
* commit point (if it has the same checkpointAddr in two
* different commit point then its state has not changed
* between those commit points).
*/
final Set addrs = new HashSet(
512);
/*
* In order to scan timestamps from [commitTime] through to
* the end. For each tuple, fetch the corresponding
* [commitRecord]. For each commitRecord, fetch the
* Name2Addr index and visit its Entries.
*/
@SuppressWarnings("unchecked")
final ITupleIterator itr2 = commitRecordIndex
.rangeIterator(commitTimeToPreserve/* fromKey */, null/* toKey */);
while(itr2.hasNext()) {
final ITuple tuple2 = itr2.next();
final CommitRecordIndex.Entry entry2 = (CommitRecordIndex.Entry) tuple2
.getObject();
/*
* For each distinct checkpoint, load the BTree and
* fetch its local partition metadata which specifies
* its resource dependencies. For each resource, add it
* to the set of resources that we are collecting. All
* of those resources MUST be retained.
*/
final ICommitRecord commitRecord = commitRecordIndex
.fetchCommitRecord(entry2);
final Name2Addr name2addr = (Name2Addr) Name2Addr
.load(
journal,
commitRecord
.getRootAddr(AbstractJournal.ROOT_NAME2ADDR),
true/* readOnly */);
@SuppressWarnings("unchecked")
final ITupleIterator itr3 = name2addr.rangeIterator();
while(itr3.hasNext()) {
final ITuple tuple3 = itr3.next();
final Name2Addr.Entry entry3 = tuple3.getObject();
final long checkpointAddr = entry3.checkpointAddr;
if(addrs.add(checkpointAddr)) {
/*
* New checkpoint address.
*/
if (log.isDebugEnabled())
log.debug("index: name=" + entry3.name);
// load checkpoint record from the store.
final Checkpoint checkpoint = Checkpoint.load(journal, entry3.checkpointAddr);
// read the index metadata object for that checkpoint.
final IndexMetadata indexMetadata = IndexMetadata.read(journal, checkpoint.getMetadataAddr());
// this is where the definition of the view is stored.
final LocalPartitionMetadata pmd = indexMetadata.getPartitionMetadata();
if (pmd == null) {
/*
* For scale-out, all indices should be
* index partitions and should define the
* resources required by their view.
*
* Note: However, the metadata service is
* not currently partitioned so you will see
* unpartitioned indices there.
*/
continue;
}
for(IResourceMetadata t : pmd.getResources()) {
if (uuids.add(t.getUUID())) {
if (log.isInfoEnabled())
log.info("Dependency: file="
+ t.getFile() + ", uuid="
+ t.getUUID() + ", view="
+ pmd);
}
} // next resource in view
} // end if
} // next Name2Addr.Entry
} // next CommitRecordIndex.Entry
} // block
} // while(journalIndex.rangeIterator.hasNext())
} // synchronized( journalIndex )
if (log.isInfoEnabled())
log.info("commitTime=" + commitTimeToPreserve + ", #used=" + uuids.size());
return uuids;
}
/**
* Munge a name index so that it is suitable for use in a filesystem. In
* particular, any non-word characters are converted to an underscore
* character ("_"). This gets rid of all punctuation characters and
* whitespace in the index name itself, but will not translate unicode
* characters.
*
* @param s
* The name of the scale-out index.
*
* @return A string suitable for inclusion in a filename.
*/
static public String munge(final String s) {
return s.replaceAll("[\\W]", "_");
}
@Override
public File getIndexSegmentFile(final IndexMetadata indexMetadata) {
if (indexMetadata == null)
throw new IllegalArgumentException();
final IPartitionMetadata pmd = indexMetadata.getPartitionMetadata();
return getIndexSegmentFile(indexMetadata.getName(), indexMetadata
.getIndexUUID(), pmd == null ? -1 : pmd.getPartitionId());
}
/**
* Return the file on which a new {@link IndexSegment} should be written.
* The file will exist but will have zero length. The file is created using
* the {@link File#createTempFile(String, String, File)} mechanism within
* the configured {@link #dataDir} in the subdirectory for the specified
* scale-out index.
*
* Note: The index name appears in the file path above the {@link UUID} of
* the scale-out index. Therefore it is not possible to have collisions
* arise in the file system when given indices whose scale-out names differ
* only in characters that are munged onto the same character since the
* files will always be stored in a directory specific to the scale-out
* index.
*
* @param scaleOutIndexName
* The name of the scale-out index.
* @param indexUUID
* The UUID of the scale-out index.
* @param partitionId
* The index partition identifier -or- -1
if the
* index is not partitioned (handles the MDS which does not use
* partitioned indices at this time).
*
* @return The {@link File} on which a {@link IndexSegmentStore} for that
* index partition may be written. The file will be unique and
* empty.
*
* @throws IllegalArgumentException
* if any argument is null
* @throws IllegalArgumentException
* if the partitionId is negative and not -1
*
* @todo should the filename be relative or absolute?
*/
public File getIndexSegmentFile(final String scaleOutIndexName,
final UUID indexUUID, final int partitionId) {
assertOpen();
if (scaleOutIndexName == null)
throw new IllegalArgumentException();
if (indexUUID == null)
throw new IllegalArgumentException();
if (partitionId < -1)
throw new IllegalArgumentException();
// munge index name to fit the file system.
final String mungedName = munge(scaleOutIndexName);
// subdirectory into which the individual index segs will be placed.
final File indexDir = new File(segmentsDir, mungedName + File.separator
+ indexUUID.toString());
// make sure that directory exists.
indexDir.mkdirs();
final String partitionStr = (partitionId == -1 ? "" : "_shardId"
+ leadingZeros.format(partitionId));
final String prefix = mungedName + "" + partitionStr + "_";
final File file;
try {
file = File.createTempFile(prefix, Options.SEG, indexDir);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (log.isInfoEnabled())
log.info("Created file: " + file);
return file;
}
/**
* This attempts to obtain the exclusive lock for the
* {@link WriteExecutorService}. If successful, it purges any resources that
* are no longer required based on
* {@link StoreManager.Options#MIN_RELEASE_AGE} and optionally truncates the
* live journal such that no free space remains in the journal.
*
* Note: If there is heavy write activity on the service then the timeout
* may well expire before the exclusive write lock becomes available.
* Further, the acquisition of the exclusive write lock will throttle
* concurrent write activity and negatively impact write performance if the
* system is heavily loaded by write tasks.
*
* @param timeout
* The timeout (in milliseconds) that the method will await the
* pause of the write service.
* @param truncateJournal
* When true
, the live journal will be truncated to
* its minimum extent (all writes will be preserved but there
* will be no free space left in the journal). This may be used
* to force the {@link DataService} to its minimum possible
* footprint for the configured history retention policy.
*
* @return true
if successful and false
if the
* write service could not be paused after the specified timeout.
*
* @param truncateJournal
* When true
the live journal will be truncated such
* that no free space remains in the journal. If writes are
* directed to the live journal after it has been truncated then
* it will transparently re-extended.
*
* @throws IOException
* @throws InterruptedException
* @throws IllegalStateException
* if the {@link StoreManager} is not running.
*/
public boolean purgeOldResources(final long timeout,
final boolean truncateJournal) throws InterruptedException {
final WriteExecutorService writeService = getConcurrencyManager()
.getWriteService();
if (writeService.tryLock(timeout, TimeUnit.MILLISECONDS)) {
assertRunning();
try {
final Event event = new Event(getFederation(),
new EventResource(), EventType.PurgeResources).start();
try {
final PurgeResult purgeResult = purgeOldResources();
if (purgeResult != null) {
log.warn(purgeResult.toString());
event.addDetails(purgeResult.getParams());
}
if (truncateJournal) {
assertRunning();
getLiveJournal().truncate();
}
} finally {
event.end();
}
return true;
} finally {
// release the lock.
writeService.unlock();
}
} else {
log.warn("Purge resources did not run: timeout=" + timeout);
return false;
}
}
/**
* When the {@link StoreManager} is relatively new (as measured by the #of
* bytes under management) we discount the journal extent in order to
* trigger overflow earlier. Together with the discount applied to the split
* handler by the {@link AsynchronousOverflowTask}, this helps to break
* down new index partitions allocated on the new data service and
* re-distribute those index partitions (if there are other data services
* which have even less utilization).
*
* @param p
* The properties (modified as side-effect).
*/
protected void overrideJournalExtent(final Properties p) {
final long bytesUnderManagement = this.bytesUnderManagement.get();
if (accelerateOverflowThreshold == 0
|| bytesUnderManagement >= accelerateOverflowThreshold) {
/*
* Crossed the threshold where we no longer accelerate overflow.
*/
return;
}
final double d = (double) bytesUnderManagement
/ accelerateOverflowThreshold;
final long initialExtent = Long.parseLong(p.getProperty(
Options.INITIAL_EXTENT, Options.DEFAULT_INITIAL_EXTENT));
final long maximumExtent = Long.parseLong(p.getProperty(
Options.INITIAL_EXTENT, Options.DEFAULT_MAXIMUM_EXTENT));
/*
* Don't allow a journal w/ less than 10M or the minimum specified by
* Options.
*/
final long minimumExtent = Math.max(Options.minimumInitialExtent,
Bytes.megabyte * 10);
/*
* Use the same value for initial and maximum extents since we plan to
* overflow rapidly. We choose the value as a discount on the maximum
* extent. This prevents numerous extensions until we get near to the
* maximum extent.
*/
final long adjustedExtent = Math.max(minimumExtent,
(long) (maximumExtent * d));
p.setProperty(Options.INITIAL_EXTENT, Long.toString(adjustedExtent));
p.setProperty(Options.MAXIMUM_EXTENT, Long.toString(adjustedExtent));
if (log.isInfoEnabled())
log.info("discount=" + d //
+ ", bytesUnderManagement=" + bytesUnderManagement //
+ ", threshold=" + accelerateOverflowThreshold//
+ ", minimimInitialExtent=" + minimumExtent//
+ ", initialExtent=" + initialExtent //
+ ", maximumExtent=" + maximumExtent //
+ ", adjustedExtent=" + adjustedExtent);
return;
}
}