com.bigdata.resources.OverflowManager Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 25, 2008
*/
package com.bigdata.resources;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.log4j.Logger;
import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.ISimpleSplitHandler;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.IndexSegmentStore;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.ICounter;
import com.bigdata.counters.ICounterSet;
import com.bigdata.counters.IRequiredHostCounters;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.journal.AbstractJournal;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.IResourceManager;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.journal.WriteExecutorService;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.resources.ResourceManager.IResourceManagerCounters;
import com.bigdata.service.AbstractFederation;
import com.bigdata.service.DataService;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.EventType;
import com.bigdata.service.IDataService;
import com.bigdata.service.IServiceShutdown;
import com.bigdata.service.DataService.IDataServiceCounters;
import com.bigdata.util.Bytes;
import com.bigdata.util.DaemonThreadFactory;
/**
* Class encapsulates logic for handling journal overflow events. Overflow is
* triggered automatically when the user data extent on the journal nears a
* configured threshold. Once the preconditions for overflow are satisfied,
* the {@link WriteExecutorService}s for the journal are paused and all running
* tasks on those services are allowed to complete and commit. Once no writers
* are running, the {@link WriteExecutorService} triggers synchronous overflow.
* Synchronous overflow is a low-latency process which creates a new journal to
* absorb future writes, re-defines the views for all index partitions found on
* the old journal to include the new journal as their first source, and
* initiates a background thread performing asynchronous overflow
* post-processing.
*
* Asynchronous overflow post-processing is responsible for identifying index
* partitions overflow (resulting in a split into two or more index partitions),
* index partition underflow (resulting in the join of the under-capacity index
* partition with its rightSibling), index partition moves (the index partition
* is moved to a different {@link DataService}), and index partition builds (an
* {@link IndexSegment} is created from the current view in what is effectively
* a compacting merge). Overflow processing is suspended during asynchronous
* post-processing, but is automatically re-enabled once post-processing
* completes.
*
* @author Bryan Thompson
* @version $Id$
*/
abstract public class OverflowManager extends IndexManager {
/**
* Logger.
*/
protected static final Logger log = Logger.getLogger(OverflowManager.class);
/**
* FIXME This is a temporary flag used to (dis|en)able the logic for
* executing various index partition operations as after actions for a
* compacting merge.
*/
final protected boolean compactingMergeWithAfterAction = true;
/**
* @see Options#COPY_INDEX_THRESHOLD
*/
final protected int copyIndexThreshold;
/**
* @see Options#ACCELERATE_SPLIT_THRESHOLD
*/
final protected int accelerateSplitThreshold;
/**
* @see Options#PERCENT_OF_SPLIT_THRESHOLD
*/
final protected double percentOfSplitThreshold;
/**
* FIXME configuration option.
*/
final protected double percentOfJoinThreshold = 0.4;
/**
* @see Options#TAIL_SPLIT_THRESHOLD
*/
final protected double tailSplitThreshold;
// /**
// * @see Options#HOT_SPLIT_THRESHOLD
// */
// final protected double hotSplitThreshold;
/**
* @see Options#SCATTER_SPLIT_ENABLED
*/
final protected boolean scatterSplitEnabled;
/**
* @see Options#JOINS_ENABLED
*/
final protected boolean joinsEnabled;
/**
* @see Options#MINIMUM_ACTIVE_INDEX_PARTITIONS
*/
protected final int minimumActiveIndexPartitions;
/**
* @see Options#MAXIMUM_MOVES
*
* @deprecated Moves are now decided on a case by case basis. An alternative
* parameter might be introduced in the future to restrict the
* rate at which a DS can shed shards by moving them to other
* nodes.
*/
protected final int maximumMoves;
/**
* @see Options#MAXIMUM_MOVES_PER_TARGET
*
* @deprecated Moves are now decided on a case by case basis. An alternative
* parameter might be introduced in the future to restrict the
* rate at which a DS can shed shards by moving them to other
* nodes.
*
* Note: This is also used to disable moves by some of the unit
* tests so we need a way to replace that functionality before
* this can be taken out.
*/
protected final int maximumMovesPerTarget;
/**
* @see Options#MAXIMUM_MOVE_PERCENT_OF_SPLIT
*/
protected final double maximumMovePercentOfSplit;
/**
* @see Options#MOVE_PERCENT_CPU_TIME_THRESHOLD
*/
protected final double movePercentCpuTimeThreshold;
/**
* The maximum #of optional compacting merge operations that will be
* performed during a single overflow event.
*
* @see Options#MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW
*
* @deprecated merges are now performed in priority order while time remains
* in a given asynchronous overflow cycle.
*/
protected final int maximumOptionalMergesPerOverflow;
/**
* @see Options#MAXIMUM_JOURNALS_PER_VIEW
*
* @deprecated merges are now performed in priority order while time remains
* in a given asynchronous overflow cycle.
*/
protected final int maximumJournalsPerView;
/**
* @see Options#MAXIMUM_SEGMENTS_PER_VIEW
*
* @deprecated merges are now performed in priority order while time remains
* in a given asynchronous overflow cycle.
*/
protected final int maximumSegmentsPerView;
/**
* @see Options#MAXIMUM_BUILD_SEGMENT_BYTES
*/
final protected long maximumBuildSegmentBytes;
/**
* The timeout for {@link #shutdown()} -or- ZERO (0L) to wait for ever.
*
* @see IServiceShutdown#SHUTDOWN_TIMEOUT
*/
final private long shutdownTimeout;
/**
* The service that runs the asynchronous overflow
* {@link AsynchronousOverflowTask}.
*/
private final ExecutorService overflowService;
/**
* The #of threads which will execute index partition build operations.
*
* @see Options#BUILD_SERVICE_CORE_POOL_SIZE
*/
protected final int buildServiceCorePoolSize;
/**
* The #of threads which will execute index partition merge operations.
*
* @see Options#MERGE_SERVICE_CORE_POOL_SIZE
*/
protected final int mergeServiceCorePoolSize;
/**
* The name of the service (iff available). This is used to help label
* thread pools and the like.
*/
protected final String serviceName;
/**
* @see Options#OVERFLOW_ENABLED
*/
private final boolean overflowEnabled;
/**
* @see Options#OVERFLOW_MAX_COUNT
*
* @deprecated This is no longer used, even for testing.
*/
private final int overflowMaxCount;
/**
* @see Options#OVERFLOW_THRESHOLD
*/
protected final double overflowThreshold;
/**
* A flag used to disable overflow of the live journal until asynchronous
* post-processing of the old journal has been completed.
*
* @see AsynchronousOverflowTask
*/
protected final AtomicBoolean overflowAllowed = new AtomicBoolean(true);
/**
* A flag used to disable the asynchronous overflow processing for some unit
* tests.
*/
protected final AtomicBoolean asyncOverflowEnabled = new AtomicBoolean(true);
/**
* Flag may be set to force overflow processing during the next group
* commit. The flag is cleared by {@link #overflow()}.
*
* @see DataService#forceOverflow(boolean, boolean)
*/
public final AtomicBoolean forceOverflow = new AtomicBoolean(false);
/**
* A flag that may be set to force the next asynchronous overflow to perform
* a compacting merge for all indices that are not simply copied over to the
* new journal (the use of this flag significantly raises the time
* required for asynchronous overflow processing as all shard views must be
* made compact and SHOULD NOT be used for deployed federations).
* The state of the flag is cleared each time asynchronous overflow
* processing begins.
*
* @see DataService#forceOverflow(boolean, boolean)
*/
public final AtomicBoolean compactingMerge = new AtomicBoolean(false);
/**
* The "live" overflow counters which are maintained by the service.
*/
protected final OverflowCounters overflowCounters = new OverflowCounters();
/**
* Return a copy of the {@link OverflowCounters}.
*/
public OverflowCounters getOverflowCounters() {
return overflowCounters.clone();
}
/**
* #of synchronous overflows that have taken place. This counter is
* incremented each time the synchronous overflow operation.
*
* @see #getOverflowCounters()
*/
public long getSynchronousOverflowCount() {
return overflowCounters.synchronousOverflowCounter.get();
}
/**
* #of asynchronous overflows that have taken place. This counter is
* incremented each time the entire overflow operation is complete,
* including any post-processing of the old journal.
*
* @see #getOverflowCounters()
*/
public long getAsynchronousOverflowCount() {
return overflowCounters.asynchronousOverflowCounter.get();
}
/**
* The timeout for asynchronous overflow processing.
*
* @see Options#OVERFLOW_TIMEOUT
*/
protected final long overflowTimeout;
/**
* @see Options#OVERFLOW_TASKS_CONCURRENT
*
* @deprecated by {@link #mergeServiceCorePoolSize} and
* {@link #buildServiceCorePoolSize}
*/
protected final int overflowTasksConcurrent;
/**
* @see Options#OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL
*/
protected final boolean overflowCancelledWhenJournalFull;
// /**
// * @see Options#PURGE_RESOURCES_TIMEOUT
// */
// private final long purgeResourcesTimeout;
// /**
// * The timeout in milliseconds that we will await an exclusive write lock on
// * the {@link WriteExecutorService} in order to purge unused resources.
// *
// * @see Options#PURGE_RESOURCES_TIMEOUT
// */
// public long getPurgeResourcesTimeout() {
//
// return purgeResourcesTimeout;
//
// }
/**
* Index partitions are split when they approach this size on the disk.
*
* @see Options#NOMINAL_SHARD_SIZE
*
* @todo Encapsulate with split accelerator factor when this is the first
* index partition for some scale-out index.
*/
public final long nominalShardSize;
/**
* If an index partition refuses to split it will be disabled once its size
* on disk (for a compact view) is greater than this multiplier. The most
* common cause for this is a bad {@link ISimpleSplitHandler} implementation
* provided by the application when it registered the index. By disallowing
* further writes on the shard we prevent it from dragging down performance
* for the entire data service and push the problem back on the application.
* In order to remedy this issue on a pre-existing index you must fix the
* split handler, register the new split handler on the MDS and on each
* shard on the index, and then re-enable writes for the index.
*
* @todo configuration option?
*/
public final double shardOverextensionLimit = 2d;
/**
* true
if overflow processing is enabled and
* false
if overflow processing was disabled as a
* configuration option or if a maximum overflow count was configured and
* has been satisfied, in which case the live journal will NOT overflow.
*
* @see Options#OVERFLOW_ENABLED
* @see Options#OVERFLOW_MAX_COUNT
*/
public boolean isOverflowEnabled() {
return overflowEnabled
&& (overflowMaxCount == 0 || overflowCounters.synchronousOverflowCounter
.get() < overflowMaxCount);
}
/**
* true
unless an overflow event is currently being
* processed.
*/
public boolean isOverflowAllowed() {
return overflowAllowed.get();
}
/**
* Options understood by the {@link OverflowManager}.
*
* @author Bryan Thompson
* @version $Id$
*/
public static interface Options extends IndexManager.Options, IServiceShutdown.Options {
/**
* Boolean property determines whether or not
* {@link IResourceManager#overflow()} processing is enabled (default
* {@value #DEFAULT_OVERFLOW_ENABLED}). When disabled the journal will
* grow without bounds, {@link IndexSegment}s will never be generated
* and index partitions will not be split, joined nor moved away from
* this {@link ResourceManager}.
*/
String OVERFLOW_ENABLED = OverflowManager.class.getName()+".overflowEnabled";
String DEFAULT_OVERFLOW_ENABLED = "true";
/**
* Option may be used to permit a fixed number of synchronous overflow
* operations after which overflow is disabled (default
* {@value #DEFAULT_OVERFLOW_MAX_COUNT}). When ZERO (0) there is no
* limit on the #of synchronous overflow operations. This option is
* mainly used for testing, but it can be enabled if you want higher
* throughput (for a while) and you know that the data will be well
* distributed on the federation after N overflows. Once synchronous
* overflow is disabled, all future writes will be buffered by the live
* journal and index partition builds, merges, splits, joins, and moves
* will no longer be executed. Eventually the live journal extent will
* grow large enough that throughput will drop (due to IOWAIT on random
* seeks against the journal) and it is possible that the maximum
* possible journal extent can be exceeded unless you also configure
* {@link com.bigdata.journal.Options#OFFSET_BITS} for scale-up.
*
* @deprecated This is no longer used, even for testing.
*/
String OVERFLOW_MAX_COUNT = OverflowManager.class.getName()
+ ".overflowMaxCount";
String DEFAULT_OVERFLOW_MAX_COUNT = "0";
/**
* Floating point property specifying the percentage of the maximum
* extent at which synchronous overflow processing will be triggered
* (default {@link #DEFAULT_OVERFLOW_THRESHOLD}). The value is
* multiplied into the configured
* {@link com.bigdata.journal.Options#MAXIMUM_EXTENT}. If the result is
* GTE the current extend of the live journal, then synchronous overflow
* processing will be triggered. However, note that synchronous overflow
* processing can not be triggered until asynchronous overflow
* processing for the last journal is complete. Therefore if
* asynchronous overflow processing takes a long time, the overflow
* threshold might not be checked until after it has already been
* exceeded.
*
* The main purpose of this property is to trigger overflow processing
* before the maximum extent is exceeded. The trigger needs to lead the
* maximum extent somewhat since overflow processing can not proceed
* until there is an exclusive lock on the write service, and tasks
* already running will continue to write on the live journal.
* Overflowing the maximum extent is not a problem as long as the
* {@link BufferMode} supports transparent extension of the journal.
* However, some {@link BufferMode}s do not and therefore they can not
* be used reliably with the overflow manager.
*/
String OVERFLOW_THRESHOLD = OverflowManager.class.getName()
+ ".overflowThreshold";
String DEFAULT_OVERFLOW_THRESHOLD = ".9";
/**
* Index partitions having no more than this many entries as reported by
* a range count will be copied to the new journal during synchronous
* overflow processing rather than building a new index segment from the
* buffered writes (default {@value #DEFAULT_COPY_INDEX_THRESHOLD}).
* When ZERO (0), index partitions will not be copied during overflow
* processing (unless they are empty). While it is important to keep
* down the latency of synchronous overflow processing, small indices
* can be copied so quickly that it is worth it to avoid the heavier
* index segment build operation.
*
* @see #DEFAULT_COPY_INDEX_THRESHOLD
*/
String COPY_INDEX_THRESHOLD = OverflowManager.class.getName()
+ ".copyIndexThreshold";
String DEFAULT_COPY_INDEX_THRESHOLD = "1000";
/**
* The #of index partitions below which we will accelerate the decision
* to split an index partition (default
* {@value #DEFAULT_ACCELERATE_SPLIT_THRESHOLD}). When a new scale-out
* index is created there is by default only a single index partition on
* a single {@link IDataService}. Since each index (partition) is
* single threaded for writes, we can increase the potential concurrency
* if we split the initial index partition. We accelerate decisions to
* split index partitions by reducing the minimum and target #of tuples
* per index partition for an index with fewer than the #of index
* partitions specified by this parameter. When ZERO (0) this feature is
* disabled and we do not count the #of index partitions.
*/
String ACCELERATE_SPLIT_THRESHOLD = OverflowManager.class.getName()
+ ".accelerateSplitThreshold";
String DEFAULT_ACCELERATE_SPLIT_THRESHOLD = "20";
/**
* The minimum percentage (where 1.0
corresponds to 100
* percent) that an index partition must constitute of a nominal index
* partition before a head or tail split will be considered (default
* {@value #DEFAULT_PERCENT_OF_SPLIT_THRESHOLD}). Values near to and
* greater than 1.0
are permissible and imply that the
* post-split leftSibling index partition will be approximately a
* nominal index partition. However the maximum percentage may not be
* greater than 2.0
(200 percent).
*/
String PERCENT_OF_SPLIT_THRESHOLD = OverflowManager.class.getName()
+ ".percentOfSplitThreshold";
String DEFAULT_PERCENT_OF_SPLIT_THRESHOLD = ".9";
/**
* The minimum percentage (in [0:1]) of leaf splits which must be in the
* tail of the index partition before a tail split of an index partition
* will be considered (default {@value #DEFAULT_TAIL_SPLIT_THRESHOLD}).
*/
String TAIL_SPLIT_THRESHOLD = OverflowManager.class.getName()
+ ".tailSplitThreshold";
String DEFAULT_TAIL_SPLIT_THRESHOLD = ".4";
/**
* The minimum percentage (in [0:2]) of a nominal split before an index
* partition will be "hot split" (default
* {@value #DEFAULT_HOT_SPLIT_THRESHOLD}). Hot splits are taken by hosts
* which are more heavily utilized than their peers but not heavily
* utilized in terms of their own resources. This is basically an
* acceleration factor for index partition splits when a host has a
* relatively higher workload than its peers. The purpose of a "hot
* split" is to increase the potential concurrency by breaking an active
* index partition into two index partitions. If the writes on the index
* partition are evenly distributed, then this can double the
* concurrency if the host has spare cycles. Reasonable values are on
* the order of [.25:.75]. Hot splits may be effectively disabled by
* raising the percent of split to GTE
* {@value #PERCENT_OF_SPLIT_THRESHOLD}.
*
* @deprecated Hot splits are not implemented and this option does not
* do anything. It will be going away soon.
*/
String HOT_SPLIT_THRESHOLD = OverflowManager.class.getName()
+ ".hotSplitThreshold";
String DEFAULT_HOT_SPLIT_THRESHOLD = "2.0"; // was .4
/**
* Boolean option indicates whether or not scatter splits are allowed
* (default {@value #SCATTER_SPLIT_ENABLED}) on this service.
*
* @see IndexMetadata.Options#SCATTER_SPLIT_ENABLED
*/
String SCATTER_SPLIT_ENABLED = OverflowManager.class.getName()
+ ".scatterSplitEnabled";
String DEFAULT_SCATTER_SPLIT_ENABLED = "true";
/**
* Option may be used to disable index partition joins.
*
* FIXME Joins are being triggered by the scatter split and/or
* {@link #ACCELERATE_SPLIT_THRESHOLD} behaviors since the target for
* the split size increases as a function of the #of index partitions.
* For example, a scatter split can cause the adjust nominal size of a
* shard to jump to its configured setting, which will cause the shards
* to be "undercapacity" and hence drive JOINs. In order to fix this we
* have to somehow discount joins, either by requiring deletes on the
* index partition or by waiting some #of overflows since the split,
* etc. Alternatively, joins could be ignored unless there are more
* partitions of a given index than were (or would be) produced by a
* scatter split. For the moment joins are disabled by default.
*/
String JOINS_ENABLED = OverflowManager.class.getName()
+ ".joinsEnabled";
String DEFAULT_JOINS_ENABLED = "false";
/**
* The minimum #of active index partitions on a data service before the
* resource manager will consider moving an index partition to another
* service (default {@value #DEFAULT_MINIMUM_ACTIVE_INDEX_PARTITIONS}).
*
* Note: This makes sure that we don't do a move if there are only a few
* active index partitions on this service. This value is also used to
* place an upper bound on the #of index partitions that can be moved
* away from this service - if we move too many (or too many at once)
* then this service stands a good chance of becoming under-utilized and
* index partitions will just bounce around which is very inefficient.
*
* Note: Even when only a single index partition for a new scale-out
* index is initially allocated on this service, if it is active and
* growing it will eventually split into enough index partitions that we
* will begin to re-distribute those index partitions across the
* federation.
*
* Note: Index partitions are considered to be "active" iff
* {@link ITx#UNISOLATED} or {@link ITx#READ_COMMITTED} operations are
* run against the index partition during the life cycle of the live
* journal. There may be many other index partitions on the same service
* that either are never read or are subject only to historical reads.
* However, since only the current state of the index partition is
* moved, not its history, moving index partitions which are only the
* target for historical reads will not reduce the load on the service.
* Instead, read burdens are reduced using replication.
*
* @see #DEFAULT_MINIMUM_ACTIVE_INDEX_PARTITIONS
*/
String MINIMUM_ACTIVE_INDEX_PARTITIONS = OverflowManager.class
.getName()
+ ".minimumActiveIndexPartitions";
String DEFAULT_MINIMUM_ACTIVE_INDEX_PARTITIONS = "1";
/**
* This is the maximum #of index partitions that the resource manager is
* willing to move in a given overflow operations across all of the
* identified under-utilized services (default
* {@value #DEFAULT_MAXIMUM_MOVES}).
*
* Note: Index partition moves MAY be disabled by setting this property
* to ZERO (0).
*
* @see #DEFAULT_MAXIMUM_MOVES
*
* @deprecated Moves are now decided on a case by case basis. An
* alternative parameter might be introduced in the future
* to restrict the rate at which a DS can shed shards by
* moving them to other nodes.
*/
String MAXIMUM_MOVES = OverflowManager.class.getName()
+ ".maximumMoves";
String DEFAULT_MAXIMUM_MOVES = "3";
/**
* This is the maximum #of index partitions that the resource manager is
* willing to move in a given overflow operation onto each identified
* under-utilized service (default
* {@value #DEFAULT_MAXIMUM_MOVES_PER_TARGET}).
*
* Note: Index partitions are moved to the identified under-utilized
* services using a round-robin approach which aids in distributing the
* load across the federation.
*
* Note: Index partition moves MAY be disabled by setting this property
* to ZERO (0).
*
* @see #DEFAULT_MAXIMUM_MOVES_PER_TARGET
*
* @deprecated Moves are now decided on a case by case basis. An
* alternative parameter might be introduced in the future
* to restrict the rate at which a DS can shed shards by
* moving them to other nodes.
*
* Note: This is also used to disable moves by some of the
* unit tests so we need a way to replace that functionality
* before this can be taken out.
*/
String MAXIMUM_MOVES_PER_TARGET = OverflowManager.class.getName()
+ ".maximumMovesPerTarget";
String DEFAULT_MAXIMUM_MOVES_PER_TARGET = "2";
/**
* This is the maximum percentage (in [0:2]) of a full index partition
* which will be considered for a move (default
* {@value #DEFAULT_MAXIMUM_MOVE_PERCENT_OF_SPLIT}).
*
* @see #DEFAULT_MAXIMUM_MOVE_PERCENT_OF_SPLIT
*/
String MAXIMUM_MOVE_PERCENT_OF_SPLIT = OverflowManager.class.getName()
+ ".maximumMovePercentOfSplit";
String DEFAULT_MAXIMUM_MOVE_PERCENT_OF_SPLIT = ".8";
/**
* The threshold for a service to consider itself sufficiently loaded
* that it will consider moving an index partition (default
* {@value #DEFAULT_MOVE_PERCENT_CPU_TIME_THRESHOLD}). This threshold
* IS NOT considered for scatter splits, since the goal there is to
* distribute the data evenly across the federation.
*/
String MOVE_PERCENT_CPU_TIME_THRESHOLD = OverflowManager.class
.getName()
+ ".movePercentCpuTimeThreshold";
String DEFAULT_MOVE_PERCENT_CPU_TIME_THRESHOLD = ".7";
/**
* The maximum #of optional compacting merge operations that will be
* performed during a single overflow event (default
* {@value #DEFAULT_OPTIONAL_COMPACTING_MERGES_PER_OVERFLOW}).
*
* Once this #of optional compacting merge tasks have been identified
* for a given overflow event, the remainder of the index partitions
* that are neither split, joined, moved, nor copied will use
* incremental builds. An incremental build is generally cheaper since
* it only copies the data on the mutable {@link BTree} for the
* lastCommitTime rather than the fused view. A compacting merge permits
* the older index segments to be released and results in a simpler view
* with view {@link IndexSegment}s. Either a compacting merge or an
* incremental build will permit old journals to be released once the
* commit points on those journals are no longer required.
*
* Note: Mandatory compacting merges are identified based on
* {@link #MAXIMUM_JOURNALS_PER_VIEW} and
* {@link #MAXIMUM_SEGMENTS_PER_VIEW}. There is NO limit the #of
* mandatory compacting merges that will be performed during an
* asynchronous overflow event. However, each mandatory compacting merge
* does count towards the maximum #of optional merges. Therefore if the
* #of mandatory compacting merges is greater than this parameter then
* NO optional compacting merges will be selected in a given overflow
* cycle.
*
* @deprecated merges are now performed in priority order while time
* remains in a given asynchronous overflow cycle.
*/
String MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW = OverflowManager.class
.getName()
+ ".maximumOptionalMergesPerOverflow";
String DEFAULT_OPTIONAL_COMPACTING_MERGES_PER_OVERFLOW = "2";
// /**
// * The maximum #of sources for an index partition view before a
// * compacting merge of the index partition will be triggered in
// * preference to an incremental build (default
// * {@value #DEFAULT_MAXIMUM_SOURCES_PER_VIEW}). The minimum value is
// * ONE (1) since the source view must always include the mutable
// * {@link BTree}. When ONE (1), a compacting merge is always indicated.
// *
// * Note: An index partition view is comprised of a mutable {@link BTree}
// * on the live journal, zero or more mutable {@link BTree}s from
// * historical journals, and zero or more {@link IndexSegment}s. An
// * incremental build replaces the {@link BTree} from the old journal (as
// * of the lastCommitTime for that journal) with an {@link IndexSegment}
// * having the same data. A compacting merge replaces the view
// * as of the lastCommitTime of the old journal and results in a mutable
// * {@link BTree} on the live journal and a single {@link IndexSegment}.
// * Split and move operations have the same effect as a compacting merge
// * since their output will contain at most one {@link IndexSegment}.
// *
// * @deprecated should be redundant with
// * {@link #MAXIMUM_JOURNALS_PER_VIEW} and
// * {@link #MAXIMUM_SEGMENTS_PER_VIEW}.
// */
// String MAXIMUM_SOURCES_PER_VIEW = OverflowManager.class.getName()
// + ".maximumSourcesPerView";
//
// String DEFAULT_MAXIMUM_SOURCES_PER_VIEW = "5";
/**
* A compacting merge will be triggered when the #of journals in an
* index partition view is GTE to this value (default
* {@value #DEFAULT_MAXIMUM_JOURNALS_PER_VIEW}). The minimum value is
* TWO (2) since there will be two journals in a view when an index
* partition overflows and {@link OverflowActionEnum#Copy} is not
* selected. As long as index partition splits, builds or merges are
* performed the #of journals in the view WILL NOT exceed 2 and will
* always be ONE (1) after an asynchronous overflow in which a split,
* build or merge was performed.
*
* It is extremely important to perform compacting merges in order to
* release dependencies on old resources (both journals and index
* segments) and keep down the #of sources in a view. This is especially
* true when those sources are journals. Journals are organized by write
* access, not read access. Once the backing buffer for a journal is
* released there will be large spikes in IOWAIT when reading on an old
* journal as reads are more or less random.
*
* Note: The {@link #MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW} will be
* ignored if a compacting merge is recommended for an index partition
* based on this parameter.
*
* Note: Synchronous overflow will refuse to copy tuples for an index
* partition whose mutable {@link BTree} otherwise satisfies the
* {@link #COPY_INDEX_THRESHOLD} if the #of sources in the view exceeds
* thresholds which demand a compacting merge.
*
* @deprecated merges are now performed in priority order while time
* remains in a given asynchronous overflow cycle.
*/
String MAXIMUM_JOURNALS_PER_VIEW = OverflowManager.class.getName()
+ ".maximumJournalsPerView";
String DEFAULT_MAXIMUM_JOURNALS_PER_VIEW = "3";
/**
* A compacting merge will be triggered when the #of index segments in
* an index partition view is GTE to this value (default
* {@value #DEFAULT_MAXIMUM_SEGMENTS_PER_VIEW}).
*
* It is extremely important to perform compacting merges in order to
* release dependencies on old resources (both journals and index
* segments) and keep down the #of sources in a view. However, this is
* less important when those resources are {@link IndexSegment}s since
* they are very efficient for read operations. In this case the main
* driver is to reduce the complexity of the view, to require fewer open
* index segments (and associated resources) in order to materialize the
* view, and to make it possible to release index segments and thus have
* less of a footprint on the disk.
*
* Note: The {@link #MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW} will be
* ignored if a compacting merge is recommended for an index partition
* based on this parameter.
*
* Note: Synchronous overflow will refuse to copy tuples for an index
* partition whose mutable {@link BTree} otherwise satisfies the
* {@link #COPY_INDEX_THRESHOLD} if the #of sources in the view exceeds
* thresholds which demand a compacting merge.
*
* @deprecated merges are now performed in priority order while time
* remains in a given asynchronous overflow cycle.
*/
String MAXIMUM_SEGMENTS_PER_VIEW = OverflowManager.class.getName()
+ ".maximumSegmentsPerView";
String DEFAULT_MAXIMUM_SEGMENTS_PER_VIEW = "6";
/**
* Option limits the #of {@link IndexSegmentStore} bytes that an
* {@link OverflowActionEnum#Build} operation will process (default
* {@value #DEFAULT_MAXIMUM_BUILD_SEGMENTS_BYTES}). Given that the
* nominal size of an index partition is 200M, a reasonable value for
* this might be 1/10th to 1/5th of that, so 20-40M. The key is to keep
* the builds fast so they should not do too much work while reducing
* the frequency with which we must do a compacting merge. This option
* only effects the #of {@link IndexSegment}s that will be incorporated
* into an {@link OverflowActionEnum#Build} operation. When ZERO (0L),
* {@link OverflowActionEnum#Build} operations will only include the
* data from the historical journal.
*
* @todo Configure as a percentage of the nominal shard size (ignoring
* any acceleration factor).
*/
String MAXIMUM_BUILD_SEGMENT_BYTES = OverflowManager.class.getName()
+ ".maximumBuildSegmentsBytes";
String DEFAULT_MAXIMUM_BUILD_SEGMENTS_BYTES = ""
+ (Bytes.megabyte * 20);
/**
* The timeout in milliseconds for asynchronous overflow processing to
* complete (default {@link #DEFAULT_OVERFLOW_TIMEOUT}). Any overflow
* task that does not complete within this timeout will be canceled.
*
* Asynchronous overflow processing is responsible for splitting,
* moving, and joining index partitions. The asynchronous overflow tasks
* are written to fail "safe". Also, each task may succeed or fail on
* its own. Iff the task succeeds, then its effect is made restart safe.
* Otherwise clients continue to use the old view of the index
* partition.
*
* If asynchronous overflow processing DOES NOT complete each time then
* we run several very serious and non-sustainable risks, including: (a)
* the #of sources in a view can increase without limit; and (b) the #of
* journal that must be retained can increase without limit.
*
* @deprecated Asynchronous overflow processing should run to completion
* with a minimum goal of an incremental build for each
* index partition having data on the previous journal.
*/
String OVERFLOW_TIMEOUT = OverflowManager.class.getName() + ".timeout";
/**
* The default timeout in milliseconds for asynchronous overflow
* processing (equivalent to 10 minutes).
*/
String DEFAULT_OVERFLOW_TIMEOUT = "" + (10 * 1000 * 60L); // 10 minutes.
/**
* The #of threads used to execute the asynchronous overflow tasks in
* parallel, ZERO (0) to execute ALL asynchronous overflow tasks in
* parallel, or ONE (1) to execute the asynchronous overflow tasks
* sequentially (default {@value #DEFAULT_OVERFLOW_TASKS_CONCURRENT}).
*
* @deprecated by {@link #MERGE_SERVICE_CORE_POOL_SIZE} and
* {@link #BUILD_SERVICE_CORE_POOL_SIZE}.
*/
String OVERFLOW_TASKS_CONCURRENT = OverflowManager.class.getName()
+ ".overflowTasksConcurrent";
String DEFAULT_OVERFLOW_TASKS_CONCURRENT = "0";
/**
* Cancel an existing asynchronous overflow process (interrupting any
* running tasks) if the live journal is again approaching its maximum
* extent (default
* {@value #DEFAULT_OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL}).
*
* @deprecated Asynchronous overflow processing should run to completion
* with a minimum goal of an incremental build for each
* index partition having data on the previous journal.
*/
String OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL = OverflowManager.class
.getName()
+ ".overflowCancelledWhenJournalFull";
String DEFAULT_OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL = "true";
// /**
// * The timeout in milliseconds that we will await an exclusive lock on
// * the {@link WriteExecutorService} in order to release unused resources
// * (journals and segment files).
// */
// String PURGE_RESOURCES_TIMEOUT = OverflowManager.class.getName() + "purgeResourcesTimeout";
//
// String DEFAULT_PURGE_RESOURCES_TIMEOUT = "" + (1000 * 60L);
/**
* The #of threads in the pool handling index segment builds from the
* old journal.
*/
String BUILD_SERVICE_CORE_POOL_SIZE = OverflowManager.class.getName()
+ ".buildService.corePoolSize";
// @todo or (ncores/2)-1?
String DEFAULT_BUILD_SERVICE_CORE_POOL_SIZE = "3";
/**
* The #of threads in the pool handling index partition merges.
*/
String MERGE_SERVICE_CORE_POOL_SIZE = OverflowManager.class.getName()
+ ".mergeService.corePoolSize";
String DEFAULT_MERGE_SERVICE_CORE_POOL_SIZE = "1";
/**
* The nominal size on the size of a full index partition (~200MB).
* Index partitions are split once they reach or exceed this size. The
* space on the journal is not considered when making this decision
* since it can not readily be attributed to any given index partition.
*
* Note: If you modify this, you may also need to modify the size of the
* buffers in the {@link DirectBufferPool} used to fully buffer the
* nodes region of the index segment file.
*/
String NOMINAL_SHARD_SIZE = OverflowManager.class.getName()
+ ".nominalShardSize";
String DEFAULT_NOMINAL_SHARD_SIZE = "" + (200 * Bytes.megabyte);
}
/**
* Performance counters for the {@link OverflowManager}.
*
* @author Bryan Thompson
* @version $Id$
*/
public static interface IOverflowManagerCounters {
/**
* true
iff overflow processing is enabled as a
* configuration option.
*/
String OverflowEnabled = "Overflow Enabled";
/**
* true
iff overflow processing is currently permitted.
*/
String OverflowAllowed = "Overflow Allowed";
/**
* true
iff synchronous overflow should be initiated
* based on an examination of the state of the live journal and whether
* or not overflow processing is enabled and currently allowed.
*/
String ShouldOverflow = "Should Overflow";
/**
* The #of synchronous overflow events that have taken place. This
* counter is incremented each time the synchronous overflow operation
* is complete.
*/
String SynchronousOverflowCount = "Synchronous Overflow Count";
/**
* The elapsed time for synchronous overflow processing to date.
*/
String SynchronousOverflowMillis = "Synchronous Overflow Millis";
/**
* The elapsed time for asynchronous overflow processing to date.
*/
String AsynchronousOverflowMillis = "Asynchronous Overflow Millis";
/**
* The #of asynchronous overflow events that have taken place. This
* counter is incremented each time the entire overflow operation is
* complete, including any post-processing of the old journal.
*/
String AsynchronousOverflowCount = "Asynchronous Overflow Count";
/**
* The #of asynchronous overflow operations which have failed.
*/
String AsynchronousOverflowFailedCount = "Asynchronous Overflow Failed Count";
/**
* The #of asynchronous overflow tasks (split, join, merge, etc) which
* have failed.
*/
String AsynchronousOverflowTaskFailedCount = "Asynchronous Overflow Task Failed Count";
/**
* The #of asynchronous overflow tasks (split, join, merge, etc) that
* were canceled due to timeout.a
*/
String AsynchronousOverflowTaskCancelledCount = "Asynchronous Overflow Task Cancelled Count";
}
/**
* Performance counters for the index partition tasks.
*
* @author Bryan Thompson
* @version $Id$
*/
public static interface IIndexPartitionTaskCounters {
/**
* The #of index partition build operations which have completed
* successfully.
*/
String BuildCount = "Build Count";
/**
* The #of index partition merge (compacting merge) operations which
* have completed successfully.
*/
String MergeCount = "Merge Count";
/**
* The #of index partition split operations which have completed
* successfully.
*/
String SplitCount = "Split Count";
/**
* The #of index partition tail split operations which have completed
* successfully.
*/
String TailSplitCount = "Tail Split Count";
/**
* The #of index partition join operations which have completed
* successfully.
*/
String JoinCount = "Join Count";
/**
* The #of index partition move operations which have completed
* successfully.
*/
String MoveCount = "Move Count";
/**
* The #of index partitions received by this data service in response to
* an index partition move from another data service.
*/
String ReceiveCount = "Receive Count";
/**
* The #of index partitions build tasks that are executing concurrently
* on this data service.
*/
String ConcurrentBuildCount = "Concurrent Build Count";
/**
* The #of index partitions merge tasks that are executing concurrently
* on this data service.
*/
String ConcurrentMergeCount = "Concurrent Merge Count";
/**
* The running index partition builds for this service. The vast
* majority of any of the index partition tasks (split, move, join,
* etc.) lies in the index segment build operations. Therefore you can
* use the tasks reported here to see the majority of the effort for
* asynchronous overflow operations.
*/
String RunningBuilds = "Active Builds";
}
/**
* @param properties
*/
public OverflowManager(final Properties properties) {
super(properties);
// overflowEnabled
{
overflowEnabled = Boolean
.parseBoolean(properties.getProperty(
Options.OVERFLOW_ENABLED,
Options.DEFAULT_OVERFLOW_ENABLED));
if (log.isInfoEnabled())
log.info(Options.OVERFLOW_ENABLED + "=" + overflowEnabled);
}
// overflowMaxCount
{
overflowMaxCount = Integer.parseInt(properties.getProperty(
Options.OVERFLOW_MAX_COUNT,
Options.DEFAULT_OVERFLOW_MAX_COUNT));
if (log.isInfoEnabled())
log.info(Options.OVERFLOW_MAX_COUNT + "=" + overflowMaxCount);
}
// overflowThreshold
{
overflowThreshold = Double
.parseDouble(properties.getProperty(
Options.OVERFLOW_THRESHOLD,
Options.DEFAULT_OVERFLOW_THRESHOLD));
if (log.isInfoEnabled())
log.info(Options.OVERFLOW_THRESHOLD + "=" + overflowThreshold);
}
// overflowTimeout
{
overflowTimeout = Long
.parseLong(properties.getProperty(
Options.OVERFLOW_TIMEOUT,
Options.DEFAULT_OVERFLOW_TIMEOUT));
if(log.isInfoEnabled())
log.info(Options.OVERFLOW_TIMEOUT + "=" + overflowTimeout);
}
// overflowTasksConcurrent
{
overflowTasksConcurrent = Integer.parseInt(properties
.getProperty(Options.OVERFLOW_TASKS_CONCURRENT,
Options.DEFAULT_OVERFLOW_TASKS_CONCURRENT));
if (log.isInfoEnabled())
log.info(Options.OVERFLOW_TASKS_CONCURRENT + "="
+ overflowTasksConcurrent);
if (overflowTasksConcurrent < 0) {
throw new IllegalArgumentException(
Options.OVERFLOW_TASKS_CONCURRENT
+ " : must be non-negative.");
}
}
// overflowCancelledWhenJournalFull
{
overflowCancelledWhenJournalFull = Boolean
.parseBoolean(properties
.getProperty(
Options.OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL,
Options.DEFAULT_OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL));
if (log.isInfoEnabled())
log.info(Options.OVERFLOW_CANCELLED_WHEN_JOURNAL_FULL + "="
+ overflowCancelledWhenJournalFull);
}
// // purgeResourcesTimeout
// {
//
// purgeResourcesTimeout = Long
// .parseLong(properties.getProperty(
// Options.PURGE_RESOURCES_TIMEOUT,
// Options.DEFAULT_PURGE_RESOURCES_TIMEOUT));
//
// if(log.isInfoEnabled())
// log.info(Options.PURGE_RESOURCES_TIMEOUT + "=" + purgeResourcesTimeout);
//
// }
// copyIndexThreshold
{
copyIndexThreshold = Integer.parseInt(properties
.getProperty(Options.COPY_INDEX_THRESHOLD,
Options.DEFAULT_COPY_INDEX_THRESHOLD));
if(log.isInfoEnabled())
log.info(Options.COPY_INDEX_THRESHOLD + "="
+ copyIndexThreshold);
if (copyIndexThreshold < 0) {
throw new RuntimeException(
Options.COPY_INDEX_THRESHOLD
+ " must be non-negative");
}
}
// accelerateSplitThreshold
{
accelerateSplitThreshold = Integer.parseInt(properties.getProperty(
Options.ACCELERATE_SPLIT_THRESHOLD,
Options.DEFAULT_ACCELERATE_SPLIT_THRESHOLD));
if (log.isInfoEnabled())
log.info(Options.ACCELERATE_SPLIT_THRESHOLD + "="
+ accelerateSplitThreshold);
if (accelerateSplitThreshold < 0) {
throw new RuntimeException(Options.ACCELERATE_SPLIT_THRESHOLD
+ " must be non-negative");
}
}
// percentOfSplitThreshold
{
percentOfSplitThreshold = Double.parseDouble(properties.getProperty(
Options.PERCENT_OF_SPLIT_THRESHOLD,
Options.DEFAULT_PERCENT_OF_SPLIT_THRESHOLD));
if (log.isInfoEnabled())
log.info(Options.PERCENT_OF_SPLIT_THRESHOLD + "="
+ percentOfSplitThreshold);
if (percentOfSplitThreshold < 0 || percentOfSplitThreshold > 2) {
throw new RuntimeException(Options.PERCENT_OF_SPLIT_THRESHOLD
+ " must be in [0:2]");
}
}
// tailSplitThreshold
{
tailSplitThreshold = Double.parseDouble(properties.getProperty(
Options.TAIL_SPLIT_THRESHOLD,
Options.DEFAULT_TAIL_SPLIT_THRESHOLD));
if (log.isInfoEnabled())
log.info(Options.TAIL_SPLIT_THRESHOLD + "="
+ tailSplitThreshold);
if (tailSplitThreshold < 0 || tailSplitThreshold > 1) {
throw new RuntimeException(Options.TAIL_SPLIT_THRESHOLD
+ " must be in [0:1]");
}
}
// // hotSplitThreshold
// {
//
// hotSplitThreshold = Double.parseDouble(properties.getProperty(
// Options.HOT_SPLIT_THRESHOLD,
// Options.DEFAULT_HOT_SPLIT_THRESHOLD));
//
// if (log.isInfoEnabled())
// log.info(Options.HOT_SPLIT_THRESHOLD + "="
// + hotSplitThreshold);
//
// if (hotSplitThreshold < 0 || hotSplitThreshold > 2) {
//
// throw new RuntimeException(Options.HOT_SPLIT_THRESHOLD
// + " must be in [0:2]");
//
// }
//
// }
// scatterSplitEnabled
{
scatterSplitEnabled = Boolean.parseBoolean(properties.getProperty(
Options.SCATTER_SPLIT_ENABLED,
Options.DEFAULT_SCATTER_SPLIT_ENABLED));
if (log.isInfoEnabled())
log.info(Options.SCATTER_SPLIT_ENABLED + "="
+ scatterSplitEnabled);
}
// // scatterSplitPercentOfSplitThreshold
// {
//
// scatterSplitPercentOfSplitThreshold = Double
// .parseDouble(properties
// .getProperty(
// Options.SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD,
// Options.DEFAULT_SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD));
//
// if (log.isInfoEnabled())
// log.info(Options.SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD + "="
// + scatterSplitPercentOfSplitThreshold);
//
// if (scatterSplitPercentOfSplitThreshold < 0.1
// || scatterSplitPercentOfSplitThreshold > 1.0) {
//
// throw new RuntimeException(
// Options.SCATTER_SPLIT_PERCENT_OF_SPLIT_THRESHOLD
// + " must be in [0.1:1.0]");
//
// }
//
// }
//
// // scatterSplitDataServicesCount
// {
//
// scatterSplitDataServicesCount = Integer.parseInt(properties
// .getProperty(Options.SCATTER_SPLIT_DATA_SERVICES_COUNT,
// Options.DEFAULT_SCATTER_SPLIT_DATA_SERVICES_COUNT));
//
// if (log.isInfoEnabled())
// log.info(Options.SCATTER_SPLIT_DATA_SERVICES_COUNT + "="
// + scatterSplitDataServicesCount);
//
// if (scatterSplitDataServicesCount < 0) {
//
// throw new RuntimeException(
// Options.SCATTER_SPLIT_DATA_SERVICES_COUNT
// + " must be non-negative");
//
// }
//
// }
//
// // scatterSplitIndexPartitionsCount
// {
//
// scatterSplitIndexPartitionsCount = Integer
// .parseInt(properties
// .getProperty(
// Options.SCATTER_SPLIT_INDEX_PARTITIONS_COUNT,
// Options.DEFAULT_SCATTER_SPLIT_INDEX_PARTITIONS_COUNT));
//
// if (log.isInfoEnabled())
// log.info(Options.SCATTER_SPLIT_INDEX_PARTITIONS_COUNT + "="
// + scatterSplitIndexPartitionsCount);
//
// if (scatterSplitIndexPartitionsCount < 0) {
//
// throw new RuntimeException(
// Options.SCATTER_SPLIT_INDEX_PARTITIONS_COUNT
// + " must be non-negative");
//
// }
//
// }
// joinsEnabled
{
joinsEnabled = Boolean.parseBoolean(properties.getProperty(
Options.JOINS_ENABLED, Options.DEFAULT_JOINS_ENABLED));
if (log.isInfoEnabled())
log.info(Options.JOINS_ENABLED + "=" + joinsEnabled);
}
// minimumActiveIndexPartitions
{
minimumActiveIndexPartitions = Integer.parseInt(properties
.getProperty(Options.MINIMUM_ACTIVE_INDEX_PARTITIONS,
Options.DEFAULT_MINIMUM_ACTIVE_INDEX_PARTITIONS));
if(log.isInfoEnabled())
log.info(Options.MINIMUM_ACTIVE_INDEX_PARTITIONS + "="
+ minimumActiveIndexPartitions);
if (minimumActiveIndexPartitions <= 0) {
throw new RuntimeException(
Options.MINIMUM_ACTIVE_INDEX_PARTITIONS
+ " must be positive");
}
}
// maximum moves
{
maximumMoves = Integer.parseInt(properties.getProperty(
Options.MAXIMUM_MOVES, Options.DEFAULT_MAXIMUM_MOVES));
if (log.isInfoEnabled())
log.info(Options.MAXIMUM_MOVES + "=" + maximumMoves);
if (maximumMoves < 0) {
throw new RuntimeException(Options.MAXIMUM_MOVES
+ " must be non-negative");
}
}
// maximum moves per target
{
maximumMovesPerTarget = Integer.parseInt(properties.getProperty(
Options.MAXIMUM_MOVES_PER_TARGET,
Options.DEFAULT_MAXIMUM_MOVES_PER_TARGET));
if(log.isInfoEnabled())
log.info(Options.MAXIMUM_MOVES_PER_TARGET + "="
+ maximumMovesPerTarget);
if (maximumMovesPerTarget < 0) {
throw new RuntimeException(Options.MAXIMUM_MOVES_PER_TARGET
+ " must be non-negative");
}
if (maximumMovesPerTarget > maximumMoves) {
throw new RuntimeException(Options.MAXIMUM_MOVES_PER_TARGET
+ " must be less than " + Options.MAXIMUM_MOVES);
}
}
// movePercentOfSplitThreshold
{
maximumMovePercentOfSplit = Double.parseDouble(properties.getProperty(
Options.MAXIMUM_MOVE_PERCENT_OF_SPLIT,
Options.DEFAULT_MAXIMUM_MOVE_PERCENT_OF_SPLIT));
if (log.isInfoEnabled())
log.info(Options.MAXIMUM_MOVE_PERCENT_OF_SPLIT + "="
+ maximumMovePercentOfSplit);
if (maximumMovePercentOfSplit < 0 || maximumMovePercentOfSplit > 2) {
throw new RuntimeException(Options.MAXIMUM_MOVE_PERCENT_OF_SPLIT
+ " must be in [0:2]");
}
}
// movePercentCpuTimeThreshold
{
movePercentCpuTimeThreshold = Double.parseDouble(properties
.getProperty(Options.MOVE_PERCENT_CPU_TIME_THRESHOLD,
Options.DEFAULT_MOVE_PERCENT_CPU_TIME_THRESHOLD));
if (log.isInfoEnabled())
log.info(Options.MOVE_PERCENT_CPU_TIME_THRESHOLD + "="
+ movePercentCpuTimeThreshold);
if (movePercentCpuTimeThreshold < .0
|| movePercentCpuTimeThreshold > 1.) {
throw new RuntimeException(
Options.MOVE_PERCENT_CPU_TIME_THRESHOLD
+ " must be in [0.0:1.0] ");
}
}
// {
// maximumSourcesPerView = Integer.parseInt(properties.getProperty(
// Options.MAXIMUM_SOURCES_PER_VIEW,
// Options.DEFAULT_MAXIMUM_SOURCES_PER_VIEW));
//
// if(log.isInfoEnabled())
// log.info(Options.MAXIMUM_SOURCES_PER_VIEW+ "="
// + maximumSourcesPerView);
//
// if (maximumSourcesPerView < 1) {
//
// throw new RuntimeException(
// Options.MAXIMUM_SOURCES_PER_VIEW
// + " must be GT ONE (1)");
//
// }
//
// }
{
maximumOptionalMergesPerOverflow = Integer.parseInt(properties.getProperty(
Options.MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW,
Options.DEFAULT_OPTIONAL_COMPACTING_MERGES_PER_OVERFLOW));
if (log.isInfoEnabled())
log.info(Options.MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW + "="
+ maximumOptionalMergesPerOverflow);
if (maximumOptionalMergesPerOverflow < 0) {
throw new RuntimeException(
Options.MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW
+ " must be non-negative");
}
}
{
maximumJournalsPerView = Integer.parseInt(properties.getProperty(
Options.MAXIMUM_JOURNALS_PER_VIEW,
Options.DEFAULT_MAXIMUM_JOURNALS_PER_VIEW));
if (log.isInfoEnabled())
log.info(Options.MAXIMUM_JOURNALS_PER_VIEW + "="
+ maximumJournalsPerView);
if (maximumJournalsPerView < 2) {
throw new RuntimeException(Options.MAXIMUM_JOURNALS_PER_VIEW
+ " must be GTE 2");
}
}
{
maximumSegmentsPerView = Integer.parseInt(properties.getProperty(
Options.MAXIMUM_SEGMENTS_PER_VIEW,
Options.DEFAULT_MAXIMUM_SEGMENTS_PER_VIEW));
if (log.isInfoEnabled())
log.info(Options.MAXIMUM_SEGMENTS_PER_VIEW + "="
+ maximumSegmentsPerView);
if (maximumSegmentsPerView < 1) {
throw new RuntimeException(Options.MAXIMUM_SEGMENTS_PER_VIEW
+ " must be GTE 1");
}
}
// maximumBuildSegmentBytes
{
maximumBuildSegmentBytes = Long.parseLong(properties.getProperty(
Options.MAXIMUM_BUILD_SEGMENT_BYTES,
Options.DEFAULT_MAXIMUM_BUILD_SEGMENTS_BYTES));
if (maximumBuildSegmentBytes < 0) {
throw new RuntimeException("The '" + Options.SHUTDOWN_TIMEOUT
+ "' must be non-negative.");
}
if (log.isInfoEnabled())
log.info(Options.MAXIMUM_BUILD_SEGMENT_BYTES + "="
+ maximumBuildSegmentBytes);
}
// shutdownTimeout
{
shutdownTimeout = Long
.parseLong(properties.getProperty(Options.SHUTDOWN_TIMEOUT,
Options.DEFAULT_SHUTDOWN_TIMEOUT));
if (shutdownTimeout < 0) {
throw new RuntimeException("The '" + Options.SHUTDOWN_TIMEOUT
+ "' must be non-negative.");
}
if(log.isInfoEnabled())
log.info(Options.SHUTDOWN_TIMEOUT + "=" + shutdownTimeout);
}
// nominalShardSize
{
nominalShardSize = Long.parseLong(properties.getProperty(
Options.NOMINAL_SHARD_SIZE,
Options.DEFAULT_NOMINAL_SHARD_SIZE));
/*
* Note: When debugging some unit tests it may be necessary to
* override [minShardSize] in order to test against smaller shards.
* The correct value is [Bytes.megabyte].
*
* @see com.bigdata.resources.TestSplitTask
*
* @see com.bigdata.services.TestSplitJoin
*
* @see com.bigdata.services.StressTestConcurrent
*/
final long minShardSize = Bytes.kilobyte;
if (nominalShardSize < minShardSize) {
throw new RuntimeException("The '" + Options.NOMINAL_SHARD_SIZE
+ "' must be GTE " + minShardSize);
}
if (log.isInfoEnabled())
log.info(Options.NOMINAL_SHARD_SIZE + "=" + nominalShardSize);
}
/*
* Obtain the service name so that we can include it in the
* overflowService thread name (if possible).
*/
{
String serviceName = null;
try {
serviceName = getDataService().getServiceName();
} catch (UnsupportedOperationException ex) {
// ignore.
} catch (Throwable t) {
log.warn(t.getMessage(), t);
}
this.serviceName = serviceName;
}
if(overflowEnabled) {
// @todo defer allocation until init() outside of ctor.
overflowService = Executors.newFixedThreadPool(1,
new DaemonThreadFactory((serviceName == null ? ""
: serviceName + "-")
+ "overflowService"));
/*
* Note: The core thread is pre-started so that the MDC logging
* information does not get inherited from whatever thread was
* running the AbstractTask that wound up doing the groupCommit
* during which overflow processing was initiated - this just cleans
* up the log which is otherwise (even more) confusing.
*/
((ThreadPoolExecutor) overflowService).prestartCoreThread();
// buildService
{
buildServiceCorePoolSize = Integer.parseInt(properties
.getProperty(Options.BUILD_SERVICE_CORE_POOL_SIZE,
Options.DEFAULT_BUILD_SERVICE_CORE_POOL_SIZE));
if (log.isInfoEnabled())
log.info(Options.BUILD_SERVICE_CORE_POOL_SIZE + "="
+ buildServiceCorePoolSize);
}
// mergeService
{
mergeServiceCorePoolSize = Integer.parseInt(properties
.getProperty(Options.MERGE_SERVICE_CORE_POOL_SIZE,
Options.DEFAULT_MERGE_SERVICE_CORE_POOL_SIZE));
if (log.isInfoEnabled())
log.info(Options.MERGE_SERVICE_CORE_POOL_SIZE + "="
+ mergeServiceCorePoolSize);
}
} else {
overflowService = null;
buildServiceCorePoolSize = 0;
mergeServiceCorePoolSize = 0;
}
}
synchronized public void shutdown() {
if(!isOpen()) return;
final long begin = System.currentTimeMillis();
if(log.isInfoEnabled())
log.info("Begin");
/*
* overflowService shutdown
*
* Note: This uses immediate termination even during shutdown since
* asynchronous overflow processing does not need to complete and will
* remain coherent regardless of when it is interrupted.
*/
if (overflowService != null)
overflowService.shutdownNow();
// {
//
// /*
// * Note: when the timeout is zero we approximate "forever" using
// * Long.MAX_VALUE.
// */
//
// final long shutdownTimeout = this.shutdownTimeout == 0L ? Long.MAX_VALUE
// : this.shutdownTimeout;
//
// final TimeUnit unit = TimeUnit.MILLISECONDS;
//
// overflowService.shutdown();
//
// try {
//
// log.info("Awaiting service termination");
//
// long elapsed = System.currentTimeMillis() - begin;
//
// if (!overflowService.awaitTermination(shutdownTimeout - elapsed, unit)) {
//
// log.warn("Service termination: timeout");
//
// }
//
// } catch (InterruptedException ex) {
//
// log.warn("Interrupted awaiting service termination.", ex);
//
// }
//
// }
super.shutdown();
final long elapsed = System.currentTimeMillis() - begin;
if(log.isInfoEnabled())
log.info("Done: elapsed="+elapsed+"ms");
}
synchronized public void shutdownNow() {
if(!isOpen()) return;
final long begin = System.currentTimeMillis();
if (log.isInfoEnabled())
log.info("Begin");
if(overflowService!=null)
overflowService.shutdownNow();
super.shutdownNow();
if(log.isInfoEnabled()) {
final long elapsed = System.currentTimeMillis() - begin;
log.info("Done: elapsed=" + elapsed + "ms");
}
}
/**
* An overflow condition is recognized when the journal is within some
* declared percentage of {@link Options#MAXIMUM_EXTENT}. However, this
* method will return false
if overflow has been disabled
* or if there is an asynchronous overflow operation in progress.
*/
public boolean shouldOverflow() {
if(forceOverflow.get()) {
/*
* Note: forceOverflow trumps everything else.
*/
if (log.isInfoEnabled())
log.info("Forcing overflow.");
return true;
}
if (isTransient()) {
/*
* Note: This is disabled in part because we can not close out and
* then re-open a transient journal.
*/
if (log.isDebugEnabled())
log.debug("Overflow processing not allowed for transient journals");
return false;
}
if (!isOverflowEnabled()) {
if (log.isDebugEnabled())
log.debug("Overflow processing is disabled");
return false;
}
if(!overflowAllowed.get()) {
/*
* Note: overflow is disabled until we are done processing the old
* journal.
*
* @todo show elapsed time since disabled in log message.
*/
if (log.isInfoEnabled())
log.info("Asynchronous overflow still active");
return false;
}
/*
* Look for overflow condition on the "live" journal.
*/
final AbstractJournal journal = getLiveJournal();
// true iff the journal meets the pre-conditions for overflow.
final boolean shouldOverflow;
// #of bytes written on the journal.
final long nextOffset;
{
nextOffset = journal.getRootBlockView().getNextOffset();
if (nextOffset > overflowThreshold * journal.getMaximumExtent()) {
shouldOverflow = true;
} else {
shouldOverflow = false;
}
if (!shouldOverflow && log.isDebugEnabled()) {
log.debug("should not overflow" + ": nextOffset=" + nextOffset
+ ", maximumExtent=" + journal.getMaximumExtent());
} else if (shouldOverflow && log.isInfoEnabled()) {
log.debug("shouldOverflow" + ": nextOffset=" + nextOffset
+ ", maximumExtent=" + journal.getMaximumExtent());
}
}
return shouldOverflow;
}
/**
* Core method for overflow with post-processing.
*
* Note: This method does not test preconditions based on the extent of the
* journal.
*
* Note: The caller is responsible for ensuring that this method is invoked
* with an exclusive lock on the write service.
*
* Preconditions:
*
* - Exclusive lock on the {@link WriteExecutorService}
* - {@link #isOverflowAllowed()}
*
*
* Post-conditions:
*
* - Overflowed onto new journal
* - {@link PostProcessOldJournal} task was submitted.
* - {@link #isOverflowAllowed()} was set
false
and will
* remain false
until {@link PostProcessOldJournal}
*
*
* @todo write unit test for an overflow edge case in which we attempt to
* perform an read-committed task on a pre-existing index immediately
* after an {@link #overflow()} and verify that a commit record exists
* on the new journal and that the read-committed task can read from
* the fused view of the new (empty) index on the new journal and the
* old index on the old journal.
*/
public Future