com.gemstone.gemfire.internal.cache.DiskStoreImpl Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gemfire-core Show documentation
SnappyData store based off Pivotal GemFireXD
The newest version!
/*
 * Copyright (c) 2010-2015 Pivotal Software, Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License. See accompanying
 * LICENSE file.
 */
package com.gemstone.gemfire.internal.cache;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.PrintStream;
import java.net.InetAddress;
import java.nio.channels.ClosedByInterruptException;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;

import com.gemstone.gemfire.CancelCriterion;
import com.gemstone.gemfire.CancelException;
import com.gemstone.gemfire.StatisticsFactory;
import com.gemstone.gemfire.SystemFailure;
import com.gemstone.gemfire.cache.Cache;
import com.gemstone.gemfire.cache.CacheClosedException;
import com.gemstone.gemfire.cache.DiskAccessException;
import com.gemstone.gemfire.cache.DiskStore;
import com.gemstone.gemfire.cache.DiskStoreFactory;
import com.gemstone.gemfire.cache.RegionDestroyedException;
import com.gemstone.gemfire.cache.persistence.PersistentID;
import com.gemstone.gemfire.cache.query.IndexMaintenanceException;
import com.gemstone.gemfire.distributed.DistributedSystem;
import com.gemstone.gemfire.distributed.internal.InternalDistributedSystem;
import com.gemstone.gemfire.distributed.internal.membership.InternalDistributedMember;
import com.gemstone.gemfire.i18n.LogWriterI18n;
import com.gemstone.gemfire.internal.ByteArrayDataInput;
import com.gemstone.gemfire.internal.FileUtil;
import com.gemstone.gemfire.internal.InsufficientDiskSpaceException;
import com.gemstone.gemfire.internal.LogWriterImpl;
import com.gemstone.gemfire.internal.NanoTimer;
import com.gemstone.gemfire.internal.cache.GemFireCacheImpl.StaticSystemCallbacks;
import com.gemstone.gemfire.internal.cache.Oplog.DiskRegionInfo;
import com.gemstone.gemfire.internal.cache.Oplog.KRFEntry;
import com.gemstone.gemfire.internal.cache.control.InternalResourceManager;
import com.gemstone.gemfire.internal.cache.control.InternalResourceManager.ResourceType;
import com.gemstone.gemfire.internal.cache.control.MemoryEvent;
import com.gemstone.gemfire.internal.cache.control.MemoryThresholds.MemoryState;
import com.gemstone.gemfire.internal.cache.control.ResourceListener;
import com.gemstone.gemfire.internal.cache.lru.LRUAlgorithm;
import com.gemstone.gemfire.internal.cache.lru.LRUStatistics;
import com.gemstone.gemfire.internal.cache.persistence.BackupInspector;
import com.gemstone.gemfire.internal.cache.persistence.BackupManager;
import com.gemstone.gemfire.internal.cache.persistence.BytesAndBits;
import com.gemstone.gemfire.internal.cache.persistence.DiskExceptionHandler;
import com.gemstone.gemfire.internal.cache.persistence.DiskRecoveryStore;
import com.gemstone.gemfire.internal.cache.persistence.DiskRegionView;
import com.gemstone.gemfire.internal.cache.persistence.DiskStoreFilter;
import com.gemstone.gemfire.internal.cache.persistence.DiskStoreID;
import com.gemstone.gemfire.internal.cache.persistence.OplogType;
import com.gemstone.gemfire.internal.cache.persistence.PRPersistentConfig;
import com.gemstone.gemfire.internal.cache.persistence.PersistentMemberID;
import com.gemstone.gemfire.internal.cache.persistence.PersistentMemberPattern;
import com.gemstone.gemfire.internal.cache.persistence.RestoreScript;
import com.gemstone.gemfire.internal.cache.snapshot.GFSnapshot;
import com.gemstone.gemfire.internal.cache.snapshot.GFSnapshot.SnapshotWriter;
import com.gemstone.gemfire.internal.cache.snapshot.SnapshotPacket.SnapshotRecord;
import com.gemstone.gemfire.internal.cache.versions.RegionVersionVector;
import com.gemstone.gemfire.internal.cache.versions.VersionSource;
import com.gemstone.gemfire.internal.cache.versions.VersionStamp;
import com.gemstone.gemfire.internal.cache.versions.VersionTag;
import com.gemstone.gemfire.internal.concurrent.ConcurrentHashSet;
import com.gemstone.gemfire.internal.i18n.LocalizedStrings;
import com.gemstone.gemfire.internal.offheap.OffHeapHelper;
import com.gemstone.gemfire.internal.offheap.annotations.Released;
import com.gemstone.gemfire.internal.offheap.annotations.Retained;
import com.gemstone.gemfire.internal.shared.SystemProperties;
import com.gemstone.gemfire.internal.shared.Version;
import com.gemstone.gnu.trove.THashMap;
import com.gemstone.gnu.trove.THashSet;

/**
 * Represents a (disk-based) persistent store for region data. Used for both
 * persistent recoverable regions and overflow-only regions.
 * 
 * @author David Whitlock
 * @author Darrel Schneider
 * @author Mitul Bid
 * @author Asif
 * 
 * @since 3.2
 */
@SuppressWarnings("synthetic-access")
public class DiskStoreImpl implements DiskStore, ResourceListener {

  private static final String BACKUP_DIR_PREFIX = "dir";

  private static final SystemProperties sysProps = SystemProperties
      .getServerInstance();
  public static final boolean TRACE_RECOVERY = sysProps.getBoolean(
      "disk.TRACE_RECOVERY", false);
  public static final boolean TRACE_WRITES = sysProps.getBoolean(
      "disk.TRACE_WRITES", false);
  public static final boolean KRF_DEBUG = sysProps.getBoolean(
      "disk.KRF_DEBUG", false);

  public static final int MAX_OPEN_INACTIVE_OPLOGS = sysProps.getInteger(
      "MAX_OPEN_INACTIVE_OPLOGS", 7);
  /* 
   * If less than 20MB (default - configurable through this property) of the
   * available space is left for logging and other misc stuff then it 
   * is better to bail out.
   */
  public static final int MIN_DISK_SPACE_FOR_LOGS = sysProps.getInteger(
      "MIN_DISK_SPACE_FOR_LOGS", 20);

  public static boolean INDEX_LOAD_DEBUG_FINER = sysProps.getBoolean(
      "IndexLoadDebugFiner", false);
  public static boolean INDEX_LOAD_DEBUG = INDEX_LOAD_DEBUG_FINER
      || sysProps.getBoolean("IndexLoadDebug", false);

  public static boolean INDEX_LOAD_PERF_DEBUG = INDEX_LOAD_DEBUG
      || sysProps.getBoolean("IndexLoadPerfDebug", false);

  /** Represents an invalid id of a key/value on disk */
  public static final long INVALID_ID = 0L; // must be zero

  private static final String COMPLETE_COMPACTION_BEFORE_TERMINATION_PROPERTY_BASE_NAME =
      "disk.completeCompactionBeforeTermination";
  public static final String COMPLETE_COMPACTION_BEFORE_TERMINATION_PROPERTY_NAME =
      sysProps.getSystemPropertyNamePrefix()
      + COMPLETE_COMPACTION_BEFORE_TERMINATION_PROPERTY_BASE_NAME;

  static final int MINIMUM_DIR_SIZE = 1024;

  /**
   * The static field delays the joining of the close/clear/destroy & forceFlush
   * operation, with the compactor thread. This joining occurs after the
   * compactor thread is notified to exit. This was added to reproduce deadlock
   * caused by concurrent destroy & clear operation where clear operation is
   * restarting the compactor thread ( a new thread object different from the
   * one for which destroy operation issued notification for release). The delay
   * occurs iff the flag used for enabling callbacks to CacheObserver is enabled
   * true
   */
  static volatile long DEBUG_DELAY_JOINING_WITH_COMPACTOR = 500;

  /**
   * Kept for backwards compat. Should use allowForceCompaction api/dtd instead.
   */
  private final static boolean ENABLE_NOTIFY_TO_ROLL = sysProps.getBoolean(
      "ENABLE_NOTIFY_TO_ROLL", false);

  private static final String RECOVER_VALUE_PROPERTY_BASE_NAME =
      "disk.recoverValues";
  public static final String RECOVER_VALUE_PROPERTY_NAME = sysProps
      .getSystemPropertyNamePrefix() + RECOVER_VALUE_PROPERTY_BASE_NAME;
  private static final String RECOVER_VALUES_SYNC_PROPERTY_BASE_NAME =
      "disk.recoverValuesSync";
  public static final String RECOVER_VALUES_SYNC_PROPERTY_NAME = sysProps
      .getSystemPropertyNamePrefix() + RECOVER_VALUES_SYNC_PROPERTY_BASE_NAME;

  /***
   * Flag to determine if KRF recovery is to be done during data extraction. 
   */
  protected boolean dataExtractionKrfRecovery = false;

  /*****
   * Flag to determine if the offline disk-store is used for data extraction
   */
  protected boolean dataExtraction = false;

  boolean RECOVER_VALUES = sysProps.getBoolean(
      RECOVER_VALUE_PROPERTY_BASE_NAME, true);
  boolean RECOVER_VALUES_SYNC = sysProps.getBoolean(
      RECOVER_VALUES_SYNC_PROPERTY_BASE_NAME, false);
  boolean FORCE_KRF_RECOVERY = sysProps.getBoolean(
      "disk.FORCE_KRF_RECOVERY", false);

  public static final int MAX_SOPLOGS_PER_LEVEL = sysProps.getInteger(
      "disk.MAX_SOPLOGS_PER_LEVEL", 4);

  //TODO soplogs - need to be able to default this to an unlimited number of levels
  //The SizeTieredCompactor currently creates all levels up front
  public static final int MAX_SOPLOG_LEVELS = sysProps.getInteger(
      "disk.MAX_SOPLOG_LEVELS", 10);

  public static final long MIN_RESERVED_DRID = 1;
  public static final long MAX_RESERVED_DRID = 8;
  static final long MIN_DRID = MAX_RESERVED_DRID + 1;

  /**
   * Estimated number of bytes written to disk for each new disk id.
   */
  static final int BYTES_PER_ID = 8;
  
  /**
   * Maximum number of oplogs to compact per compaction operations. Defaults to
   * 1 to allows oplogs to be deleted quickly, to reduce amount of memory used
   * during a compaction and to be fair to other regions waiting for a compactor
   * thread from the pool. Ignored if set to <= 0. Made non static so tests can
   * set it.
   */
  private final int MAX_OPLOGS_PER_COMPACTION = sysProps.getInteger(
      "MAX_OPLOGS_PER_COMPACTION",
      sysProps.getInteger("MAX_OPLOGS_PER_ROLL", 1));

  public static final int MAX_CONCURRENT_COMPACTIONS = sysProps.getInteger(
      "MAX_CONCURRENT_COMPACTIONS",
      sysProps.getInteger("MAX_CONCURRENT_ROLLS", 1));

  /**
   * This system property indicates that maximum number of delayed write
   * tasks that can be pending before submitting the tasks start blocking. 
   * These tasks are things like unpreblow oplogs, delete oplogs, etc. 
   */
  public static final int MAX_PENDING_TASKS = sysProps.getInteger("disk.MAX_PENDING_TASKS", 6);
  /**
   * This system property indicates that IF should also be preallocated. This property 
   * will be used in conjunction with the PREALLOCATE_OPLOGS property. If PREALLOCATE_OPLOGS
   * is ON the below will by default be ON but in order to switch it off you need to explicitly
   */
  static final boolean PREALLOCATE_IF = sysProps.getBoolean(
      "preAllocateIF", true);
  /**
   * This system property indicates that Oplogs should be preallocated till the
   * maxOplogSize as specified for the disk store.
   */
  static final boolean PREALLOCATE_OPLOGS = sysProps.getBoolean(
      "preAllocateDisk", true);

  /** For some testing purposes we would not consider top property if this flag is set to true **/
  public static boolean SET_IGNORE_PREALLOCATE = false;
  
  /**
   * This system property turns on synchronous writes just the the init file.
   */
  static final boolean SYNC_IF_WRITES = sysProps.getBoolean(
      "syncMetaDataWrites", false);

  /**
   * Property to disable fsync behavior to speed up precheckin runs.
   */
  public static boolean DISABLE_SYNC_WRITES_FOR_TESTS = sysProps
      .getBoolean("DISABLE_SYNC_WRITES_FOR_TESTS", false);

  // /** delay for slowing down recovery, for testing purposes only */
  // public static volatile int recoverDelay = 0;

  // //////////////////// Instance Fields ///////////////////////

  private final GemFireCacheImpl cache;

  /** The stats for this store */
  private final DiskStoreStats stats;

  final LogWriterI18n logger;

  /**
   * Asif:Added as stop gap arrangement to fix bug 39380. It is not a clean fix
   * as keeping track of the threads acquiring read lock, etc is not a good idea
   * to solve the issue
   */
  private final AtomicInteger entryOpsCount = new AtomicInteger(0);

  /**
   * Do not want to take chance with any object like DiskRegion etc as lock
   */
  private final Object closeRegionGuard = new Object();

  /** Number of dirs* */
  final int dirLength;

  /** Disk directory holders* */
  DirectoryHolder[] directories;

  /** max of all the dir sizes given stored in bytes* */
  private final long maxDirSize;

  /** disk dir to be used by info file * */
  private int infoFileDirIndex;

  private final int compactionThreshold;

  /**
   * The limit of how many items can be in the async queue before async starts
   * blocking and a flush is forced. If this value is 0 then no limit.
   */
  private final int maxAsyncItems;
  private final AtomicInteger forceFlushCount;
  private final Object asyncMonitor;

  // complex vars
  /** Compactor task which does the compaction. Null if compaction not possible. */
  private final OplogCompactor oplogCompactor;

  private DiskInitFile initFile = null;

  private volatile DiskStoreBackup diskStoreBackup = null;

  private final ReentrantReadWriteLock compactorLock = new ReentrantReadWriteLock();
  private final WriteLock compactorWriteLock = compactorLock.writeLock();
  private final ReadLock compactorReadLock = compactorLock.readLock();

  /**
   * Set if we have encountered a disk exception causing us to shutdown this
   * disk store. This is currently used only to prevent trying to shutdown the
   * disk store from multiple threads, but I think at some point we should use
   * this to prevent any other ops from completing during the close operation.
   */
  private final AtomicReference diskException = new AtomicReference();

  private boolean isForInternalUse;

  PersistentOplogSet persistentOplogs = new PersistentOplogSet(this);
  OverflowOplogSet overflowOplogs = new OverflowOplogSet(this);

  /** For testing purpose **/
  public THashMap TEST_INDEX_ACCOUNTING_MAP;
  public static boolean TEST_NEW_CONTAINER = false;
  public List