All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.gemstone.gemfire.internal.cache.Oplog Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2015 Pivotal Software, Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License. See accompanying
 * LICENSE file.
 */
package com.gemstone.gemfire.internal.cache;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.RandomAccessFile;
import java.io.SyncFailedException;
import java.nio.ByteBuffer;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;

import com.gemstone.gemfire.CancelException;
import com.gemstone.gemfire.DataSerializer;
import com.gemstone.gemfire.SerializationException;
import com.gemstone.gemfire.cache.CacheClosedException;
import com.gemstone.gemfire.cache.CacheWriterException;
import com.gemstone.gemfire.cache.DiskAccessException;
import com.gemstone.gemfire.cache.EntryDestroyedException;
import com.gemstone.gemfire.cache.EntryEvent;
import com.gemstone.gemfire.cache.EntryNotFoundException;
import com.gemstone.gemfire.cache.RegionDestroyedException;
import com.gemstone.gemfire.cache.TimeoutException;
import com.gemstone.gemfire.distributed.OplogCancelledException;
import com.gemstone.gemfire.distributed.internal.DM;
import com.gemstone.gemfire.i18n.LogWriterI18n;
import com.gemstone.gemfire.internal.Assert;
import com.gemstone.gemfire.internal.ByteArrayDataInput;
import com.gemstone.gemfire.internal.HeapDataOutputStream;
import com.gemstone.gemfire.internal.InsufficientDiskSpaceException;
import com.gemstone.gemfire.internal.InternalDataSerializer;
import com.gemstone.gemfire.internal.InternalStatisticsDisabledException;
import com.gemstone.gemfire.internal.Sendable;
import com.gemstone.gemfire.internal.cache.DiskInitFile.DiskRegionFlag;
import com.gemstone.gemfire.internal.cache.DiskStoreImpl.OplogCompactor;
import com.gemstone.gemfire.internal.cache.DiskStoreImpl.OplogEntryIdSet;
import com.gemstone.gemfire.internal.cache.DistributedRegion.DiskPosition;
import com.gemstone.gemfire.internal.cache.GemFireCacheImpl.StaticSystemCallbacks;
import com.gemstone.gemfire.internal.cache.OplogIndex.IndexData;
import com.gemstone.gemfire.internal.cache.delta.Delta;
import com.gemstone.gemfire.internal.cache.locks.LockMode;
import com.gemstone.gemfire.internal.cache.locks.LockingPolicy;
import com.gemstone.gemfire.internal.cache.lru.EnableLRU;
import com.gemstone.gemfire.internal.cache.persistence.BytesAndBits;
import com.gemstone.gemfire.internal.cache.persistence.DiskRecoveryStore;
import com.gemstone.gemfire.internal.cache.persistence.DiskRegionView;
import com.gemstone.gemfire.internal.cache.persistence.DiskStoreID;
import com.gemstone.gemfire.internal.cache.versions.CompactVersionHolder;
import com.gemstone.gemfire.internal.cache.versions.RegionVersionHolder;
import com.gemstone.gemfire.internal.cache.versions.RegionVersionVector;
import com.gemstone.gemfire.internal.cache.versions.VersionHolder;
import com.gemstone.gemfire.internal.cache.versions.VersionSource;
import com.gemstone.gemfire.internal.cache.versions.VersionStamp;
import com.gemstone.gemfire.internal.cache.versions.VersionTag;
import com.gemstone.gemfire.internal.concurrent.ConcurrentTHashSet;
import com.gemstone.gemfire.internal.i18n.LocalizedStrings;
import com.gemstone.gemfire.internal.offheap.OffHeapHelper;
import com.gemstone.gemfire.internal.offheap.SimpleMemoryAllocatorImpl;
import com.gemstone.gemfire.internal.offheap.StoredObject;
import com.gemstone.gemfire.internal.offheap.annotations.Released;
import com.gemstone.gemfire.internal.offheap.annotations.Retained;
import com.gemstone.gemfire.internal.sequencelog.EntryLogger;
import com.gemstone.gemfire.internal.shared.NativeCalls;
import com.gemstone.gemfire.internal.shared.UnsupportedGFXDVersionException;
import com.gemstone.gemfire.internal.shared.Version;
import com.gemstone.gemfire.internal.shared.unsafe.ChannelBufferUnsafeDataInputStream;
import com.gemstone.gemfire.internal.util.IOUtils;
import com.gemstone.gemfire.internal.util.TransformUtils;
import com.gemstone.gemfire.pdx.internal.PdxWriterImpl;
import com.gemstone.gnu.trove.THashSet;
import com.gemstone.gnu.trove.TLongHashSet;

/**
 * Implements an operation log to write to disk.
 * As of prPersistSprint2 this file only supports persistent regions.
 * For overflow only regions see {@link OverflowOplog}.
 * 
 * @author Darrel Schneider
 * @author Mitul Bid
 * @author Asif
 * 
 * @since 5.1
 */

public final class Oplog implements CompactableOplog {

  /** Extension of the oplog file * */
  public static final String CRF_FILE_EXT = ".crf";
  public static final String DRF_FILE_EXT = ".drf";
  public static final String KRF_FILE_EXT = ".krf";
  public static final Pattern IDX_PATTERN = Pattern.compile(".*\\.([0-9]+)\\.idxkrf");
  
  /** The file which will be created on disk * */
  private File diskFile;

  /** boolean marked true when this oplog is closed * */
  private volatile boolean closed;

  private final OplogFile crf = new OplogFile();
  private final OplogFile drf = new OplogFile();
  private final KRFile krf = new KRFile();
  private final OplogIndex idxkrf;
  final ConcurrentTHashSet indexesWritten =
      new ConcurrentTHashSet(2);

  /** preallocated space available for writing to* */
  // volatile private long opLogSpace = 0L;
  /** The stats for this store */
  private final DiskStoreStats stats;

  /** The store that owns this Oplog* */
  private final DiskStoreImpl parent;
  
  /**
   * The oplog set this oplog is part of 
   */
  private final PersistentOplogSet oplogSet;

  /** oplog id * */
  protected final long oplogId;
  
  /** recovered gemfire version * */
  protected Version gfversion;

  /**
   * Recovered version of the data. Usually this is same as {@link #gfversion}
   * except for the case of upgrading disk store from previous version in which
   * case the keys/values are carried forward as is and need to be interpreted
   * in load by latest product code if required.
   */
  protected Version dataVersion;

  /** Directory in which the file is present* */
  private DirectoryHolder dirHolder;

  /** The max Oplog size (user configurable) * */
  private final long maxOplogSize;
  private long maxCrfSize;
  private long maxDrfSize;

  private final AtomicBoolean hasDeletes = new AtomicBoolean();

  private boolean firstRecord = true;

  /**
   * The HighWaterMark of recentValues.
   */
  private final AtomicLong totalCount = new AtomicLong(0);
  /**
   * The number of records in this oplog that contain the most recent
   * value of the entry.
   */
  private final AtomicLong totalLiveCount = new AtomicLong(0);

  private final ConcurrentMap regionMap
    = new ConcurrentHashMap();
  
  /**
   * Set to true once compact is called on this oplog.
   * @since prPersistSprint1
   */
  private volatile boolean compacting = false;

  /**
   * Set to true after the first drf recovery.
   */
  private boolean haveRecoveredDrf = true;
  /**
   * Set to true after the first crf recovery.
   */
  private boolean haveRecoveredCrf = true;
  private OpState opState;
  
  /** OPCODES - byte appended before being written to disk* */

  /**
   * Written to CRF, and DRF.
   */
  private static final byte OPLOG_EOF_ID = 0;
  private static final byte END_OF_RECORD_ID = 21;

  /**
   * Written to CRF and DRF.
   * Followed by 16 bytes which is the leastSigBits and mostSigBits of a UUID
   * for the disk store we belong to.
   * 1: EndOfRecord
   * Is written once at the beginning of every oplog file.
   */
  private static final byte OPLOG_DISK_STORE_ID = 62;
  static final int OPLOG_DISK_STORE_REC_SIZE = 1+16+1;

  /**
   * Written to CRF.
   * Followed by 8 bytes which is the BASE_ID to use for any NEW_ENTRY records.
   * 1: EndOfRecord
   * Only needs to be written once per oplog and must preceed any OPLOG_NEW_ENTRY_0ID records.
   * @since prPersistSprint1
   */
  private static final byte OPLOG_NEW_ENTRY_BASE_ID = 63;
  static final int OPLOG_NEW_ENTRY_BASE_REC_SIZE = 1+8+1;
  /**
   * Written to CRF.
   * The OplogEntryId is +1 the previous new_entry OplogEntryId.
   * Byte Format:
   * 1: userBits
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_NEW_ENTRY_0ID = 64;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 1 byte.
   * Byte Format:
   * 1: userBits
   * 1: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_1ID = 65;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 2 bytes.
   * Byte Format:
   * 1: userBits
   * 2: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_2ID = 66;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 3 bytes.
   * Byte Format:
   * 1: userBits
   * 3: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_3ID = 67;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 4 bytes.
   * Byte Format:
   * 1: userBits
   * 4: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_4ID = 68;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 5 bytes.
   * Byte Format:
   * 1: userBits
   * 5: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_5ID = 69;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 6 bytes.
   * Byte Format:
   * 1: userBits
   * 6: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_6ID = 70;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 7 bytes.
   * Byte Format:
   * 1: userBits
   * 7: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_7ID = 71;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 8 bytes.
   * Byte Format:
   * 1: userBits
   * 8: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_8ID = 72;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 1 byte.
   * Byte Format:
   * 1: userBits
   * 1: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_1ID = 73;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 2 bytes.
   * Byte Format:
   * 1: userBits
   * 2: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_2ID = 74;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 3 bytes.
   * Byte Format:
   * 1: userBits
   * 3: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_3ID = 75;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 4 bytes.
   * Byte Format:
   * 1: userBits
   * 4: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_4ID = 76;

  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 5 bytes.
   * Byte Format:
   * 1: userBits
   * 5: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_5ID = 77;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 6 bytes.
   * Byte Format:
   * 1: userBits
   * 6: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_6ID = 78;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 7 bytes.
   * Byte Format:
   * 1: userBits
   * 7: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_7ID = 79;
  /**
   * Written to CRF.
   * The OplogEntryId is relative to the previous mod_entry OplogEntryId.
   * The signed difference is encoded in 8 bytes.
   * Byte Format:
   * 1: userBits
   * 8: OplogEntryId
   * RegionId
   * 4: valueLength (optional depending on bits)
   * valueLength: value bytes (optional depending on bits)
   * 4: keyLength
   * keyLength: key bytes
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_MOD_ENTRY_WITH_KEY_8ID = 80;

  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 1 byte.
   * Byte Format:
   * 1: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_1ID = 81;
  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 2 bytes.
   * Byte Format:
   * 2: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_2ID = 82;

  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 3 bytes.
   * Byte Format:
   * 3: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_3ID = 83;

  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 4 bytes.
   * Byte Format:
   * 4: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_4ID = 84;

  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 5 bytes.
   * Byte Format:
   * 5: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_5ID = 85;
  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 6 bytes.
   * Byte Format:
   * 6: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_6ID = 86;
  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 7 bytes.
   * Byte Format:
   * 7: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_7ID = 87;
  /**
   * Written to DRF.
   * The OplogEntryId is relative to the previous del_entry OplogEntryId.
   * The signed difference is encoded in 8 bytes.
   * Byte Format:
   * 8: OplogEntryId
   * 1: EndOfRecord
   *
   * @since prPersistSprint1
   */
  private static final byte OPLOG_DEL_ENTRY_8ID = 88;

  /**
   * The maximum size of a DEL_ENTRY record in bytes.
   * Currenty this is 10; 1 for opcode and 8 for oplogEntryId and 1 for END_OF_RECORD_ID
   */
  private static final int MAX_DELETE_ENTRY_RECORD_BYTES = 1+8+1;
  
  /**
   * Written to beginning of each CRF. Contains the RVV for 
   * all regions in the CRF.
   * Byte Format
   * 8: number of regions (variable length encoded number)
   * for each region
   *   4: number of members (variable length encoded number)
   *   for each member
   *     4: canonical member id (variable length encoded number)
   *     8: version id (variable length encoded number)
   *     4: number of exceptions (variable length encoded number)
   *     variable: exceptions
   */
  private static final byte OPLOG_RVV = 89;

  /**
   * When detected conflict, besides persisting the golden copy by modify(),
   * also persist the conflict operation's region version and member id. and failedWritten to beginning of each CRF. Contains the RVV for 
   * all regions in the CRF.
   * Byte Format
   * regionId
   * versions
   */
  private static final byte OPLOG_CONFLICT_VERSION = 90;

  /**
   * persist Gemfire version string into crf, drf, krf
   * Byte Format
   * variable gemfire version string, such as 7.0.0.beta
   * EndOfRecord
   */
  private static final byte OPLOG_GEMFIRE_VERSION = 91;
  static final int OPLOG_GEMFIRE_VERSION_REC_SIZE = 1+3+1;

  /** Compact this oplogs or no. A client configurable property * */
  private final boolean compactOplogs;

  /**
   * Asif: This object is used to correctly identify the OpLog size so as to
   * cause a switch of oplogs
   */
  final Object lock = new Object();

  private boolean lockedForKRFcreate = false;
  
  /**
   * Set to true when this oplog will no longer be written to.
   * Never set to false once it becomes true.
   */
  private boolean doneAppending = false;

  protected final LogWriterI18n logger;

  static final int DEFAULT_BUFFER_SIZE = 32 * 1024;
  static final int LARGE_BUFFER_SIZE = 128 * 1024;

  // ///////////////////// Constructors ////////////////////////
  /**
   * Creates new Oplog for the given region.
   * 
   * @param oplogId
   *          int identifying the new oplog
   * @param dirHolder
   *          The directory in which to create new Oplog
   * 
   * @throws DiskAccessException
   *           if the disk files can not be initialized
   */
  Oplog(long oplogId, PersistentOplogSet parent, DirectoryHolder dirHolder) {
    if (oplogId > DiskId.MAX_OPLOG_ID) {
      throw new IllegalStateException("Too many oplogs. The oplog id can not exceed " + DiskId.MAX_OPLOG_ID);
    }
    this.oplogId = oplogId;
    this.oplogSet = parent;
    this.parent = parent.getParent();
    this.dirHolder = dirHolder;
    // Pretend we have already seen the first record.
    // This will cause a large initial record to force a switch
    // which allows the maxDirSize to be checked.
    this.firstRecord = false; 
    this.logger = getParent().getCache().getLoggerI18n();
    this.opState = new OpState();
    long maxOplogSizeParam = getParent().getMaxOplogSizeInBytes();
    long availableSpace = this.dirHolder.getAvailableSpace();
    if (availableSpace < maxOplogSizeParam) {
      if (DiskStoreImpl.PREALLOCATE_OPLOGS && !DiskStoreImpl.SET_IGNORE_PREALLOCATE) {
        throw new InsufficientDiskSpaceException(
            LocalizedStrings.Oplog_PreAllocate_Failure_Init.toLocalizedString(
                this.dirHolder, maxOplogSizeParam), new IOException(
                "not enough space left to create and pre grow oplog files, available="
                    + availableSpace + ", required=" + maxOplogSizeParam),
            getParent());
      }
      this.maxOplogSize = availableSpace;
      if (this.logger.warningEnabled()) {
        logger.warning(LocalizedStrings.DEBUG, "Reducing maxOplogSize to " + availableSpace + " because that is all the room remaining in the directory.");
      }
    } else {
      long diff = availableSpace - maxOplogSizeParam;
      long minRequired = DiskStoreImpl.MIN_DISK_SPACE_FOR_LOGS * 1024 * 1024;
      if (minRequired > diff) {
        if (DiskStoreImpl.PREALLOCATE_OPLOGS && !DiskStoreImpl.SET_IGNORE_PREALLOCATE) {
          throw new InsufficientDiskSpaceException(
              LocalizedStrings.Oplog_PreAllocate_Failure_Init.toLocalizedString(
                  this.dirHolder, maxOplogSizeParam), new IOException(
                  "not enough space left to create and pre grow oplog files, available="
                      + availableSpace + ", required=" + maxOplogSizeParam),
              getParent());
        }
      }
      this.maxOplogSize = maxOplogSizeParam;
    }
    setMaxCrfDrfSize();
    this.stats = getParent().getStats();
    this.compactOplogs = getParent().getAutoCompact();

    this.closed = false;
    String n = getParent().getName();
    this.diskFile = new File(this.dirHolder.getDir(),
                             oplogSet.getPrefix()
                             + n + "_" + oplogId);
    this.idxkrf = new OplogIndex(this);
    try {
      createDrf(null);
      createCrf(null);
      // open krf for offline compaction
      if (getParent().isOfflineCompacting()) {
        krfFileCreate();
      }
    }
    catch (Exception ex) {
      close();
      
      getParent().getCancelCriterion().checkCancelInProgress(ex);
      if (ex instanceof DiskAccessException) {
        throw (DiskAccessException) ex;
      }
      throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_CREATING_OPERATION_LOG_BECAUSE_0.toLocalizedString(ex), getParent());
    }
  }

  /**
   * Asif: A copy constructor used for creating a new oplog based on the
   * previous Oplog. This constructor is invoked only from the function
   * switchOplog
   * 
   * @param oplogId
   *          integer identifying the new oplog
   * @param dirHolder
   *          The directory in which to create new Oplog
   * @param prevOplog
   *          The previous oplog
   */
  private Oplog(long oplogId, DirectoryHolder dirHolder, Oplog prevOplog) {
    if (oplogId > DiskId.MAX_OPLOG_ID) {
      throw new IllegalStateException("Too many oplogs. The oplog id can not exceed " + DiskId.MAX_OPLOG_ID);
    }
    this.oplogId = oplogId;
    this.parent = prevOplog.parent;
    this.oplogSet = prevOplog.oplogSet;
    this.dirHolder = dirHolder;
    this.opState = new OpState();
    this.logger = prevOplog.logger;
    long maxOplogSizeParam = getParent().getMaxOplogSizeInBytes();
    long availableSpace = this.dirHolder.getAvailableSpace();
    if (prevOplog.compactOplogs) {
      this.maxOplogSize = maxOplogSizeParam;
    } else {
      if (availableSpace < maxOplogSizeParam) {
        this.maxOplogSize = availableSpace;
        if (this.logger.warningEnabled()) {
          logger.warning(LocalizedStrings.DEBUG, "Reducing maxOplogSize to " + availableSpace + " because that is all the room remaining in the directory.");
        }
      } else {
        this.maxOplogSize = maxOplogSizeParam;
      }
    }
    setMaxCrfDrfSize();
    this.stats = prevOplog.stats;
    this.compactOplogs = prevOplog.compactOplogs;
    // copy over the previous Oplog's data version since data is not being
    // transformed at this point
    this.dataVersion = prevOplog.getDataVersionIfOld();

    this.closed = false;
    String n = getParent().getName();
    this.diskFile = new File(this.dirHolder.getDir(),
                             oplogSet.getPrefix()
                             + n + "_" + oplogId);
    this.idxkrf = new OplogIndex(this);
    try {
      createDrf(prevOplog.drf);
      createCrf(prevOplog.crf);
      // open krf for offline compaction
      if (getParent().isOfflineCompacting()) {
        krfFileCreate();
      }
    }
    catch (Exception ex) {
      close();
      
      getParent().getCancelCriterion().checkCancelInProgress(ex);
      if (ex instanceof DiskAccessException) {
        throw (DiskAccessException) ex;
      }
      throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_CREATING_OPERATION_LOG_BECAUSE_0.toLocalizedString(ex), getParent());
    }
  }
  
  public void replaceIncompatibleEntry(DiskRegionView dr, DiskEntry old, DiskEntry repl) {
    boolean useNextOplog = false;
    synchronized (this.lock) {
      if (getOplogSet().getChild() != this) {
        // make sure to only call replaceIncompatibleEntry for child, because this.lock
        // can only sync with compaction thread on child oplog
        useNextOplog = true;
      } else {
        // This method is use in recovery only and will not be called by compaction.
        // It's only called before or after compaction. It will replace DiskEntry
        // in DiskRegion without modifying DiskId (such as to a new oplogId),
        // Not to change the entry count in oplog either. While doing that,
        // this.lock will lock the current child to sync with compaction thread.
        // If replace thread got this.lock, DiskEntry "old" will not be removed from
        // current oplog (maybe not child). If compaction thread got this.lock,
        // DiskEntry "old" should have been moved to child oplog when replace thread
        // processes it.

        // See #48032.  A new region entry has been put into the region map, but we
        // also have to replace it in the oplog live entries that are used to write
        // the krf.  If we don't, we will recover the wrong (old) value.
        getOrCreateDRI(dr).replaceLive(old, repl);
        if (DiskStoreImpl.TRACE_RECOVERY) {
          logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY replacing incompatible entry" 
              + " key = " + old.getKey()
              + " old = " + System.identityHashCode(old)
              + " new = " + System.identityHashCode(repl)
              + " old diskId = " + old.getDiskId()
              + " new diskId = " + repl.getDiskId()
              + " tag = " + old.getVersionStamp()
              + " in child oplog #"+this.getOplogId());
        }
      }
    }
    if (useNextOplog) {
      if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) {
        CacheObserverHolder.getInstance().afterSwitchingOplog();
      }
      Assert.assertTrue(getOplogSet().getChild() != this);
      getOplogSet().getChild().replaceIncompatibleEntry(dr, old, repl);
    }
  }

  public Collection getRegionRecoveryMap() {
    return Collections.unmodifiableCollection(this.regionMap.values());
  }

  private void writeDiskStoreRecord(OplogFile olf) throws IOException {
    this.opState = new OpState();
    this.opState.initialize(getParent().getDiskStoreID());
    writeOpLogBytes(olf, false, true); // fix for bug 41928
    olf.currSize += getOpStateSize();
    this.dirHolder.incrementTotalOplogSize(getOpStateSize());
  }

  private void writeGemfireVersionRecord(OplogFile olf) throws IOException {
    if (this.gfversion == null) {
      this.gfversion = Version.CURRENT;
    }
    Version dataVersion = getDataVersionIfOld();
    if (dataVersion == null) {
      dataVersion = Version.CURRENT;
    }
    // if gfversion and dataVersion are not same, then write a special token
    // version and then write both, else write gfversion as before
    // this is for backward compatibility with 7.0
    this.opState = new OpState();
    if (this.gfversion == dataVersion) {
      writeProductVersionRecord(this.gfversion, olf);
    }
    else {
      writeProductVersionRecord(Version.TOKEN, olf);
      clearOpState();
      writeProductVersionRecord(this.gfversion, olf);
      clearOpState();
      writeProductVersionRecord(dataVersion, olf);
    }
  }

  private void writeProductVersionRecord(Version version, OplogFile olf)
      throws IOException {
    this.opState.initialize(version.ordinal());
    writeOpLogBytes(olf, false, true);
    olf.currSize += getOpStateSize();
    this.dirHolder.incrementTotalOplogSize(getOpStateSize());
  }

  public final Version currentRecoveredGFVersion() {
    return this.gfversion;
  }
  
  /**
   * Write an RVV record containing all of the live disk regions.
   */
  private void writeRVVRecord(OplogFile olf, boolean writeGCRVV) throws IOException {
    writeRVVRecord(olf, getParent().getAllDiskRegions(), writeGCRVV);
  }
  
  /**
   * Write the RVV record for the given regions.
   * @param olf the oplog to write to 
   * @param diskRegions the set of disk regions we should write the RVV of
   * @param writeGCRVV true to write write the GC RVV
   * @throws IOException
   */
  private void writeRVVRecord(OplogFile olf,
      Map diskRegions, boolean writeGCRVV)
      throws IOException {
    this.opState = new OpState();
    this.opState.initialize(diskRegions, writeGCRVV);
    writeOpLogBytes(olf, false, true); // fix for bug 41928
    olf.currSize += getOpStateSize();
    this.dirHolder.incrementTotalOplogSize(getOpStateSize());
  }
  
  private boolean wroteNewEntryBase = false;
  /**
   * Write a OPLOG_NEW_ENTRY_BASE_ID to this oplog.
   * Must be called before any OPLOG_NEW_ENTRY_0ID records are written
   * to this oplog.
   */
  private boolean writeNewEntryBaseRecord(boolean async) throws IOException {
    if (this.wroteNewEntryBase) return false;
    this.wroteNewEntryBase = true;
    long newEntryBase = getOplogSet().getOplogEntryId();
//     logger.info(LocalizedStrings.DEBUG, "DEBUG newEntryBase=" + newEntryBase + " oplog#" + getOplogId());

    OpState saved = this.opState;
    try {
      this.opState = new OpState();
      this.opState.initialize(newEntryBase);
      writeOpLogBytes(this.crf, async, false/*no need to flush this record*/);
      this.dirHolder.incrementTotalOplogSize(getOpStateSize());
    } finally {
      this.opState = saved;
    }
    //       {
    //         LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n();
    //         l.info(LocalizedStrings.DEBUG, "base inc=" + OPLOG_NEW_ENTRY_BASE_REC_SIZE + " currSize=" + this.crf.currSize);
    //       }
    return true;
  }

  /**
   * Return true if this oplog has a drf but does not have a crf
   */
  boolean isDrfOnly() {
    return this.drf.f != null && this.crf.f == null;
  }
  
  /**
   * This constructor will get invoked only in case of persistent region
   * when it is recovering an oplog.
   * @param oplogId
   * @param parent
   */
  Oplog(long oplogId, PersistentOplogSet parent) {
    // @todo have the crf and drf use different directories.
    if (oplogId > DiskId.MAX_OPLOG_ID) {
      throw new IllegalStateException("Too many oplogs. The oplog id can not exceed " + DiskId.MAX_OPLOG_ID);
    }
    this.isRecovering = true;
    this.oplogId = oplogId;
    this.parent = parent.getParent();
    this.oplogSet = parent;
    this.logger = getParent().getCache().getLoggerI18n();
    this.opState = new OpState();
    long maxOplogSizeParam = getParent().getMaxOplogSizeInBytes();
    this.maxOplogSize = maxOplogSizeParam;
    setMaxCrfDrfSize();
    this.stats = getParent().getStats();
    this.compactOplogs = getParent().getAutoCompact();
    this.closed = true;
    this.crf.RAFClosed = true;
    this.deleted.set(true);
    this.haveRecoveredCrf = false;
    this.haveRecoveredDrf = false;
    this.newOplog = false;
    this.idxkrf = new OplogIndex(this);
  }

  private boolean newOplog = true;
  /**
   * Returns true if added file was crf; false if drf
   * @param foundDrfs 
   * @param foundCrfs 
   */
  boolean addRecoveredFile(File f, DirectoryHolder dh, TLongHashSet foundCrfs, TLongHashSet foundDrfs) {
    String fname = f.getName();
    if (this.dirHolder != null) {
      if (!dh.equals(this.dirHolder)) {
        throw new DiskAccessException("Oplog#" + getOplogId()
                                      + " has files in two different directories: \""
                                      + this.dirHolder
                                      + "\", and \""
                                      + dh
                                      + "\". Both the crf and drf for this oplog should be in the same directory.",
                                      getParent());
      }
    } else {
      this.dirHolder = dh;
    }
    if (fname.endsWith(Oplog.CRF_FILE_EXT)) {
      this.crf.f = f;
      foundCrfs.add(this.oplogId);
    } else if (fname.endsWith(Oplog.DRF_FILE_EXT)) {
      this.drf.f = f;
      foundDrfs.add(this.oplogId);
//    } else if (fname.endsWith(Oplog.KRF_FILE_EXT)) {
//      this.krf.f = f;
    } else if (Oplog.IDX_PATTERN.matcher(fname).matches()) {
      idxkrf.addRecoveredFile(fname);
    }else {
      assert false : fname;
    }
    return false;
  }

  void setRecoveredDrfSize(long size) {
    this.drf.currSize += size;
    this.drf.bytesFlushed += size;
  }
  void setRecoveredCrfSize(long size) {
    this.crf.currSize += size;
    this.crf.bytesFlushed += size;
//       {
//         LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n();
//         l.info(LocalizedStrings.DEBUG, "setRecoveredCrfSize to=" + this.crf.currSize);
//       }
  }
  private boolean isRecovering;
  boolean isRecovering() {
    return this.isRecovering;
  }

  public final DiskStoreImpl getParent() {
    return this.parent;
  }
  
  private PersistentOplogSet getOplogSet() {
    return oplogSet;
  }

  void initAfterRecovery(boolean offline) {
    this.isRecovering = false;
    this.closed = false;
    this.deleted.set(false);
    String n = getParent().getName();
    // crf might not exist; but drf always will
    this.diskFile = new File(this.drf.f.getParentFile(),
                             oplogSet.getPrefix()
                             + n + "_" + this.oplogId);
    try {
      // This is a recovered oplog and we only read from its crf.
      // No need to open the drf.
      this.doneAppending = true;
      if (this.crf.f != null && !hasNoLiveValues()) {
        this.closed = false;
        // truncate crf/drf if their actual size is less than their pre-blow size
        this.crf.raf = new RandomAccessFile(this.crf.f, "rw");
        this.crf.RAFClosed = false;
        this.crf.channel = this.crf.raf.getChannel();
        unpreblow(this.crf, getMaxCrfSize());
        this.crf.raf.close();
        // make crf read only
        this.crf.raf = new RandomAccessFile(this.crf.f, "r");
        this.crf.channel = this.crf.raf.getChannel();
        this.stats.incOpenOplogs();
        
        //drf.raf is null at this point. create one and close it to retain existing behavior
        try {
          this.drf.raf = new RandomAccessFile(this.drf.f, "rw");
          this.drf.RAFClosed = false;
          this.drf.channel = this.drf.raf.getChannel();
          unpreblow(this.drf, getMaxDrfSize());
        } finally {
          this.drf.raf.close();
          this.drf.raf = null;
          this.drf.RAFClosed = true;
        }
        // no need to seek to the end; we will not be writing to a recovered oplog; only reading
        // this.crf.raf.seek(this.crf.currSize);
      } else if (!offline) {
        // drf exists but crf has been deleted (because it was empty).
        // I don't think the drf needs to be opened. It is only used during recovery.
        // At some point the compacter my identify that it can be deleted.
        this.crf.RAFClosed = true;
        deleteCRF();
        this.closed = true;
        this.deleted.set(true);
      }
      this.drf.RAFClosed = true; // since we never open it on a recovered oplog
    }catch (IOException ex) {
      getParent().getCancelCriterion().checkCancelInProgress(ex);
      throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_CREATING_OPERATION_LOG_BECAUSE_0.toLocalizedString(ex), getParent());
    }
    if (hasNoLiveValues() && !offline) {
      getOplogSet().removeOplog(getOplogId(), true, getHasDeletes() ? this : null);
      if (!getHasDeletes()) {
        getOplogSet().drfDelete(this.oplogId);
        deleteFile(this.drf);
      }
    } else if (needsCompaction()) {
      // just leave it in the list it is already in
    } else {
      // remove it from the compactable list
      getOplogSet().removeOplog(getOplogId(), true/* say we are deleting so that undeletedOplogSize is not inced */, null);
      // add it to the inactive list
      getOplogSet().addInactive(this);
    }
  }

  boolean getHasDeletes() {
    return this.hasDeletes.get();
  }
  private void setHasDeletes(boolean v) {
    this.hasDeletes.set(v);
  }

  private void closeAndDeleteAfterEx(IOException ex, OplogFile olf) {
    if (olf == null) {
      return;
    }
    
    if (olf.raf != null) {
      try {
        olf.raf.close();
      } catch (IOException e) {
        logger.warning(LocalizedStrings.Oplog_Close_Failed, olf.f.getAbsolutePath(), e);
      }
    }
    olf.RAFClosed = true;
    if (!olf.f.delete() && olf.f.exists()) {
      throw new DiskAccessException(
          LocalizedStrings.Oplog_COULD_NOT_DELETE__0_.toLocalizedString(olf.f
              .getAbsolutePath()), ex, getParent());
    }
  }
  
  private void preblow(OplogFile olf, long maxSize) throws IOException {
//     logger.info(LocalizedStrings.DEBUG, "DEBUG preblow(" + maxSize + ")  dirAvailSpace=" + this.dirHolder.getAvailableSpace());
    long availableSpace = this.dirHolder.getAvailableSpace();
    if (availableSpace >= maxSize) {
      try {
        NativeCalls.getInstance().preBlow(olf.f.getAbsolutePath(), maxSize,
            (DiskStoreImpl.PREALLOCATE_OPLOGS && !DiskStoreImpl.SET_IGNORE_PREALLOCATE));
      }
      catch (IOException ioe) {
        logger.warning(LocalizedStrings.DEBUG, "Could not pregrow oplog to " + maxSize + " because: " + ioe);
//         if (this.logger.warningEnabled()) {
//           this.logger.warning(
//               LocalizedStrings.Oplog_OPLOGCREATEOPLOGEXCEPTION_IN_PREBLOWING_THE_FILE_A_NEW_RAF_OBJECT_FOR_THE_OPLOG_FILE_WILL_BE_CREATED_WILL_NOT_BE_PREBLOWNEXCEPTION_STRING_IS_0,
//               ioe, null);
//         }
        // I don't think I need any of this. If setLength throws then
        // the file is still ok.
        // I need this on windows. I'm seeing this in testPreblowErrorCondition:
// Caused by: java.io.IOException: The parameter is incorrect
// 	at sun.nio.ch.FileDispatcher.write0(Native Method)
// 	at sun.nio.ch.FileDispatcher.write(FileDispatcher.java:44)
// 	at sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:104)
// 	at sun.nio.ch.IOUtil.write(IOUtil.java:60)
// 	at sun.nio.ch.FileChannelImpl.write(FileChannelImpl.java:206)
// 	at com.gemstone.gemfire.internal.cache.Oplog.flush(Oplog.java:3377)
// 	at com.gemstone.gemfire.internal.cache.Oplog.flushAll(Oplog.java:3419)
        /*
        {
          String os = System.getProperty("os.name");
          if (os != null) {
	    if (os.indexOf("Windows") != -1) {
              olf.raf.close();
              olf.RAFClosed = true;
              if (!olf.f.delete() && olf.f.exists()) {
                throw new DiskAccessException(LocalizedStrings.Oplog_COULD_NOT_DELETE__0_.toLocalizedString(olf.f.getAbsolutePath()), getParent());
              }
              if (logger.fineEnabled()) {
                logger.fine("recreating operation log file " + olf.f);
              }
              olf.raf = new RandomAccessFile(olf.f, SYNC_WRITES ? "rwd" : "rw");
              olf.RAFClosed = false;
            }
          }
        }
        */
        closeAndDeleteAfterEx(ioe, olf);
        throw new InsufficientDiskSpaceException(
            LocalizedStrings.Oplog_PreAllocate_Failure.toLocalizedString(
                olf.f.getAbsolutePath(), maxSize), ioe, getParent());
      }
    }
    // TODO: Perhaps the test flag is not requierd here. Will re-visit.
    else if (DiskStoreImpl.PREALLOCATE_OPLOGS && !DiskStoreImpl.SET_IGNORE_PREALLOCATE) {
      throw new InsufficientDiskSpaceException(
          LocalizedStrings.Oplog_PreAllocate_Failure.toLocalizedString(
              olf.f.getAbsolutePath(), maxSize), new IOException(
              "not enough space left to pre-blow, available=" + availableSpace
                  + ", required=" + maxSize), getParent());
    }
  }

  private void unpreblow(OplogFile olf, long maxSize) {
    synchronized (/*olf*/this.lock) {
      if (!olf.RAFClosed && !olf.unpreblown) {
        olf.unpreblown = true;
        if (olf.currSize < maxSize) {
          try {
             olf.raf.setLength(olf.currSize);
//             {
//               LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n();
//               l.info(LocalizedStrings.DEBUG, "after setLength setting size to=" + olf.currSize
//                      + " fp=" + olf.raf.getFilePointer()
//                      + " oplog#" + getOplogId());
//             }
          }
          catch (IOException ignore) {
          }
        }
      }
    }
  }
  /**
   * Creates the crf oplog file
   * 
   * @throws IOException
   */
  private void createCrf(OplogFile prevOlf) throws IOException
  {
    File f = new File(this.diskFile.getPath() + CRF_FILE_EXT);
    if (logger.fineEnabled()) {
      logger.fine("Creating operation log file " + f);
    }
    this.crf.f = f;
    preblow(this.crf, getMaxCrfSize());
    this.crf.raf = new RandomAccessFile(f,
        getParent().getSyncWrites() ? "rwd" : "rw");
    this.crf.RAFClosed = false;
    oplogSet.crfCreate(this.oplogId);
    this.crf.writeBuf = allocateWriteBuf(prevOlf);

    if (logger.infoEnabled()) {
      logger.info(LocalizedStrings.Oplog_CREATE_0_1_2,
                  new Object[] {toString(),
                                getFileType(this.crf),
                                getParent().getName()});
    }
    this.crf.channel = this.crf.raf.getChannel();

    this.stats.incOpenOplogs();
    writeDiskStoreRecord(this.crf);
    writeGemfireVersionRecord(this.crf);    
    writeRVVRecord(this.crf, false);
    
    //Fix for bug 41654 - don't count the header
    //size against the size of the oplog. This ensures that
    //even if we have a large RVV, we can still write up to
    //max-oplog-size bytes to this oplog.
    this.maxCrfSize += this.crf.currSize;
  }

  private static ByteBuffer allocateWriteBuf(OplogFile prevOlf) {
    if (prevOlf != null && prevOlf.writeBuf != null) {
      ByteBuffer result = prevOlf.writeBuf;
      prevOlf.writeBuf = null;
      return result;
    } else {
      return ByteBuffer.allocateDirect(Integer.getInteger("WRITE_BUF_SIZE",
          DEFAULT_BUFFER_SIZE).intValue());
    }
  }
  
  /**
   * Creates the drf oplog file
   * 
   * @throws IOException
   */
  private void createDrf(OplogFile prevOlf) throws IOException
  {
    String drfFilePath = this.diskFile.getPath() + DRF_FILE_EXT;
    File f = new File(drfFilePath);
    this.drf.f = f;
    if (logger.fineEnabled()) {
      logger.fine("Creating operation log file " + f);
    }
    preblow(this.drf, getMaxDrfSize());
    this.drf.raf = new RandomAccessFile(f,
        getParent().getSyncWrites() ? "rwd" : "rw");
    this.drf.RAFClosed = false;
    this.oplogSet.drfCreate(this.oplogId);
    this.drf.writeBuf = allocateWriteBuf(prevOlf);
    if (logger.infoEnabled()) {
      logger.info(LocalizedStrings.Oplog_CREATE_0_1_2,
                  new Object[] {toString(),
                                getFileType(this.drf),
                                getParent().getName()});
    }
    this.drf.channel = this.drf.raf.getChannel();
    writeDiskStoreRecord(this.drf);
    writeGemfireVersionRecord(this.drf);
    writeRVVRecord(this.drf, true);
  }
  
  /**
   * Returns the DiskStoreStats for this oplog
   */
  public DiskStoreStats getStats()
  {
    return this.stats;
  }
  
  /**
   * Flushes any pending writes to disk.
   * 
   * public final void flush() { forceFlush(); }
   */

  /**
   * Test Method to be used only for testing purposes. Gets the underlying File
   * object for the Oplog . Oplog class uses this File object to obtain the
   * RandomAccessFile object. Before returning the File object , the dat present
   * in the buffers of the RandomAccessFile object is flushed. Otherwise, for
   * windows the actual file length does not match with the File size obtained
   * from the File object
   * 
   * @throws IOException
   * @throws SyncFailedException
   */
  File getOplogFile() throws SyncFailedException, IOException
  {
    // @todo check callers for drf
    synchronized (this.lock/*crf*/) {
      if (!this.crf.RAFClosed) {
        this.crf.raf.getFD().sync();
      }
      return this.crf.f;
    }
  }

  /**
   * Given a set of Oplog file names return a Set of the oplog files that match those names that are 
   * managed by this Oplog.
   * @param baselineFiles a Set of operation log file names in the baseline
   * @param filesNeedingBackup a set of files still needing backup. Files will be removed
   * from this set if they are found in the baseline.
   */
  Set gatherMatchingOplogFiles(Set baselineFiles, Set filesNeedingBackup) {
    Set matchingFiles = new LinkedHashSet();

    for(Iterator itr = filesNeedingBackup.iterator(); itr.hasNext(); ) {
      File file = itr.next();
      // If the file is in the baseline, add it to the baseline map and remove
      // it from the set of files to backup.
      if(baselineFiles.contains(file.getName())) {
        matchingFiles.add(file.getName());
        itr.remove();
      }
    }
    
    return matchingFiles;
  }
  
  /**
   * Returns the set of valid files assocatiated with this oplog - the
   * crf, drf, krf, and idxkrf if they are present.
   */
  public Set getAllFiles() {
 // Check for crf existence
    Set files = new LinkedHashSet(4);
    if((null != this.crf.f) && this.crf.f.exists()) {
      files.add(IOUtils.tryGetCanonicalFileElseGetAbsoluteFile(this.crf.f));
    }
    
    // Check for drf existence
    if((null != this.drf.f) && this.drf.f.exists()) {
      files.add(IOUtils.tryGetCanonicalFileElseGetAbsoluteFile(this.drf.f));
    }
    
    // Check for krf existence
    if(getParent().getDiskInitFile().hasKrf(this.oplogId)) {
      File krfFile = new File(getKrfFilePath());
      if(krfFile.exists()) {
        files.add(IOUtils.tryGetCanonicalFileElseGetAbsoluteFile(krfFile));      
      }
      
      File idxFile = getIndexFileIfValid();
      if(idxFile != null && idxFile.exists()) {
        files.add(IOUtils.tryGetCanonicalFileElseGetAbsoluteFile(idxFile));      
      }
    }
    return files;
  }
  
  /**
   * Returns a map of baseline oplog files to copy that match this oplog's files for a currently running backup.
   * @param baselineOplogFiles a List of files to match this oplog's filenames against.
   * @param allFiles - a set of all files for the oplog. This set will be modified to remove all of the files
   * that are present in the baseline.
   * @return a map of baslineline oplog files to copy.  May be empty if total current set for this oplog
   * does not match the baseline.
   */
  Map mapBaseline(List baselineOplogFiles, Set allFiles) {
    // Map of baseline oplog file name to oplog file
    Map baselineOplogMap = TransformUtils.transformAndMap(baselineOplogFiles,TransformUtils.fileNameTransformer);
    
    // Returned Map of baseline file to current oplog file
    Map baselineToOplogMap = new HashMap();
    
    for(Iterator itr = allFiles.iterator(); itr.hasNext(); ) {
      File file = itr.next();
      // If the file is in the baseline, add it to the baseline map and remove
      // it from the set of files to backup.
      if(baselineOplogMap.containsKey(file.getName())) {
        baselineToOplogMap.put(baselineOplogMap.get(file.getName()), file);
        itr.remove();
      }
    }
    
    return baselineToOplogMap;
  }

  /** the oplog identifier * */
  public long getOplogId()
  {
    return this.oplogId;
  }

  /** Returns the unserialized bytes and bits for the given Entry. 
   * If Oplog is destroyed while querying, then the DiskRegion is queried again to
   * obatin the value This method should never get invoked for an entry which
   * has been destroyed
   * 
   * @since 3.2.1 
   * @param id The DiskId for the entry @param offset The offset in
   *        this OpLog where the entry is present. @param faultingIn @param
   *        bitOnly boolean indicating whether to extract just the UserBit or
   *        UserBit with value @return BytesAndBits object wrapping the value &
   *        user bit
   */
  public final BytesAndBits getBytesAndBits(DiskRegionView dr, DiskId id, boolean faultingIn,
                                            boolean bitOnly)
  {
    Oplog retryOplog = null;
    long offset = 0;
    synchronized (id) {
      long opId = id.getOplogId();
      if (opId != getOplogId()) {
        // the oplog changed on us so we need to do a recursive
        // call after unsyncing
        retryOplog = getOplogSet().getChild(opId);
      } else {
        // fetch this while synced so it will be consistent with oplogId
        offset = id.getOffsetInOplog();
      }
    }
    if (retryOplog != null) {
      return retryOplog.getBytesAndBits(dr, id, faultingIn, bitOnly);
    }
    BytesAndBits bb = null;
    long start = this.stats.startRead();

    // Asif: If the offset happens to be -1, still it is possible that
    // the data is present in the current oplog file.
    if (offset == -1) {
      // Asif: Since it is given that a get operation has alreadty
      // taken a
      // lock on an entry , no put operation could have modified the
      // oplog ID
      // there fore synchronization is not needed
      // synchronized (id) {
      // if (id.getOplogId() == this.oplogId) {
      offset = id.getOffsetInOplog();
      // }
      // }
    }

    // Asif :If the current OpLog is not destroyed ( its opLogRaf file
    // is still open) we can retrieve the value from this oplog.
    try {
      bb = basicGet(dr, offset, bitOnly, id.getValueLength(), id.getUserBits());
    }
    catch (DiskAccessException dae) {
      if (this.logger.errorEnabled()) {
        this.logger
          .error(
                 LocalizedStrings.Oplog_OPLOGBASICGET_ERROR_IN_READING_THE_DATA_FROM_DISK_FOR_DISK_ID_HAVING_DATA_AS_0,
                 id, dae);
      }
      throw dae;
    }

    if (bb == null) {
      throw new EntryDestroyedException(LocalizedStrings.Oplog_NO_VALUE_WAS_FOUND_FOR_ENTRY_WITH_DISK_ID_0_ON_A_REGION_WITH_SYNCHRONOUS_WRITING_SET_TO_1
                                        .toLocalizedString(new Object[] {id, Boolean.valueOf(dr.isSync())}));
    }
    if (bitOnly) {
      dr.endRead(start, this.stats.endRead(start, 1), 1);
    } else {
      dr.endRead(start, this.stats.endRead(start, bb.getBytes().length), bb.getBytes().length);
    }
    return bb;

  }

  /**
   * Returns the object stored on disk with the given id. This method is used
   * for testing purposes only. As such, it bypasses the buffer and goes
   * directly to the disk. This is not a thread safe function , in the sense, it
   * is possible that by the time the OpLog is queried , data might move HTree
   * with the oplog being destroyed
   * 
   * @param id
   *                A DiskId object for which the value on disk will be fetched
   * 
   */
  public final BytesAndBits getNoBuffer(DiskRegion dr, DiskId id)
  {
    if (logger.finerEnabled()) {
      logger
          .finer("Oplog::getNoBuffer:Before invoking Oplog.basicGet for DiskID ="
              + id);
    }

    try {
      BytesAndBits bb = basicGet(dr, id.getOffsetInOplog(), false,
                                 id.getValueLength(), id.getUserBits());
      return bb;
    }
    catch (DiskAccessException dae) {
      if (logger.errorEnabled()) {
        logger.error(
            LocalizedStrings.Oplog_OPLOGGETNOBUFFEREXCEPTION_IN_RETRIEVING_VALUE_FROM_DISK_FOR_DISKID_0,
            id, dae);
      }
      throw dae;
    }
    catch (IllegalStateException ise) {
      if (logger.errorEnabled()) {
        logger.error(
            LocalizedStrings.Oplog_OPLOGGETNOBUFFEREXCEPTION_IN_RETRIEVING_VALUE_FROM_DISK_FOR_DISKID_0,
            id, ise);
      }
      throw ise;
    }
  }

  void close(DiskRegion dr) {
    // while a krf is being created can not close a region
    lockCompactor();
    try {
//      if (logger.infoEnabled()) {
//        logger.info(LocalizedStrings.DEBUG, "DEBUG closing dr=" + dr.getId()
//            + " on oplog " + this);
//      }
    addUnrecoveredRegion(dr.getId());
    DiskRegionInfo dri = getDRI(dr);
    if (dri != null) {
      long clearCount = dri.clear(null);
      if (clearCount != 0) {
        this.totalLiveCount.addAndGet(-clearCount);
        // no need to call handleNoLiveValues because we now have an unrecovered region.
      }
      this.regionMap.remove(dr.getId(), dri);
    }
    } finally {
      unlockCompactor();
    }
  }

  void clear(DiskRegion dr, RegionVersionVector rvv) {
    DiskRegionInfo dri = getDRI(dr);
    if (dri != null) {
      long clearCount = dri.clear(rvv);
      if (clearCount != 0) {
        this.totalLiveCount.addAndGet(-clearCount);
        if (!isCompacting() || calledByCompactorThread()) {
          handleNoLiveValues();
        }
      }
    }
  }

  void destroy(DiskRegion dr) {
    DiskRegionInfo dri = getDRI(dr);
    if (dri != null) {
      long clearCount = dri.clear(null);
      if (clearCount != 0) {
        this.totalLiveCount.addAndGet(-clearCount);
        if (!isCompacting() || calledByCompactorThread()) {
          handleNoLiveValues();
        }
      }
      this.regionMap.remove(dr.getId(), dri);
    }
  }

  long getMaxRecoveredOplogEntryId() {
    long result = this.recoverNewEntryId;
    if (this.recoverModEntryIdHWM > result) {
      result = this.recoverModEntryIdHWM;
    }
    if (this.recoverDelEntryIdHWM > result) {
      result = this.recoverDelEntryIdHWM;
    }
    return result;
  }

  /**
   * Used during recovery to calculate the OplogEntryId of the next NEW_ENTRY record.
   * @since prPersistSprint1
   */
  private long recoverNewEntryId = DiskStoreImpl.INVALID_ID;
  /**
   * Used during writing to remember the last MOD_ENTRY OplogEntryId written to this oplog.
   * @since prPersistSprint1
   */
  private long writeModEntryId = DiskStoreImpl.INVALID_ID;
  /**
   * Used during recovery to calculate the OplogEntryId of the next MOD_ENTRY record.
   * @since prPersistSprint1
   */
  private long recoverModEntryId = DiskStoreImpl.INVALID_ID;
  /**
   * Added to fix bug 41301. High water mark of modified entries.
   */
  private long recoverModEntryIdHWM = DiskStoreImpl.INVALID_ID;
  /**
   * Added to fix bug 41340. High water mark of deleted entries.
   */
  private long recoverDelEntryIdHWM = DiskStoreImpl.INVALID_ID;
  /**
   * Used during writing to remember the last DEL_ENTRY OplogEntryId written to this oplog.
   * @since prPersistSprint1
   */
  private long writeDelEntryId = DiskStoreImpl.INVALID_ID;
  /**
   * Used during recovery to calculate the OplogEntryId of the next DEL_ENTRY record.
   * @since prPersistSprint1
   */
  private long recoverDelEntryId = DiskStoreImpl.INVALID_ID;

  private void setRecoverNewEntryId(long v) {
    this.recoverNewEntryId = v;
  }
  private long incRecoverNewEntryId() {
    this.recoverNewEntryId++;
    return this.recoverNewEntryId;
  }
  /**
   * Given a delta calculate the OplogEntryId for a MOD_ENTRY.
   */
  public long calcModEntryId(long delta) {
    long oplogKeyId = this.recoverModEntryId + delta;
    if (DiskStoreImpl.TRACE_RECOVERY) {
      logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY calcModEntryId delta=" + delta
                  + " recoverModEntryId=" + this.recoverModEntryId
                  + " oplogKeyId=" + oplogKeyId);
    }
    this.recoverModEntryId = oplogKeyId;
    if (oplogKeyId > this.recoverModEntryIdHWM) { 
      this.recoverModEntryIdHWM = oplogKeyId; // fixes bug 41301
    }
    return oplogKeyId;
  }
  /**
   * Given a delta calculate the OplogEntryId for a DEL_ENTRY.
   */
  public long calcDelEntryId(long delta) {
    long oplogKeyId = this.recoverDelEntryId + delta;
    if (DiskStoreImpl.TRACE_RECOVERY) {
      logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY calcDelEntryId delta=" + delta
                  + " recoverDelEntryId=" + this.recoverDelEntryId
                  + " oplogKeyId=" + oplogKeyId);
    }
    this.recoverDelEntryId = oplogKeyId;
    if (oplogKeyId > this.recoverDelEntryIdHWM) { 
      this.recoverDelEntryIdHWM = oplogKeyId; // fixes bug 41340
    }
    return oplogKeyId;
  }
  private boolean crashed;
  boolean isCrashed() {
    return this.crashed;
  }
  
  /**
   * Return bytes read.
   */
  long recoverDrf(OplogEntryIdSet deletedIds,
                  boolean alreadyRecoveredOnce,
                  boolean latestOplog) {
    File drfFile = this.drf.f;
    if (drfFile == null) {
      this.haveRecoveredDrf = true;
      return 0L;
    }
    lockCompactor();
    try {
      if (this.haveRecoveredDrf && !getHasDeletes()) return 0L; // do this while holding lock
      if (!this.haveRecoveredDrf) {
        this.haveRecoveredDrf = true;
      }
    if (logger.infoEnabled()) {
      logger.info(LocalizedStrings.DiskRegion_RECOVERING_OPLOG_0_1_2,
                  new Object[] {toString(),
                                drfFile.getAbsolutePath(),
                                getParent().getName()});
    }
    this.recoverDelEntryId = DiskStoreImpl.INVALID_ID;
    boolean readLastRecord = true;
    CountingDataInputStream dis = null;
    try {
      int recordCount = 0;
      boolean foundDiskStoreRecord = false;
      FileInputStream fis = null;
      try {
        fis = new FileInputStream(drfFile);
        dis = new CountingDataInputStream(new BufferedInputStream(fis,
            DEFAULT_BUFFER_SIZE), drfFile.length());
        boolean endOfLog = false;
        while (!endOfLog) {
          if (dis.atEndOfFile()) {
            endOfLog = true;
            break;
          }
          readLastRecord = false;
          byte opCode = dis.readByte();
          if (DiskStoreImpl.TRACE_RECOVERY) {
            logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY drf byte=" + opCode + " location=" + Long.toHexString(dis.getCount()));
          }
          switch (opCode) {
          case OPLOG_EOF_ID:
            // we are at the end of the oplog. So we need to back up one byte
            dis.decrementCount();
            endOfLog = true;
            break;
          case OPLOG_DEL_ENTRY_1ID:
          case OPLOG_DEL_ENTRY_2ID:
          case OPLOG_DEL_ENTRY_3ID:
          case OPLOG_DEL_ENTRY_4ID:
          case OPLOG_DEL_ENTRY_5ID:
          case OPLOG_DEL_ENTRY_6ID:
          case OPLOG_DEL_ENTRY_7ID:
          case OPLOG_DEL_ENTRY_8ID:
            readDelEntry(dis, opCode, deletedIds, parent);
            recordCount++;
            break;
          case OPLOG_DISK_STORE_ID:
            readDiskStoreRecord(dis, this.drf.f);
            foundDiskStoreRecord = true;
            recordCount++;
            break;
          case OPLOG_GEMFIRE_VERSION:
            readGemfireVersionRecord(dis, this.drf.f);
            recordCount++;
            break;
            
          case OPLOG_RVV:
              dis.getCount();
            readRVVRecord(dis, this.drf.f, true, latestOplog);
            recordCount++;
            break;

          default:
            throw new DiskAccessException(LocalizedStrings.Oplog_UNKNOWN_OPCODE_0_FOUND_IN_DISK_OPERATION_LOG.toLocalizedString(opCode), getParent());
          }
          readLastRecord = true;
          // @todo
//           if (rgn.isDestroyed()) {
//             break;
//           }
        } // while
      }
      finally {
        if (dis != null) {
          dis.close();
        }
        if (fis != null) {
          fis.close();
        }
      }
      if (!foundDiskStoreRecord && recordCount > 0) {
        throw new DiskAccessException("The oplog file \""
                                      + this.drf.f
                                      + "\" does not belong to the init file \""
                                      + getParent().getInitFile() + "\". Drf did not contain a disk store id.",
                                      getParent());
      }
    }
    catch (EOFException ex) {
      // ignore since a partial record write can be caused by a crash
//       if (byteCount < fileLength) {
//         throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_READING_FILE_DURING_RECOVERY_FROM_0
//             .toLocalizedString(drfFile.getPath()), ex, getParent());
//       }// else do nothing, this is expected in crash scenarios
    }
    catch (IOException ex) {
      getParent().getCancelCriterion().checkCancelInProgress(ex);
      throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_READING_FILE_DURING_RECOVERY_FROM_0
          .toLocalizedString(drfFile.getPath()), ex, getParent());
    }
    catch (CancelException ignore) {
      if (logger.fineEnabled()) {
        logger.fine("Oplog::readOplog:Error in recovery as Cache was closed",
            ignore);
      }
    }
    catch (RegionDestroyedException ignore) {
      if (logger.fineEnabled()) {
        logger.fine(
            "Oplog::readOplog:Error in recovery as Region was destroyed",
            ignore);
      }
    }
    catch (IllegalStateException ex) {
      // @todo
//       if (!rgn.isClosed()) {
        throw ex;
//       }
    }
    //Add the Oplog size to the Directory Holder which owns this oplog,
    // so that available space is correctly calculated & stats updated. 
    long byteCount = 0;
    if (!readLastRecord) {
      // this means that there was a crash
      // and hence we should not continue to read
      // the next oplog
      this.crashed = true;
      if (dis != null) {
        byteCount = dis.getFileLength();
      }
    } else {
      if (dis != null) {
        byteCount = dis.getCount();
      }
    }
    if (!alreadyRecoveredOnce) {
      setRecoveredDrfSize(byteCount);
//       {
//         LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n();
//         l.info(LocalizedStrings.DEBUG, "drfSize inc=" + byteCount);
//       }
      this.dirHolder.incrementTotalOplogSize(byteCount);
    }
    return byteCount;
    } finally {
      unlockCompactor();
    }
  }

  /**
   * This map is used during recovery to keep track of what entries were
   * recovered. Its keys are the oplogEntryId; its values are the actual
   * logical keys that end up in the Region's keys.
   * It used to be a local variable in basicInitializeOwner
   * but now that it needs to live longer than that method
   * I made it an instance variable
   * It is now only alive during recoverRegionsThatAreReady so it could
   * once again be passed down into each oplog.
   * 

If offlineCompaction the value in this map will have the key bytes, * values bytes, user bits, etc (any info we need to copy forward). */ private OplogEntryIdMap kvMap; public final OplogEntryIdMap getRecoveryMap() { return this.kvMap; } private volatile OplogEntryIdMap kvInitMap; public final OplogEntryIdMap getInitRecoveryMap() { return this.kvInitMap; } final void clearInitRecoveryMap() { this.kvInitMap = null; } /** * This map is used during recover to keep track of keys * that are skipped. Later modify records in the same oplog * may use this map to retrieve the correct key. */ private OplogEntryIdMap skippedKeyBytes; private boolean readKrf(OplogEntryIdSet deletedIds, boolean recoverValues, boolean recoverValuesSync, Set oplogsNeedingValueRecovery, boolean latestOplog) { File f = new File(this.diskFile.getPath() + KRF_FILE_EXT); if (!f.exists()) { return false; } if(!getParent().getDiskInitFile().hasKrf(this.oplogId)) { logger.info(LocalizedStrings.Oplog_REMOVING_INCOMPLETE_KRF, new Object[] { f.getName(), this.oplogId, getParent().getName() }); f.delete(); return false; } // Set krfCreated to true since we have a krf. if (logger.fineEnabled()) { logger.info(LocalizedStrings.DEBUG, "readKrf:: setting krfcreated to true for oplog: " + this); } this.krfCreated.set(true); //Fix for 42741 - we do this after creating setting the krfCreated flag //so that we don't try to recreate the krf. if(recoverValuesSync) { return false; } FileInputStream fis; try { fis = new FileInputStream(f); } catch (FileNotFoundException ex) { return false; } try { if (getParent().isOffline() && !getParent().FORCE_KRF_RECOVERY) { return false; } if (logger.infoEnabled()) { logger .info(LocalizedStrings.DiskRegion_RECOVERING_OPLOG_0_1_2, new Object[] { toString(), f.getAbsolutePath(), getParent().getName() }); } this.recoverNewEntryId = DiskStoreImpl.INVALID_ID; this.recoverModEntryId = DiskStoreImpl.INVALID_ID; this.recoverModEntryIdHWM = DiskStoreImpl.INVALID_ID; long oplogKeyIdHWM = DiskStoreImpl.INVALID_ID; int krfEntryCount = 0; //DataInputStream dis = new DataInputStream(new BufferedInputStream(fis, // LARGE_BUFFER_SIZE)); final Version version = getProductVersionIfOld(); final ByteArrayDataInput in = new ByteArrayDataInput(); final long currentTime = getParent().getCache().cacheTimeMillis(); try { final ChannelBufferUnsafeDataInputStream dis = new ChannelBufferUnsafeDataInputStream(fis.getChannel(), LARGE_BUFFER_SIZE); readDiskStoreRecord(dis, f); readGemfireVersionRecord(dis, f); readTotalCountRecord(dis, f); readRVVRecord(dis, f, false, latestOplog); long lastOffset = 0; byte[] keyBytes = DataSerializer.readByteArray(dis); while (keyBytes != null) { byte userBits = dis.readByte(); int valueLength = InternalDataSerializer.readArrayLength(dis); byte[] valueBytes = null; long drId = DiskInitFile.readDiskRegionID(dis); DiskRecoveryStore drs = getOplogSet().getCurrentlyRecovering(drId); // read version VersionTag tag = null; long lastModifiedTime = 0; if (EntryBits.isWithVersions(userBits)) { tag = readVersionsFromOplog(dis); // Update the RVV with the new entry if (drs != null) { drs.recordRecoveredVersionTag(tag); } } // read last modified time for no-versions case else if (EntryBits.isLastModifiedTime(userBits)) { lastModifiedTime = InternalDataSerializer.readUnsignedVL(dis); } else { lastModifiedTime = currentTime; } long oplogKeyId = InternalDataSerializer.readVLOld(dis); long oplogOffset; if (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)) { oplogOffset = -1; } else { oplogOffset = lastOffset + InternalDataSerializer.readVLOld(dis); lastOffset = oplogOffset; } if (oplogKeyId > oplogKeyIdHWM) { oplogKeyIdHWM = oplogKeyId; } if (okToSkipModifyRecord(deletedIds, drId, drs, oplogKeyId, true, tag).skip()) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readNewEntry skipping oplogKeyId=<" + oplogKeyId + ">" + " drId=" + drId + " userBits=" + userBits + " oplogOffset=" + oplogOffset + " valueLen=" + valueLength); } // logger.info(LocalizedStrings.DEBUG, // "DEBUG: recover krf skipping oplogKeyId=" + oplogKeyId); this.stats.incRecoveryRecordsSkipped(); incSkipped(); } else { if (EntryBits.isAnyInvalid(userBits)) { if (EntryBits.isInvalid(userBits)) { valueBytes = DiskEntry.INVALID_BYTES; } else { valueBytes = DiskEntry.LOCAL_INVALID_BYTES; } } else if (EntryBits.isTombstone(userBits)) { valueBytes = DiskEntry.TOMBSTONE_BYTES; } Object key = deserializeKey(keyBytes, version, in); // logger.info(LocalizedStrings.DEBUG, "DEBUG: recover krf key=" + key // + " id=" + oplogKeyId); /* { Object oldValue = getRecoveryMap().put(oplogKeyId, key); if (oldValue != null) { throw new AssertionError(LocalizedStrings.Oplog_DUPLICATE_CREATE .toLocalizedString(oplogKeyId)); } } */ DiskEntry de = drs.getDiskEntry(key); if (de == null) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readNewEntry oplogKeyId=<" + oplogKeyId + ">" + " drId=" + drId + " key=<" + key + ">" + " userBits=" + userBits + " oplogOffset=" + oplogOffset + " valueLen=" + valueLength); // + " kvMapSize=" + getRecoveryMap().size() // + " kvMapKeys=" + laToString(getRecoveryMap().keys())); } DiskEntry.RecoveredEntry re = createRecoveredEntry(valueBytes, valueLength, userBits, getOplogId(), oplogOffset, oplogKeyId, false, version, in); if (tag != null) { re.setVersionTag(tag); } if (lastModifiedTime != 0) { re.setLastModifiedTime(lastModifiedTime); } de = initRecoveredEntry(drs.getDiskRegionView(), drs .initializeRecoveredEntry(key, re)); drs.getDiskRegionView().incRecoveredEntryCount(); this.stats.incRecoveredEntryCreates(); krfEntryCount++; } else { DiskId curdid = de.getDiskId(); //assert curdid.getOplogId() != getOplogId(); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY ignore readNewEntry because getOplogId()=" + getOplogId() + " != curdid.getOplogId()=" + curdid.getOplogId() + " for drId=" + drId + " key=" + key); } } Object oldEntry = getRecoveryMap().put(oplogKeyId, de); if (oldEntry != null) { throw new AssertionError(LocalizedStrings.Oplog_DUPLICATE_CREATE .toLocalizedString(oplogKeyId)); } } keyBytes = DataSerializer.readByteArray(dis); } // while setRecoverNewEntryId(oplogKeyIdHWM); } catch (IOException ex) { try { fis.close(); fis = null; } catch (IOException ignore) { } throw new DiskAccessException( "Unable to recover from krf file for oplogId=" + oplogId + ", file=" + f.getName() + ". This file is corrupt, but may be safely deleted.", ex, getParent()); } if (recoverValues && krfEntryCount > 0) { oplogsNeedingValueRecovery.add(this); // TODO optimize this code and make it async // It should also honor the lru limit // The fault in logic might not work until // the region is actually created. // Instead of reading the crf it might be better to iterate the live entry // list that was built during KRF recovery. Just fault values in until we // hit the LRU limit (if we have one). Only fault in values for entries // recovered from disk that are still in this oplog. // Defer faulting in values until all oplogs for the ds have been // recovered. } } finally { // fix for bug 42776 if (fis != null) { try { fis.close(); fis = null; } catch (IOException ignore) { } } } return true; } /** * Return number of bytes read */ private long readCrf(OplogEntryIdSet deletedIds, boolean recoverValues, boolean latestOplog) { this.recoverNewEntryId = DiskStoreImpl.INVALID_ID; this.recoverModEntryId = DiskStoreImpl.INVALID_ID; this.recoverModEntryIdHWM = DiskStoreImpl.INVALID_ID; boolean readLastRecord = true; CountingDataInputStream dis = null; try { final LocalRegion currentRegion = LocalRegion.getInitializingRegion(); final boolean keyRequiresRegionContext = currentRegion != null ? currentRegion.keyRequiresRegionContext() : false; final Version version = getProductVersionIfOld(); final ByteArrayDataInput in = new ByteArrayDataInput(); final HeapDataOutputStream hdos = new HeapDataOutputStream(Version.CURRENT); final long currentTime = getParent().getCache().cacheTimeMillis(); int recordCount = 0; boolean foundDiskStoreRecord = false; FileInputStream fis = null; try { fis = new FileInputStream(this.crf.f); dis = new CountingDataInputStream(new BufferedInputStream(fis, LARGE_BUFFER_SIZE), this.crf.f.length()); boolean endOfLog = false; while (!endOfLog) { // long startPosition = byteCount; if (dis.atEndOfFile()) { endOfLog = true; break; } readLastRecord = false; byte opCode = dis.readByte(); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY Oplog opCode=" + opCode); } switch (opCode) { case OPLOG_EOF_ID: // we are at the end of the oplog. So we need to back up one byte dis.decrementCount(); endOfLog = true; break; case OPLOG_CONFLICT_VERSION: this.readVersionTagOnlyEntry(dis, opCode); break; case OPLOG_NEW_ENTRY_BASE_ID: { long newEntryBase = dis.readLong(); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY newEntryBase=" + newEntryBase); } readEndOfRecord(dis); setRecoverNewEntryId(newEntryBase); recordCount++; } break; case OPLOG_NEW_ENTRY_0ID: readNewEntry(dis, opCode, deletedIds, recoverValues, currentRegion, keyRequiresRegionContext, version, in, hdos, currentTime); recordCount++; break; case OPLOG_MOD_ENTRY_1ID: case OPLOG_MOD_ENTRY_2ID: case OPLOG_MOD_ENTRY_3ID: case OPLOG_MOD_ENTRY_4ID: case OPLOG_MOD_ENTRY_5ID: case OPLOG_MOD_ENTRY_6ID: case OPLOG_MOD_ENTRY_7ID: case OPLOG_MOD_ENTRY_8ID: readModifyEntry(dis, opCode, deletedIds, recoverValues, currentRegion, keyRequiresRegionContext, version, in, hdos, currentTime); recordCount++; break; case OPLOG_MOD_ENTRY_WITH_KEY_1ID: case OPLOG_MOD_ENTRY_WITH_KEY_2ID: case OPLOG_MOD_ENTRY_WITH_KEY_3ID: case OPLOG_MOD_ENTRY_WITH_KEY_4ID: case OPLOG_MOD_ENTRY_WITH_KEY_5ID: case OPLOG_MOD_ENTRY_WITH_KEY_6ID: case OPLOG_MOD_ENTRY_WITH_KEY_7ID: case OPLOG_MOD_ENTRY_WITH_KEY_8ID: readModifyEntryWithKey(dis, opCode, deletedIds, recoverValues, currentRegion, keyRequiresRegionContext, version, in, hdos, currentTime); recordCount++; break; case OPLOG_DISK_STORE_ID: readDiskStoreRecord(dis, this.crf.f); foundDiskStoreRecord = true; recordCount++; break; case OPLOG_GEMFIRE_VERSION: readGemfireVersionRecord(dis, this.crf.f); recordCount++; break; case OPLOG_RVV: readRVVRecord(dis, this.drf.f, false, latestOplog); recordCount++; break; default: throw new DiskAccessException(LocalizedStrings.Oplog_UNKNOWN_OPCODE_0_FOUND_IN_DISK_OPERATION_LOG.toLocalizedString(opCode), getParent()); } readLastRecord = true; // @todo // if (rgn.isDestroyed()) { // break; // } } // while } finally { if (dis != null) { dis.close(); } if (fis != null) { fis.close(); } } if (!foundDiskStoreRecord && recordCount > 0) { throw new DiskAccessException("The oplog file \"" + this.crf.f + "\" does not belong to the init file \"" + getParent().getInitFile() + "\". Crf did not contain a disk store id.", getParent()); } } catch (EOFException ex) { // ignore since a partial record write can be caused by a crash // if (byteCount < fileLength) { // throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_READING_FILE_DURING_RECOVERY_FROM_0 // .toLocalizedString(this.crf.f.getPath()), ex, getParent()); // }// else do nothing, this is expected in crash scenarios } catch (IOException ex) { getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_READING_FILE_DURING_RECOVERY_FROM_0 .toLocalizedString(this.crf.f.getPath()), ex, getParent()); } catch (CancelException ignore) { if (logger.fineEnabled()) { logger.fine("Oplog::readOplog:Error in recovery as Cache was closed", ignore); } } catch (RegionDestroyedException ignore) { if (logger.fineEnabled()) { logger.fine( "Oplog::readOplog:Error in recovery as Region was destroyed", ignore); } } catch (IllegalStateException ex) { // @todo // if (!rgn.isClosed()) { throw ex; // } } //Add the Oplog size to the Directory Holder which owns this oplog, // so that available space is correctly calculated & stats updated. long byteCount = 0; if (!readLastRecord) { // this means that there was a crash // and hence we should not continue to read // the next oplog this.crashed = true; if (dis != null) { byteCount = dis.getFileLength(); } } else { if (dis != null) { byteCount = dis.getCount(); } } return byteCount; } /** * @throws DiskAccessException if this file does not belong to our parent */ private void readDiskStoreRecord(DataInput dis, File f) throws IOException { long leastSigBits = dis.readLong(); long mostSigBits = dis.readLong(); DiskStoreID readDSID = new DiskStoreID(mostSigBits, leastSigBits); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY diskStoreId=" + readDSID); } readEndOfRecord(dis); DiskStoreID dsid = getParent().getDiskStoreID(); if (!readDSID.equals(dsid)) { throw new DiskAccessException("The oplog file \"" + f + "\" does not belong to the init file \"" + getParent().getInitFile() + "\".", getParent()); } } /** * @throws DiskAccessException if this file does not belong to our parent */ private void readGemfireVersionRecord(DataInput dis, File f) throws IOException { Version recoveredGFVersion = readProductVersionRecord(dis, f); final boolean hasDataVersion; if ((hasDataVersion = (recoveredGFVersion == Version.TOKEN))) { // actual GFE version will be the next record in this case byte opCode = dis.readByte(); if (opCode != OPLOG_GEMFIRE_VERSION) { throw new DiskAccessException(LocalizedStrings .Oplog_UNKNOWN_OPCODE_0_FOUND_IN_DISK_OPERATION_LOG .toLocalizedString(opCode), getParent()); } recoveredGFVersion = readProductVersionRecord(dis, f); } if (this.gfversion == null) { this.gfversion = recoveredGFVersion; } else { assert this.gfversion == recoveredGFVersion; } if (hasDataVersion) { byte opCode = dis.readByte(); if (opCode != OPLOG_GEMFIRE_VERSION) { throw new DiskAccessException(LocalizedStrings .Oplog_UNKNOWN_OPCODE_0_FOUND_IN_DISK_OPERATION_LOG .toLocalizedString(opCode), getParent()); } recoveredGFVersion = readProductVersionRecord(dis, f); if (this.dataVersion == null) { this.dataVersion = recoveredGFVersion; } else { assert this.dataVersion == recoveredGFVersion; } } } private Version readProductVersionRecord(DataInput dis, File f) throws IOException { Version recoveredGFVersion; short ver = Version.readOrdinal(dis); try { recoveredGFVersion = Version.fromOrdinal(ver, false); } catch (UnsupportedGFXDVersionException e) { throw new DiskAccessException(LocalizedStrings .Oplog_UNEXPECTED_PRODUCT_VERSION_0.toLocalizedString(ver), e, getParent()); } if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY version=" + recoveredGFVersion); } readEndOfRecord(dis); return recoveredGFVersion; } private void readTotalCountRecord(DataInput dis, File f) throws IOException { long recoveredCount = InternalDataSerializer.readUnsignedVL(dis); this.totalCount.set(recoveredCount); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY totalCount=" + totalCount); } readEndOfRecord(dis); } private void readRVVRecord(DataInput dis, File f, boolean gcRVV, boolean latestOplog) throws IOException { long numRegions = InternalDataSerializer.readUnsignedVL(dis); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readRVV entry numRegions=" + numRegions); } for(int region =0; region < numRegions; region++) { long drId = InternalDataSerializer.readUnsignedVL(dis); //Get the drs. This may be null if this region is not currently recovering DiskRecoveryStore drs = getOplogSet().getCurrentlyRecovering(drId); if (drs instanceof AbstractRegion && !((AbstractRegion)drs).getConcurrencyChecksEnabled()) { drs = null; } if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readRVV drId=" + drId + " region=" + drs); } if(gcRVV) { //Read the GCC RV long rvvSize = InternalDataSerializer.readUnsignedVL(dis); for(int memberNum = 0; memberNum < rvvSize; memberNum++) { //for each member, read the member id and version long memberId = InternalDataSerializer.readUnsignedVL(dis); long gcVersion = InternalDataSerializer.readUnsignedVL(dis); //if we have a recovery store, add the recovered regions if(drs != null) { Object member = getParent().getDiskInitFile().getCanonicalObject((int)memberId); drs.recordRecoveredGCVersion((VersionSource) member, gcVersion); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY adding gcRVV entry drId=" + drId + ",member=" + memberId + ",version=" + gcVersion); } } else { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY skipping gcRVV entry drId=" + drId + ",member=" + memberId + ",version=" + gcVersion); } } } } else { boolean rvvTrusted = InternalDataSerializer.readBoolean(dis); if(drs != null) { if (latestOplog) { // only set rvvtrust based on the newest oplog recovered drs.setRVVTrusted(rvvTrusted); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY marking RVV trusted drId=" + drId + ",rvvTrusted=" + rvvTrusted); } } } //Read a regular RVV long rvvSize = InternalDataSerializer.readUnsignedVL(dis); for(int memberNum = 0; memberNum < rvvSize; memberNum++) { //for each member, read the member id and version long memberId = InternalDataSerializer.readUnsignedVL(dis); RegionVersionHolder versionHolder = new RegionVersionHolder(dis); if(drs != null) { Object member = getParent().getDiskInitFile().getCanonicalObject((int)memberId); drs.recordRecoveredVersonHolder((VersionSource) member, versionHolder, latestOplog); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY adding RVV entry drId=" + drId + ",member=" + memberId + ",versionHolder=" + versionHolder+",latestOplog="+latestOplog+",oplogId="+getOplogId()); } } else { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY skipping RVV entry drId=" + drId + ",member=" + memberId + ",versionHolder=" + versionHolder); } } } } } readEndOfRecord(dis); } /** * Recovers one oplog * @param latestOplog - true if this oplog is the latest oplog in the disk * store. */ long recoverCrf(OplogEntryIdSet deletedIds, boolean recoverValues, boolean recoverValuesSync, boolean alreadyRecoveredOnce, Set oplogsNeedingValueRecovery, boolean latestOplog, boolean initialRecovery) { // crf might not exist; but drf always will this.diskFile = new File(this.drf.f.getParentFile(), oplogSet.getPrefix() + getParent().getName() + "_" + this.oplogId); File crfFile = this.crf.f; if (crfFile == null) { this.haveRecoveredCrf = true; // logger.info(LocalizedStrings.DEBUG, "DEBUG crfFile=null"); return 0L; } lockCompactor(); this.kvMap = new OplogEntryIdMap(); this.skippedKeyBytes = new OplogEntryIdMap(); try { // logger.info(LocalizedStrings.DEBUG, "DEBUG haveRecoveredCrf=" // + this.haveRecoveredCrf + " isDestroyed()=" + isDeleted() // + " alreadyRecoveredOnce=" + alreadyRecoveredOnce); if (this.haveRecoveredCrf && isDeleted()) return 0; // do this check while holding lock if (!this.haveRecoveredCrf) { this.haveRecoveredCrf = true; } long byteCount; // if we have a KRF then read it and delay reading the CRF. // Unless we are in synchronous recovery mode if (!readKrf(deletedIds, recoverValues, recoverValuesSync, oplogsNeedingValueRecovery, latestOplog)) { //If the data extraction tool is trying to recover keys using // the FORCE_KRF_RECOVERY flag and for any reason the readKrf returns false //DO NOT proceed further and read the CRF file. if (getParent().FORCE_KRF_RECOVERY && getParent().dataExtractionKrfRecovery) { return 0L; } if (logger.infoEnabled()) { logger.info(LocalizedStrings.DiskRegion_RECOVERING_OPLOG_0_1_2, new Object[] { toString(), crfFile.getAbsolutePath(), getParent().getName() }); } byteCount = readCrf(deletedIds, recoverValues, latestOplog); // if (this.idxBuilder != null) { // this.idxBuilder.sortRecords(); // } } else { byteCount = this.crf.f.length(); } if (!isPhase2()) { if (getParent().isOfflineCompacting()) { getParent().incLiveEntryCount(getRecoveryMap().size()); } getParent().incDeadRecordCount(getRecordsSkipped()); } if (getParent().isOfflineCompacting()) { offlineCompact(deletedIds, latestOplog); } if (!alreadyRecoveredOnce) { setRecoveredCrfSize(byteCount); // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "crfSize inc=" + byteCount); // } this.dirHolder.incrementTotalOplogSize(byteCount); } if (getParent().isOfflineCompacting()) { if (isOplogEmpty()) { this.deleted.set(false); destroy(); } } // initialize the kvMap used by index recovery if required; index recovery // is now done in IndexRecoveryTask so the condition for copying // here should be same as that for creating an IndexRecoveryTask else if (!getParent().isOffline()) { this.kvInitMap = this.kvMap; } return byteCount; } finally { this.kvMap = null; this.skippedKeyBytes = null; unlockCompactor(); } } private boolean offlineCompactPhase2 = false; private boolean isPhase1() { return !this.offlineCompactPhase2; } private boolean isPhase2() { return this.offlineCompactPhase2; } private void offlineCompact(OplogEntryIdSet deletedIds, boolean latestOplog) { // If we only do this if "(getRecordsSkipped() > 0)" then it will only compact // an oplog that has some garbage in it. // Instead if we do every oplog in case they set maxOplogSize // then all oplogs will be converted to obey maxOplogSize. // 45777: for normal offline compaction, we only do it when getRecordsSkipped() > 0 // but for upgrade disk store, we have to do it for pure creates oplog if (getRecordsSkipped() > 0 || getHasDeletes() || getParent().isUpgradeVersionOnly()) { this.offlineCompactPhase2 = true; if(getOplogSet().getChild() == null) { getOplogSet().initChild(); } readCrf(deletedIds, true, latestOplog); this.deleted.set(false); destroyCrfOnly(); } else { // For every live entry in this oplog add it to the deleted set // so that we will skip it when we recovery the next oplogs. for (OplogEntryIdMap.Iterator it = getRecoveryMap().iterator(); it.hasNext();) { it.advance(); deletedIds.add(it.key()); } close(); } } private DiskEntry.RecoveredEntry createRecoveredEntry(byte[] valueBytes, int valueLength, byte userBits, long oplogId, long offsetInOplog, long oplogKeyId, boolean recoverValue, Version version, ByteArrayDataInput in) { DiskEntry.RecoveredEntry re = null; if (recoverValue || EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)) { Object value; if (EntryBits.isLocalInvalid(userBits)) { value = Token.LOCAL_INVALID; valueLength = 0; } else if (EntryBits.isInvalid(userBits)) { value = Token.INVALID; valueLength = 0; } else if (EntryBits.isSerialized(userBits)) { value = DiskEntry.Helper .readSerializedValue(valueBytes, version, in, false); } else if (EntryBits.isTombstone(userBits)) { value = Token.TOMBSTONE; } else { final StaticSystemCallbacks sysCb; if (version != null && (sysCb = GemFireCacheImpl .getInternalProductCallbacks()) != null) { // may need to change serialized shape for GemFireXD value = sysCb .fromVersion(valueBytes, valueLength, false, version, in); } else { value = valueBytes; } } re = new DiskEntry.RecoveredEntry(oplogKeyId, oplogId, offsetInOplog, userBits, valueLength, value); } else { re = new DiskEntry.RecoveredEntry(oplogKeyId, oplogId, offsetInOplog, userBits, valueLength); } return re; } private void readEndOfRecord(DataInput di) throws IOException { int b = di.readByte(); if (b != END_OF_RECORD_ID) { if (b == 0) { logger.warning(LocalizedStrings.Oplog_PARTIAL_RECORD); // this is expected if this is the last record and we died while writing it. throw new EOFException("found partial last record"); } else { // Our implementation currently relies on all unwritten bytes having // a value of 0. So throw this exception if we find one we didn't expect. throw new IllegalStateException("expected end of record (byte==" + END_OF_RECORD_ID + ") or zero but found " + b); } } } private static void forceSkipBytes(CountingDataInputStream dis, int len) throws IOException { int skipped = dis.skipBytes(len); while (skipped < len) { dis.readByte(); skipped++; } } private int recordsSkippedDuringRecovery = 0; private void incSkipped() { this.recordsSkippedDuringRecovery++; } int getRecordsSkipped() { return this.recordsSkippedDuringRecovery; } private VersionTag readVersionsFromOplog(DataInput dis) throws IOException { if (Version.GFE_70.compareTo(currentRecoveredGFVersion()) <= 0) { // this version format is for gemfire 7.0 // if we have different version format in 7.1, it will be handled in "else if" int entryVersion = (int)InternalDataSerializer.readSignedVL(dis); long regionVersion = InternalDataSerializer.readUnsignedVL(dis); int memberId = (int)InternalDataSerializer.readUnsignedVL(dis); Object member = getParent().getDiskInitFile().getCanonicalObject(memberId); long timestamp = InternalDataSerializer.readUnsignedVL(dis); int dsId = (int) InternalDataSerializer.readSignedVL(dis); VersionTag vt = VersionTag.create((VersionSource)member); vt.setEntryVersion(entryVersion); vt.setRegionVersion(regionVersion); vt.setMemberID((VersionSource)member); vt.setVersionTimeStamp(timestamp); vt.setDistributedSystemId(dsId); return vt; } else { // pre-7.0 return null; } } private synchronized VersionTag createDummyTag(DiskRecoveryStore drs, long currentTime) { DiskStoreID member = getParent().getDiskStoreID(); getParent().getDiskInitFile().getOrCreateCanonicalId(member); long regionVersion = drs.getVersionForMember(member); VersionTag vt = VersionTag.create(member); vt.setEntryVersion(1); vt.setRegionVersion(regionVersion+1); vt.setMemberID(member); vt.setVersionTimeStamp(currentTime); vt.setDistributedSystemId(-1); return vt; } /** * Reads an oplog entry of type Create * * @param dis * DataInputStream from which the oplog is being read * @param opcode * byte whether the id is short/int/long * @param recoverValue * @throws IOException */ private void readNewEntry(CountingDataInputStream dis, byte opcode, OplogEntryIdSet deletedIds, boolean recoverValue, final LocalRegion currentRegion, boolean keyRequiresRegionContext, Version version, ByteArrayDataInput in, HeapDataOutputStream hdos, final long currentTime) throws IOException { long oplogOffset = -1; byte userBits = dis.readByte(); byte[] objValue = null; int valueLength =0; long oplogKeyId = incRecoverNewEntryId(); long drId = DiskInitFile.readDiskRegionID(dis); DiskRecoveryStore drs = getOplogSet().getCurrentlyRecovering(drId); // read versions VersionTag tag = null; final boolean withVersions = EntryBits.isWithVersions(userBits); long lastModifiedTime = 0L; if (withVersions) { tag = readVersionsFromOplog(dis); } else if (getParent().isUpgradeVersionOnly() && drs != null /* Sqlfire 1.1 1099 has no version tags */ && !Version.CURRENT.equals(Version.SQLF_1099) && !Version.CURRENT.equals(Version.SQLF_11)) { tag = this.createDummyTag(drs, currentTime); userBits = EntryBits.setWithVersions(userBits, true); } // read last modified time for no-versions case if (!withVersions) { if (EntryBits.isLastModifiedTime(userBits)) { lastModifiedTime = InternalDataSerializer.readUnsignedVL(dis); } else if (tag == null) { lastModifiedTime = currentTime; } } OkToSkipResult skipResult = okToSkipModifyRecord(deletedIds, drId, drs, oplogKeyId, true, tag); if (skipResult.skip()) { if (!isPhase2()) { this.stats.incRecoveryRecordsSkipped(); incSkipped(); } } else if (recoverValue && drs.lruLimitExceeded() && !getParent().isOfflineCompacting()) { this.stats.incRecoveredValuesSkippedDueToLRU(); recoverValue = false; } CompactionRecord p2cr = null; long crOffset; if (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)) { if (EntryBits.isInvalid(userBits)) { objValue = DiskEntry.INVALID_BYTES; } else if (EntryBits.isTombstone(userBits)) { objValue = DiskEntry.TOMBSTONE_BYTES; } else { objValue = DiskEntry.LOCAL_INVALID_BYTES; } crOffset = dis.getCount(); if (!skipResult.skip()) { if (isPhase2()) { p2cr = (CompactionRecord)getRecoveryMap().get(oplogKeyId); if (p2cr.getOffset() != crOffset) { skipResult = OkToSkipResult.SKIP_RECORD; } } } } else { int len = dis.readInt(); oplogOffset = dis.getCount(); crOffset = oplogOffset; valueLength = len; if (!skipResult.skip()) { if (isPhase2()) { p2cr = (CompactionRecord)getRecoveryMap().get(oplogKeyId); if (p2cr.getOffset() != crOffset) { skipResult = OkToSkipResult.SKIP_RECORD; } } } if (recoverValue && !skipResult.skip()) { byte[] valueBytes = new byte[len]; dis.readFully(valueBytes); objValue = valueBytes; validateValue(valueBytes, userBits, version, in); } else { forceSkipBytes(dis, len); } } { int len = dis.readInt(); incTotalCount(); if (skipResult.skip()) { if(skipResult.skipKey()) { forceSkipBytes(dis, len); } else { byte[] keyBytes = new byte[len]; dis.readFully(keyBytes); skippedKeyBytes.put(oplogKeyId, keyBytes); } readEndOfRecord(dis); if(drs != null && tag != null) { //Update the RVV with the new entry //This must be done after reading the end of record to make sure //we don't have a corrupt record. See bug #45538 drs.recordRecoveredVersionTag(tag); } if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readNewEntry SKIPPING oplogKeyId=<" + oplogKeyId + ">" + " drId=" + drId + " userBits=" + userBits + " keyLen=" + len + " valueLen=" + valueLength + " tag=" + tag ); } } else { byte[] keyBytes = null; if (isPhase2()) { forceSkipBytes(dis, len); } else { keyBytes = new byte[len]; dis.readFully(keyBytes); } readEndOfRecord(dis); if(drs != null && tag != null) { //Update the RVV with the new entry //This must be done after reading the end of record to make sure //we don't have a corrupt record. See bug #45538 drs.recordRecoveredVersionTag(tag); } if (getParent().isOfflineCompacting()) { if (isPhase1()) { CompactionRecord cr = new CompactionRecord(keyBytes, crOffset); getRecoveryMap().put(oplogKeyId, cr); drs.getDiskRegionView().incRecoveredEntryCount(); this.stats.incRecoveredEntryCreates(); } else { // phase2 assert p2cr != null; // may need to change the key/value bytes for GemFireXD keyBytes = p2cr.getKeyBytes(); if (version != null && !Version.CURRENT.equals(version)) { final StaticSystemCallbacks sysCb = GemFireCacheImpl .getInternalProductCallbacks(); if (sysCb != null) { keyBytes = sysCb.fromVersionToBytes(keyBytes, keyBytes.length, true, version, in, hdos); objValue = sysCb.fromVersionToBytes(objValue, objValue.length, EntryBits.isSerialized(userBits), version, in, hdos); } } getOplogSet().getChild().copyForwardForOfflineCompact(oplogKeyId, keyBytes, objValue, userBits, drId, tag, lastModifiedTime, currentTime); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readNewEntry copyForward oplogKeyId=<" + oplogKeyId + ">"); } // add it to the deletedIds set so we will ignore it in earlier oplogs deletedIds.add(oplogKeyId); } } else { Object key = deserializeKey(keyBytes, version, in); if (keyRequiresRegionContext) { ((KeyWithRegionContext)key).setRegionContext(currentRegion); } /* { Object oldValue = getRecoveryMap().put(oplogKeyId, key); if (oldValue != null) { throw new AssertionError(LocalizedStrings.Oplog_DUPLICATE_CREATE.toLocalizedString(oplogKeyId)); } } */ DiskEntry de = drs.getDiskEntry(key); if (de == null) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readNewEntry oplogKeyId=<" + oplogKeyId + ">" + " drId=" + drId + " key=<"+ key + ">" + " userBits=" + userBits + " oplogOffset=" + oplogOffset + " valueLen=" + valueLength + " tag=" + tag ); // + " kvMapSize=" + getRecoveryMap().size() // + " kvMapKeys=" + laToString(getRecoveryMap().keys())); } DiskEntry.RecoveredEntry re = createRecoveredEntry(objValue, valueLength, userBits, getOplogId(), oplogOffset, oplogKeyId, recoverValue, version, in); if (tag != null) { re.setVersionTag(tag); } if (lastModifiedTime != 0) { re.setLastModifiedTime(lastModifiedTime); } de = initRecoveredEntry(drs.getDiskRegionView(), drs.initializeRecoveredEntry(key, re)); drs.getDiskRegionView().incRecoveredEntryCount(); this.stats.incRecoveredEntryCreates(); } else { DiskId curdid = de.getDiskId(); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY ignore readNewEntry because getOplogId()=" + getOplogId() + " != curdid.getOplogId()=" + curdid.getOplogId() + " oplogKeyId=<" + oplogKeyId + ">" + " for drId=" + drId + " tag=" + tag + " key=" + key); } assert curdid.getOplogId() != getOplogId(); } Object oldEntry = getRecoveryMap().put(oplogKeyId, de); if (oldEntry != null) { throw new AssertionError( LocalizedStrings.Oplog_DUPLICATE_CREATE .toLocalizedString(oplogKeyId)); } } } } } /** * Reads an oplog entry of type Modify * * @param dis * DataInputStream from which the oplog is being read * @param opcode * byte whether the id is short/int/long * @param recoverValue * @param currentRegion * @param keyRequiresRegionContext * @throws IOException */ private void readModifyEntry(CountingDataInputStream dis, byte opcode, OplogEntryIdSet deletedIds, boolean recoverValue, LocalRegion currentRegion, boolean keyRequiresRegionContext, Version version, ByteArrayDataInput in, HeapDataOutputStream hdos, final long currentTime) throws IOException { long oplogOffset = -1; byte userBits = dis.readByte(); int idByteCount = (opcode - OPLOG_MOD_ENTRY_1ID) + 1; // long debugRecoverModEntryId = this.recoverModEntryId; long oplogKeyId = getModEntryId(dis, idByteCount); // long debugOplogKeyId = dis.readLong(); // //assert oplogKeyId == debugOplogKeyId // // : "expected=" + debugOplogKeyId + " actual=" + oplogKeyId // assert debugRecoverModEntryId == debugOplogKeyId // : "expected=" + debugOplogKeyId + " actual=" + debugRecoverModEntryId // + " idByteCount=" + idByteCount // + " delta=" + this.lastDelta; long drId = DiskInitFile.readDiskRegionID(dis); DiskRecoveryStore drs = getOplogSet().getCurrentlyRecovering(drId); // read versions VersionTag tag = null; final boolean withVersions = EntryBits.isWithVersions(userBits); long lastModifiedTime = 0L; if (withVersions) { tag = readVersionsFromOplog(dis); } else if (getParent().isUpgradeVersionOnly() && drs != null /* Sqlfire 1.1 and 1099 has no version tags */ && !Version.CURRENT.equals(Version.SQLF_1099) && !Version.CURRENT.equals(Version.SQLF_11)) { tag = this.createDummyTag(drs, currentTime); userBits = EntryBits.setWithVersions(userBits, true); } // read last modified time for no-versions case if (!withVersions) { if (EntryBits.isLastModifiedTime(userBits)) { lastModifiedTime = InternalDataSerializer.readUnsignedVL(dis); } else if (tag == null) { lastModifiedTime = currentTime; } } OkToSkipResult skipResult = okToSkipModifyRecord(deletedIds, drId, drs, oplogKeyId, false, tag); if (skipResult.skip()) { if (!isPhase2()) { incSkipped(); this.stats.incRecoveryRecordsSkipped(); } } else if (recoverValue && drs.lruLimitExceeded() && !getParent().isOfflineCompacting()) { this.stats.incRecoveredValuesSkippedDueToLRU(); recoverValue = false; } byte[] objValue = null; int valueLength = 0; CompactionRecord p2cr = null; long crOffset; if (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)) { if (EntryBits.isInvalid(userBits)) { objValue = DiskEntry.INVALID_BYTES; } else if (EntryBits.isTombstone(userBits)) { objValue = DiskEntry.TOMBSTONE_BYTES; } else { objValue = DiskEntry.LOCAL_INVALID_BYTES; } crOffset = dis.getCount(); if (!skipResult.skip()) { if (isPhase2()) { p2cr = (CompactionRecord)getRecoveryMap().get(oplogKeyId); if (p2cr.getOffset() != crOffset) { skipResult = OkToSkipResult.SKIP_RECORD; } } } } else { int len = dis.readInt(); oplogOffset = dis.getCount(); crOffset = oplogOffset; valueLength = len; if (!skipResult.skip()) { if (isPhase2()) { p2cr = (CompactionRecord)getRecoveryMap().get(oplogKeyId); if (p2cr.getOffset() != crOffset) { skipResult = OkToSkipResult.SKIP_RECORD; } } } if (!skipResult.skip() && recoverValue) { byte[] valueBytes = new byte[len]; dis.readFully(valueBytes); objValue = valueBytes; validateValue(valueBytes, userBits, version, in); } else { forceSkipBytes(dis, len); } } readEndOfRecord(dis); if(drs != null && tag != null) { //Update the RVV with the new entry //This must be done after reading the end of record to make sure //we don't have a corrupt record. See bug #45538 drs.recordRecoveredVersionTag(tag); } incTotalCount(); if (!skipResult.skip()) { Object key = null; Object entry = getRecoveryMap().get(oplogKeyId); DiskEntry de = null; byte[] keyBytes = null; //if the key is not in the recover map, it's possible it //was previously skipped. Check the skipped bytes map for the key. if(entry == null) { keyBytes = (byte[]) skippedKeyBytes.get(oplogKeyId); if (keyBytes != null) { key = deserializeKey(keyBytes, version, in); if (keyRequiresRegionContext) { ((KeyWithRegionContext)key).setRegionContext(currentRegion); } } } else if (entry instanceof DiskEntry) { de = (DiskEntry)entry; key = de.getKeyCopy(); } else { key = entry; } if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readModifyEntry oplogKeyId=<" + oplogKeyId + ">" + "drId=" + drId + " key=<" + key + "> userBits=" + userBits + " oplogOffset=" + oplogOffset + " tag=" + tag + " valueLen=" + valueLength); // + " kvMapSize=" + getRecoveryMap().size()); } // Will no longer be null since 1st modify record in any oplog // will now be a MOD_ENTRY_WITH_KEY record. assert key != null; if (getParent().isOfflineCompacting()) { if (isPhase1()) { CompactionRecord cr = (CompactionRecord)key; incSkipped(); // we are going to compact the previous record away cr.update(crOffset); } else { // phase2 assert p2cr != null; // may need to change the key/value bytes for GemFireXD keyBytes = p2cr.getKeyBytes(); if (version != null && !Version.CURRENT.equals(version)) { final StaticSystemCallbacks sysCb = GemFireCacheImpl .getInternalProductCallbacks(); if (sysCb != null) { keyBytes = sysCb.fromVersionToBytes(keyBytes, keyBytes.length, true, version, in, hdos); objValue = sysCb.fromVersionToBytes(objValue, objValue.length, EntryBits.isSerialized(userBits), version, in, hdos); } } getOplogSet().getChild().copyForwardForOfflineCompact(oplogKeyId, keyBytes, objValue, userBits, drId, tag, lastModifiedTime, currentTime); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readModifyEntry copyForward oplogKeyId=<" + oplogKeyId + ">"); } // add it to the deletedIds set so we will ignore it in earlier oplogs deletedIds.add(oplogKeyId); } } else { // Check the actual region to see if it has this key from // a previous recovered oplog. if (de == null && key != null) { de = drs.getDiskEntry(key); } //This may actually be create, if the previous create or modify //of this entry was cleared through the RVV clear. if (de == null) { DiskRegionView drv = drs.getDiskRegionView(); // and create an entry DiskEntry.RecoveredEntry re = createRecoveredEntry(objValue, valueLength, userBits, getOplogId(), oplogOffset, oplogKeyId, recoverValue, version, in); if (tag != null) { re.setVersionTag(tag); } if (lastModifiedTime != 0) { re.setLastModifiedTime(lastModifiedTime); } if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readModEntryWK init oplogKeyId=<" + oplogKeyId + ">" + "drId=" + drId + "key=<"+ key + ">" + " oplogOffset=" + oplogOffset + " userBits=" + userBits + " valueLen=" + valueLength + " tag=" + tag ); // + " kvMapSize=" + .getRecoveryMap().size() // + " kvMapKeys=" + laToString(getRecoveryMap().keys())); } de = initRecoveredEntry(drv, drs.initializeRecoveredEntry(key, re)); drs.getDiskRegionView().incRecoveredEntryCount(); this.stats.incRecoveredEntryCreates(); } else { DiskEntry.RecoveredEntry re = createRecoveredEntry(objValue, valueLength, userBits, getOplogId(), oplogOffset, oplogKeyId, recoverValue, version, in); if (tag != null) { re.setVersionTag(tag); } de = drs.updateRecoveredEntry(key, de, re); updateRecoveredEntry(drs.getDiskRegionView(), de, re); this.stats.incRecoveredEntryUpdates(); } } } else { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY skipping readModifyEntry oplogKeyId=<" + oplogKeyId + ">" + "drId=" + drId); } } } private void readVersionTagOnlyEntry(CountingDataInputStream dis, byte opcode) throws IOException { long drId = DiskInitFile.readDiskRegionID(dis); DiskRecoveryStore drs = getOplogSet().getCurrentlyRecovering(drId); // read versions VersionTag tag = readVersionsFromOplog(dis); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readVersionTagOnlyEntry " +" drId="+drId+" tag="+tag); } readEndOfRecord(dis); //Update the RVV with the new entry if(drs != null) { drs.recordRecoveredVersionTag(tag); } } private void validateValue(byte[] valueBytes, byte userBits, Version version, ByteArrayDataInput in) { if (getParent().isValidating()) { if (EntryBits.isSerialized(userBits)) { // make sure values are deserializable if (!PdxWriterImpl.isPdx(valueBytes)) { // fix bug 43011 try { DiskEntry.Helper.readSerializedValue(valueBytes, version, in, true); } catch (SerializationException ex) { logger.warning(LocalizedStrings.DEBUG, "Could not deserialize recovered value: " + ex.getCause()); } } } } } /** * Reads an oplog entry of type ModifyWithKey * * @param dis * DataInputStream from which the oplog is being read * @param opcode * byte whether the id is short/int/long * @param deletedIds * @param recoverValue * @throws IOException */ private void readModifyEntryWithKey(CountingDataInputStream dis, byte opcode, OplogEntryIdSet deletedIds, boolean recoverValue, final LocalRegion currentRegion, final boolean keyRequiresRegionContext, Version version, ByteArrayDataInput in, HeapDataOutputStream hdos, final long currentTime) throws IOException { long oplogOffset = -1; byte userBits = dis.readByte(); int idByteCount = (opcode - OPLOG_MOD_ENTRY_WITH_KEY_1ID) + 1; // long debugRecoverModEntryId = this.recoverModEntryId; long oplogKeyId = getModEntryId(dis, idByteCount); // long debugOplogKeyId = dis.readLong(); // //assert oplogKeyId == debugOplogKeyId // // : "expected=" + debugOplogKeyId + " actual=" + oplogKeyId // assert debugRecoverModEntryId == debugOplogKeyId // : "expected=" + debugOplogKeyId + " actual=" + debugRecoverModEntryId // + " idByteCount=" + idByteCount // + " delta=" + this.lastDelta; long drId = DiskInitFile.readDiskRegionID(dis); DiskRecoveryStore drs = getOplogSet().getCurrentlyRecovering(drId); // read version VersionTag tag = null; final boolean withVersions = EntryBits.isWithVersions(userBits); long lastModifiedTime = 0L; if (withVersions) { tag = readVersionsFromOplog(dis); } else if (getParent().isUpgradeVersionOnly() && drs != null /* Sqlfire 1.1 and 1099 has no version tags */ && !Version.CURRENT.equals(Version.SQLF_1099) && !Version.CURRENT.equals(Version.SQLF_11)) { tag = this.createDummyTag(drs, currentTime); userBits = EntryBits.setWithVersions(userBits, true); } // read last modified time for no-versions case if (!withVersions) { if (EntryBits.isLastModifiedTime(userBits)) { lastModifiedTime = InternalDataSerializer.readUnsignedVL(dis); } else if (tag == null) { lastModifiedTime = currentTime; } } OkToSkipResult skipResult = okToSkipModifyRecord(deletedIds, drId, drs, oplogKeyId, true, tag); if (skipResult.skip()) { if (!isPhase2()) { incSkipped(); this.stats.incRecoveryRecordsSkipped(); } } else if (recoverValue && drs.lruLimitExceeded() && !getParent().isOfflineCompacting()) { this.stats.incRecoveredValuesSkippedDueToLRU(); recoverValue = false; } byte[] objValue = null; int valueLength = 0; CompactionRecord p2cr = null; long crOffset; if (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)) { if (EntryBits.isInvalid(userBits)) { objValue = DiskEntry.INVALID_BYTES; } else if (EntryBits.isTombstone(userBits)) { objValue = DiskEntry.TOMBSTONE_BYTES; } else { objValue = DiskEntry.LOCAL_INVALID_BYTES; } crOffset = dis.getCount(); if (!skipResult.skip()) { if (isPhase2()) { p2cr = (CompactionRecord)getRecoveryMap().get(oplogKeyId); if (p2cr.getOffset() != crOffset) { skipResult = OkToSkipResult.SKIP_RECORD; } } } } else { int len = dis.readInt(); oplogOffset = dis.getCount(); crOffset = oplogOffset; valueLength = len; if (!skipResult.skip()) { if (isPhase2()) { p2cr = (CompactionRecord)getRecoveryMap().get(oplogKeyId); if (p2cr.getOffset() != crOffset) { skipResult = OkToSkipResult.SKIP_RECORD; } } } if (!skipResult.skip() && recoverValue) { byte[] valueBytes = new byte[len]; dis.readFully(valueBytes); objValue = valueBytes; validateValue(valueBytes, userBits, version, in); } else { forceSkipBytes(dis, len); } } int keyLen = dis.readInt(); incTotalCount(); if (skipResult.skip()) { if(skipResult.skipKey()) { forceSkipBytes(dis, keyLen); } else { byte[] keyBytes = new byte[keyLen]; dis.readFully(keyBytes); skippedKeyBytes.put(oplogKeyId, keyBytes); } readEndOfRecord(dis); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY skipping readModEntryWK init oplogKeyId=<" + oplogKeyId + ">" + "drId=" + drId); } } else { // read the key byte[] keyBytes = null; if (isPhase2()) { forceSkipBytes(dis, keyLen); } else { keyBytes = new byte[keyLen]; dis.readFully(keyBytes); } readEndOfRecord(dis); if(drs != null && tag != null) { //Update the RVV with the new entry //This must be done after reading the end of record to make sure //we don't have a corrupt record. See bug #45538 drs.recordRecoveredVersionTag(tag); } assert oplogKeyId >= 0; if (getParent().isOfflineCompacting()) { if (isPhase1()) { CompactionRecord cr = new CompactionRecord(keyBytes, crOffset); getRecoveryMap().put(oplogKeyId, cr); drs.getDiskRegionView().incRecoveredEntryCount(); this.stats.incRecoveredEntryCreates(); } else { // phase2 assert p2cr != null; // may need to change the key/value bytes for GemFireXD keyBytes = p2cr.getKeyBytes(); if (version != null && !Version.CURRENT.equals(version)) { final StaticSystemCallbacks sysCb = GemFireCacheImpl .getInternalProductCallbacks(); if (sysCb != null) { keyBytes = sysCb.fromVersionToBytes(keyBytes, keyBytes.length, true, version, in, hdos); objValue = sysCb.fromVersionToBytes(objValue, objValue.length, EntryBits.isSerialized(userBits), version, in, hdos); } } getOplogSet().getChild().copyForwardForOfflineCompact(oplogKeyId, keyBytes, objValue, userBits, drId, tag, lastModifiedTime, currentTime); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readModifyEntryWithKey copyForward oplogKeyId=<" + oplogKeyId + ">"); } // add it to the deletedIds set so we will ignore it in earlier oplogs deletedIds.add(oplogKeyId); } } else { Object key = deserializeKey(keyBytes, version, in); if (keyRequiresRegionContext) { ((KeyWithRegionContext)key).setRegionContext(currentRegion); } /* Object oldValue = getRecoveryMap().put(oplogKeyId, key); if (oldValue != null) { throw new AssertionError(LocalizedStrings.Oplog_DUPLICATE_CREATE.toLocalizedString(oplogKeyId)); } */ // Check the actual region to see if it has this key from // a previous recovered oplog. DiskEntry de = drs.getDiskEntry(key); if (de == null) { DiskRegionView drv = drs.getDiskRegionView(); // and create an entry DiskEntry.RecoveredEntry re = createRecoveredEntry(objValue, valueLength, userBits, getOplogId(), oplogOffset, oplogKeyId, recoverValue, version, in); if (tag != null) { re.setVersionTag(tag); } if (lastModifiedTime != 0) { re.setLastModifiedTime(lastModifiedTime); } if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "" + " oplogKeyId=<" + oplogKeyId + ">" + "drId=" + drId + "key=<"+ key + ">" + " oplogOffset=" + oplogOffset + " userBits=" + userBits + " valueLen=" + valueLength + " tag=" + tag ); // + " kvMapSize=" + .getRecoveryMap().size() // + " kvMapKeys=" + laToString(getRecoveryMap().keys())); } de = drs.initializeRecoveredEntry(key, re); initRecoveredEntry(drv, de); drs.getDiskRegionView().incRecoveredEntryCount(); this.stats.incRecoveredEntryCreates(); } else { DiskId curdid = de.getDiskId(); assert curdid.getOplogId() != getOplogId() : "Mutiple ModEntryWK in the same oplog for getOplogId()=" + getOplogId() + " , curdid.getOplogId()=" + curdid.getOplogId() + " , for drId=" + drId + " , key=" + key; if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY ignore readModEntryWK because getOplogId()=" + getOplogId() + " != curdid.getOplogId()=" + curdid.getOplogId() + " for drId=" + drId + " key=" + key); } // if (DiskStoreImpl.TRACE_RECOVERY) { // logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readModEntryWK update oplogKeyId=<" + oplogKeyId + ">" // + "key=<"+ key + ">" // + " userBits=" + userBits // + " valueLen=" + valueLength // ); // } // de = drs.updateRecoveredEntry(key, re); // updateRecoveredEntry(drv, de, re); // this.stats.incRecoveredEntryUpdates(); } Object oldEntry = getRecoveryMap().put(oplogKeyId, de); if (oldEntry != null) { throw new AssertionError( LocalizedStrings.Oplog_DUPLICATE_CREATE .toLocalizedString(oplogKeyId)); } } } } /** * Reads an oplog entry of type Delete * * @param dis * DataInputStream from which the oplog is being read * @param opcode * byte whether the id is short/int/long * @param parent instance of disk region * @throws IOException */ private void readDelEntry(CountingDataInputStream dis, byte opcode, OplogEntryIdSet deletedIds, DiskStoreImpl parent) throws IOException { int idByteCount = (opcode - OPLOG_DEL_ENTRY_1ID) + 1; // long debugRecoverDelEntryId = this.recoverDelEntryId; long oplogKeyId = getDelEntryId(dis, idByteCount); // long debugOplogKeyId = dis.readLong(); readEndOfRecord(dis); // assert debugRecoverDelEntryId == debugOplogKeyId // : "expected=" + debugOplogKeyId + " actual=" + debugRecoverDelEntryId // + " idByteCount=" + idByteCount // + " delta=" + this.lastDelta; deletedIds.add(oplogKeyId); setHasDeletes(true); this.stats.incRecoveredEntryDestroys(); if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY readDelEntry oplogKeyId=<" + oplogKeyId + ">"); } } /** * Keeps track of the drId of Regions that have records in this oplog that * have not yet been recovered. * If this count is > 0 then this oplog can not be compacted. */ private final AtomicInteger unrecoveredRegionCount = new AtomicInteger(); private void addUnrecoveredRegion(Long drId) { DiskRegionInfo dri = getOrCreateDRI(drId); if (dri.testAndSetUnrecovered()) { this.unrecoveredRegionCount.incrementAndGet(); } } /** * For each dri that this oplog has that is currently unrecoverable check to see * if a DiskRegion that is recoverable now exists. */ void checkForRecoverableRegion(DiskRegionView dr) { if (this.unrecoveredRegionCount.get() > 0) { DiskRegionInfo dri = getDRI(dr); if (dri != null) { if (dri.testAndSetRecovered(dr)) { this.unrecoveredRegionCount.decrementAndGet(); } } } } void updateDiskRegion(DiskRegionView dr) { DiskRegionInfo dri = getDRI(dr); if (dri != null) { dri.setDiskRegion(dr); } } /** * Returns true if it is ok the skip the current modify record * which had the given oplogEntryId. * It is ok to skip if any of the following are true: * 1. deletedIds contains the id * 2. the last modification of the entry was done by a record read * from an oplog other than this oplog * @param tag */ private OkToSkipResult okToSkipModifyRecord(OplogEntryIdSet deletedIds, long drId, DiskRecoveryStore drs, long oplogEntryId, boolean checkRecoveryMap, VersionTag tag) { if (deletedIds.contains(oplogEntryId)) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because oplogEntryId=" + oplogEntryId + " was deleted for drId=" + drId); } return OkToSkipResult.SKIP_RECORD; } // if (dr == null || !dr.isReadyForRecovery()) { // // Region has not yet been created (it is not in the diskStore drMap). // // or it is not ready for recovery (i.e. it is a ProxyBucketRegion). // if (getParent().getDiskInitFile().regionExists(drId) // || (dr != null && !dr.isReadyForRecovery())) { // // Prevent compactor from removing this oplog. // // It needs to be in this state until all the regions stored it in // // are recovered. // if (DiskStoreImpl.TRACE_RECOVERY) { // logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY adding unrecoveredRegion drId=" + drId); // } // addUnrecoveredRegion(drId); // } else { // // someone must have deleted the region from the initFile (with our public tool?) // // so skip this record and don't count it as live so that the compactor can gc it. // } // if (DiskStoreImpl.TRACE_RECOVERY) { // logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because dr=null drId=" + drId); // } // return true; // } else if (drs == null) { // we are not currently recovering this region if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because drs is null for drId=" + drId); } // Now when the diskStore is created we recover all the regions immediately. // After that they can close and reopen a region but the close code calls // addUnrecoveredRegion. So I think at this point we don't need to do anything. // // Prevent compactor from removing this oplog. // // It needs to be in this state until all the regions stored it in // // are recovered. // if (DiskStoreImpl.TRACE_RECOVERY) { // logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY adding unrecoveredRegion drId=" + drId); // } // addUnrecoveredRegion(drId); return OkToSkipResult.SKIP_RECORD; } if (!checkRecoveryMap && !getParent().isOfflineCompacting()) { Object entry = getRecoveryMap().get(oplogEntryId); if (entry != null) { //DiskEntry de = drs.getDiskEntry(key); DiskEntry de = (DiskEntry)entry; if (de != null) { DiskId curdid = de.getDiskId(); if (curdid != null) { if (curdid.getOplogId() != getOplogId()) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because getOplogId()=" + getOplogId() + " != curdid.getOplogId()=" + curdid.getOplogId() + " for drId=" + drId + " key=" + de.getKeyCopy()); } return OkToSkipResult.SKIP_RECORD; } } } } } return okToSkipRegion(drs.getDiskRegionView(), oplogEntryId, tag); } /** * Returns true if the drId region has been destroyed or * if oplogKeyId preceeds the last clear done on the drId region * @param tag */ private OkToSkipResult okToSkipRegion(DiskRegionView drv, long oplogKeyId, VersionTag tag) { long lastClearKeyId = drv.getClearOplogEntryId(); if (lastClearKeyId != DiskStoreImpl.INVALID_ID) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY lastClearKeyId=" + lastClearKeyId + " oplogKeyId=" + oplogKeyId); } if (lastClearKeyId >= 0) { if (oplogKeyId <= lastClearKeyId) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because oplogKeyId=" + oplogKeyId + " <= lastClearKeyId=" + lastClearKeyId + " for drId=" + drv.getId()); } // @todo add some wraparound logic return OkToSkipResult.SKIP_RECORD; } } else { // lastClearKeyId is < 0 which means it wrapped around // treat it like an unsigned value (-1 == MAX_UNSIGNED) if (oplogKeyId > 0 || oplogKeyId <= lastClearKeyId) { // If oplogKeyId > 0 then it happened before the clear // (assume clear happened after we wrapped around to negative). // If oplogKeyId < 0 then it happened before the clear // if it is < lastClearKeyId if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because oplogKeyId=" + oplogKeyId + " <= lastClearKeyId=" + lastClearKeyId + " for drId=" + drv.getId()); } return OkToSkipResult.SKIP_RECORD; } } } RegionVersionVector clearRVV = drv.getClearRVV(); if(clearRVV != null) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY clearRVV=" + clearRVV + " tag=" + tag); } if (clearRVV.contains(tag.getMemberID(), tag.getRegionVersion())) { if (DiskStoreImpl.TRACE_RECOVERY) { logger.info(LocalizedStrings.DEBUG, "TRACE_RECOVERY okToSkip because tag=" + tag + " <= clearRVV=" + clearRVV + " for drId=" + drv.getId()); } //For an RVV clear, we can only skip the value during recovery //because later modifies may use the oplog key id. return OkToSkipResult.SKIP_VALUE; } } return OkToSkipResult.DONT_SKIP; } private long getModEntryId(CountingDataInputStream dis, int idByteCount) throws IOException { return calcModEntryId(getEntryIdDelta(dis, idByteCount)); } private long getDelEntryId(CountingDataInputStream dis, int idByteCount) throws IOException { return calcDelEntryId(getEntryIdDelta(dis, idByteCount)); } private /*HACK DEBUG*/ static long getEntryIdDelta(CountingDataInputStream dis, int idByteCount) throws IOException { assert idByteCount >= 1 && idByteCount <= 8 : idByteCount; long delta; byte firstByte = dis.readByte(); // if (firstByte < 0) { // delta = 0xFFFFFFFFFFFFFF00L | firstByte; // } else { // delta = firstByte; // } delta = firstByte; idByteCount--; while (idByteCount > 0) { delta <<= 8; delta |= (0x00FF & dis.readByte()); idByteCount--; } // this.lastDelta = delta; // HACK DEBUG return delta; } // private long lastDelta; // HACK DEBUG /** * Call this when the cache is closed or region is destroyed. * Deletes the lock files. */ public void close() { if (this.closed) { return; } if( logger.fineEnabled()){ logger.fine("Oplog::close: Store name ="+ parent.getName() + " Oplog ID = "+oplogId); } basicClose(false); } /** * Close the files of a oplog but don't set any state. Used by unit tests */ public void testClose() { try { this.crf.channel.close(); } catch (IOException ignore) { } try { this.crf.raf.close(); } catch (IOException ignore) { } this.crf.RAFClosed = true; try { this.drf.channel.close(); } catch (IOException ignore) { } try { this.drf.raf.close(); } catch (IOException ignore) { } this.drf.RAFClosed = true; } private void basicClose(boolean forceDelete) { flushAll(); synchronized (this.lock/*crf*/) { unpreblow(this.crf, getMaxCrfSize()); // logger.info(LocalizedStrings.DEBUG, "DEBUG closing oplog#" + getOplogId() // + " liveCount=" + this.totalLiveCount.get(), // new RuntimeException("STACK")); if (!this.crf.RAFClosed) { try { this.crf.channel.close(); } catch (IOException ignore) { } try { this.crf.raf.close(); } catch (IOException ignore) { } this.crf.RAFClosed = true; this.stats.decOpenOplogs(); } this.closed = true; } synchronized (this.lock/*drf*/) { unpreblow(this.drf, getMaxDrfSize()); if (!this.drf.RAFClosed) { try { this.drf.channel.close(); } catch (IOException ignore) { } try { this.drf.raf.close(); } catch (IOException ignore) { } this.drf.RAFClosed = true; } } if (forceDelete) { this.deleteFiles(false); } } /** * Used by tests to confirm that an oplog was compacted */ boolean testConfirmCompacted() { return this.closed && this.deleted.get() && getOplogSize() == 0; } // @todo add state to determine when both crf and drf and been deleted. /** * Note that this can return true even when we still need to keep the oplog around * because its drf is still needed. */ boolean isDeleted() { return this.deleted.get(); } /** * Destroys this oplog. First it will call close which will cleanly close all * Async threads and then the oplog file will be deleted. The * deletion of lock files will be taken care of by the close. * */ public void destroy() { lockCompactor(); try { if (!this.closed) { this.basicClose(true /* force delete */); } else { // do the following even if we were already closed deleteFiles(false); } } finally { unlockCompactor(); } } /* In offline compaction, after compacted each oplog, only the crf * will be deleted. Oplog with drf only will be housekepted later. */ public void destroyCrfOnly() { lockCompactor(); try { if (!this.closed) { this.basicClose(true /* force delete */); } else { // do the following even if we were already closed deleteFiles(true); } } finally { unlockCompactor(); } } /** * A check to confirm that the oplog has been closed because of the cache * being closed * */ private void checkClosed() { getParent().getCancelCriterion().checkCancelInProgress(null); if (!this.closed) { return; } throw new OplogCancelledException("This Oplog has been closed."); } /** * Return the number of bytes needed to encode the given long. * Value returned will be >= 1 and <= 8. */ static int bytesNeeded(long v) { if (v < 0) { v = ~v; } return ((64 - Long.numberOfLeadingZeros(v)) / 8)+1; } /** * Return absolute value of v. */ static long abs(long v) { if (v < 0) { return -v; } else { return v; } } private long calcDelta(long opLogKeyID, byte opCode) { long delta; if (opCode == OPLOG_DEL_ENTRY_1ID) { delta = opLogKeyID - this.writeDelEntryId; this.writeDelEntryId = opLogKeyID; } else { delta = opLogKeyID - this.writeModEntryId; this.writeModEntryId = opLogKeyID; // logger.info(LocalizedStrings.DEBUG, "DEBUG calcDelta delta=" + delta // + " writeModEntryId=" + oplogKeyId); } return delta; } /** * This function records all the data for the current op * into this.opState. * * @param opCode * The int value identifying whether it is create/modify or delete * operation * @param entry * The DiskEntry object being operated upon * @param value * The byte array representing the value * @param userBits * @throws IOException */ private void initOpState(byte opCode, DiskRegionView dr, DiskEntry entry, byte[] value, byte userBits, boolean notToUseUserBits) throws IOException { int len = value != null ? value.length : 0; initOpState(opCode, dr, entry, value, len, userBits, notToUseUserBits); } private void initOpState(byte opCode, DiskRegionView dr, DiskEntry entry, byte[] value, int valueLength, byte userBits, boolean notToUseUserBits) throws IOException { this.opState.initialize(opCode, dr, entry, value, valueLength, userBits, notToUseUserBits); } private void clearOpState() { this.opState.clear(); } /** * Returns the number of bytes it will take to serialize this.opState. */ private int getOpStateSize() { return this.opState.getSize(); } private int getOpStateValueOffset() { return this.opState.getValueOffset(); } private byte calcUserBits(byte[] value, boolean isSerializedObject) { byte userBits = 0x0; if (isSerializedObject) { if (value == DiskEntry.INVALID_BYTES) { // its the invalid token userBits = EntryBits.setInvalid(userBits, true); } else if (value == DiskEntry.LOCAL_INVALID_BYTES) { // its the local-invalid token userBits = EntryBits.setLocalInvalid(userBits, true); } else if (value == DiskEntry.TOMBSTONE_BYTES) { // its the tombstone token userBits = EntryBits.setTombstone(userBits, true); } else { if (value == null) { throw new IllegalStateException("userBits==1 and value is null"); } else if (value.length == 0) { throw new IllegalStateException("userBits==1 and value is zero length"); } userBits = EntryBits.setSerialized(userBits, true); } } return userBits; } /** * Returns true if the given entry has not yet been written to this oplog. */ private boolean modNeedsKey(DiskEntry entry) { DiskId did = entry.getDiskId(); synchronized (did) { if (did.getOplogId() != getOplogId()) { // the last record for it was written in a different oplog // so we need the key. return true; } else { return false; } } } /** * Asif: Modified the code so as to reuse the already created ByteBuffer * during transition. Creates a key/value pair from a region entry * on disk. Updates all of the necessary * {@linkplain DiskStoreStats statistics} and invokes basicCreate * * @param entry * The DiskEntry object for this key/value pair. * @param value * byte array representing the value * @param isSerializedObject * boolean indicating whether the byte array is a serialized value or * not Do the bytes in value contain a serialized * object (or an actually byte array)? * @throws DiskAccessException * @throws IllegalStateException * */ public final void create(LocalRegion region, DiskEntry entry, byte[] value, boolean isSerializedObject, boolean async) { if (this != getOplogSet().getChild()) { getOplogSet().getChild().create(region, entry, value, isSerializedObject, async); } else { DiskId did = entry.getDiskId(); boolean exceptionOccured = false; byte prevUsrBit = did.getUserBits(); int len = did.getValueLength(); try { // It is ok to do this outside of "lock" because // create records do not need to change. byte userBits = calcUserBits(value, isSerializedObject); // save versions for creates and updates even if value is bytearrary in 7.0 if (entry.getVersionStamp()!=null) { if(entry.getVersionStamp().getMemberID() == null) { throw new AssertionError("Version stamp should have a member at this point for entry " + entry); } // pdx and tx will not use version userBits = EntryBits.setWithVersions(userBits, true); } basicCreate(region.getDiskRegion(), entry, value, userBits, async); } catch (IOException ex) { exceptionOccured = true; region.getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, region.getFullPath()); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); exceptionOccured = true; region.getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0_DUE_TO_FAILURE_IN_ACQUIRING_READ_LOCK_FOR_ASYNCH_WRITING.toLocalizedString(this.diskFile.getPath()), ie, region.getFullPath()); } finally { if (exceptionOccured) { did.setValueLength(len); did.setUserBits(prevUsrBit); } } } } /** * Return true if no records have been written to the oplog yet. */ private boolean isFirstRecord() { return this.firstRecord; } /** * Asif: A helper function which identifies whether to create the entry in the * current oplog or to make the switch to the next oplog. This function * enables us to reuse the byte buffer which got created for an oplog which no * longer permits us to use itself * * @param entry * DiskEntry object representing the current Entry * @throws IOException * @throws InterruptedException */ private void basicCreate(DiskRegion dr, DiskEntry entry, byte[] value, byte userBits, boolean async) throws IOException, InterruptedException { DiskId id = entry.getDiskId(); boolean useNextOplog = false; long startPosForSynchOp = -1; if (DiskStoreImpl.KRF_DEBUG) { // wait for cache close to create krf System.out.println("basicCreate KRF_DEBUG"); Thread.sleep(1000); } synchronized (this.lock) { // TODO soplog perf analysis shows this as a contention point //synchronized (this.crf) { initOpState(OPLOG_NEW_ENTRY_0ID, dr, entry, value, userBits, false); // Asif : Check if the current data in ByteBuffer will cause a // potential increase in the size greater than the max allowed long temp = (getOpStateSize() + this.crf.currSize); if (!this.wroteNewEntryBase) { temp += OPLOG_NEW_ENTRY_BASE_REC_SIZE; } if (this != getOplogSet().getChild()) { useNextOplog = true; } else if (temp > getMaxCrfSize() && !isFirstRecord()) { switchOpLog(dr, getOpStateSize(), entry, false); useNextOplog = true; } else { if (this.lockedForKRFcreate) { throw new CacheClosedException("The disk store is closed."); } this.firstRecord = false; writeNewEntryBaseRecord(async); // Now we can finally call newOplogEntryId. // We need to make sure the create records // are written in the same order as they are created. // This allows us to not encode the oplogEntryId explicitly in the record long createOplogEntryId = getOplogSet().newOplogEntryId(); id.setKeyId(createOplogEntryId); // startPosForSynchOp = this.crf.currSize; // Asif: Allow it to be added to the OpLOg so increase the // size of currenstartPosForSynchOpt oplog int dataLength = getOpStateSize(); // Asif: It is necessary that we set the // Oplog ID here without releasing the lock on object as we are // writing to the file after releasing the lock. This can cause // a situation where the // switching thread has added Oplog for compaction while the previous // thread has still not started writing. Thus compactor can // miss an entry as the oplog Id was not set till then. // This is because a compactor thread will iterate over the entries & // use only those which have OplogID equal to that of Oplog being // compacted without taking any lock. A lock is taken only if the // entry is a potential candidate. // Further the compactor may delete the file as a compactor thread does // not require to take any shared/exclusive lock at DiskStoreImpl // or Oplog level. // It is also assumed that compactor thread will take a lock on both // entry as well as DiskID while compacting. In case of synch // mode we can // safely set OplogID without taking lock on DiskId. But // for asynch mode // we have to take additional precaution as the asynch // writer of previous // oplog can interfere with the current oplog. id.setOplogId(getOplogId()); // do the io while holding lock so that switch can set doneAppending // Write the data to the opLog for the synch mode startPosForSynchOp = writeOpLogBytes(this.crf, async, true); // if (this.crf.currSize != startPosForSynchOp) { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // logger.info(LocalizedStrings.DEBUG, "currSize=" + this.crf.currSize // + " startPosForSynchOp=" + startPosForSynchOp // + " oplog#" + getOplogId()); // assert false; // } this.crf.currSize = temp; // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "create setting size to=" + temp // + " oplog#" + getOplogId()); // } // if (temp != this.crf.raf.getFilePointer()) // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "create setting size to=" + temp // + " fp=" + this.crf.raf.getFilePointer() // + " oplog#" + getOplogId()); // } if (EntryBits.isNeedsValue(userBits)) { id.setValueLength(value.length); } else { id.setValueLength(0); } id.setUserBits(userBits); if (this.logger.finerEnabled()) { this.logger .finer("Oplog::basicCreate:About to Release ByteBuffer with data for Disk ID = " + id.toString()); } if (this.logger.finerEnabled()) { this.logger .finer("Oplog::basicCreate:Release dByteBuffer with data for Disk ID = " + id.toString()); } // Asif: As such for any put or get operation , a synch is taken // on the Entry object in the DiskEntry's Helper functions. // Compactor thread will also take a lock on entry object. Therefore // we do not require a lock on DiskID, as concurrent access for // value will not occur. startPosForSynchOp += getOpStateValueOffset(); if (DiskStoreImpl.TRACE_WRITES) { VersionTag tag = null; if (entry.getVersionStamp()!=null) { tag = entry.getVersionStamp().asVersionTag(); } this.logger.info(LocalizedStrings.DEBUG, "TRACE_WRITES basicCreate: id=<" + abs(id.getKeyId()) + "> key=<" + entry.getKeyCopy() + ">" + " valueOffset=" + startPosForSynchOp + " userBits=" + userBits + " valueLen=" + (value != null ? value.length : 0) + " valueBytes=<" + baToString(value) + ">" + " drId=" + dr.getId() + " versionTag=" + tag + " oplog#" + getOplogId()); } id.setOffsetInOplog(startPosForSynchOp); addLive(dr, entry); // Size of the current oplog being increased // due to 'create' operation. Set the change in stats. // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "create inc=" + dataLength); // } this.dirHolder.incrementTotalOplogSize(dataLength); incTotalCount(); //Update the region version vector for the disk store. //This needs to be done under lock so that we don't switch oplogs //unit the version vector accurately represents what is in this oplog RegionVersionVector rvv = dr.getRegionVersionVector(); final VersionStamp version; if (rvv != null && (version = entry.getVersionStamp()) != null) { rvv.recordVersion(version.getMemberID(), version.getRegionVersion()); } // getKeyCopy() is a potentially expensive operation in GemFireXD so // qualify with EntryLogger.isEnabled() first if (EntryLogger.isEnabled()) { EntryLogger.logPersistPut(dr.getName(), entry.getKeyCopy(), dr.getDiskStoreID()); } } clearOpState(); // } } if (useNextOplog) { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterSwitchingOplog(); } Assert.assertTrue(this != getOplogSet().getChild()); getOplogSet().getChild().basicCreate(dr, entry, value, userBits, async); } else { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance() .afterSettingOplogOffSet(startPosForSynchOp); } } } /** * This oplog will be forced to switch to a new oplog * * * public void forceRolling() { if (getOplogSet().getChild() == this) { * synchronized (this.lock) { if (getOplogSet().getChild() == this) { * switchOpLog(0, null); } } if (!this.sync) { * this.writer.activateThreadToTerminate(); } } } */ /** * This oplog will be forced to switch to a new oplog */ void forceRolling(DiskRegion dr, boolean blocking) { if (getOplogSet().getChild() == this) { synchronized (this.lock) { if (getOplogSet().getChild() == this) { switchOpLog(dr, 0, null, blocking); } } if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterSwitchingOplog(); } } } /** * Return true if it is possible that compaction of this oplog will be done. */ private boolean isCompactionPossible() { return getOplogSet().isCompactionPossible(); } /** * Asif: This function is used to switch from one op Log to another , when the * size of the current oplog has reached the maximum permissible. It is always * called from synch block with lock object being the OpLog File object We * will reuse the ByteBuffer Pool. We should add the current Oplog for compaction * first & then try to get next directory holder as in case there is only a * single directory with space being full, compaction has to happen before it can * be given a new directory. If the operation causing the switching is on an * Entry which already is referencing the oplog to be compacted, then the compactor * thread will skip compaction that entry & the switching thread will roll the * entry explicitly. * * @param lengthOfOperationCausingSwitch * length of the operation causing the switch * @param entryCausingSwitch * DiskEntry object operation on which caused the switching of Oplog. * This can be null if the switching has been invoked by the * forceRolling which does not need an operation on entry to cause * the switch */ private void switchOpLog(DiskRegionView dr, int lengthOfOperationCausingSwitch, DiskEntry entryCausingSwitch, boolean blocking) { String drName; if (dr != null) { drName = dr.getName(); } else { drName = getParent().getName(); } flushAll(); // needed in case of async lengthOfOperationCausingSwitch += 20; // for worstcase overhead of writing first record // logger.info(LocalizedStrings.DEBUG, "DEBUG: recSize=" // + lengthOfOperationCausingSwitch // + " crf.currSize=" + this.crf.currSize // + " drf.currSize=" + this.drf.currSize // + " crf.maxSize=" + getMaxCrfSize() // + " drf.maxSize=" + getMaxDrfSize() // , new RuntimeException("STACK")); // if length of operation is greater than max Dir Size than an exception // is // thrown if (this.logger.fineEnabled()) { this.logger .fine("Oplog::switchOpLog: Entry causing Oplog switch has diskID=" + (entryCausingSwitch != null ? entryCausingSwitch.getDiskId() .toString() : "Entry is null")); } if (lengthOfOperationCausingSwitch > getParent().getMaxDirSize()) { throw new DiskAccessException(LocalizedStrings.Oplog_OPERATION_SIZE_CANNOT_EXCEED_THE_MAXIMUM_DIRECTORY_SIZE_SWITCHING_PROBLEM_FOR_ENTRY_HAVING_DISKID_0.toLocalizedString((entryCausingSwitch != null ? entryCausingSwitch.getDiskId().toString() : "\"null Entry\"")), drName); } if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().beforeSwitchingOplog(); } if (this.logger.fineEnabled()) { this.logger.fine("Oplog::switchOpLog: About to add the Oplog = " + this.oplogId + " for compaction. Entry causing the switch is having DiskID = " + (entryCausingSwitch != null ? entryCausingSwitch.getDiskId() .toString() : "null Entry")); } if (needsCompaction()) { addToBeCompacted(); } else { getOplogSet().addInactive(this); } try { DirectoryHolder nextDirHolder = getOplogSet().getNextDir( lengthOfOperationCausingSwitch); Oplog newOplog = new Oplog(this.oplogId + 1, nextDirHolder, this); newOplog.firstRecord = true; getOplogSet().setChild(newOplog); finishedAppending(); //Defer the unpreblow and close of the RAF. We saw pauses in testing from //unpreblow of the drf, maybe because it is freeing pages that were //preallocated. Close can pause if another thread is calling force on the //file channel - see 50254. These operations should be safe to defer, //a concurrent read will synchronize on the oplog and use or reopen the RAF //as needed. getParent().executeDelayedExpensiveWrite(new Runnable() { public void run() { // need to truncate crf and drf if their actual size is less than their pregrow size unpreblow(Oplog.this.crf, getMaxCrfSize()); unpreblow(Oplog.this.drf, getMaxDrfSize()); // Close the crf using closeRAF. We will reopen the crf if we // need it to fault in or to read values during compaction. closeRAF(); // I think at this point the drf no longer needs to be open synchronized (Oplog.this.lock/*drf*/) { if (!Oplog.this.drf.RAFClosed) { try { Oplog.this.drf.channel.close(); } catch (IOException ignore) { } try { Oplog.this.drf.raf.close(); } catch (IOException ignore) { } Oplog.this.drf.RAFClosed = true; } } } }); // offline compaction will not execute create Krf in the task, becasue // this.totalLiveCount.get() == 0 if (getParent().isOfflineCompacting()) { krfClose(true, false); } else { if (blocking) { createKrf(false); } else { createKrfAsync(); } } } catch (DiskAccessException dae) { // Asif: Remove the Oplog which was added in the DiskStoreImpl // for compaction as compaction cannot be done. // However, it is also possible that compactor // may have done the compaction of the Oplog but the switching thread // encountered timeout exception. // So the code below may or may not actually // ensure that the Oplog has been compacted or not. getOplogSet().removeOplog(this.oplogId) ; throw dae; } } /** * Schedule a task to create a krf asynchronously. */ protected void createKrfAsync() { if (logger.infoEnabled()) { this.logger.info( LocalizedStrings.DEBUG, "createKrfAsync called for oplog: " + this + ", parent: " + parent.getName()); } boolean submitted = getParent().executeDiskStoreTask(new Runnable() { public void run() { // for GemFireXD first wait for first phase DDL replay to finish so that // indexes, regions etc. are in a stable state boolean signalEnd = getParent().waitBeforeAsyncDiskTask(); try { // return if diskstore is closing if (getParent().isClosing()) { return; } createKrf(false); } finally { if (signalEnd) { getParent().endAsyncDiskTask(); } } } }); if (!submitted) { if (logger.infoEnabled()) { this.logger.info(LocalizedStrings.DEBUG, "createKrfAsync createKrf job for oplog: " + this + ", parent: " + parent.getName() + " could not be submitted successfully"); } this.krfCreationCancelled.set(true); } } /** * Used when creating a KRF to keep track of what DiskRegionView a DiskEntry * belongs to. */ public static final class KRFEntry { private final DiskEntry de; private final DiskRegionView drv; /** Fix for 42733 - a stable snapshot * of the offset so we can sort * It doesn't matter that this is stale, * we'll filter out these entries later. */ private final long offsetInOplog; private final VersionHolder versionTag; public KRFEntry(DiskRegionView drv, DiskEntry de, VersionHolder tag) { this.de = de; this.drv = drv; DiskId diskId = de.getDiskId(); this.offsetInOplog = diskId != null ? diskId.getOffsetInOplog() : 0; this.versionTag = tag; } public DiskEntry getDiskEntry() { return this.de; } public DiskRegionView getDiskRegionView() { return this.drv; } public long getOffsetInOplogForSorting() { return offsetInOplog; } } private boolean writeOneKeyEntryForKRF(KRFEntry ke, long currentTime) throws IOException { DiskEntry de = ke.getDiskEntry(); long diskRegionId = ke.getDiskRegionView().getId(); long oplogKeyId; byte userBits; long valueOffset; int valueLength; Object deKey; VersionHolder tag = ke.versionTag; synchronized (de) { DiskId di = de.getDiskId(); if (di == null) { return false; } if(de.isRemovedFromDisk()) { //the entry was concurrently removed return false; } synchronized (di) { // Make sure each one is still in this oplog. if (di.getOplogId() != getOplogId()) { return false; } userBits = di.getUserBits(); oplogKeyId = Math.abs(di.getKeyId()); valueOffset = di.getOffsetInOplog(); valueLength = di.getValueLength(); deKey = de.getKeyCopy(); if (valueOffset < 0) { assert (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)); } } if(tag ==null) { if (EntryBits.isWithVersions(userBits) && de.getVersionStamp()!=null) { tag = de.getVersionStamp().asVersionTag(); } else if(de.getVersionStamp() != null) { throw new AssertionError("No version bits on entry we're writing to the krf " + de); } } } if(DiskStoreImpl.TRACE_WRITES) { this.logger.info(LocalizedStrings.DEBUG, "TRACE_WRITES krf oplogId=" + oplogId + " key=" + deKey + " oplogKeyId=" + oplogKeyId + " de=" + System.identityHashCode(de) + " vo=" + valueOffset + " vl=" + valueLength+ " diskRegionId="+diskRegionId+" version tag="+tag); } byte[] keyBytes = EntryEventImpl.serialize(deKey); //skip the invalid entries, theire valueOffset is -1 return writeOneKeyEntryForKRF(keyBytes, userBits, valueLength, diskRegionId, oplogKeyId, valueOffset, tag, de.getLastModified(), currentTime); } private boolean writeOneKeyEntryForKRF(byte[] keyBytes, byte userBits, int valueLength, long diskRegionId, long oplogKeyId, long valueOffset, VersionHolder tag, long lastModifiedTime, long currentTime) throws IOException { if (getParent().isValidating()) { return false; } if (!getParent().isOfflineCompacting()) { assert (this.krf.dos!=null); } else { if (this.krf.dos == null) { // krf already exist, thus not re-opened for write return false; } } DataSerializer.writeByteArray(keyBytes, this.krf.dos); boolean withVersions = EntryBits.isWithVersions(userBits); if (withVersions && tag == null) { userBits = EntryBits.setWithVersions(userBits, false); withVersions = false; } if (!withVersions) { userBits = EntryBits.setHasLastModifiedTime(userBits); } this.krf.dos.writeByte(EntryBits.getPersistentBits(userBits)); InternalDataSerializer.writeArrayLength(valueLength, this.krf.dos); DiskInitFile.writeDiskRegionID(this.krf.dos, diskRegionId); if (withVersions) { serializeVersionTag(tag, this.krf.dos); } else { if (lastModifiedTime == 0) { lastModifiedTime = currentTime; } InternalDataSerializer.writeUnsignedVL(lastModifiedTime, this.krf.dos); } InternalDataSerializer.writeVLOld(oplogKeyId, this.krf.dos); //skip the invalid entries, theire valueOffset is -1 if(!EntryBits.isAnyInvalid(userBits) && !EntryBits.isTombstone(userBits)) { InternalDataSerializer.writeVLOld((valueOffset - this.krf.lastOffset), this.krf.dos); // save the lastOffset in krf object this.krf.lastOffset = valueOffset; } this.krf.keyNum ++; return true; } private final AtomicBoolean krfCreated = new AtomicBoolean(); private final AtomicBoolean krfCreationCancelled = new AtomicBoolean(); private String getKrfFilePath() { return this.diskFile.getPath() + KRF_FILE_EXT; } public void krfFileCreate() throws IOException { // this method is only used by offline compaction. validating will not create krf assert (getParent().isValidating() == false); this.krf.f = new File(getKrfFilePath()); if (this.krf.f.exists()) { throw new IllegalStateException("krf file " + this.krf.f + " already exists."); } this.krf.fos = new FileOutputStream(this.krf.f); this.krf.bos = new BufferedOutputStream(this.krf.fos, DEFAULT_BUFFER_SIZE); this.krf.dos = new DataOutputStream(this.krf.bos); //write the disk store id to the krf this.krf.dos.writeLong(getParent().getDiskStoreID().getLeastSignificantBits()); this.krf.dos.writeLong(getParent().getDiskStoreID().getMostSignificantBits()); this.krf.dos.writeByte(END_OF_RECORD_ID); // write product versions assert this.gfversion != null; // write both gemfire and data versions if the two are different else write // only gemfire version; a token distinguishes the two cases while reading // like in writeGemFireVersionRecord Version dataVersion = getDataVersionIfOld(); if (dataVersion == null) { dataVersion = Version.CURRENT; } if (this.gfversion == dataVersion) { this.gfversion.writeOrdinal(this.krf.dos, false); } else { Version.TOKEN.writeOrdinal(this.krf.dos, false); this.krf.dos.writeByte(END_OF_RECORD_ID); this.krf.dos.writeByte(OPLOG_GEMFIRE_VERSION); this.gfversion.writeOrdinal(this.krf.dos, false); this.krf.dos.writeByte(END_OF_RECORD_ID); this.krf.dos.writeByte(OPLOG_GEMFIRE_VERSION); dataVersion.writeOrdinal(this.krf.dos, false); } this.krf.dos.writeByte(END_OF_RECORD_ID); //Write the total entry count to the krf so that when we recover, //our compaction statistics will be accurate InternalDataSerializer.writeUnsignedVL(this.totalCount.get(), this.krf.dos); this.krf.dos.writeByte(END_OF_RECORD_ID); //Write the RVV to the krf. Map drMap = getParent().getAllDiskRegions(); byte[] rvvBytes = serializeRVVs(drMap, false); this.krf.dos.write(rvvBytes); this.krf.dos.writeByte(END_OF_RECORD_ID); } // if IOException happened during krf creation, close and delete it private void closeAndDeleteKrf() { try { if (this.krf.dos != null) { this.krf.dos.close(); this.krf.dos = null; } } catch (IOException ignore) { } try { if (this.krf.bos != null) { this.krf.bos.close(); this.krf.bos = null; } } catch (IOException ignore) { } try { if (this.krf.fos != null) { this.krf.fos.close(); this.krf.fos = null; } } catch (IOException ignore) { } if (this.krf.f.exists()) { this.krf.f.delete(); } } public void krfClose(boolean deleteEmptyKRF, boolean persistIndexes) { boolean allClosed = false; try { if (this.krf.fos != null) { DataSerializer.writeByteArray(null, this.krf.dos); } else { return; } this.krf.dos.close(); this.krf.dos = null; this.krf.bos.close(); this.krf.bos = null; this.krf.fos.close(); this.krf.fos = null; if (this.krf.keyNum == 0 && deleteEmptyKRF) { // this is an empty krf file this.krf.f.delete(); assert this.krf.f.exists() == false; } else { //Mark that this krf is complete. if (!persistIndexes) { getParent().getDiskInitFile().krfCreate(this.oplogId); } if (logger.infoEnabled()) { logger.info(LocalizedStrings.Oplog_CREATE_0_1_2, new Object[] { toString(), "krf", getParent().getName() }); } } allClosed = true; } catch (IOException e) { throw new DiskAccessException("Fail to close krf file "+this.krf.f, e, getParent()); } finally { if (!allClosed) { // IOException happened during close, delete this krf closeAndDeleteKrf(); } } } /** * Create the KRF file for this oplog. It is ok for this method to be async. * finishKRF will be called and it must block until KRF generation is * complete. * * @param cancel * if true then prevent the krf from being created if possible */ void createKrf(boolean cancel) { //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf called for oplog: " + this + " parent: " + this.parent.getName()); if (cancel) { this.krfCreated.compareAndSet(false, true); //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 1"); return; } if (!couldHaveKrf()) { //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 2"); return; } final boolean persistIndexes = this.parent.isPersistIndexes(); // Make sure regions can not become unrecovered while creating the KRF. getParent().acquireCompactorReadLock(); try { if (!getParent().allowKrfCreation()) { return; } //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf locking compactor"); lockCompactor(); try { synchronized(this.lock) { // 42733: after set it to true, we will not reset it, since this oplog will be // inactive forever this.lockedForKRFcreate = true; } //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf locked for krfcreate"); synchronized (this.krfCreated) { if (this.krfCreated.get()) { //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 3"); return; } this.krfCreated.set(true); if (this.unrecoveredRegionCount.get() > 0) { // if we have unrecovered regions then we can't create // a KRF because we don't have the list of live entries. //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 4"); return; } int tlc = (int) this.totalLiveCount.get(); if (tlc <= 0) { // no need to create a KRF since this oplog will be deleted. // TODO should we create an empty KRF anyway? //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 5"); return; } // logger.info(LocalizedStrings.DEBUG, "DEBUG: tlc=" + tlc); Collection regions = this.regionMap.values(); List sortedLiveEntries = getSortedLiveEntries(regions); if (sortedLiveEntries == null || sortedLiveEntries.isEmpty()) { //no need to create a krf if there are no live entries. //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 6"); return; } if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { if (!CacheObserverHolder.getInstance().shouldCreateKRFIRF()) { //logger.info(LocalizedStrings.DEBUG, "DEBUG: createKrf ret 7"); return; } } boolean krfCreateSuccess = false; try { krfFileCreate(); final HashSet notWrittenKRFs = new HashSet(); // sortedLiveEntries are now sorted // so we can start writing them to disk. if (sortedLiveEntries != null) { final long currentTime = getParent().getCache().cacheTimeMillis(); for (KRFEntry ke : sortedLiveEntries) { boolean written = writeOneKeyEntryForKRF(ke, currentTime); if (!written) { if (persistIndexes) { notWrittenKRFs.add(ke); } } } } // If index persistence is ON then we would delay writing the dif record // until the IRF is also generated. krfClose(true, persistIndexes); krfCreateSuccess = true; for(DiskRegionInfo dri : regions) { dri.afterKrfCreated(); } if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterKrfCreated(); } if (persistIndexes) { writeIRF(sortedLiveEntries, notWrittenKRFs, null, null); if (logger.fineEnabled()) { logger.info(LocalizedStrings.DEBUG, "createKrf: going to " + "write krfCreate and irfCreate records for: " + this); } this.parent.flushAndSync(true); DiskInitFile initFile = getParent().getDiskInitFile(); initFile.krfCreate(this.oplogId); initFile.irfCreate(this.oplogId); } } catch (FileNotFoundException ex) { // handle exception; we couldn't open the krf file throw new IllegalStateException("could not create krf " + this.krf.f, ex); } catch (IOException ex) { // handle io exceptions; we failed to write to the file throw new IllegalStateException("failed writing krf " + this.krf.f, ex); } finally { synchronized (this.krfCreated) { this.krfCreated.notifyAll(); } // if IOException happened in writeOneKeyEntryForKRF(), delete krf here if (!krfCreateSuccess) { closeAndDeleteKrf(); } } } } finally { unlockCompactor(); } } finally { getParent().releaseCompactorReadLock(); } } /** * Write index records for given list of region entries in this Oplog. * * @param sortedLiveEntries * the list of region entries to write * @param notWrittenKRFs * any entries that were not successfully written to KRF * @param dumpIndexes * the index containers for which to write the index data, or null to * write for all indexes * @param loadIndexes * if any index containers have also to be populated with data from * this oplog */ @SuppressWarnings("unchecked") public long writeIRF(List sortedLiveEntries, final HashSet notWrittenKRFs, Set dumpIndexes, Map loadIndexes) throws IOException { final GemFireCacheImpl.StaticSystemCallbacks sysCb = GemFireCacheImpl .getInternalProductCallbacks(); final boolean traceOn = sysCb.tracePersistFinestON(); // wait for krf creation to complete if in progress if (!this.krfCreated.get()) { final long start = System.currentTimeMillis(); final long maxWait = 60000L; synchronized (this.krfCreated) { while (!this.krfCreated.get()) { if (System.currentTimeMillis() > (start + maxWait)) { throw new DiskAccessException("Failed to write index file due " + "to missing key file (" + getKrfFilePath() + ')', getParent()); } try { this.krfCreated.wait(500L); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); DiskStoreImpl dsi = getParent(); dsi.getCache().getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException("Failed to write index file due " + "to missing key file (" + getKrfFilePath() + ')', ie, dsi); } } } } // truncate existing idxkrf if records for all indexes need to be written this.idxkrf.initializeForWriting(dumpIndexes == null); long numEntries = 0; if (sortedLiveEntries != null && (numEntries = sortedLiveEntries.size()) > 0) { if (notWrittenKRFs != null) { numEntries -= notWrittenKRFs.size(); } if (dumpIndexes == null) { dumpIndexes = sysCb.getAllLocalIndexes(getParent()); this.indexesWritten.clear(); } else if (!this.indexesWritten.isEmpty()) { // remove the indexes already written for this oplog dumpIndexes = new THashSet(dumpIndexes); dumpIndexes.removeAll(this.indexesWritten); } if (logger.fineEnabled() || traceOn) { logger.fine("writeIRF called for diskstore: " + this.parent.getName() + " and oplog id: " + this.oplogId + ", indexes: " + dumpIndexes); if (logger.finerEnabled() || traceOn) { logger.finer("List of krf entries during createKrf of diskstore: " + this.parent.getName() + " and oplog id: " + this.oplogId); for (KRFEntry ke : sortedLiveEntries) { logger.finer("oplogentryid: " + ke.de.getDiskId().getKeyId() + " and region key: " + ke.de.getKey()); } logger.finer("List of krf entries during createKrf of diskstore: " + this.parent.getName() + " and oplog id: " + this.oplogId + " ends"); } if (notWrittenKRFs != null) { for (KRFEntry ke : notWrittenKRFs) { logger.fine("notWritten: oplogentryid: " + ke.de.getDiskId().getKeyId() + " and region key: " + ke.de.getKey()); } } } this.idxkrf.writeIndexRecords(sortedLiveEntries, notWrittenKRFs, dumpIndexes, loadIndexes); if (logger.fineEnabled()) { logger.fine("writeIRF ends for diskstore: " + this.parent.getName() + " and oplog id: " + this.oplogId); } this.indexesWritten.addAll(dumpIndexes); } if (logger.fineEnabled() || DiskStoreImpl.INDEX_LOAD_DEBUG) { logger.info(LocalizedStrings.DEBUG, "writeIRF going to flush and close idkkrf for: " + this); } this.idxkrf.close(); if (logger.infoEnabled()) { logger.info(LocalizedStrings.Oplog_CREATE_0_1_2, new Object[]{ toString(), this.idxkrf.getIndexFile().getAbsolutePath(), getParent().getName()}); } return numEntries; } public OplogIndex getOplogIndex() { return this.idxkrf; } public Collection getTargetRegionsForIndexes( Set indexes) { if (indexes != null) { ArrayList targetRegions = new ArrayList( this.regionMap.size()); THashSet usedRegionIDs = new THashSet(indexes.size()); for (SortedIndexContainer index : indexes) { usedRegionIDs.add(index.getBaseRegion().getRegionID()); } for (DiskRegionInfo regionInfo : this.regionMap.values()) { DiskRegionView drv = regionInfo.getDiskRegion(); String baseRegionID = getParentRegionID(drv); if (usedRegionIDs.contains(baseRegionID)) { targetRegions.add(regionInfo); } } return targetRegions; } else { return getRegionRecoveryMap(); } } private String printList(List list) { StringBuilder sb = new StringBuilder(); for (KRFEntry ke : list) { sb.append(ke.de.getDiskId().getKeyId()); sb.append(" - "); sb.append(ke.de.getKey()); sb.append(" ** "); } return sb.toString(); } public List getSortedLiveEntries(Collection targetRegions) { int tlc = (int) this.totalLiveCount.get(); if (tlc <= 0) { // no need to create a KRF since this oplog will be deleted. // TODO should we create an empty KRF anyway? return null; } KRFEntry[] sortedLiveEntries = new KRFEntry[tlc]; int idx = 0; for (DiskRegionInfo dri : targetRegions) { // logger.info(LocalizedStrings.DEBUG, "DEBUG: dri=" + dri); if (dri.getDiskRegion() != null) { idx = dri.addLiveEntriesToList(sortedLiveEntries, idx); // logger.info(LocalizedStrings.DEBUG, "DEBUG: idx=" + idx); } } // idx is now the length of sortedLiveEntries Arrays.sort(sortedLiveEntries, 0, idx, new Comparator() { public int compare(KRFEntry o1, KRFEntry o2) { long val1 = o1.getOffsetInOplogForSorting(); long val2 = o2.getOffsetInOplogForSorting(); return Long.signum(val1 - val2); } }); return Arrays.asList(sortedLiveEntries).subList(0, idx); } /** * Asif:This function retrieves the value for an entry being compacted subject to * entry referencing the oplog being compacted. Attempt is made to retrieve the * value from in memory , if available, else from asynch buffers ( if asynch * mode is enabled), else from the Oplog being compacted. It is invoked from * switchOplog as well as OplogCompactor's compact function. * * @param entry * DiskEntry being compacted referencing the Oplog being compacted * @param wrapper * Object of type BytesAndBitsForCompactor. The data if found is * set in the wrapper Object. The wrapper Object also contains * the user bit associated with the entry * @return boolean false indicating that entry need not be compacted. If true it * means that wrapper has been appropriately filled with data */ private boolean getBytesAndBitsForCompaction(DiskRegionView dr, DiskEntry entry, BytesAndBitsForCompactor wrapper) { // caller is synced on did DiskId did = entry.getDiskId(); byte userBits = 0; long oplogOffset = did.getOffsetInOplog(); SimpleMemoryAllocatorImpl.skipRefCountTracking(); @Retained @Released Object value = entry._getValueRetain(dr, true); // OFFHEAP for now copy into heap CD; todo optimize by keeping offheap for life of wrapper SimpleMemoryAllocatorImpl.unskipRefCountTracking(); // TODO:KIRK:OK Object value = entry.getValueWithContext(dr); boolean foundData = false; if (value == null) { // Asif: If the mode is synch it is guaranteed to be present in the disk foundData = basicGetForCompactor(dr, oplogOffset, false, did.getValueLength(), did.getUserBits(), wrapper); // after we have done the get do one more check to see if the // disk id of interest is still stored in the current oplog. // Do this to fix bug 40648 // Since we now call this with the diskId synced I think // it is impossible for this oplogId to change. if (did.getOplogId() != getOplogId()) { // if it is not then no need to compact it // logger.info(LocalizedStrings.DEBUG, "DEBUG skipping #2 did.Oplog#" + did.getOplogId() + " was not oplog#" + getOplogId()); return false; } else { // if the disk id indicates its most recent value is in oplogInFocus // then we should have found data assert foundData : "compactor get failed on oplog#" + getOplogId(); } userBits = wrapper.getBits(); if (EntryBits.isAnyInvalid(userBits)) { if (EntryBits.isInvalid(userBits)) { wrapper.setData(DiskEntry.INVALID_BYTES, userBits, DiskEntry.INVALID_BYTES.length, false/* Can not be reused*/); } else { wrapper.setData(DiskEntry.LOCAL_INVALID_BYTES, userBits, DiskEntry.LOCAL_INVALID_BYTES.length, false/* Can not be reused*/); } } else if (EntryBits.isTombstone(userBits)) { wrapper.setData(DiskEntry.TOMBSTONE_BYTES, userBits, DiskEntry.TOMBSTONE_BYTES.length, false/* Can not be reused*/); } if (EntryBits.isWithVersions(did.getUserBits())) { userBits = EntryBits.setWithVersions(userBits, true); } } else { foundData = true; userBits = 0; if (EntryBits.isRecoveredFromDisk(did.getUserBits())) { userBits = EntryBits.setRecoveredFromDisk(userBits, true); } if (EntryBits.isWithVersions(did.getUserBits())) { userBits = EntryBits.setWithVersions(userBits, true); } // no need to preserve pendingAsync bit since we will clear it anyway since we // (the compactor) are writing the value out to disk. if (value == Token.INVALID) { userBits = EntryBits.setInvalid(userBits, true); wrapper.setData(DiskEntry.INVALID_BYTES, userBits, DiskEntry.INVALID_BYTES.length, false /* Cannot be reused */); } else if (value == Token.LOCAL_INVALID) { userBits = EntryBits.setLocalInvalid(userBits, true); wrapper.setData(DiskEntry.LOCAL_INVALID_BYTES, userBits, DiskEntry.LOCAL_INVALID_BYTES.length, false /* Cannot be reused */); } else if (value == Token.TOMBSTONE) { userBits = EntryBits.setTombstone(userBits, true); wrapper.setData(DiskEntry.TOMBSTONE_BYTES, userBits, DiskEntry.TOMBSTONE_BYTES.length, false /* Cannot be reused */); } else if (value instanceof CachedDeserializable) { CachedDeserializable proxy = (CachedDeserializable)value; if (proxy instanceof StoredObject) { @Released StoredObject ohproxy = (StoredObject) proxy; try { if (ohproxy.isSerialized()) { userBits = EntryBits.setSerialized(userBits, true); } ohproxy.fillSerializedValue(wrapper, userBits); } finally { OffHeapHelper.releaseWithNoTracking(ohproxy); } } else { userBits = EntryBits.setSerialized(userBits, true); proxy.fillSerializedValue(wrapper, userBits); } } else if (value instanceof byte[]) { byte[] valueBytes = (byte[])value; // Asif: If the value is already a byte array then the user bit // is 0, which is the default value of the userBits variable, // indicating that it is non serialized data. Thus it is // to be used as it is & not to be deserialized to // convert into Object wrapper.setData(valueBytes, userBits, valueBytes.length, false /* the wrapper is not reusable */); } else if (Token.isRemoved(value) && value != Token.TOMBSTONE) { //TODO - RVV - We need to handle tombstones differently here! if (entry.getDiskId().isPendingAsync()) { entry.getDiskId().setPendingAsync(false); try { getOplogSet().getChild().basicRemove(dr, entry, false, false); } catch (IOException ex) { getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, dr.getName()); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); getParent().getCache().getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0_DUE_TO_FAILURE_IN_ACQUIRING_READ_LOCK_FOR_ASYNCH_WRITING.toLocalizedString(this.diskFile.getPath()), ie, dr.getName()); } } else { rmLive(dr, entry); } foundData = false; } else if (value instanceof Delta && !((Delta)value).allowCreate()) { // skip ListOfDeltas foundData = false; } else { userBits = EntryBits.setSerialized(userBits, true); EntryEventImpl.fillSerializedValue(wrapper, value, userBits); } } if (foundData) { // since the compactor is writing it out clear the async flag entry.getDiskId().setPendingAsync(false); } return foundData; } /** * Modifies a key/value pair from a region entry on disk. Updates all of the * necessary {@linkplain DiskStoreStats statistics} and invokes basicModify * * @param entry * DiskEntry object representing the current Entry * * @param value * byte array representing the value * @param isSerializedObject * Do the bytes in value contain a serialized object * (or an actually byte array)? * @throws DiskAccessException * @throws IllegalStateException */ /* * Asif: Modified the code so as to reuse the already created ByteBuffer * during transition. Minimizing the synchronization allowing multiple put * operations for different entries to proceed concurrently for asynch mode */ public final void modify(LocalRegion region, DiskEntry entry, byte[] value, boolean isSerializedObject, boolean async) { if (getOplogSet().getChild() != this) { getOplogSet().getChild().modify(region, entry, value, isSerializedObject, async); } else { DiskId did = entry.getDiskId(); boolean exceptionOccured = false; byte prevUsrBit = did.getUserBits(); int len = did.getValueLength(); try { byte userBits = calcUserBits(value, isSerializedObject); // save versions for creates and updates even if value is bytearrary in 7.0 if (entry.getVersionStamp()!=null) { if(entry.getVersionStamp().getMemberID() == null) { throw new AssertionError("Version stamp should have a member at this point for entry " + entry); } // pdx and tx will not use version userBits = EntryBits.setWithVersions(userBits, true); } int valueLen = value != null ? value.length : 0; basicModify(region.getDiskRegion(), entry, value, valueLen, userBits, async, false); } catch (IOException ex) { exceptionOccured = true; region.getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, region.getFullPath()); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); exceptionOccured = true; region.getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0_DUE_TO_FAILURE_IN_ACQUIRING_READ_LOCK_FOR_ASYNCH_WRITING.toLocalizedString(this.diskFile.getPath()), ie, region.getFullPath()); } finally { if (exceptionOccured) { did.setValueLength(len); did.setUserBits(prevUsrBit); } } } } public final void saveConflictVersionTag(LocalRegion region, VersionTag tag, boolean async) { if (getOplogSet().getChild() != this) { getOplogSet().getChild().saveConflictVersionTag(region, tag, async); } else { try { basicSaveConflictVersionTag(region.getDiskRegion(), tag, async); } catch (IOException ex) { region.getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_CONFLICT_VERSION_TAG_0.toLocalizedString(this.diskFile.getPath()), ex, region.getFullPath()); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); region.getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_CONFLICT_VERSION_TAG_0.toLocalizedString(this.diskFile.getPath()), ie, region.getFullPath()); } } } private final void copyForwardForOfflineCompact(long oplogKeyId, byte[] keyBytes, byte[] valueBytes, byte userBits, long drId, VersionTag tag, long lastModifiedTime, final long currentTime) { try { basicCopyForwardForOfflineCompact(oplogKeyId, keyBytes, valueBytes, userBits, drId, tag, lastModifiedTime, currentTime); } catch (IOException ex) { getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, getParent()); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); getParent().getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0_DUE_TO_FAILURE_IN_ACQUIRING_READ_LOCK_FOR_ASYNCH_WRITING.toLocalizedString(this.diskFile.getPath()), ie, getParent()); } } private final void copyForwardModifyForCompact(DiskRegionView dr, DiskEntry entry, byte[] value, int valueLength, byte userBits) { if (getOplogSet().getChild() != this) { getOplogSet().getChild().copyForwardModifyForCompact(dr, entry, value, valueLength, userBits); } else { DiskId did = entry.getDiskId(); boolean exceptionOccured = false; int len = did.getValueLength(); try { // Compactor always says to do an async basicModify so that its writes // will be grouped. This is not a true async write; just a grouped one. basicModify(dr, entry, value, valueLength, userBits, true, true); } catch (IOException ex) { exceptionOccured = true; getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, getParent()); } catch (InterruptedException ie) { exceptionOccured = true; Thread.currentThread().interrupt(); getParent().getCancelCriterion().checkCancelInProgress(ie); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0_DUE_TO_FAILURE_IN_ACQUIRING_READ_LOCK_FOR_ASYNCH_WRITING.toLocalizedString(this.diskFile.getPath()), ie, getParent()); } finally { if (exceptionOccured) { did.setValueLength(len); } } } } /** * Asif: A helper function which identifies whether to modify the entry in the * current oplog or to make the switch to the next oplog. This function * enables us to reuse the byte buffer which got created for an oplog which no * longer permits us to use itself. It will also take acre of compaction if * required * * @param entry * DiskEntry object representing the current Entry * @throws IOException * @throws InterruptedException */ private void basicModify(DiskRegionView dr, DiskEntry entry, byte[] value, int valueLength, byte userBits, boolean async, boolean calledByCompactor) throws IOException, InterruptedException { DiskId id = entry.getDiskId(); boolean useNextOplog = false; long startPosForSynchOp = -1L; int adjustment = 0; Oplog emptyOplog = null; if (DiskStoreImpl.KRF_DEBUG) { // wait for cache close to create krf System.out.println("basicModify KRF_DEBUG"); Thread.sleep(1000); } synchronized (this.lock) { // synchronized (this.crf) { if (getOplogSet().getChild() != this) { useNextOplog = true; } else { initOpState(OPLOG_MOD_ENTRY_1ID, dr, entry, value, valueLength, userBits, false); adjustment = getOpStateSize(); assert adjustment > 0; long temp = (this.crf.currSize + adjustment); if (temp > getMaxCrfSize() && !isFirstRecord()) { switchOpLog(dr, adjustment, entry, false); // we can't reuse it since it contains variable length data useNextOplog = true; } else { if (this.lockedForKRFcreate) { throw new CacheClosedException("The disk store is closed."); } this.firstRecord = false; // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "modify setting size to=" + temp // + " oplog#" + getOplogId()); // } long oldOplogId; // do the io while holding lock so that switch can set doneAppending // Write the data to the opLog for the synch mode startPosForSynchOp = writeOpLogBytes(this.crf, async, true); // if (this.crf.currSize != startPosForSynchOp) { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "currSize=" + this.crf.currSize // + " startPosForSynchOp=" + startPosForSynchOp // + " oplog#" + getOplogId()); // assert false; // } this.crf.currSize = temp; // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "modify stratPosForSyncOp=" + startPosForSynchOp // + " oplog#" + getOplogId()); // } // if (temp != this.crf.raf.getFilePointer()) { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "modify setting size to=" + temp // + " fp=" + this.crf.raf.getFilePointer() // + " oplog#" + getOplogId()); // } startPosForSynchOp += getOpStateValueOffset(); if (DiskStoreImpl.TRACE_WRITES) { VersionTag tag = null; if (entry.getVersionStamp()!=null) { tag = entry.getVersionStamp().asVersionTag(); } this.logger.info(LocalizedStrings.DEBUG, "TRACE_WRITES basicModify: id=<" + abs(id.getKeyId()) + "> key=<" + entry.getKeyCopy() + ">" + " valueOffset=" + startPosForSynchOp + " userBits=" + userBits + " valueLen=" + valueLength + " valueBytes=<" + baToString(value, valueLength) + ">" + " drId=" + dr.getId() + " versionStamp=" + tag + " oplog#" + getOplogId()); } if (EntryBits.isNeedsValue(userBits)) { id.setValueLength(valueLength); } else { id.setValueLength(0); } id.setUserBits(userBits); if (this.logger.finerEnabled()) { this.logger .finer("Oplog::basicModify:About to Release ByteBuffer with data for Disk ID = " + id.toString()); } if (this.logger.finerEnabled()) { this.logger .finer("Oplog::basicModify:Released ByteBuffer with data for Disk ID = " + id.toString()); } synchronized (id) { // Need to do this while synced on id // now that we compact forward to most recent oplog. // @todo darrel: The sync logic in the disk code is so complex // a really doubt is is correct. // I think we need to do a fresh rewrite of it. oldOplogId = id.setOplogId(getOplogId()); if(EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits)) { id.setOffsetInOplog(-1); } else { id.setOffsetInOplog(startPosForSynchOp); } } // Set the oplog size change for stats // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "modify inc=" + adjustment); // } this.dirHolder.incrementTotalOplogSize(adjustment); this.incTotalCount(); // getKeyCopy() is a potentially expensive operation in GemFireXD so // qualify with EntryLogger.isEnabled() first if (EntryLogger.isEnabled()) { EntryLogger.logPersistPut(dr.getName(), entry.getKeyCopy(), dr.getDiskStoreID()); } if (oldOplogId != getOplogId()) { Oplog oldOplog = getOplogSet().getChild(oldOplogId); if (oldOplog != null) { oldOplog.rmLive(dr, entry); emptyOplog = oldOplog; } addLive(dr, entry); // Note if this mod was done to oldOplog then this entry is already in // the linked list. All we needed to do in this case is call incTotalCount } else { getOrCreateDRI(dr).update(entry); } //Update the region version vector for the disk store. //This needs to be done under lock so that we don't switch oplogs //unit the version vector accurately represents what is in this oplog RegionVersionVector rvv = dr.getRegionVersionVector(); final VersionStamp version; if (rvv != null && (version = entry.getVersionStamp()) != null) { //TODO: Asif: Temporary fix for Bug #47395. //We need to find out the actual cause as to why the DiskEntry's Version Tag does not contain DiskStoreID if(version.getMemberID() == null) { version.setMemberID(dr.getDiskStoreID()); } rvv.recordVersion(version.getMemberID(), version.getRegionVersion()); } } clearOpState(); } // } } if (useNextOplog) { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterSwitchingOplog(); } Assert.assertTrue(getOplogSet().getChild() != this); getOplogSet().getChild().basicModify(dr, entry, value, valueLength, userBits, async, calledByCompactor); } else { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance() .afterSettingOplogOffSet(startPosForSynchOp); } if (emptyOplog != null && (!emptyOplog.isCompacting() || emptyOplog.calledByCompactorThread())) { if (calledByCompactor && emptyOplog.hasNoLiveValues()) { // Since compactor will only append to crf no need to flush drf. // Before we have the compactor delete an oplog it has emptied out // we want to have it flush anything it has written to the current oplog. // Note that since sync writes may be done to the same oplog we are doing // async writes to any sync writes will cause a flush to be done immediately. flushAll(true); } emptyOplog.handleNoLiveValues(); } } } private void basicSaveConflictVersionTag(DiskRegionView dr, VersionTag tag, boolean async) throws IOException, InterruptedException { boolean useNextOplog = false; int adjustment = 0; synchronized (this.lock) { if (getOplogSet().getChild() != this) { useNextOplog = true; } else { this.opState.initialize(OPLOG_CONFLICT_VERSION, dr.getId(), tag); adjustment = getOpStateSize(); assert adjustment > 0; long temp = (this.crf.currSize + adjustment); if (temp > getMaxCrfSize() && !isFirstRecord()) { switchOpLog(dr, adjustment, null, false); // we can't reuse it since it contains variable length data useNextOplog = true; } else { if (this.lockedForKRFcreate) { throw new CacheClosedException("The disk store is closed."); } this.firstRecord = false; writeOpLogBytes(this.crf, async, true); this.crf.currSize = temp; if (DiskStoreImpl.TRACE_WRITES) { this.logger.info(LocalizedStrings.DEBUG, "TRACE_WRITES basicSaveConflictVersionTag:" + " drId=" + dr.getId() + " versionStamp=" + tag + " oplog#" + getOplogId()); } this.dirHolder.incrementTotalOplogSize(adjustment); //Update the region version vector for the disk store. //This needs to be done under lock so that we don't switch oplogs //unit the version vector accurately represents what is in this oplog RegionVersionVector rvv = dr.getRegionVersionVector(); if(rvv != null) { rvv.recordVersion(tag.getMemberID(), tag.getRegionVersion()); } } clearOpState(); } } if (useNextOplog) { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterSwitchingOplog(); } Assert.assertTrue(getOplogSet().getChild() != this); getOplogSet().getChild().basicSaveConflictVersionTag(dr, tag, async); } } private void basicCopyForwardForOfflineCompact(long oplogKeyId, byte[] keyBytes, byte[] valueBytes, byte userBits, long drId, VersionTag tag, long lastModifiedTime, final long currentTime) throws IOException, InterruptedException { boolean useNextOplog = false; long startPosForSynchOp = -1L; int adjustment = 0; synchronized (this.lock) { // synchronized (this.crf) { if (getOplogSet().getChild() != this) { useNextOplog = true; } else { this.opState.initialize(oplogKeyId, keyBytes, valueBytes, userBits, drId, tag, lastModifiedTime, false); adjustment = getOpStateSize(); assert adjustment > 0; long temp = (this.crf.currSize + adjustment); if (temp > getMaxCrfSize() && !isFirstRecord()) { switchOpLog(null, adjustment, null, false); // we can't reuse it since it contains variable length data useNextOplog = true; } else { this.firstRecord = false; // do the io while holding lock so that switch can set doneAppending // Write the data to the opLog async since we are offline compacting startPosForSynchOp = writeOpLogBytes(this.crf, true, true); this.crf.currSize = temp; startPosForSynchOp += getOpStateValueOffset(); getOplogSet().getChild().writeOneKeyEntryForKRF(keyBytes, userBits, valueBytes.length, drId, oplogKeyId, startPosForSynchOp, tag, lastModifiedTime, currentTime); if (DiskStoreImpl.TRACE_WRITES) { this.logger.info(LocalizedStrings.DEBUG, "TRACE_WRITES basicCopyForwardForOfflineCompact: id=<" + oplogKeyId + "> keyBytes=<" + baToString(keyBytes) + ">" + " valueOffset=" + startPosForSynchOp + " userBits=" + userBits + " valueLen=" + valueBytes.length + " valueBytes=<" + baToString(valueBytes) + ">" + " drId=" + drId + " oplog#" + getOplogId()); } this.dirHolder.incrementTotalOplogSize(adjustment); this.incTotalCount(); } clearOpState(); } // } } if (useNextOplog) { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterSwitchingOplog(); } Assert.assertTrue(getOplogSet().getChild() != this); getOplogSet().getChild().basicCopyForwardForOfflineCompact(oplogKeyId, keyBytes, valueBytes, userBits, drId, tag, lastModifiedTime, currentTime); } } private boolean isCompacting() { return this.compacting; } private void addLive(DiskRegionView dr, DiskEntry de) { // logger.info(LocalizedStrings.DEBUG, "DEBUG: addLive oplog#" + getOplogId() // + " de=" + de); getOrCreateDRI(dr).addLive(de); incLiveCount(); } private void rmLive(DiskRegionView dr, DiskEntry de) { if (getOrCreateDRI(dr).rmLive(de)) { // logger.info(LocalizedStrings.DEBUG, "DEBUG: rmLive oplog#" + getOplogId() // + " de=" + de); decLiveCount(); } } private DiskRegionInfo getDRI(Long drId) { return this.regionMap.get(drId); } private DiskRegionInfo getDRI(DiskRegionView dr) { return getDRI(dr.getId()); } public DiskRegionInfo getOrCreateDRI(DiskRegionView dr) { DiskRegionInfo dri = getDRI(dr); if (dri == null) { dri = (isCompactionPossible() || couldHaveKrf()) ? new DiskRegionInfoWithList(dr, couldHaveKrf(), this.krfCreated.get()) : new DiskRegionInfoNoList(dr); DiskRegionInfo oldDri = this.regionMap.putIfAbsent(dr.getId(), dri); if (oldDri != null) { dri = oldDri; } } return dri; } public boolean needsKrf() { return couldHaveKrf() && !krfCreated.get(); } /** * @return true if this Oplog could end up having a KRF file. */ private boolean couldHaveKrf() { return getOplogSet().couldHaveKrf(); } private DiskRegionInfo getOrCreateDRI(Long drId) { DiskRegionInfo dri = getDRI(drId); if (dri == null) { dri = (isCompactionPossible() || couldHaveKrf()) ? new DiskRegionInfoWithList(null, couldHaveKrf(), this.krfCreated.get()) : new DiskRegionInfoNoList(null); DiskRegionInfo oldDri = this.regionMap.putIfAbsent(drId, dri); if (oldDri != null) { dri = oldDri; } } return dri; } /** * Removes the key/value pair with the given id on disk. * * @param entry * DiskEntry object on which remove operation is called */ public final void remove(LocalRegion region, DiskEntry entry, boolean async, boolean isClear) { DiskRegion dr = region.getDiskRegion(); if (getOplogSet().getChild() != this) { getOplogSet().getChild().remove(region, entry, async, isClear); } else { DiskId did = entry.getDiskId(); boolean exceptionOccured = false; byte prevUsrBit = did.getUserBits(); int len = did.getValueLength(); try { basicRemove(dr, entry, async, isClear); } catch (IOException ex) { exceptionOccured = true; getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, dr.getName()); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); region.getCancelCriterion().checkCancelInProgress(ie); exceptionOccured = true; throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0_DUE_TO_FAILURE_IN_ACQUIRING_READ_LOCK_FOR_ASYNCH_WRITING.toLocalizedString(this.diskFile.getPath()), ie, dr.getName()); } finally { if (exceptionOccured) { did.setValueLength(len); did.setUserBits(prevUsrBit); } } } } /** * Write the GC RVV for a single region to disk */ public final void writeGCRVV(DiskRegion dr) { boolean useNextOplog = false; synchronized (this.lock) { if (getOplogSet().getChild() != this) { useNextOplog = true; } else { try { writeRVVRecord( this.drf, Collections. singletonMap(dr.getId(), dr), true); } catch (IOException ex) { dr.getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException( LocalizedStrings.Oplog_FAILED_RECORDING_RVV_BECAUSE_OF_0.toLocalizedString(this.diskFile .getPath()), ex, dr.getName()); } } } if(useNextOplog) { getOplogSet().getChild().writeGCRVV(dr); } else { DiskStoreObserver.endWriteGCRVV(dr); } } /** * There're 3 cases to use writeRVV: * 1) endGII: DiskRegion.writeRVV(region=null, true), Oplog.writeRVV(true,null) * 2) beginGII: DiskRegion.writeRVV(region=this, false), Oplog.writeRVV(false,sourceRVV!=null) * 3) clear: DiskRegion.writeRVV(region=this, null), Oplog.writeRVV(null,sourceRVV!=null) */ public void writeRVV(DiskRegion dr, RegionVersionVector sourceRVV, Boolean isRVVTrusted) { boolean useNextOplog = false; synchronized (this.lock) { if (getOplogSet().getChild() != this) { useNextOplog = true; } else { try { //We'll update the RVV of the disk region while holding the lock on the oplog, //to make sure we don't switch oplogs while we're in the middle of this. if (sourceRVV != null) { dr.getRegionVersionVector().recordVersions(sourceRVV); } else { // it's original EndGII, not to write duplicate rvv if its trusted if (dr.getRVVTrusted()) { return; } } if (isRVVTrusted != null) { // isRVVTrusted == null means "as is" dr.setRVVTrusted(isRVVTrusted); } writeRVVRecord( this.crf, Collections. singletonMap(dr.getId(), dr), false); } catch (IOException ex) { dr.getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException( LocalizedStrings.Oplog_FAILED_RECORDING_RVV_BECAUSE_OF_0.toLocalizedString(this.diskFile .getPath()), ex, dr.getName()); } } } if(useNextOplog) { getOplogSet().getChild().writeRVV(dr, sourceRVV, isRVVTrusted); } } private long getMaxCrfSize() { return this.maxCrfSize; } private long getMaxDrfSize() { return this.maxDrfSize; } private void setMaxCrfDrfSize() { int crfPct = Integer.getInteger("gemfire.CRF_MAX_PERCENTAGE", 90); if (crfPct > 100 || crfPct < 0) { crfPct = 90; } this.maxCrfSize = (long)(this.maxOplogSize * (crfPct / 100.0)); this.maxDrfSize = this.maxOplogSize - this.maxCrfSize; } /** * * Asif: A helper function which identifies whether to record a removal of * entry in the current oplog or to make the switch to the next oplog. This * function enables us to reuse the byte buffer which got created for an oplog * which no longer permits us to use itself. It will also take acre of * compaction if required * * @param entry * DiskEntry object representing the current Entry * @throws IOException * @throws InterruptedException */ private void basicRemove(DiskRegionView dr, DiskEntry entry, boolean async, boolean isClear) throws IOException, InterruptedException { DiskId id = entry.getDiskId(); boolean useNextOplog = false; long startPosForSynchOp = -1; Oplog emptyOplog = null; if (DiskStoreImpl.KRF_DEBUG) { // wait for cache close to create krf System.out.println("basicRemove KRF_DEBUG"); Thread.sleep(1000); } synchronized (this.lock) { if (getOplogSet().getChild() != this) { useNextOplog = true; } else if ((this.drf.currSize + MAX_DELETE_ENTRY_RECORD_BYTES) > getMaxDrfSize() && !isFirstRecord()) { switchOpLog(dr, MAX_DELETE_ENTRY_RECORD_BYTES, entry, false); useNextOplog = true; } else { if (this.lockedForKRFcreate) { throw new CacheClosedException("The disk store is closed."); } long oldOplogId = id.setOplogId(getOplogId()); if(!isClear) { this.firstRecord = false; // Ok now we can go ahead and find out its actual size // This is the only place to set notToUseUserBits=true initOpState(OPLOG_DEL_ENTRY_1ID, dr, entry, null, (byte)0, true); int adjustment = getOpStateSize(); this.drf.currSize += adjustment; // do the io while holding lock so that switch can set doneAppending if (this.logger.finerEnabled()) { this.logger .finer(" Oplog::basicRemove: Recording the Deletion of entry in the Oplog with id = " + getOplogId() + " The Oplog Disk ID for the entry being deleted =" + id + " Mode is Synch"); } // Write the data to the opLog for the synch mode // @todo if we don't sync write destroys what will happen if // we do 1. create k1 2. destroy k1 3. create k1? // It would be possible for the crf to be flushed but not the drf. // Then during recovery we will find identical keys with different entryIds. // I think we can safely have drf writes be async as long as we flush the drf // before we flush the crf. // However we can't have removes by async if we are doing a sync write // because we might be killed right after we do this write. startPosForSynchOp = writeOpLogBytes(this.drf, async, true); setHasDeletes(true); if (DiskStoreImpl.TRACE_WRITES) { this.logger.info(LocalizedStrings.DEBUG, "TRACE_WRITES basicRemove: id=<" + abs(id.getKeyId()) + "> key=<" + entry.getKeyCopy() + ">" + " drId=" + dr.getId() + " oplog#" + getOplogId()); } // new RuntimeException("STACK")); if (this.logger.finerEnabled()) { this.logger .finer("Oplog::basicRemove:About to Release ByteBuffer for Disk ID = " + id.toString()); } if (this.logger.finerEnabled()) { this.logger .finer("Oplog::basicRemove:Released ByteBuffer for Disk ID = " + id.toString()); } this.dirHolder.incrementTotalOplogSize(adjustment); } //Set the oplog size change for stats // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "rm inc=" + adjustment); // } id.setOffsetInOplog(-1); // getKeyCopy() is a potentially expensive operation in GemFireXD so // qualify with EntryLogger.isEnabled() first if (EntryLogger.isEnabled()) { EntryLogger.logPersistDestroy(dr.getName(), entry.getKeyCopy(), dr.getDiskStoreID()); } { Oplog rmOplog = null; if (oldOplogId == getOplogId()) { rmOplog = this; } else { rmOplog = getOplogSet().getChild(oldOplogId); } if (rmOplog != null) { rmOplog.rmLive(dr, entry); emptyOplog = rmOplog; } } clearOpState(); } } if (useNextOplog) { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance().afterSwitchingOplog(); } Assert.assertTrue(getOplogSet().getChild() != this); getOplogSet().getChild().basicRemove(dr, entry, async, isClear); } else { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { CacheObserverHolder.getInstance() .afterSettingOplogOffSet(startPosForSynchOp); } if (emptyOplog != null && (!emptyOplog.isCompacting() || emptyOplog.calledByCompactorThread())) { emptyOplog.handleNoLiveValues(); } } } // /** // * This is only used for an assertion check. // */ // private long lastWritePos = -1; /** * test hook */ public final ByteBuffer getWriteBuf() { return this.crf.writeBuf; } private final void flushNoSync(OplogFile olf) throws IOException { flushAllNoSync(false); // @todo //flush(olf, false); } private final void flushAndSync(OplogFile olf) throws IOException { flushAll(false); // @todo //flush(olf, true); } private final void flush(OplogFile olf, boolean doSync, boolean dofsync) throws IOException { try { synchronized (this.lock/*olf*/) { if (olf.RAFClosed) { // logger.info(LocalizedStrings.DEBUG, "DEBUG: no need to flush because RAFClosed" // + " oplog#" + getOplogId() // + ((olf==this.crf)? "crf" : "drf")); return; } ByteBuffer bb = olf.writeBuf; // logger.info(LocalizedStrings.DEBUG, "DEBUG: flush " // + " oplog#" + getOplogId() // + " bb.position()=" + ((bb != null) ? bb.position() : "null") // + ((olf==this.crf)? "crf" : "drf")); if (bb != null && bb.position() != 0) { bb.flip(); int flushed = 0; do { // { // byte[] toPrint = new byte[bb.remaining()]; // for (int i=0; i < bb.remaining(); i++) { // toPrint[i] = bb.get(i); // } // logger.info(LocalizedStrings.DEBUG, "DEBUG: flush writing bytes at offset=" + olf.bytesFlushed // + " position=" + olf.channel.position() // + " bytes=" + baToString(toPrint) // + " oplog#" + getOplogId() // + ((olf==this.crf)? "crf" : "drf")); // } flushed += olf.channel.write(bb); // logger.info(LocalizedStrings.DEBUG, "DEBUG: flush bytesFlushed=" + olf.bytesFlushed // + " position=" + olf.channel.position() // + " oplog#" + getOplogId() // + ((olf==this.crf)? "crf" : "drf")); } while (bb.hasRemaining()); // update bytesFlushed after entire writeBuffer is flushed to fix bug 41201 olf.bytesFlushed += flushed; bb.clear(); } } if (doSync) { if (dofsync && !DiskStoreImpl.DISABLE_SYNC_WRITES_FOR_TESTS) { // Synch Meta Data as well as content olf.channel.force(true); } } } catch (ClosedChannelException ignore) { // It is possible for a channel to be closed when our code does not // explicitly call channel.close (when we will set RAFclosed). // This can happen when a thread is doing an io op and is interrupted. // That thread will see ClosedByInterruptException but it will also // close the channel and then we will see ClosedChannelException. } } public final void flushAll() { flushAll(false); } public final void flushAllNoSync(boolean skipDrf) { flushAll(skipDrf, false); } public final void flushAll(boolean skipDrf) { flushAll(skipDrf, true/*doSync*/); } public final void flushAll(boolean skipDrf, boolean doSync) { flushAll(skipDrf, doSync, getParent().getSyncWrites()); } public final void flushAll(boolean skipDrf, boolean doSync, boolean dofsync) { try { // if (!skipDrf) { // @todo if skipDrf then only need to do drf if crf has flushable data flush(this.drf, doSync, dofsync); // } flush(this.crf, doSync, dofsync); } catch (IOException ex) { getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_WRITING_KEY_TO_0.toLocalizedString(this.diskFile.getPath()), ex, getParent()); } } final void flushAllAndSync() { lockCompactor(); try { flushAll(false, true, true); } finally { unlockCompactor(); } } final void flushAllAndSync(boolean noCompactorLock) { flushAll(false, true, true); } /** * Asif: Since the ByteBuffer being writen to can have additional bytes which * are used for extending the size of the file, it is necessary that the * ByteBuffer provided should have limit which is set to the position till * which it contains the actual bytes. If the mode is synched write then only * we will write up to the capacity & opLogSpace variable have any meaning. * For asynch mode it will be zero. Also this method must be synchronized on * the file , whether we use synch or asynch write because the fault in * operations can clash with the asynch writing. Write the specified bytes to * the oplog. Note that since extending a file is expensive this code will * possibly write OPLOG_EXTEND_SIZE zero bytes to reduce the number of times * the file is extended. * * * @param olf the file to write the bytes to * @return The long offset at which the data present in the ByteBuffer gets * written to */ private long writeOpLogBytes(OplogFile olf, boolean async, boolean doFlushIfSync) throws IOException { long startPos = -1L; synchronized (this.lock/*olf*/) { Assert.assertTrue(!this.doneAppending); if (this.closed) { Assert.assertTrue( false, "The Oplog " + this.oplogId + " for store " + getParent().getName() + " has been closed for synch mode while writing is going on. This should not happen"); } // Asif : It is assumed that the file pointer is already at the // appropriate position in the file so as to allow writing at the end. // Any fault in operations will set the pointer back to the write location. // Also it is only in case of synch writing, we are writing more // than what is actually needed, we will have to reset the pointer. // Also need to add in offset in writeBuf in case we are not flushing writeBuf startPos = olf.channel.position() + olf.writeBuf.position(); // logger.info(LocalizedStrings.DEBUG, "writeOpLogBytes" // + " position=" + olf.channel.position() // + " writeBufPos=" + olf.writeBuf.position() // + " startPos=" + startPos // + " " + getFileType(olf) + "#" + getOplogId() // + " opStateSize=" + getOpStateSize() // + this.opState.debugStr()); // Assert.assertTrue(startPos > lastWritePos, // "startPos=" + startPos + // " was not > lastWritePos=" + lastWritePos); long bytesWritten = this.opState.write(olf); if (!async && doFlushIfSync) { flushAndSync(olf); } getStats().incWrittenBytes(bytesWritten, async); // // Moved the set of lastWritePos to after write // // so if write throws an exception it will not be updated. // // This fixes bug 40449. // this.lastWritePos = startPos; } return startPos; } boolean isRAFOpen() { return !this.crf.RAFClosed; // volatile read } private boolean okToReopen; boolean closeRAF() { if (this.beingRead) return false; synchronized (this.lock/*crf*/) { if (this.beingRead) return false; if (!this.doneAppending) return false; if (this.crf.RAFClosed) { return false; } else { try { this.crf.raf.close(); } catch (IOException ignore) { } this.crf.RAFClosed = true; this.okToReopen = true; this.stats.decOpenOplogs(); return true; } } } private volatile boolean beingRead; /** * If crfRAF has been closed then attempt to reopen the oplog for this read. * Verify that this only happens when test methods are invoked. * @return true if oplog file is open and can be read from; false if not */ private boolean reopenFileIfClosed() throws IOException { synchronized (this.lock/*crf*/) { boolean result = !this.crf.RAFClosed; if (!result && this.okToReopen) { result = true; this.crf.raf = new RandomAccessFile(this.crf.f, "r"); this.stats.incOpenOplogs(); this.crf.RAFClosed = false; this.okToReopen = false; } return result; } } private BytesAndBits attemptGet(DiskRegionView dr, long offsetInOplog, boolean bitOnly, int valueLength, byte userBits) throws IOException { boolean didReopen = false; boolean accessedInactive = false; try { synchronized (this.lock/*crf*/) { // if (this.closed || this.deleted.get()) { // throw new DiskAccessException("attempting get on " // + (this.deleted.get() ? "destroyed" : "closed") // + " oplog #" + getOplogId(), this.owner); // } this.beingRead = true; final long readPosition = offsetInOplog; if (/*!getParent().isSync() since compactor groups writes && */ (readPosition+valueLength) > this.crf.bytesFlushed && !this.closed) { flushAllNoSync(true); // fix for bug 41205 } try { RandomAccessFile myRAF = null; if (this.crf.RAFClosed) { myRAF = new RandomAccessFile(this.crf.f, "r"); this.stats.incOpenOplogs(); if (this.okToReopen) { this.crf.RAFClosed = false; this.okToReopen = false; this.crf.raf = myRAF; didReopen = true; } } else { myRAF = this.crf.raf; accessedInactive = true; } BytesAndBits bb = null; try { final long writePosition = (this.doneAppending) ? this.crf.bytesFlushed : myRAF.getFilePointer(); if ((readPosition+valueLength) > writePosition) { // logger.info(LocalizedStrings.DEBUG, "DEBUG: crfSize=" + this.crf.currSize // + " fp=" + myRAF.getFilePointer() // + " rp=" + readPosition // + " oplog#" + getOplogId()); throw new DiskAccessException(LocalizedStrings.Oplog_TRIED_TO_SEEK_TO_0_BUT_THE_FILE_LENGTH_IS_1_OPLOG_FILE_OBJECT_USED_FOR_READING_2.toLocalizedString(new Object[] {readPosition+valueLength, writePosition, this.crf.raf}), dr.getName()); } else if (readPosition < 0) { throw new DiskAccessException(LocalizedStrings.Oplog_CANNOT_FIND_RECORD_0_WHEN_READING_FROM_1.toLocalizedString(new Object[] {offsetInOplog, this.diskFile.getPath()}), dr.getName()); } try { myRAF.seek(readPosition); // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "after seek rp=" + readPosition // + " fp=" + myRAF.getFilePointer() // + " oplog#" + getOplogId()); // } this.stats.incOplogSeeks(); byte[] valueBytes = new byte[valueLength]; myRAF.readFully(valueBytes); // logger.info(LocalizedStrings.DEBUG, // "DEBUG attemptGet readPosition=" + readPosition // + " valueLength=" + valueLength // + " value=<" + baToString(valueBytes) + ">" // + " oplog#" + getOplogId()); this.stats.incOplogReads(); bb = new BytesAndBits(valueBytes, userBits); // also set the product version for an older product final Version version = getProductVersionIfOld(); if (version != null) { bb.setVersion(version); } } finally { // if this oplog is no longer being appended to then don't waste disk io if (!this.doneAppending) { // by seeking back to writePosition myRAF.seek(writePosition); // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "after seek wp=" + writePosition // + " position=" + this.crf.channel.position() // + " fp=" + myRAF.getFilePointer() // + " oplog#" + getOplogId()); // } this.stats.incOplogSeeks(); } } return bb; } finally { if (myRAF != this.crf.raf) { try { myRAF.close(); } catch (IOException ignore) { } } } } finally { this.beingRead = false; // if (this.closed || this.deleted.get()) { // throw new DiskAccessException("attempting get on " // + (this.deleted.get() ? "destroyed" : "closed") // + " oplog #" + getOplogId(), this.owner); // } } } // sync } finally { if (accessedInactive) { getOplogSet().inactiveAccessed(this); } else if (didReopen) { getOplogSet().inactiveReopened(this); } } } /** * Asif: Extracts the Value byte array & UserBit from the OpLog * * @param offsetInOplog * The starting position from which to read the data in the opLog * @param bitOnly * boolean indicating whether the value needs to be extracted along * with the UserBit or not. * @param valueLength * The length of the byte array which represents the value * @param userBits * The userBits of the value. * @return BytesAndBits object which wraps the extracted value & user bit */ private BytesAndBits basicGet(DiskRegionView dr, long offsetInOplog, boolean bitOnly, int valueLength, byte userBits) { BytesAndBits bb = null; if (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits) || bitOnly || valueLength == 0) { if (EntryBits.isInvalid(userBits)) { bb = new BytesAndBits(DiskEntry.INVALID_BYTES, userBits); } else if (EntryBits.isTombstone(userBits)) { bb = new BytesAndBits(DiskEntry.TOMBSTONE_BYTES, userBits); } else { bb = new BytesAndBits(DiskEntry.LOCAL_INVALID_BYTES, userBits); } } else { if (offsetInOplog == -1) return null; try { for (;;) { dr.getCancelCriterion().checkCancelInProgress(null); boolean interrupted = Thread.interrupted(); try { bb = attemptGet(dr, offsetInOplog, bitOnly, valueLength, userBits); break; } catch (InterruptedIOException e) { // bug 39756 // ignore, we'll clear and retry. } finally { if (interrupted) { Thread.currentThread().interrupt(); } } } // for } catch (IOException ex) { getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException(LocalizedStrings.Oplog_FAILED_READING_FROM_0_OPLOGID_1_OFFSET_BEING_READ_2_CURRENT_OPLOG_SIZE_3_ACTUAL_FILE_SIZE_4_IS_ASYNCH_MODE_5_IS_ASYNCH_WRITER_ALIVE_6 .toLocalizedString( new Object[] { this.diskFile.getPath(), Long.valueOf(this.oplogId), Long.valueOf(offsetInOplog), Long.valueOf(this.crf.currSize), Long.valueOf(this.crf.bytesFlushed), Boolean.valueOf(!dr.isSync()), Boolean.valueOf(false) }), ex, dr.getName()); } catch (IllegalStateException ex) { checkClosed(); throw ex; } } return bb; } /** * Asif: Extracts the Value byte array & UserBit from the OpLog and inserts it * in the wrapper Object of type BytesAndBitsForCompactor which is passed * * @param offsetInOplog * The starting position from which to read the data in the * opLog * @param bitOnly * boolean indicating whether the value needs to be extracted * along with the UserBit or not. * @param valueLength * The length of the byte array which represents the value * @param userBits * The userBits of the value. * @param wrapper * Object of type BytesAndBitsForCompactor. The data is set in the * wrapper Object. The wrapper Object also contains the user * bit associated with the entry * @return true if data is found false if not */ private boolean basicGetForCompactor(DiskRegionView dr, long offsetInOplog, boolean bitOnly, int valueLength, byte userBits, BytesAndBitsForCompactor wrapper) { if (EntryBits.isAnyInvalid(userBits) || EntryBits.isTombstone(userBits) || bitOnly || valueLength == 0) { if (EntryBits.isInvalid(userBits)) { wrapper.setData(DiskEntry.INVALID_BYTES, userBits, DiskEntry.INVALID_BYTES.length, false /* Cannot be reused */); } else if (EntryBits.isTombstone(userBits)) { wrapper.setData(DiskEntry.TOMBSTONE_BYTES, userBits, DiskEntry.TOMBSTONE_BYTES.length, false /* Cannot be reused */); } else { wrapper.setData(DiskEntry.LOCAL_INVALID_BYTES, userBits, DiskEntry.LOCAL_INVALID_BYTES.length, false /* Cannot be reused */); } } else { try { synchronized (this.lock/*crf*/) { final long readPosition = offsetInOplog; if (/*!getParent().isSync() since compactor groups writes && */ (readPosition+valueLength) > this.crf.bytesFlushed && !this.closed) { flushAllNoSync(true); // fix for bug 41205 } if (!reopenFileIfClosed()) { return false; // fix for bug 40648 } final long writePosition = (this.doneAppending) ? this.crf.bytesFlushed : this.crf.raf.getFilePointer(); if ((readPosition+valueLength) > writePosition) { throw new DiskAccessException( LocalizedStrings.Oplog_TRIED_TO_SEEK_TO_0_BUT_THE_FILE_LENGTH_IS_1_OPLOG_FILE_OBJECT_USED_FOR_READING_2.toLocalizedString( new Object[] {readPosition+valueLength, writePosition, this.crf.raf}), dr.getName()); } else if (readPosition < 0) { throw new DiskAccessException( LocalizedStrings.Oplog_CANNOT_FIND_RECORD_0_WHEN_READING_FROM_1 .toLocalizedString( new Object[] { Long.valueOf(offsetInOplog), this.diskFile.getPath()}), dr.getName()); } // if (this.closed || this.deleted.get()) { // throw new DiskAccessException("attempting get on " // + (this.deleted.get() ? "destroyed" : "closed") // + " oplog #" + getOplogId(), this.owner); // } try { this.crf.raf.seek(readPosition); // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "after seek rp=" + readPosition // + " fp=" + this.crf.raf.getFilePointer() // + " oplog#" + getOplogId()); // } this.stats.incOplogSeeks(); byte[] valueBytes = null; if (wrapper.getBytes().length < valueLength) { valueBytes = new byte[valueLength]; this.crf.raf.readFully(valueBytes); } else { valueBytes = wrapper.getBytes(); this.crf.raf.readFully(valueBytes, 0, valueLength); } // this.logger.info(LocalizedStrings.DEBUG, "DEBUG: basicGetForCompact readPosition=" // + readPosition // + " length=" + valueLength // + " valueBytes=" + baToString(valueBytes)); this.stats.incOplogReads(); Version version = getProductVersionIfOld(); if (version != null) { wrapper.setVersion(version); } wrapper.setData(valueBytes, userBits, valueLength, true); } finally { // if this oplog is no longer being appended to then don't waste disk io if (!this.doneAppending) { this.crf.raf.seek(writePosition); // { // LogWriterI18n l = parent.getOwner().getCache().getLoggerI18n(); // l.info(LocalizedStrings.DEBUG, "after seek wp=" + writePosition // + " position=" + this.crf.channel.position() // + " fp=" + this.crf.raf.getFilePointer() // + " oplog#" + getOplogId()); // } this.stats.incOplogSeeks(); } // if (this.closed || this.deleted.get()) { // throw new DiskAccessException("attempting get on " // + (this.deleted.get() ? "destroyed" : "closed") // + " oplog #" + getOplogId(), this.owner); // } } } } catch (IOException ex) { getParent().getCancelCriterion().checkCancelInProgress(ex); throw new DiskAccessException( LocalizedStrings.Oplog_FAILED_READING_FROM_0_OPLOG_DETAILS_1_2_3_4_5_6 .toLocalizedString( new Object[] { this.diskFile.getPath(), Long.valueOf(this.oplogId), Long.valueOf(offsetInOplog), Long.valueOf(this.crf.currSize), Long.valueOf(this.crf.bytesFlushed), Boolean.valueOf(/*!dr.isSync() @todo */false), Boolean.valueOf(false)}), ex, dr.getName()); } catch (IllegalStateException ex) { checkClosed(); throw ex; } } return true; } private final AtomicBoolean deleted = new AtomicBoolean(); /** * deletes the oplog's file(s) */ void deleteFiles(boolean crfOnly) { // try doing the removeOplog unconditionally since I'm see an infinite loop // in destroyOldestReadyToCompact boolean needsDestroy = this.deleted.compareAndSet(false, true); if (needsDestroy) { // I don't under stand why the compactor would have anything to do with // an oplog file that we are removing from disk. // So I'm commenting out the following if // if (!isCompactionPossible()) { // moved this from close to fix bug 40574 // If we get to the point that it is ok to close the file // then we no longer need the parent to be able to find this // oplog using its id so we can unregister it now. // If compaction is possible then we need to leave this // oplog registered with the parent and allow the compactor to unregister it. // } deleteCRF(); if (!crfOnly || !getHasDeletes()) { setHasDeletes(false); deleteDRF(); // no need to call removeDrf since parent removeOplog did it //getParent().removeDrf(this); // getParent().oplogSetRemove(this); } this.idxkrf.deleteIRF(null); //Fix for bug 42495 - Don't remove the oplog from this list //of oplogs until it has been removed from the init file. This guarantees //that if the oplog is in the init file, the backup code can find it and //try to back it up. boolean addToDrfOnly = crfOnly && getHasDeletes(); getOplogSet().removeOplog(getOplogId(), true, addToDrfOnly ? this : null); } else if (!crfOnly && getHasDeletes()) { setHasDeletes(false); deleteDRF(); getOplogSet().removeDrf(this); // getParent().oplogSetRemove(this); } } public void deleteCRF() { oplogSet.crfDelete(this.oplogId); DiskStoreBackup inProgressBackup = getParent().getInProgressBackup(); if(inProgressBackup == null || !inProgressBackup.deferCrfDelete(this)) { deleteCRFFileOnly(); } } public void deleteCRFFileOnly() { deleteFile(this.crf); // replace .crf at the end with .krf if (this.crf.f != null) { final File krf = new File(this.crf.f.getAbsolutePath().replaceFirst( "\\" + CRF_FILE_EXT + "$", KRF_FILE_EXT)); if (!krf.exists()) { return; } getParent().executeDelayedExpensiveWrite(new Runnable() { public void run() { if (!krf.delete()) { if (krf.exists()) { logger.warning(LocalizedStrings.Oplog_DELETE_FAIL_0_1_2, new Object[] { Oplog.this.toString(), "krf", getParent().getName()}); } } else { if (logger.infoEnabled()) { logger.info(LocalizedStrings.Oplog_DELETE_0_1_2, new Object[] { Oplog.this.toString(), "krf", getParent().getName() }); } } } }); } } public void deleteDRF() { getOplogSet().drfDelete(this.oplogId); DiskStoreBackup inProgressBackup = getParent().getInProgressBackup(); if(inProgressBackup == null || !inProgressBackup.deferDrfDelete(this)) { deleteDRFFileOnly(); } } public void deleteDRFFileOnly() { deleteFile(this.drf); } /** * Returns "crf" or "drf". */ private static String getFileType(OplogFile olf) { String name = olf.f.getName(); int index = name.lastIndexOf('.'); return name.substring(index+1); } private void deleteFile(final OplogFile olf) { synchronized(this.lock) { if (olf.currSize != 0) { this.dirHolder.decrementTotalOplogSize(olf.currSize); olf.currSize = 0; } if (olf.f == null) return; if (!olf.f.exists()) return; assert olf.RAFClosed == true; if (!olf.RAFClosed || olf.raf != null) { try { olf.raf.close(); olf.RAFClosed = true; } catch (IOException ignore) { } } //Delete the file asynchronously. Based on perf testing, deletes //can block at the filesystem level. See #50254 //It's safe to do this asynchronously, because we have already //marked this file as deleted in the init file. //Note - could we run out of disk space because the compaction thread is //doing this and creating files? For a real fix, you probably need a bounded //queue getParent().executeDelayedExpensiveWrite(new Runnable() { public void run() { if (!olf.f.delete() && olf.f.exists()) { logger.warning(LocalizedStrings.Oplog_DELETE_FAIL_0_1_2, new Object[] {Oplog.this.toString(), getFileType(olf), getParent().getName()}); } else if (logger.infoEnabled()) { logger.info(LocalizedStrings.Oplog_DELETE_0_1_2, new Object[] {Oplog.this.toString(), getFileType(olf), getParent().getName()}); // logger.info(LocalizedStrings.DEBUG, "DEBUG deleteFile " + olf.f, // new RuntimeException("STACK")); } } }); } } /** * Helper function for the test * * @return FileChannel object representing the Oplog */ FileChannel getFileChannel() { return this.crf.channel; } DirectoryHolder getDirectoryHolder() { return this.dirHolder; } /** * The current size of Oplog. It may be less than the actual Oplog file size ( * in case of asynch writing as it also takes into account data present in * asynch buffers which will get flushed in course of time o * * @return long value indicating the current size of the oplog. */ long getOplogSize() { // logger.info(LocalizedStrings.DEBUG, "getOplogSize crfSize=" + // this.crf.currSize + " drfSize=" + this.drf.currSize); return this.crf.currSize + this.drf.currSize; } boolean isOplogEmpty() { return this.crf.currSize <= OPLOG_DISK_STORE_REC_SIZE && this.drf.currSize <= OPLOG_DISK_STORE_REC_SIZE; } void incLiveCount() { this.totalLiveCount.incrementAndGet(); } private void decLiveCount() { this.totalLiveCount.decrementAndGet(); } /** * Return true if a record (crf or drf) has been added to this oplog */ boolean hasBeenUsed() { return this.hasDeletes.get() || this.totalCount.get() > 0; } void incTotalCount() { if (!isPhase2()) { this.totalCount.incrementAndGet(); } } private void finishedAppending() { synchronized (this.lock/*crf*/) { this.doneAppending = true; } handleNoLiveValues(); // I'm deadcoding the following because it is not safe unless we change to // always recover values. If we don't recover values then // an oplog we recovered from may still need to fault values in from memory. // if (!getParent().isOverflowEnabled()) { // // If !overflow then we can close the file even // // when it has recent values because // // we will never need to fault values in from this // // file since they are all in memory. // close(); // } } boolean needsCompaction() { // logger.info(LocalizedStrings.DEBUG, // "DEBUG isCompactionPossible=" + isCompactionPossible()); if (!isCompactionPossible()) return false; // logger.info(LocalizedStrings.DEBUG, // "DEBUG unrecoveredRegionCount=" + this.unrecoveredRegionCount.get()); if (this.unrecoveredRegionCount.get() > 0) return false; // logger.info(LocalizedStrings.DEBUG, // "DEBUG compactionThreshold=" + parent.getCompactionThreshold()); if (parent.getCompactionThreshold() == 100) return true; if (parent.getCompactionThreshold() == 0) return false; // otherwise check if we have enough garbage to collect with a compact long rvHWMtmp = this.totalCount.get(); // logger.info(LocalizedStrings.DEBUG, "DEBUG rvHWM=" + rvHWMtmp); if (rvHWMtmp > 0) { long tlc = this.totalLiveCount.get(); if (tlc < 0) { tlc = 0; } double rv = tlc; // logger.info(LocalizedStrings.DEBUG, "DEBUG rv=" + rv); double rvHWM = rvHWMtmp; if (((rv / rvHWM) * 100) <= parent.getCompactionThreshold()) { return true; } } else { return true; } return false; } public boolean hadLiveEntries() { return this.totalCount.get() != 0; } public boolean hasNoLiveValues() { return this.totalLiveCount.get() <= 0 // if we have an unrecoveredRegion then we don't know how many liveValues we have && this.unrecoveredRegionCount.get() == 0 && !getParent().isOfflineCompacting(); } private void handleEmptyAndOldest(boolean calledByCompactor) { if (!calledByCompactor) { if (logger.infoEnabled()) { logger.info(LocalizedStrings.DEBUG, "Deleting oplog early because it is empty. It is for disk store " + getParent().getName() + " and has oplog#" + oplogId); } } destroy(); getOplogSet().destroyOldestReadyToCompact(); } private void handleEmpty(boolean calledByCompactor) { lockCompactor(); try { if (!calledByCompactor) { if (logger.infoEnabled()) { logger.info(LocalizedStrings.Oplog_CLOSING_EMPTY_OPLOG_0_1, new Object[] {getParent().getName(), toString()}); } } cancelKrf(); close(); deleteFiles(getHasDeletes()); } finally { unlockCompactor(); } } void cancelKrf() { createKrf(true); } private final static ThreadLocal isCompactorThread = new ThreadLocal(); private boolean calledByCompactorThread() { if (!this.compacting) return false; Object v = isCompactorThread.get(); return v != null && v == Boolean.TRUE; } private void handleNoLiveValues() { // logger.info(LocalizedStrings.DEBUG, "DEBUG handleNoLiveValues" // + " count=" + this.totalLiveCount.get() // + " totalCount=" + this.totalCount.get() // + " doneAppending=" + this.doneAppending // + " needsCompaction=" + needsCompaction()); if (!this.doneAppending) return; if (hasNoLiveValues()) { if (LocalRegion.ISSUE_CALLBACKS_TO_CACHE_OBSERVER) { if (calledByCompactorThread()) { // after compaction, remove the oplog from the list & destroy it CacheObserverHolder.getInstance().beforeDeletingCompactedOplog(this); } else { CacheObserverHolder.getInstance().beforeDeletingEmptyOplog(this); } } if (isOldest()) { if (calledByCompactorThread()) { // do it in this compactor thread handleEmptyAndOldest(true); } else { // schedule another thread to do it getParent().executeDiskStoreTask(new Runnable() { public void run() { handleEmptyAndOldest(false); } }); } } else { if (calledByCompactorThread()) { // do it in this compactor thread handleEmpty(true); } else { // schedule another thread to do it getParent().executeDiskStoreTask(new Runnable() { public void run() { handleEmpty(false); } }); } } } else if (needsCompaction()) { addToBeCompacted(); } } /** * Return true if this oplog is the oldest one of those ready to compact */ private boolean isOldest() { long myId = getOplogId(); return getOplogSet().isOldestExistingOplog(myId); } private boolean added = false; private synchronized void addToBeCompacted() { if (this.added) return; this.added = true; getOplogSet().addToBeCompacted(this); if (this.logger.fineEnabled()) { this.logger.fine("Oplog::switchOpLog: Added the Oplog = " + this.oplogId + " for compacting. "); } } private DiskEntry initRecoveredEntry(DiskRegionView drv, DiskEntry de) { addLive(drv, de); return de; } /** * The oplogId in re points to the oldOplogId. * "this" oplog is the current oplog. */ private void updateRecoveredEntry(DiskRegionView drv, DiskEntry de, DiskEntry.RecoveredEntry re) { if (getOplogId() != re.getOplogId()) { Oplog oldOplog = getOplogSet().getChild(re.getOplogId()); oldOplog.rmLive(drv, de); initRecoveredEntry(drv, de); } else { getDRI(drv).update(de); } } public void prepareForCompact() { this.compacting = true; } private final Lock compactorLock = new ReentrantLock(); public void lockCompactor() { this.compactorLock.lock(); } public void unlockCompactor() { this.compactorLock.unlock(); } /** * Copy any live entries last stored in this oplog to the current oplog. * No need to copy deletes in the drf. * Backup only needs them until all the older crfs are empty. */ public int compact(OplogCompactor compactor) { if (!needsCompaction()) { return 0; // @todo check new logic that deals with not compacting oplogs which have unrecovered regions } isCompactorThread.set(Boolean.TRUE); assert calledByCompactorThread(); getParent().acquireCompactorReadLock(); try { if (!compactor.keepCompactorRunning()) { return 0; } lockCompactor(); try { if (hasNoLiveValues()) { // logger.info(LocalizedStrings.DEBUG, "DEBUG oplog#" + getOplogId() // + " hasNoLiveValues()=" + hasNoLiveValues()); handleNoLiveValues(); return 0; // do this while holding compactorLock } //logger.info(LocalizedStrings.DEBUG, "DEBUG compacting " + this, new RuntimeException("STACK")); //Asif:Start with a fresh wrapper on every compaction so that //if previous run used some high memory byte array which was // exceptional, it gets garbage collected. long opStart = getStats().getStatTime(); BytesAndBitsForCompactor wrapper = new BytesAndBitsForCompactor(); DiskEntry de; DiskEntry lastDe = null; boolean compactFailed = /*getParent().getOwner().isDestroyed || */ !compactor.keepCompactorRunning(); int totalCount = 0; final ByteArrayDataInput in = new ByteArrayDataInput(); final HeapDataOutputStream hdos = new HeapDataOutputStream( Version.CURRENT); for (DiskRegionInfo dri: this.regionMap.values()) { final DiskRegionView dr = dri.getDiskRegion(); if (dr == null) continue; boolean didCompact = false; while ((de = dri.getNextLiveEntry()) != null) { //logger.info(LocalizedStrings.DEBUG, "DEBUG compact de=" + de); if (/*getParent().getOwner().isDestroyed ||*/ !compactor.keepCompactorRunning()) { compactFailed = true; break; } if (lastDe != null) { if (lastDe == de) { // logger.info(LocalizedStrings.DEBUG, "DEBUG duplicate entry " + de // + " didCompact=" + didCompact // + " le=" + this.liveEntries // + " leNext=" + this.liveEntries.getNext() // + " dePrev=" + de.getPrev() // + " deNext=" + de.getNext()); throw new IllegalStateException("compactor would have gone into infinite loop"); } assert lastDe != de; } lastDe = de; didCompact = false; synchronized (de) { // fix for bug 41797 DiskId did = de.getDiskId(); assert did != null; synchronized (did) { long oplogId = did.getOplogId(); if (oplogId != getOplogId()) { // logger.info(LocalizedStrings.DEBUG, "DEBUG compact skipping#2 entry " + de + " because it is in oplog#" + oplogId + " instead of oplog#" + getOplogId()); continue; } boolean toCompact = getBytesAndBitsForCompaction(dr, de, wrapper); if (toCompact) { byte[] valueBytes = wrapper.getBytes(); int length = wrapper.getValidLength(); byte userBits = wrapper.getBits(); // TODO: compaction needs to get version? if (oplogId != did.getOplogId()) { // @todo: Is this even possible? Perhaps I should just assert here // skip this guy his oplogId changed if (!wrapper.isReusable()) { wrapper = new BytesAndBitsForCompactor(); } // logger.info(LocalizedStrings.DEBUG, "DEBUG compact skipping#3 entry because it is in oplog#" + oplogId + " instead of oplog#" + getOplogId()); continue; } Version version = wrapper.getVersion(); if (version != null && !Version.CURRENT.equals(version)) { if (logger.finerEnabled()) { logger.finer("Oplog " + this + " Converting " + valueBytes.length + " bytes from version " + version + " to version " + Version.CURRENT); } final StaticSystemCallbacks sysCb = GemFireCacheImpl .getInternalProductCallbacks(); if (sysCb != null) { byte[] newValueBytes = sysCb.fromVersionToBytes( valueBytes, length, EntryBits.isSerialized(userBits), version, in, hdos); if (valueBytes != newValueBytes) { valueBytes = newValueBytes; length = newValueBytes.length; } } } // write it to the current oplog getOplogSet().getChild() .copyForwardModifyForCompact(dr, de, valueBytes, length, userBits); // the did's oplogId will now be set to the current active oplog didCompact = true; } } // did } // de if (didCompact) { totalCount++; getStats().endCompactionUpdate(opStart); opStart = getStats().getStatTime(); //Asif: Check if the value byte array happens to be any of the constant //static byte arrays or references the value byte array of underlying RegionEntry. // If so for preventing data corruption across regions //( in case of static byte arrays) & for RegionEntry, //recreate the wrapper if (!wrapper.isReusable()) { wrapper = new BytesAndBitsForCompactor(); } } } } if (!compactFailed) { // logger.info(LocalizedStrings.DEBUG, "DEBUG totalCount="+totalCount); if (totalCount == 0) { // Need to still remove the oplog even if it had nothing to compact. handleNoLiveValues(); } // We can't assert hasNoLiveValues() because a race condition exists // in which our liveEntries list is empty but the liveCount has not // yet been decremented. } return totalCount; } finally { unlockCompactor(); } } finally { getParent().releaseCompactorReadLock(); assert calledByCompactorThread(); isCompactorThread.remove(); } } public static boolean isCRFFile(String filename) { return filename.endsWith(Oplog.CRF_FILE_EXT); } public static boolean isDRFFile(String filename) { return filename.endsWith(Oplog.DRF_FILE_EXT); } public static boolean isIRFFile(String filename) { return Oplog.IDX_PATTERN.matcher(filename).matches(); } public static String getKRFFilenameFromCRFFilename(String crfFilename) { return crfFilename.substring(0, crfFilename.length() - Oplog.CRF_FILE_EXT.length()) + Oplog.KRF_FILE_EXT; } long testGetOplogFileLength() throws IOException { long result = 0; if (this.crf.raf != null) { result += this.crf.raf.length(); } if (this.drf.raf != null) { result += this.drf.raf.length(); } return result; } /** * This method is called by the async value recovery * task to recover the values from the crf if the * keys were recovered from the krf. * If the defer regions argument is non-null, then disk regions that are marked to be * deferred ({@link DiskRegionFlag#DEFER_RECOVERY}) are skipped from recovery * and those regions are filled in the passed map as the result. * @param diskRecoveryStores */ public void recoverValuesIfNeeded( Map diskRecoveryStores, Map deferredRegions, Object sync) { //Early out if we start closing the parent. if (getParent().isClosing() || diskRecoveryStores.isEmpty() || this.regionMap.isEmpty()) { return; } List sortedLiveEntries; HashMap targetRegions = new HashMap(this.regionMap); synchronized (sync) { //Don't bother to include any stores that have reached the lru limit Iterator itr = diskRecoveryStores.values().iterator(); while(itr.hasNext()) { DiskRecoveryStore store = itr.next(); if(store.lruLimitExceeded()) { itr.remove(); } } // Get the sorted list of live entries from the target regions Iterator targetItr = targetRegions.keySet().iterator(); while (targetItr.hasNext()) { Long diskRegionId = targetItr.next(); DiskRecoveryStore drs = diskRecoveryStores.get(diskRegionId); if (drs == null) { if (this.logger.fineEnabled()) { this.logger.fine("Oplog::recoverValuesIfNeeded: skipping region " + "with null disk info for id=" + diskRegionId); } targetItr.remove(); } /* else if (deferredRegions != null && drs.getDiskRegionView().getFlags() .contains(DiskRegionFlag.DEFER_RECOVERY)) { if (this.logger.fineEnabled()) { this.logger.fine("Oplog::recoverValuesIfNeeded: skipping region " + "deferred for value recovery: " + drs); } targetItr.remove(); deferredRegions.put(diskRegionId, drs); } */ else { if (this.logger.fineEnabled()) { this.logger.fine("Oplog::recoverValuesIfNeeded: will try " + "to recover for region: " + drs); } } } } if (targetRegions.isEmpty()) { // no regions to recover return; } sortedLiveEntries = getSortedLiveEntries(targetRegions.values()); if(sortedLiveEntries == null) { //There are no live entries in this oplog to recover. return; } if (this.logger.infoEnabled()) { this.logger.info(LocalizedStrings.ONE_ARG, "Oplog::recoverValuesIfNeeded: recovering values from " + toString()); } final ByteArrayDataInput in = new ByteArrayDataInput(); for(KRFEntry entry : sortedLiveEntries) { //Early out if we start closing the parent. if(getParent().isClosing() || diskRecoveryStores.isEmpty()) { return; } DiskEntry diskEntry = entry.getDiskEntry(); DiskRegionView diskRegionView = entry.getDiskRegionView(); Long diskRegionId = diskRegionView.getId(); //TODO DAN ok, here's what we need to do // 1) lock and obtain the correct RegionEntry that we are recovering too. // this will likely mean obtaining the correct DiskRecoveryStore, since with // that we can find the region entry I believe. // 2) Make sure that the lru limit is not exceeded // 3) Update the region entry with the value from disk, assuming the value from // disk is still valid. That is going to be something like synchronized (sync) { DiskRecoveryStore diskRecoveryStore = diskRecoveryStores.get(diskRegionId); if(diskRecoveryStore == null) { continue; } if(diskRecoveryStore.lruLimitExceeded()) { diskRecoveryStores.remove(diskRegionId); continue; } synchronized(diskEntry) { //Make sure the entry hasn't been modified if(diskEntry.getDiskId() != null && diskEntry.getDiskId().getOplogId() == oplogId) { //dear lord, this goes through a lot of layers. Maybe we should skip some? //* specifically, this could end up faulting in from a different oplog, causing // us to seek. //* Also, there may be lock ordering issues here, Really, I guess I want // a flavor of faultInValue that only faults in from this oplog. //* We could have some churn here, opening and closing this oplog //* We also might not be buffering adjacent entries? Not sure about that one //* Ideally, this would fault the thing in only if it were in this oplog and the lru limit wasn't hit // and it would return a status if the lru limit was hit to make us remove the store. try { DiskEntry.Helper.recoverValue(diskEntry, getOplogId(), diskRecoveryStore, in); } catch(RegionDestroyedException e) { //This region has been destroyed, stop recovering from it. diskRecoveryStores.remove(diskRegionId); } } } } } } public static String getParentRegionID(DiskRegionView drv) { String rpath; if (drv.isBucket()) { String bn = PartitionedRegionHelper.getBucketName(drv.getName()); rpath = PartitionedRegionHelper.getPRPath(bn); } else { rpath = drv.getName(); } rpath = LocalRegion.getIDFromPath(rpath, drv.getUUID()); return (rpath.charAt(0) == '/') ? rpath : ("/" + rpath); } /** * Recover given indexes from the oplog rather than the oplog index files. For * latter, use {@link OplogIndex#recoverIndexes}. */ public long recoverIndexes( Map indexes) { // Early out if we start closing the parent. if (getParent().isClosing() || indexes.isEmpty() || this.regionMap.isEmpty()) { return 0; } final LogWriterI18n logger = this.logger; final boolean logEnabled = DiskStoreImpl.INDEX_LOAD_DEBUG || logger.fineEnabled(); // store the affected indexes and the parent region against each disk region final HashMap indexRecoveryMap = new HashMap(this.regionMap.size()); ArrayList targetRegions = new ArrayList( this.regionMap.size()); this.idxkrf.getDiskIdToIndexDataMap(null, indexes, 0, indexRecoveryMap, targetRegions); if (targetRegions.isEmpty()) { // no index affected return 0; } List sortedLiveEntries = getSortedLiveEntries(targetRegions); if (sortedLiveEntries == null) { // There are no live entries in this oplog to recover. return 0; } if (logger.infoEnabled()) { logger.info(LocalizedStrings.ONE_ARG, "Oplog#recoverIndexes: recovering values from " + toString()); } if (logEnabled) { logger.info(LocalizedStrings.DEBUG, "Oplog#recoverIndexes: recovering for oplog=" + toString() + ", indexes=" + indexes + ", targetRegions=" + targetRegions); } long numRecovered = 0; for (KRFEntry entry : sortedLiveEntries) { // Early out if we start closing the parent. if (getParent().isClosing() || indexRecoveryMap.isEmpty()) { return numRecovered; } DiskEntry diskEntry = entry.getDiskEntry(); DiskRegionView diskRegionView = entry.getDiskRegionView(); Long diskRegionId = diskRegionView.getId(); final IndexData[] affectedIndexes = indexRecoveryMap.get(diskRegionId); if (affectedIndexes == null) { continue; } final LocalRegion baseRegion = affectedIndexes[0].index.getBaseRegion(); synchronized (diskEntry) { // Make sure the entry hasn't been modified final DiskId diskId = diskEntry.getDiskId(); if (diskId != null && diskId.getOplogId() == oplogId) { @Released Object val = null; try { val = DiskEntry.Helper.getValueOffHeapOrDiskWithoutFaultIn(diskEntry, diskRegionView, baseRegion); if (val != null && !Token.isInvalidOrRemoved(val)) { for (IndexData indexData : affectedIndexes) { SortedIndexKey indexKey = indexData.index.getIndexKey(val, diskEntry); indexData.indexJob.addJob(indexKey, diskEntry); } numRecovered++; } } catch (RegionDestroyedException rde) { // This region has been destroyed, stop recovering from it. indexRecoveryMap.remove(diskRegionId); } finally { OffHeapHelper.release(val); } } } } if (logEnabled || DiskStoreImpl.INDEX_LOAD_PERF_DEBUG) { logger.info(LocalizedStrings.DEBUG, "Oplog#recoverIndexes: " + "Processed oplog=" + toString() + " for indexes: " + indexes); } return numRecovered; } private byte[] serializeRVVs(Map drMap, boolean gcRVV) throws IOException { HeapDataOutputStream out = new HeapDataOutputStream(Version.CURRENT); //Write the size first InternalDataSerializer.writeUnsignedVL(drMap.size(), out); //Now write regions RVV. for(Map.Entry regionEntry: drMap.entrySet()) { //For each region, write the RVV for the region. Long diskRegionID = regionEntry.getKey(); AbstractDiskRegion dr = regionEntry.getValue(); RegionVersionVector rvv = dr.getRegionVersionVector(); if (rvv == null) { continue; } if (DiskStoreImpl.TRACE_WRITES) { this.logger.info(LocalizedStrings.DEBUG, "serializeRVVs: isGCRVV="+gcRVV+" drId="+diskRegionID +" rvv="+rvv.fullToString()+" oplog#" + getOplogId()); } //Write the disk region id InternalDataSerializer.writeUnsignedVL(diskRegionID, out); if(gcRVV) { //For the GC RVV, we will just write the GC versions Map memberToVersion = rvv.getMemberToGCVersion(); InternalDataSerializer.writeUnsignedVL(memberToVersion.size(), out); for(Entry memberEntry : memberToVersion.entrySet()) { //For each member, write the canonicalized member id, //and the version number for that member VersionSource member = memberEntry.getKey(); Long gcVersion = memberEntry.getValue(); int id = getParent().getDiskInitFile().getOrCreateCanonicalId(member); InternalDataSerializer.writeUnsignedVL(id, out); InternalDataSerializer.writeUnsignedVL(gcVersion, out); } } else { InternalDataSerializer.writeBoolean(dr.getRVVTrusted(), out); //Otherwise, we will write the version and exception list for each member Map memberToVersion = rvv.getMemberToVersion(); InternalDataSerializer.writeUnsignedVL(memberToVersion.size(), out); for(Map.Entry memberEntry : memberToVersion.entrySet()) { //For each member, right the canonicalized member id, //and the version number with exceptions for that member VersionSource member = memberEntry.getKey(); RegionVersionHolder versionHolder = memberEntry.getValue(); int id = getParent().getDiskInitFile().getOrCreateCanonicalId(member); InternalDataSerializer.writeUnsignedVL(id, out); synchronized(versionHolder) { InternalDataSerializer.invokeToData(versionHolder, out); } } } } byte[] rvvBytes = out.toByteArray(); return rvvBytes; } // // Comparable code // // public int compareTo(Oplog o) { // return getOplogId() - o.getOplogId(); // } // public boolean equals(Object o) { // if (o instanceof Oplog) { // return compareTo((Oplog)o) == 0; // } else { // return false; // } // } // public int hashCode() { // return getOplogId(); // } @Override public String toString() { return "oplog#" + getOplogId() /* + "DEBUG" + System.identityHashCode(this) */; // return this.parent.getName() + "#oplog#" + getOplogId() /* + "DEBUG" + System.identityHashCode(this) */; } // //////// Methods used during recovery ////////////// // ////////////////////Inner Classes ////////////////////// private static class OplogFile { public File f; public RandomAccessFile raf; public volatile boolean RAFClosed = true; public FileChannel channel; public ByteBuffer writeBuf; public long currSize; public long bytesFlushed; public boolean unpreblown; } private static class KRFile { public File f; FileOutputStream fos; BufferedOutputStream bos; DataOutputStream dos; long lastOffset = 0; int keyNum = 0; } private static String baToString(byte[] ba) { return baToString(ba, ba != null ? ba.length : 0); } private static String baToString(byte[] ba, int len) { if ( ba == null) return "null"; StringBuilder sb = new StringBuilder(); for (int i=0; i < len; i++) { sb.append(ba[i]).append(", "); } return sb.toString(); } void serializeVersionTag(VersionHolder tag, DataOutput out) throws IOException { int entryVersion = tag.getEntryVersion(); long regionVersion = tag.getRegionVersion(); VersionSource versionMember = tag.getMemberID(); long timestamp = tag.getVersionTimeStamp(); int dsId = tag.getDistributedSystemId(); serializeVersionTag(entryVersion, regionVersion, versionMember, timestamp, dsId, out); } byte[] serializeVersionTag(VersionTag tag) throws IOException { int entryVersion = tag.getEntryVersion(); long regionVersion = tag.getRegionVersion(); VersionSource versionMember = tag.getMemberID(); long timestamp = tag.getVersionTimeStamp(); int dsId = tag.getDistributedSystemId(); return serializeVersionTag(entryVersion, regionVersion, versionMember, timestamp, dsId); } byte[] serializeVersionTag(VersionStamp stamp) throws IOException { int entryVersion = stamp.getEntryVersion(); long regionVersion = stamp.getRegionVersion(); VersionSource versionMember = stamp.getMemberID(); long timestamp = stamp.getVersionTimeStamp(); int dsId = stamp.getDistributedSystemId(); return serializeVersionTag(entryVersion, regionVersion, versionMember, timestamp, dsId); } private byte[] serializeVersionTag(int entryVersion, long regionVersion, VersionSource versionMember, long timestamp, int dsId) throws IOException { HeapDataOutputStream out = new HeapDataOutputStream(4 + 8 + 4 + 8 + 4, Version.CURRENT); serializeVersionTag(entryVersion, regionVersion, versionMember, timestamp, dsId, out); byte[] versionsBytes = out.toByteArray(); return versionsBytes; } private void serializeVersionTag(int entryVersion, long regionVersion, VersionSource versionMember, long timestamp, int dsId, DataOutput out) throws IOException { int memberId = getParent().getDiskInitFile().getOrCreateCanonicalId(versionMember); InternalDataSerializer.writeSignedVL(entryVersion, out); InternalDataSerializer.writeUnsignedVL(regionVersion, out); InternalDataSerializer.writeUnsignedVL(memberId, out); InternalDataSerializer.writeUnsignedVL(timestamp, out); InternalDataSerializer.writeSignedVL(dsId, out); } /** * Holds all the state for the current operation. * Since an oplog can only have one operation in progress at any given * time we only need a single instance of this class per oplog. */ private class OpState { private byte opCode; private byte userBits; private boolean notToUseUserBits; // currently only DestroyFromDisk will not use userBits /** * How many bytes it will be when serialized */ private int size; private boolean needsValue; private byte[] value; private int valueLength; private int drIdLength; // 1..9 private final byte[] drIdBytes = new byte[DiskInitFile.DR_ID_MAX_BYTES]; private byte[] keyBytes; private final byte[] deltaIdBytes = new byte[8]; private int deltaIdBytesLength; private long newEntryBase; private DiskStoreID diskStoreId; private byte[] versionsBytes; private long lastModifiedTime; private int lmtBytes; private short gfversion; // private int entryVersion; // private long regionVersion; // private int memberId; // canonicalId of memberID public final int getSize() { return this.size; } public String debugStr() { StringBuilder sb = new StringBuilder(); sb.append(" opcode=").append(this.opCode) .append(" len=").append(this.valueLength) .append(" vb=").append(baToString(this.value, this.valueLength)); return sb.toString(); } private final void write(OplogFile olf, byte[] bytes, int byteLength) throws IOException { int offset = 0; final int maxOffset = byteLength; ByteBuffer bb = olf.writeBuf; while (offset < maxOffset) { int bytesThisTime = maxOffset - offset; boolean needsFlush = false; if (bytesThisTime > bb.remaining()) { needsFlush = true; bytesThisTime = bb.remaining(); } // logger.info(LocalizedStrings.DEBUG, // "DEBUG offset=" + offset // + " maxOffset=" + maxOffset // + " bytesThisTime=" + bytesThisTime // + " needsFlush=" + needsFlush // + " bb.remaining()=" + bb.remaining()); bb.put(bytes, offset, bytesThisTime); offset += bytesThisTime; if (needsFlush) { flushNoSync(olf); } } } private final void writeByte(OplogFile olf, byte v) throws IOException { ByteBuffer bb = olf.writeBuf; if (1 > bb.remaining()) { flushNoSync(olf); } bb.put(v); } private final void writeOrdinal(OplogFile olf, short ordinal) throws IOException { ByteBuffer bb = olf.writeBuf; if (3 > bb.remaining()) { flushNoSync(olf); } // don't compress since we setup fixed size of buffers Version.writeOrdinal(bb, ordinal, false); } private final void writeInt(OplogFile olf, int v) throws IOException { ByteBuffer bb = olf.writeBuf; if (4 > bb.remaining()) { flushNoSync(olf); } bb.putInt(v); } private final void writeLong(OplogFile olf, long v) throws IOException { ByteBuffer bb = olf.writeBuf; if (8 > bb.remaining()) { flushNoSync(olf); } bb.putLong(v); } private final void writeUnsignedVL(OplogFile olf, long v, int numBytes) throws IOException { ByteBuffer bb = olf.writeBuf; if (numBytes > bb.remaining()) { flushNoSync(olf); } InternalDataSerializer.writeUnsignedVL(v, bb); } public void initialize(long newEntryBase) { this.opCode = OPLOG_NEW_ENTRY_BASE_ID; this.newEntryBase = newEntryBase; this.size = OPLOG_NEW_ENTRY_BASE_REC_SIZE; } public void initialize(short gfversion) { this.opCode = OPLOG_GEMFIRE_VERSION; this.gfversion = gfversion; this.size = OPLOG_GEMFIRE_VERSION_REC_SIZE; } public void initialize(DiskStoreID diskStoreId) { this.opCode = OPLOG_DISK_STORE_ID; this.diskStoreId = diskStoreId; this.size = OPLOG_DISK_STORE_REC_SIZE; } public void initialize(Map drMap, boolean gcRVV) throws IOException { this.opCode = OPLOG_RVV; byte[] rvvBytes = serializeRVVs(drMap, gcRVV); this.value = rvvBytes; //Size is opCode + length + end of record this.valueLength = rvvBytes.length; this.size = 1 + rvvBytes.length + 1; } public void initialize(long oplogKeyId, byte[] keyBytes, byte[] valueBytes, byte userBits, long drId, VersionTag tag, long lastModifiedTime, boolean notToUseUserBits) throws IOException { this.opCode = OPLOG_MOD_ENTRY_WITH_KEY_1ID; this.size = 1;// for the opcode saveUserBits(notToUseUserBits, userBits); this.keyBytes = keyBytes; this.value = valueBytes; this.valueLength = this.value.length; if (this.userBits == 1 && this.valueLength == 0) { throw new IllegalStateException("userBits==1 and valueLength is 0"); } this.needsValue = EntryBits.isNeedsValue(this.userBits); this.size += (4 + this.keyBytes.length); saveDrId(drId); initVersionsBytes(tag, lastModifiedTime); if (this.needsValue) { this.size += 4 + this.valueLength; } this.deltaIdBytesLength = 0; { long delta = calcDelta(oplogKeyId, this.opCode); this.deltaIdBytesLength = bytesNeeded(delta); this.size += this.deltaIdBytesLength; this.opCode += this.deltaIdBytesLength - 1; for (int i=this.deltaIdBytesLength-1; i >= 0; i--) { this.deltaIdBytes[i] = (byte)(delta & 0xFF); delta >>= 8; } } this.size++; // for END_OF_RECORD_ID } private void initVersionsBytes(VersionTag tag, long lastModifiedTime) throws IOException { if (EntryBits.isWithVersions(this.userBits)) { this.versionsBytes = serializeVersionTag(tag); this.size += this.versionsBytes.length; } else { // persist last modified time for no-versions case this.userBits = EntryBits.setHasLastModifiedTime(this.userBits); initLastModifiedTime(lastModifiedTime); } } private void initVersionsBytes(DiskEntry entry) throws IOException { // persist entry version, region version and memberId // The versions in entry are initialized to 0. So we will not persist the 3 // types of data if region version is 0. // TODO: This method will be called 2 times, one for persisting into crf // another for persisting into krf, since we did not save the byte arrary // for the verstion tag. if (EntryBits.isWithVersions(this.userBits)) { VersionStamp stamp = entry.getVersionStamp(); assert (stamp != null); this.versionsBytes = serializeVersionTag(stamp); this.size += this.versionsBytes.length; } else { // persist last modified time for no-versions case this.userBits = EntryBits.setHasLastModifiedTime(this.userBits); initLastModifiedTime(entry.getLastModified()); } } private void initLastModifiedTime(long lastModifiedTime) { if (lastModifiedTime == 0) { lastModifiedTime = getParent().getCache().cacheTimeMillis(); } this.lastModifiedTime = lastModifiedTime; this.lmtBytes = InternalDataSerializer .getUnsignedVLSize(lastModifiedTime); this.size += this.lmtBytes; } public void initialize(byte opCode, DiskRegionView dr, DiskEntry entry, byte[] value, int valueLength, byte userBits, boolean notToUseUserBits) throws IOException { this.opCode = opCode; this.size = 1;// for the opcode saveUserBits(notToUseUserBits, userBits); this.value = value; this.valueLength = valueLength; if (this.userBits == 1 && this.valueLength == 0) { throw new IllegalStateException("userBits==1 and valueLength is 0"); } boolean needsKey = false; if (this.opCode == OPLOG_MOD_ENTRY_1ID) { if (modNeedsKey(entry)) { needsKey = true; this.opCode = OPLOG_MOD_ENTRY_WITH_KEY_1ID; } this.needsValue = EntryBits.isNeedsValue(this.userBits); initVersionsBytes(entry); } else if (this.opCode == OPLOG_NEW_ENTRY_0ID) { needsKey = true; this.needsValue = EntryBits.isNeedsValue(this.userBits); initVersionsBytes(entry); } else if (this.opCode == OPLOG_DEL_ENTRY_1ID) { needsKey = false; this.needsValue = false; } if (needsKey) { Object key = entry.getKeyCopy(); this.keyBytes = EntryEventImpl.serialize(key); this.size += (4 + this.keyBytes.length); } else { this.keyBytes = null; } if (this.opCode == OPLOG_DEL_ENTRY_1ID) { this.drIdLength = 0; } else { long drId = dr.getId(); saveDrId(drId); } if (this.needsValue) { this.size += 4 + this.valueLength; } this.deltaIdBytesLength = 0; if (this.opCode != OPLOG_NEW_ENTRY_0ID) { // if (this.opCode == OPLOG_DEL_ENTRY_1ID) { // this.newEntryBase = writeDelEntryId/*abs(entry.getDiskId().getKeyId())*/; this.size += 8; // HACK DEBUG // } else { // this.newEntryBase = writeModEntryId/*abs(entry.getDiskId().getKeyId())*/; this.size += 8; // HACK DEBUG // } long keyId = entry.getDiskId().getKeyId(); if(keyId == 0) { Assert.fail("Attempting to write an entry with keyId=0 to oplog. Entry key=" + entry.getKey() + " diskId=" + entry.getDiskId() + " region=" + dr); } long delta = calcDelta(abs(keyId), this.opCode); this.deltaIdBytesLength = bytesNeeded(delta); this.size += this.deltaIdBytesLength; this.opCode += this.deltaIdBytesLength - 1; for (int i=this.deltaIdBytesLength-1; i >= 0; i--) { this.deltaIdBytes[i] = (byte)(delta & 0xFF); delta >>= 8; } } this.size++; // for END_OF_RECORD_ID } private void saveUserBits(boolean notToUseUserBits, byte userBits) { this.notToUseUserBits = notToUseUserBits; if (notToUseUserBits) { this.userBits = 0; } else { this.userBits = EntryBits.getPersistentBits(userBits); this.size++; // for the userBits } } private void saveDrId(long drId) { // If the drId is <= 255 (max unsigned byte) then // encode it as a single byte. // Otherwise write a byte whose value is the number of bytes // it will be encoded by and then follow it with that many bytes. // Note that drId are not allowed to have a value in the range 1..8 inclusive. if (drId >= 0 && drId <= 255) { this.drIdLength = 1; this.drIdBytes[0] = (byte)drId; } else { byte bytesNeeded = (byte)Oplog.bytesNeeded(drId); this.drIdLength = bytesNeeded+1; this.drIdBytes[0] = bytesNeeded; for (int i=bytesNeeded; i >=1; i--) { this.drIdBytes[i] = (byte)(drId & 0xFF); drId >>=8; } } this.size += this.drIdLength; } public void initialize(byte opCode, long drId, VersionTag tag) throws IOException { this.opCode = opCode; assert this.opCode == OPLOG_CONFLICT_VERSION; this.size = 1;// for the opcode saveDrId(drId); this.versionsBytes = serializeVersionTag(tag); this.size += this.versionsBytes.length; this.size++; // for END_OF_RECORD_ID } /** * Returns the offset to the first byte of the value bytes. */ public int getValueOffset() { if (!this.needsValue) return 0; int result = this.deltaIdBytesLength // + 8 /* HACK DEBUG */ + this.drIdLength + 1/* opcode */ + 4/* value length */; if (this.notToUseUserBits == false) { result++; } if (EntryBits.isWithVersions(this.userBits) && this.versionsBytes != null) { result += this.versionsBytes.length; } if (this.lastModifiedTime != 0) { result += this.lmtBytes; } return result; } public long write(OplogFile olf) throws IOException { long bytesWritten = 0; writeByte(olf, this.opCode); bytesWritten++; if (this.opCode == OPLOG_NEW_ENTRY_BASE_ID) { writeLong(olf, this.newEntryBase); bytesWritten += 8; } else if (this.opCode == OPLOG_DISK_STORE_ID) { writeLong(olf, this.diskStoreId.getLeastSignificantBits()); writeLong(olf, this.diskStoreId.getMostSignificantBits()); bytesWritten += 16; } else if (this.opCode == OPLOG_RVV) { write(olf, this.value, this.valueLength); bytesWritten+= this.valueLength; } else if (this.opCode == OPLOG_GEMFIRE_VERSION) { writeOrdinal(olf, this.gfversion); bytesWritten++; } else if (this.opCode == OPLOG_CONFLICT_VERSION) { if (this.drIdLength > 0) { write(olf, this.drIdBytes, this.drIdLength); bytesWritten += this.drIdLength; } assert this.versionsBytes.length > 0; write(olf, this.versionsBytes, this.versionsBytes.length); bytesWritten += this.versionsBytes.length; } else { if (this.notToUseUserBits == false) { writeByte(olf, this.userBits); bytesWritten++; } if (this.deltaIdBytesLength > 0) { write(olf, this.deltaIdBytes, this.deltaIdBytesLength); bytesWritten += this.deltaIdBytesLength; // writeLong(olf, this.newEntryBase); bytesWritten += 8; // HACK DEBUG } if (this.drIdLength > 0) { write(olf, this.drIdBytes, this.drIdLength); bytesWritten += this.drIdLength; } if (EntryBits.isWithVersions(this.userBits) && this.versionsBytes != null && this.opCode != OPLOG_DEL_ENTRY_1ID) { write(olf, this.versionsBytes, this.versionsBytes.length); bytesWritten += this.versionsBytes.length; } if (this.lastModifiedTime != 0) { writeUnsignedVL(olf, this.lastModifiedTime, this.lmtBytes); bytesWritten += this.lmtBytes; } if (this.needsValue) { writeInt(olf, this.valueLength); bytesWritten += 4; if (this.valueLength > 0) { write(olf, this.value, this.valueLength); bytesWritten += this.valueLength; } } if (this.keyBytes != null) { writeInt(olf, this.keyBytes.length); bytesWritten += 4; if (this.keyBytes.length > 0) { write(olf, this.keyBytes, this.keyBytes.length); bytesWritten += this.keyBytes.length; } } } writeByte(olf, END_OF_RECORD_ID); bytesWritten++; return bytesWritten; } /** * Free up any references to possibly large data. */ public void clear() { this.value = null; this.keyBytes = null; this.notToUseUserBits = false; this.versionsBytes = null; this.lastModifiedTime = 0; } } /** * Fake disk entry used to implement the circular linked list of entries * an oplog has. Each Oplog will have one OplogDiskEntry whose prev and next * fields point to the actual DiskEntrys currently stored in its crf. * Items are added at "next" so the most recent entry written will be at next * and the oldest item written will be at "prev". */ static class OplogDiskEntry implements DiskEntry, RegionEntry { private DiskEntry next = this; private DiskEntry prev = this; public synchronized DiskEntry getPrev() { return this.prev; } public synchronized void setPrev(DiskEntry v) { this.prev = v; } public synchronized DiskEntry getNext() { return this.next; } public synchronized void setNext(DiskEntry v) { this.next = v; } /** * returns the number of entries cleared * @param rvv * @param pendingKrfTags */ public synchronized int clear(RegionVersionVector rvv, Map pendingKrfTags) { if(rvv == null) { if(pendingKrfTags != null) { pendingKrfTags.clear(); } return clear(); } else { //Clearing the list is handled in AbstractRegionMap.clear for RVV //based clears, because it removes each entry. //It needs to be handled there because the entry is synched at that point return 0; } } /** * Clear using an RVV. Remove live entries that are contained within * the clear RVV. * @param pendingKrfTags */ private int clearWithRVV(RegionVersionVector rvv, Map pendingKrfTags) { //TODO this doesn't work, because we can end up removing entries from here before //they are removed from the region map. Reverting this to the old, leaky, behavior //until I fix the region map code. return 0; // int result = 0; // DiskEntry n = getNext(); // while (n != this) { // DiskEntry nextEntry = n.getNext(); // VersionSource member = null; // long version = -1; // if(pendingKrfTags != null) { // VersionTag tag = pendingKrfTags.get(n); // if(tag != null) { // member = tag.getMemberID(); // version = tag.getRegionVersion(); // } // } // if(member == null) { // VersionStamp stamp = n.getVersionStamp(); // member = stamp.getMemberID(); // version = stamp.getRegionVersion(); // } // // if(rvv.contains(member, version)) { // result++; // remove(n); // if(pendingKrfTags != null) { // pendingKrfTags.remove(n); // } // } // n = nextEntry; // } // return result; } /** * Clear without an RVV. Empties the entire list. */ private int clear() { int result = 0; // Need to iterate over the list and set each prev field to null // so that if remove is called it will know that the DiskEntry // has already been removed. DiskEntry n = getNext(); setNext(this); setPrev(this); while (n != this) { result++; n.setPrev(null); n = n.getNext(); } return result; } public synchronized boolean remove(DiskEntry v) { DiskEntry p = v.getPrev(); if (p != null) { v.setPrev(null); DiskEntry n = v.getNext(); v.setNext(null); n.setPrev(p); p.setNext(n); return true; } else { return false; } } public synchronized void insert(DiskEntry v) { assert v.getPrev() == null; // checkForDuplicate(v); DiskEntry n = getNext(); setNext(v); n.setPrev(v); v.setNext(n); v.setPrev(this); } public synchronized void replace(DiskEntry old, DiskEntry v) { DiskEntry p = old.getPrev(); if (p != null) { old.setPrev(null); v.setPrev(p); p.setNext(v); } DiskEntry n = old.getNext(); if (n != null) { old.setNext(null); v.setNext(n); n.setPrev(v); } if (getNext() == old) { setNext(v); } } // private synchronized void checkForDuplicate(DiskEntry v) { // DiskEntry de = getPrev(); // final long newKeyId = v.getDiskId().getKeyId(); // while (de != this) { // if (de.getDiskId().getKeyId() == newKeyId) { // throw new IllegalStateException( // "DEBUG: found duplicate for oplogKeyId=" + newKeyId + " de=" // + System.identityHashCode(v) + " ode=" // + System.identityHashCode(de) + " deKey=" + v.getKey() // + " odeKey=" + de.getKey() + " deOffset=" // + v.getDiskId().getOffsetInOplog() + " odeOffset=" // + de.getDiskId().getOffsetInOplog()); // } // de = de.getPrev(); // } // } @Override public Object getKey() {throw new IllegalStateException();} @Override public Object getKeyCopy() {throw new IllegalStateException();} @Override public Object _getValue() {throw new IllegalStateException();} @Override public Token getValueAsToken() {throw new IllegalStateException();} @Override public Object _getValueRetain(RegionEntryContext context, boolean decompress) {throw new IllegalStateException();} @Override public void setValueWithContext(RegionEntryContext context,Object value) {throw new IllegalStateException();} @Override public void handleValueOverflow(RegionEntryContext context) {throw new IllegalStateException();} @Override public void afterValueOverflow(RegionEntryContext context) {throw new IllegalStateException();} @Override public Object prepareValueForCache(RegionEntryContext r, Object val, boolean isEntryUpdate, boolean valHasMetadataForGfxdOffHeapUpdate) { throw new IllegalStateException("Should never be called"); } public void _removePhase1() {throw new IllegalStateException();} public DiskId getDiskId() {throw new IllegalStateException();} public long getLastModified() {throw new IllegalStateException();} public boolean isRecovered() {throw new IllegalStateException();} public boolean isValueNull() {throw new IllegalStateException();} public boolean isRemovedFromDisk() {throw new IllegalStateException();} public int updateAsyncEntrySize(EnableLRU capacityController) {throw new IllegalStateException();} public void _setLastModified(long lastModifiedTime) { throw new IllegalStateException(); } public void setLastModified(long lastModifiedTime) { throw new IllegalStateException(); } public boolean isLockedForCreate() {throw new IllegalStateException();} public Object getRawKey() { throw new IllegalStateException(); } public void setOwner(LocalRegion owner) { throw new IllegalStateException(); } public Object getContainerInfo() { throw new IllegalStateException(); } /** * {@inheritDoc} */ @Override public Object setContainerInfo(LocalRegion owner, Object val) { throw new IllegalStateException(); } /** * Adds any live entries in this list to liveEntries and returns the index * of the next free slot. * * @param liveEntries * the array to fill with the live entries * @param idx * the first free slot in liveEntries * @param drv * the disk region these entries are on * @param pendingKrfTags * @return the next free slot in liveEntries */ public synchronized int addLiveEntriesToList(KRFEntry[] liveEntries, int idx, DiskRegionView drv, Map pendingKrfTags) { DiskEntry de = getPrev(); while (de != this) { VersionHolder tag = null; if(pendingKrfTags != null) { tag = pendingKrfTags.get(de); } liveEntries[idx] = new KRFEntry(drv, de, tag); idx++; de = de.getPrev(); } return idx; } /* (non-Javadoc) * @see com.gemstone.gemfire.internal.cache.DiskEntry#getVersionStamp() */ @Override public VersionStamp getVersionStamp() { // dummy entry as start of live list return null; } @Override public boolean hasStats() { // TODO Auto-generated method stub return false; } @Override public long getLastAccessed() throws InternalStatisticsDisabledException { // TODO Auto-generated method stub return 0; } @Override public long getHitCount() throws InternalStatisticsDisabledException { // TODO Auto-generated method stub return 0; } @Override public long getMissCount() throws InternalStatisticsDisabledException { // TODO Auto-generated method stub return 0; } @Override public void updateStatsForPut(long lastModifiedTime) { // TODO Auto-generated method stub } @Override public VersionTag generateVersionTag(VersionSource member, boolean isRemoteVersionSource, boolean withDelta, LocalRegion region, EntryEventImpl event) { // TODO Auto-generated method stub return null; } @Override public boolean dispatchListenerEvents(EntryEventImpl event) throws InterruptedException { // TODO Auto-generated method stub return false; } @Override public void setRecentlyUsed() { // TODO Auto-generated method stub } @Override public void updateStatsForGet(boolean hit, long time) { // TODO Auto-generated method stub } @Override public void txDidDestroy(long currTime) { // TODO Auto-generated method stub } @Override public void resetCounts() throws InternalStatisticsDisabledException { // TODO Auto-generated method stub } @Override public void makeTombstone(LocalRegion r, VersionTag version) throws RegionClearedException { // TODO Auto-generated method stub } @Override public void removePhase1(LocalRegion r, boolean clear) throws RegionClearedException { // TODO Auto-generated method stub } @Override public void removePhase2() { // TODO Auto-generated method stub } @Override public boolean isRemoved() { // TODO Auto-generated method stub return false; } @Override public boolean isRemovedPhase2() { // TODO Auto-generated method stub return false; } @Override public boolean isTombstone() { // TODO Auto-generated method stub return false; } @Override public boolean fillInValue(LocalRegion r, com.gemstone.gemfire.internal.cache.InitialImageOperation.Entry entry, ByteArrayDataInput in, DM mgr, Version targetVersion) { // TODO Auto-generated method stub return false; } @Override public boolean isOverflowedToDisk(LocalRegion r, DiskPosition dp) { // TODO Auto-generated method stub return false; } @Override public Object getValue(RegionEntryContext context) { // TODO Auto-generated method stub return null; } @Override public void setValue(RegionEntryContext context, Object value) throws RegionClearedException { // TODO Auto-generated method stub } @Override public void setValueWithTombstoneCheck(Object value, EntryEvent event) throws RegionClearedException { // TODO Auto-generated method stub } @Override public Object getTransformedValue() { // TODO Auto-generated method stub return null; } @Override public Object getValueInVM(RegionEntryContext context) { // TODO Auto-generated method stub return null; } @Override public Object getValueOnDisk(LocalRegion r) throws EntryNotFoundException { // TODO Auto-generated method stub return null; } @Override public Object getValueOnDiskOrBuffer(LocalRegion r) throws EntryNotFoundException { // TODO Auto-generated method stub return null; } @Override public boolean initialImagePut(LocalRegion region, long lastModified, Object newValue, boolean wasRecovered, boolean acceptedVersionTag) throws RegionClearedException { // TODO Auto-generated method stub return false; } @Override public boolean initialImageInit(LocalRegion region, long lastModified, Object newValue, boolean create, boolean wasRecovered, boolean acceptedVersionTag) throws RegionClearedException { // TODO Auto-generated method stub return false; } @Override public boolean destroy(LocalRegion region, EntryEventImpl event, boolean inTokenMode, boolean cacheWrite, Object expectedOldValue, boolean forceDestroy, boolean removeRecoveredEntry) throws CacheWriterException, EntryNotFoundException, TimeoutException, RegionClearedException { // TODO Auto-generated method stub return false; } @Override public Object getSerializedValueOnDisk(LocalRegion localRegion) { // TODO Auto-generated method stub return null; } @Override public Object getValueInVMOrDiskWithoutFaultIn(LocalRegion owner) { // TODO Auto-generated method stub return null; } @Override public Object getValueOffHeapOrDiskWithoutFaultIn(LocalRegion owner) { // TODO Auto-generated method stub return null; } @Override public boolean isUpdateInProgress() { // TODO Auto-generated method stub return false; } @Override public void setUpdateInProgress(boolean underUpdate) { // TODO Auto-generated method stub } @Override public boolean isMarkedForEviction() { // TODO Auto-generated method stub return false; } @Override public void setMarkedForEviction() { // TODO Auto-generated method stub } @Override public void clearMarkedForEviction() { // TODO Auto-generated method stub } @Override public boolean isInvalid() { // TODO Auto-generated method stub return false; } @Override public boolean isDestroyed() { // TODO Auto-generated method stub return false; } @Override public boolean isDestroyedOrRemoved() { // TODO Auto-generated method stub return false; } @Override public boolean isDestroyedOrRemovedButNotTombstone() { // TODO Auto-generated method stub return false; } @Override public boolean isInvalidOrRemoved() { // TODO Auto-generated method stub return false; } @Override public void setValueToNull() { // TODO Auto-generated method stub } @Override public void returnToPool() { // TODO Auto-generated method stub } /** * {@inheritDoc} */ @Override public Object getOwnerId(Object context) { // TODO Auto-generated method stub return null; } /** * {@inheritDoc} */ @Override public boolean attemptLock(LockMode mode, int flags, LockingPolicy lockPolicy, long msecs, Object owner, Object context) { // TODO Auto-generated method stub return false; } /** * {@inheritDoc} */ @Override public void releaseLock(LockMode mode, boolean releaseAll, Object owner, Object context) { // TODO Auto-generated method stub } /** * {@inheritDoc} */ @Override public int numSharedLocks() { // TODO Auto-generated method stub return 0; } /** * {@inheritDoc} */ @Override public int numReadOnlyLocks() { // TODO Auto-generated method stub return 0; } /** * {@inheritDoc} */ @Override public boolean hasExclusiveLock(Object owner, Object context) { // TODO Auto-generated method stub return false; } /** * {@inheritDoc} */ @Override public boolean hasExclusiveSharedLock(Object ownerId, Object context) { // TODO Auto-generated method stub return false; } /** * {@inheritDoc} */ @Override public int getState() { // TODO Auto-generated method stub return 0; } /** * {@inheritDoc} */ @Override public boolean hasAnyLock() { // TODO Auto-generated method stub return false; } @Override public boolean isCacheListenerInvocationInProgress() { // TODO Auto-generated method stub return false; } @Override public void setCacheListenerInvocationInProgress(boolean isListenerInvoked) { // TODO Auto-generated method stub } } /** * Used as the value in the regionMap. Tracks information about what the * region has in this oplog. */ public interface DiskRegionInfo { public DiskRegionView getDiskRegion(); public int addLiveEntriesToList(KRFEntry[] liveEntries, int idx); public void addLive(DiskEntry de); public void update(DiskEntry entry); public void replaceLive(DiskEntry old, DiskEntry de); public boolean rmLive(DiskEntry de); public DiskEntry getNextLiveEntry(); public void setDiskRegion(DiskRegionView dr); public long clear(RegionVersionVector rvv); /** * Return true if we are the first guy to set it to true */ public boolean testAndSetUnrecovered(); public boolean getUnrecovered(); /** * Return true if we are the first guy to set it to false */ public boolean testAndSetRecovered(DiskRegionView dr); /** * Callback to indicate that this oplog has created a krf. */ public void afterKrfCreated(); } public abstract static class AbstractDiskRegionInfo implements DiskRegionInfo { private DiskRegionView dr; private boolean unrecovered = false; public AbstractDiskRegionInfo(DiskRegionView dr) { this.dr = dr; } public abstract void addLive(DiskEntry de); public abstract boolean rmLive(DiskEntry de); public abstract DiskEntry getNextLiveEntry(); public abstract long clear(RegionVersionVector rvv); final public DiskRegionView getDiskRegion() { return this.dr; } final public void setDiskRegion(DiskRegionView dr) { this.dr = dr; } synchronized public boolean testAndSetUnrecovered() { boolean result = !this.unrecovered; if (result) { this.unrecovered = true; this.dr = null; } return result; } final synchronized public boolean getUnrecovered() { return this.unrecovered; } final synchronized public boolean testAndSetRecovered(DiskRegionView dr) { boolean result = this.unrecovered; if (result) { this.unrecovered = false; this.dr = dr; } return result; } } public static class DiskRegionInfoNoList extends AbstractDiskRegionInfo { private final AtomicInteger liveCount = new AtomicInteger(); public DiskRegionInfoNoList(DiskRegionView dr) { super(dr); } @Override public void addLive(DiskEntry de) { this.liveCount.incrementAndGet(); } @Override public void update(DiskEntry entry) { //nothing to do } @Override public void replaceLive(DiskEntry old, DiskEntry de) { } @Override public boolean rmLive(DiskEntry de) { return this.liveCount.decrementAndGet() >= 0; } @Override public DiskEntry getNextLiveEntry() { return null; } @Override public long clear(RegionVersionVector rvv) { return this.liveCount.getAndSet(0); } public int addLiveEntriesToList(KRFEntry[] liveEntries, int idx) { // nothing needed since no linked list return idx; } public void afterKrfCreated() { //do nothing } } public static class DiskRegionInfoWithList extends AbstractDiskRegionInfo { /** * A linked list of the live entries in this oplog. Updates to pendingKrfTags * are protected by synchronizing on object. */ private final OplogDiskEntry liveEntries = new OplogDiskEntry(); /** * A map of DiskEntry to the VersionTag that is written to disk associated * with this tag. Only needed for async regions so that we can generate a * krf with a version tag that matches the the tag we have written to disk * for this oplog. */ private Map pendingKrfTags; public DiskRegionInfoWithList(DiskRegionView dr, boolean couldHaveKrf, boolean krfExists) { super(dr); // we need to keep track of the version tags for entries so that we write the correct entry to the krf // both in sync and async disk write cases if (!krfExists && couldHaveKrf) { pendingKrfTags = new HashMap(200); } else { pendingKrfTags = null; } } @Override public void addLive(DiskEntry de) { synchronized(liveEntries) { this.liveEntries.insert(de); if(pendingKrfTags != null && de.getVersionStamp() != null) { //Remember the version tag of the entry as it was written to the crf. pendingKrfTags.put(de, new CompactVersionHolder(de.getVersionStamp())); } } } @Override public void update(DiskEntry de) { if(pendingKrfTags != null && de.getVersionStamp() != null) { //Remember the version tag of the entry as it was written to the crf. pendingKrfTags.put(de, new CompactVersionHolder(de.getVersionStamp())); } } @Override public void replaceLive(DiskEntry old, DiskEntry de) { synchronized (liveEntries) { this.liveEntries.replace(old, de); if (pendingKrfTags != null && de.getVersionStamp() != null) { // Remember the version tag of the entry as it was written to the crf. pendingKrfTags.remove(old); pendingKrfTags.put(de, new CompactVersionHolder(de.getVersionStamp())); } } } @Override public boolean rmLive(DiskEntry de) { synchronized(liveEntries) { boolean removed = this.liveEntries.remove(de); if(removed && pendingKrfTags != null) { pendingKrfTags.remove(de); } return removed; } } @Override public DiskEntry getNextLiveEntry() { DiskEntry result = this.liveEntries.getPrev(); if (result == this.liveEntries) { result = null; } return result; } @Override public long clear(RegionVersionVector rvv) { synchronized(this.liveEntries) { return this.liveEntries.clear(rvv, this.pendingKrfTags); } } /** * Return true if we are the first guy to set it to true */ @Override synchronized public boolean testAndSetUnrecovered() { boolean result = super.testAndSetUnrecovered(); if (result) { this.liveEntries.clear(); } return result; } public int addLiveEntriesToList(KRFEntry[] liveEntries, int idx) { synchronized(liveEntries) { int result = this.liveEntries.addLiveEntriesToList(liveEntries, idx, getDiskRegion(), pendingKrfTags); return result; } } public void afterKrfCreated() { synchronized(liveEntries) { this.pendingKrfTags = null; } } } /** * Used during offline compaction to hold information that may need to be copied forward. */ private static class CompactionRecord { private final byte[] keyBytes; private long offset; public CompactionRecord(byte[] kb, long offset) { this.keyBytes = kb; this.offset = offset; } public void update(long offset) { this.offset = offset; } public byte[] getKeyBytes() { return this.keyBytes; } public long getOffset() { return this.offset; } } /** * Map of OplogEntryIds (longs). * Memory is optimized by using an int[] for ids in the unsigned int range. */ public static class OplogEntryIdMap { private final TStatelessIntObjectHashMap ints = new TStatelessIntObjectHashMap((int)DiskStoreImpl.INVALID_ID); private final TStatelessLongObjectHashMap longs = new TStatelessLongObjectHashMap(DiskStoreImpl.INVALID_ID); public Object put(long id, Object v) { Object result; if (id >= 0 && id <= 0x00000000FFFFFFFFL) { result = this.ints.put((int)id, v); } else { result = this.longs.put(id, v); } return result; } public int size() { return this.ints.size() + this.longs.size(); } public Object get(long id) { Object result; if (id >= 0 && id <= 0x00000000FFFFFFFFL) { result = this.ints.get((int)id); } else { result = this.longs.get(id); } return result; } public Iterator iterator() { return new Iterator(); } public class Iterator { private boolean doingInt = true; TStatelessIntObjectIterator intIt = ints.iterator(); TStatelessLongObjectIterator longIt = longs.iterator(); public boolean hasNext() { if (this.intIt.hasNext()) { return true; } else { doingInt = false; return this.longIt.hasNext(); } } public void advance() { if (doingInt) { this.intIt.advance(); } else { this.longIt.advance(); } } public long key() { if (doingInt) { return this.intIt.key(); } else { return this.longIt.key(); } } public Object value() { if (doingInt) { return this.intIt.value(); } else { return this.longIt.value(); } } } } void finishKrf() { createKrf(false); } void prepareForClose() { try { finishKrf(); } catch(CancelException e) { //workaround for 50465 logger.fine("Got a cancel exception while creating a krf during shutown", e); } } private Object deserializeKey(byte[] keyBytes, final Version version, final ByteArrayDataInput in) { if (!getParent().isOffline() || !PdxWriterImpl.isPdx(keyBytes)) { return EntryEventImpl.deserialize(keyBytes, version, in); } else { return new RawByteKey(keyBytes); } } /** * If this OpLog is from an older version of the product, then return that * {@link Version} else return null. */ public Version getProductVersionIfOld() { final Version version = this.gfversion; if (version == null) { // check for the case of diskstore upgrade from 6.6 to >= 7.0 if (getParent().isUpgradeVersionOnly()) { // assume previous release version return Version.GFE_66; } else { return null; } } else if (version == Version.CURRENT) { return null; } else { // version changed so return that for VersionedDataStream return version; } } /** * If this OpLog has data that was written by an older version of the product, * then return that {@link Version} else return null. */ public Version getDataVersionIfOld() { final Version version = this.dataVersion; if (version == null) { // check for the case of diskstore upgrade from 6.6 to >= 7.0 if (getParent().isUpgradeVersionOnly()) { // assume previous release version return Version.GFE_66; } else { return null; } } else if (version == Version.CURRENT) { return null; } else { // version changed so return that for VersionedDataStream return version; } } /** * Used in offline mode to prevent pdx deserialization of keys. * The raw bytes are a serialized pdx. * @author darrel * @since 6.6 */ private static class RawByteKey implements Sendable { final byte[] bytes; final int hashCode; public RawByteKey(byte[] keyBytes) { this.bytes = keyBytes; this.hashCode = Arrays.hashCode(keyBytes); } @Override public int hashCode() { return this.hashCode; } @Override public boolean equals(Object other) { if (!(other instanceof RawByteKey)) { return false; } return Arrays.equals(this.bytes, ((RawByteKey)other).bytes); } public void sendTo(DataOutput out) throws IOException { out.write(this.bytes); } } public File getIndexFileIfValid() { return this.idxkrf.getIndexFileIfValid(); } public boolean isNewOplog() { return this.newOplog; } /** * Enumeration of operation log file types. * @author rholmes */ enum OplogFileType { OPLOG_CRF, // Creates and updates OPLOG_DRF, // Deletes OPLOG_KRF // Keys } /** * Enumeration of the possible results of * the okToSkipModifyRecord * @author dsmith * */ private static enum OkToSkipResult { SKIP_RECORD, //Skip reading the key and value SKIP_VALUE, //skip reading just the value DONT_SKIP; //don't skip the record public boolean skip() { return this != DONT_SKIP; } public boolean skipKey() { return this == SKIP_RECORD; } } public String getDiskFilePath() { assert this.diskFile != null; return this.diskFile.getPath(); } public String getDiskFileName() { assert this.diskFile != null; return this.diskFile.getName(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy