All Downloads are FREE. Search and download functionalities are using the official Maven repository.

oracle.kv.impl.rep.stats.PartitionScan Maven / Gradle / Ivy

Go to download

NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.

There is a newer version: 18.3.10
Show newest version
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.impl.rep.stats;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

import oracle.kv.Key;
import oracle.kv.impl.admin.param.RepNodeParams;
import oracle.kv.impl.api.TopologyManager;
import oracle.kv.impl.api.table.DroppedTableException;
import oracle.kv.impl.api.table.TableImpl;
import oracle.kv.impl.api.table.TableMetadata;
import oracle.kv.impl.metadata.Metadata.MetadataType;
import oracle.kv.impl.rep.RNTaskCoordinator;
import oracle.kv.impl.rep.RepNode;
import oracle.kv.impl.rep.stats.PartitionLeaseManager.PartitionLeaseInfo;
import oracle.kv.impl.systables.TableStatsPartitionDesc;
import oracle.kv.impl.topo.Partition;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.util.TxnUtil;
import oracle.kv.table.Row;
import oracle.kv.table.Table;
import oracle.kv.table.TableAPI;
import oracle.kv.table.TimeToLive;

import com.sleepycat.je.CacheMode;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Environment;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
import com.sleepycat.je.dbi.CursorImpl;
import com.sleepycat.je.utilint.TaskCoordinator.Permit;

/**
 * The class scans the partition database to calculate primary key statistics
 * and stores the scanned results into statistics tables.
 */
public class PartitionScan extends StatsScan
                           implements TopologyManager.PostUpdateListener {

    private final PartitionId partId;
    private final int groupId;
    private Table tableStatsTable;
    private final Set emptyTableSet = new HashSet<>();
    
    /* The key to record the last read one. It is used as a resume key */
    private byte[] resumeKey = null;

    /*
     * The name of the fake internal table used to store statistics for
     * KV pairs.
     */
    public static String KV_STATS_TABLE_NAME = "$KV$";

    final Map tableAccMap = new HashMap<>();

    PartitionScan(TableAPI tableAPI,
                  PartitionId partId,
                  RepNode repNode,
                  StatsLeaseManager leaseManager,
                  PartitionLeaseInfo leaseInfo,
                  long intervalStart,
                  TimeToLive ttl,
                  Logger logger) {
        super(repNode, tableAPI, leaseManager, leaseInfo,
              intervalStart, ttl, logger);
        this.partId = partId;
        this.groupId = repNode.getRepNodeId().getGroupId();
    }

    @Override
    boolean checkStatsTable(TableMetadata md) {
        if (tableStatsTable !=  null) {
            return true;
        }

        tableStatsTable = md.getTable(null, TableStatsPartitionDesc.TABLE_NAME);
        if (tableStatsTable == null) {
            /* Table does not exist, stop gathering statistics info */
            return false;
        }

        return true;
    }

    @Override
    void accumulateResult(byte[] key, Cursor cursor) {
        /*
         * Check whether a key is belong to a table one by one. If a key is
         * belong to a table, store it into a map with the table name; if
         * not, associate it with the fake internal table: KV_STATS_TABLE_NAME.
         */

        /* Filter out internal key space record */
        if (Key.keySpaceIsInternal(key)) {
            return;
        }

        final TableImpl table;
        try {
            table = repNode.getTableManager().getTable(key);
        } catch (DroppedTableException dte) {
            /* Do not accumulate records for dropped tables. */
            return;
        }

        final String tableName = (table == null) ? KV_STATS_TABLE_NAME :
                                                   table.getFullNamespaceName();

        StatsAccumulator csa = tableAccMap.get(tableName);

        if (csa == null) {
            csa = new StatsAccumulator();
            tableAccMap.put(tableName, csa);

            /* Find data for the table and remove table name from set */
            emptyTableSet.remove(tableName);
        }
        csa.addKeySize(key.length);

        // TODO - should we just always collect the size??
        if ((table != null) && table.hasSizeLimit()) {
            final CursorImpl cursorImpl = DbInternal.getCursorImpl(cursor);
            /*
             * getStorageSize returns the estimated disk storage size for the
             * record at the current position.
             */
            csa.addTableSize(cursorImpl.getStorageSize());
        }
    }

    @Override
    void wrapResult() {
        /* Deal with empty table */
        for (String tableName : emptyTableSet) {
            tableAccMap.put(tableName, new StatsAccumulator());
        }

        /*
         * Convert individual table statistics into rows that can be inserted
         * into table TableStatsPartition.
         */
        for (Map.Entry entry :
                tableAccMap.entrySet()) {

            final StatsAccumulator sa = entry.getValue();
            final Row row = tableStatsTable.createRow();
            row.setTTL(ttl);
            row.put(TableStatsPartitionDesc.COL_NAME_TABLE_NAME,
                    entry.getKey());
            row.put(TableStatsPartitionDesc.COL_NAME_PARTITION_ID,
                    partId.getPartitionId());
            row.put(TableStatsPartitionDesc.COL_NAME_SHARD_ID, groupId);
            row.put(TableStatsPartitionDesc.COL_NAME_COUNT,
                    sa.count);
            row.put(TableStatsPartitionDesc.COL_NAME_AVG_KEY_SIZE,
                    sa.getAvgKeySize());
            row.put(TableStatsPartitionDesc.COL_NAME_TABLE_SIZE,
                    sa.totalTableSize);

            addRow(row);
        }
    }

    @Override
    boolean preScan() {
        tableAccMap.clear();
        emptyTableSet.clear();
        resumeKey = null;

        final TableMetadata metadata =
                (TableMetadata)repNode.getMetadata(MetadataType.TABLE);
        if (metadata == null) {
            return false;
        }

        /* Initialize with fake table name. */
        emptyTableSet.add(KV_STATS_TABLE_NAME);
        for (String tableName : metadata.listTables(null, true)) {
            emptyTableSet.add(tableName);
        }

        repNode.getTopologyManager().addPostUpdateListener(this);
        return true;
    }

    @Override
    void postScan(boolean scanCompleted) {
        repNode.getTopologyManager().removePostUpdateListener(this);
    }

    @Override
    Database getDatabase() {
        return repNode.getPartitionDB(partId);
    }

    /* -- From TopologyManager.PostUpdateListener -- */
    
    @Override
    public boolean postUpdate(Topology topology) {
        final Partition p = topology.getPartitionMap().get(partId);          
        if ((p == null) || (p.getRepGroupId().getGroupId() != groupId)) {
            logger.log(Level.INFO,
                       "Stopping partition scan, partition {0} has moved",
                       partId);
            stop();
            return true;
        }
        return false;
    }

    /**
     * A class to assist to record and accumulate the result of scanning.
     */
    private static class StatsAccumulator {
        private long count;
        private long totalKeySize;
        private long totalTableSize;

        private void addKeySize(long keySize) {
            count++;
            totalKeySize += keySize;
        }

        private int getAvgKeySize() {
            return count == 0 ? 0 : (int)(totalKeySize / count);
        }

        private void addTableSize(int size) {
            totalTableSize += size;
        }
    }

    @Override
    boolean scanDatabase(Environment env, Database db)
        throws InterruptedException {

        Cursor cursor = null;
        Transaction txn = null;
        /* Acquire a permit for each batch of keys. */
        final RepNodeParams repNodeParams = repNode.getRepNodeParams();
        final long permitTimeoutMs =
            repNodeParams.getPermitTimeoutMs(RNTaskCoordinator.KV_STORAGE_STATS_TASK);
        final long permitLeaseMs =
            repNodeParams.getPermitLeaseMs(RNTaskCoordinator.KV_STORAGE_STATS_TASK);

        /*
         * Acquire a permit before scanning each batch. If permits are in short
         * supply the permit may be a deficit permit, but we choose not to act
         * on it for now to keep things simple.
         */
        try (final Permit permit = repNode.getTaskCoordinator().
             acquirePermit(RNTaskCoordinator.KV_STORAGE_STATS_TASK,
                           permitTimeoutMs, permitLeaseMs,
                           TimeUnit.MILLISECONDS)) {
            txn = env.beginTransaction(null, txnConfig);
            txn.setTxnTimeout(TXN_TIME_OUT, TimeUnit.MILLISECONDS);

            int nRecords = 0;
            cursor = db.openCursor(txn, cursorConfig);
            cursor.setCacheMode(CacheMode.UNCHANGED);

            final DatabaseEntry keyEntry = new DatabaseEntry();
            final DatabaseEntry dataEntry = new DatabaseEntry();
            dataEntry.setPartial(0, 0, true);
            OperationStatus status;

            if (resumeKey == null) {
                status = cursor.getNext(keyEntry, dataEntry,
                                        LockMode.READ_UNCOMMITTED);
            } else {
                keyEntry.setData(resumeKey);
                status = cursor.getSearchKeyRange(keyEntry, dataEntry,
                                                  LockMode.READ_UNCOMMITTED);
                if (status == OperationStatus.SUCCESS &&
                        Arrays.equals(resumeKey, keyEntry.getData())) {
                    status = cursor.getNext(keyEntry, dataEntry,
                                            LockMode.READ_UNCOMMITTED);
                }
            }

            if (status != OperationStatus.SUCCESS) {
                return false;
            }

            boolean hasMoreElement = false;
            while (status == OperationStatus.SUCCESS && !stop) {
                /* Record the latest key as a resume key */
                resumeKey = keyEntry.getData();

                /* Accumulate the key into results */
                accumulateResult(resumeKey, cursor);
                nRecords++;

                if (nRecords >= BATCH_SIZE) {
                    hasMoreElement = true;
                    break;
                }
                dataEntry.setPartial(0, 0, true);
                status = cursor.getNext(keyEntry, dataEntry,
                                        LockMode.READ_UNCOMMITTED);
            }
            totalRecords += nRecords;
            return hasMoreElement;
        } catch (DatabaseException | IllegalArgumentException e) {
            logger.log(Level.FINE, "Scanning encounters exception: {0}, " +
                       "iteration scanning exits", e);
        } finally {
            if (cursor != null) {
                TxnUtil.close(cursor);
            }

            /* We are just reading. Abort every transaction */
            TxnUtil.abort(txn);
        }
        return false;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy