com.fasterxml.clustermate.service.cluster.ClusterPeerImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of clustermate-service Show documentation
Building blocks for ClusterMate-based services and servers.
The newest version!
package com.fasterxml.clustermate.service.cluster;

import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import org.skife.config.TimeSpan;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.storemate.shared.*;
import com.fasterxml.storemate.shared.util.IOUtil;
import com.fasterxml.storemate.store.*;
import com.fasterxml.storemate.store.state.NodeStateStore;
import com.fasterxml.storemate.store.util.BoundedInputStream;
import com.fasterxml.clustermate.api.*;
import com.fasterxml.clustermate.service.SharedServiceStuff;
import com.fasterxml.clustermate.service.state.ActiveNodeState;
import com.fasterxml.clustermate.service.store.StoredEntry;
import com.fasterxml.clustermate.service.store.StoredEntryConverter;
import com.fasterxml.clustermate.service.sync.*;
import com.fasterxml.clustermate.service.util.StoreUtil;

public class ClusterPeerImpl>
    extends ClusterPeer
    implements com.fasterxml.storemate.shared.StartAndStoppable
{
    /**
     * If access to peer's sync/list fails, wait for this duration before
     * trying again.
     */
    private final static long SLEEP_FOR_SYNCLIST_ERRORS_MSECS = 10000L;

    private final static long SLEEP_FOR_SYNCPULL_ERRORS_MSECS = 3000L;

    // no point trying to sleep for trivial time
    private final static long MINIMAL_SLEEP_MSECS = 10L;

    /**
     * If synclist is empty and server does not instruct us, simply sleep for
     * 1 second (to try avoid congestion)
     */
    private final static long SLEEP_FOR_EMPTY_SYNCLIST_MSECS = 1000L;

    // no real hurry; use 20 seconds to account for GC, congestion etc
    private final static TimeSpan TIMEOUT_FOR_SYNCLIST = new TimeSpan(10L, TimeUnit.SECONDS);

    /**
     * Lowish timeout for "bye bye" message, so it won't block shutdown
     */
    private final static TimeSpan TIMEOUT_FOR_BYEBYE = new TimeSpan(250L, TimeUnit.MILLISECONDS);

    /**
     * We will limit maximum estimate response size to some reasonable
     * limit: starting with 250 megs. The idea is to use big enough sizes
     * for efficient bulk transfer; but small enough not to cause timeouts
     * during normal operation.
     */
    private final static long MAX_TOTAL_PAYLOAD = 250 * 1000 * 1000;

    /**
     * During fetching of items to sync, let's cap number of failures to some
     * number; this should make it easier to recover from cases where peer
     * shuts down during individual sync operation (after sync list received
     * but before all entries are fetched)
     */
    private final static int MAX_SYNC_FAILURES = 8;

    /**
     * Also, let's limit maximum individual calls per sync-pull fetch portion,
     * to avoid excessive calls.
     */
    private final int MAX_FETCH_TRIES = 20;

    private final static Logger LOG = LoggerFactory.getLogger(ClusterPeer.class);
    
    /*
    /**********************************************************************
    /* Configuration, general helpers
    /**********************************************************************
     */

    protected final SharedServiceStuff _stuff;

    /**
     * This object is necessary to support "virtual time" for test cases.
     */
    protected final TimeMaster _timeMaster;

    /*
    /**********************************************************************
    /* Operation state of the peer object
    /**********************************************************************
     */
    
    /**
     * Synchronization thread if (and only if) this peer shares part of keyspace
     * with the local node; otherwise null.
     * Note that threads may be started and stopped based on changes to cluster
     * configuration.
     */
    protected Thread _syncThread;

    /**
     * Flag used to request termination of the sync thread.
     */
    protected AtomicBoolean _running = new AtomicBoolean(false);

    /**
     * Let's keep track of number of failures (as per caught exceptions); mostly
     * so that tests can verify passing, but also potentially for monitoring.
     */
    protected AtomicInteger _failCount = new AtomicInteger(0);
    
    /*
    /**********************************************************************
    /* Helper objects for entry handling
    /**********************************************************************
     */

    /**
     * And to store fetched missing entities, we need the store
     */
    protected final StorableStore _entryStore;

    /**
     * Need to construct metadata nuggets with this factory
     */
    protected final StoredEntryConverter _entryConverter;
    
    /*
    /**********************************************************************
    /* Helper objects for sync handling
    /**********************************************************************
     */

    protected final ClusterViewByServerUpdatable _cluster;
    
    /**
     * Object used to access Node State information, needed to construct
     * view of the cluster.
     */
    protected final ClusterStatusAccessor _statusAccessor;

    /**
     * Helper object used for doing HTTP requests
     */
    protected final SyncListAccessor _syncListAccessor;

    /**
     * Persistent data store in which we store information regarding
     * synchronization.
     */
    protected final NodeStateStore _stateStore;

    /**
     * Hash code of contents of the last cluster view we received from
     * the peer. Used as optimization: cluster view only piggy-backed
     * on list response if hash differs.
     */
    protected long _lastClusterHash;
    
    /*
    /**********************************************************************
    /* Local information for peer (which for us is external but...)
    /**********************************************************************
     */

    /**
     * Synchronization state of this peer
     */
    protected ActiveNodeState _syncState;
    
    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */
    
    public ClusterPeerImpl(SharedServiceStuff stuff, ClusterViewByServerUpdatable cluster,
            NodeStateStore stateStore, StorableStore entryStore,
            ActiveNodeState state,
            ClusterStatusAccessor accessor)
    {
        super();
        _cluster = cluster;
        _stuff = stuff;
        _syncListAccessor = new SyncListAccessor(stuff);
        _syncState = state;
        _stateStore = stateStore;
        _entryStore = entryStore;
        _timeMaster = stuff.getTimeMaster();
        _entryConverter = stuff.getEntryConverter();
        _statusAccessor = accessor;
    }

    @Override
    public void start() {
        startSyncing();
    }

    /**
     * If we get notified about a shutdown in future, we can take some
     * preparatory steps, to make eventual shutdown simpler, safer
     * and quicker.
     */
    @Override
    public void prepareForStop() {
        _stop(false);
    }
    
    /**
     * Method called when the system is shutting down.
     */
    @Override
    public void stop() {
        _stop(true);
    }

    protected void _stop(boolean forced)
    {
        // stopSyncing():
        Thread t;
        synchronized (this) {
            _running.set(false);
            t = _syncThread;
            if (t != null) {
                _syncThread = null;
                LOG.info("Stop requested for sync thread for peer at {}", _syncState.getAddress());
            }
        }
        if (t != null) {
//            t.notify();
            t.interrupt();
        }
        _syncListAccessor.stop();
    }

    /*
    /**********************************************************************
    /* Actual synchronization task
    /**********************************************************************
     */
    
    /**
     * Method that can be called to ensure that there is a synchronization
     * thread running to sync between the local node and this peer
     * 
     * @return True if a new sync thread was started; false if there already 
     *    was a thread
     */
    public boolean startSyncing()
    {
        Thread t;
        
        synchronized (this) {
            t = _syncThread;
            if (t != null) { // sanity check
                return false;
            }
            _running.set(true);
            _syncThread = t = new Thread(new Runnable() {
                @Override
                public void run() {
                    syncLoop();
                }
            });
            _syncThread.setDaemon(true);
            _syncThread.setName("NodeSync-"+_syncState.getAddress());
        }
        t.start();
        return true;
    }
    
    /*
    /**********************************************************************
    /* State access
    /**********************************************************************
     */

    @Override
    public int getFailCount() { return _failCount.get(); }

    @Override
    public void resetFailCount() { _failCount.set(0); }

    @Override
    public long getSyncedUpTo() {
        return _syncState.getSyncedUpTo();
    }

    /*
    /**********************************************************************
    /* Public API
    /**********************************************************************
     */

    @Override
    public IpAndPort getAddress() {
        return _syncState.getAddress();
    }

    @Override
    public KeyRange getActiveRange() {
        return _syncState.getRangeActive();
    }

    @Override
    public KeyRange getTotalRange() {
        return _syncState.totalRange();
    }

    /**
     * Accessor for getting key range that is shared between the local node
     * and this peer; for non-overlapping nodes this may be an empty range.
     */
    @Override
    public KeyRange getSyncRange() {
        return _syncState.getRangeSync();
    }

    /*
    /**********************************************************************
    /* Extended accessors
    /**********************************************************************
     */

    public ActiveNodeState getSyncState() {
        return _syncState;
    }

    public boolean isDisabled() {
        return _syncState.isDisabled();
    }
    
    /*
    /**********************************************************************
    /* Background synchronization processing
    /**********************************************************************
     */

    /**
     * Main synchronization loop
     */
    protected void syncLoop()
    {
        LOG.info("Starting sync thread for peer at {}", _syncState.getAddress());
        
        // For testing (and only testing!), let's add little bit of
        // virtual sleep (TimeMaster will block threads) before starting
        // the loop; this to stabilize situation
        if (_stuff.isRunningTests()) {
            try {
                _timeMaster.sleep(1L);
            } catch (InterruptedException e) { }
        }

        /* At high level, we have two kinds of tasks, depending on whether
         * there is any overlap:
         * 
         * 1. If ranges overlap, we need to do proper sync list/pull handling
         * 2. If no overlap, we just need to keep an eye towards changes, to
         *   try to keep whole cluster view up to date (since clients need it)
         */
        while (_running.get()) {
            try {
                if (hasOverlap(_cluster.getLocalState(), _syncState)) {
                    doRealSync();
                } else {
                    doMinimalSync();
                }
            } catch (InterruptedException e) {
                if (_running.get()) {
                    LOG.warn("syncLoop() interrupted without clearing '_running' flag; ignoring");
                }
            } catch (Exception e) {
                LOG.warn("Uncaught processing exception during syncLoop(): ({}) {}",
                        e.getClass().getName(), e.getMessage());
                if (_running.get()) {
                    // Ignore failures during shutdown, so only increase here
                    _failCount.addAndGet(1);
                    try {
                        _timeMaster.sleep(SLEEP_FOR_SYNCPULL_ERRORS_MSECS);
                    } catch (InterruptedException e2) { }
                }
            }
        }
        if (_stuff.isRunningTests()) {
            LOG.info("Stopped sync thread for peer at {} -- testing, all done!", _syncState.getAddress());
            return;
        }
        // And send the byebye message if peer is NOT (known to be) disabled
        if (_syncState.isDisabled()) {
            LOG.info("Stopped sync thread for peer at {}: is disabled, no need to send bye-bye", _syncState.getAddress());
            return;
        }
        LOG.info("Stopped sync thread for peer at {}: let's send bye-bye", _syncState.getAddress());
        long start = System.currentTimeMillis();
        _syncListAccessor.sendStatusUpdate(_cluster, TIMEOUT_FOR_BYEBYE,
                _syncState.getAddress(), ClusterMateConstants.STATE_INACTIVE);
        LOG.info("Bye-bye message to {} sent in {} msec", _syncState.getAddress(), System.currentTimeMillis()-start);
    }

    protected void doRealSync() throws Exception
    {
        /* Sequence for each iterations consists of:
         * 
         * 1. Fetch list of newly inserted/deleted entries from peer (sync/list)
         * 2a. Find subset of entries unknown to this node, if any
         * 2b. Fetch unknown entries, possibly with multiple requests
         * 
         * and we will also add bit of sleep between requests, depending on how many
         * entries we get in step 1.
         */
        
        long listTime = _timeMaster.currentTimeMillis();
        SyncListResponse syncResp = _fetchSyncList();
        if (!_running.get()) { // short-circuit during shutdown
            return;
        }
        if (syncResp == null) { // only for hard errors
            _timeMaster.sleep(SLEEP_FOR_SYNCLIST_ERRORS_MSECS);
            return;
        }

        // First things first:
        if (syncResp.clusterStatus != null) {
            _cluster.updateWith(syncResp.clusterStatus);
            _lastClusterHash = syncResp.clusterHash;
        } else {
            // This is fine, as long as hashes match
            if (syncResp.clusterHash != _lastClusterHash) {
                LOG.warn("Did not get cluster status from {} even though hashes differ 0x{} (old) vs 0x{} (response)",
                        _syncState.getAddress(),
                        Long.toHexString(_lastClusterHash), Long.toHexString(syncResp.clusterHash));
            }
        }
        final long lastSeenTimestamp = syncResp.lastSeen();
        // Sanity check; should never happen
        if (lastSeenTimestamp <= 0L) {
            LOG.error("Invalid lastSeen timestamp value {} for SyncList, from {}",
                    lastSeenTimestamp, _syncState.getAddress());
            // should we sleep bit extra? Probably good to avoid flooding logs, if we end up here
            Thread.sleep(100L);
        }

        // comment out or remove for production; left here during testing:
//long diff = (listTime - syncResp.lastSeen()) >> 10; // in seconds
//LOG.warn("Received syncList with {} responses; last timestamp {} secs ago", syncResp.size(), diff);
        
        List newEntries = syncResp.entries;
        int insertedEntryCount = newEntries.size();
        if (insertedEntryCount == 0) { // nothing to update
            // may still need to update timestamp?
            _updatePersistentState(listTime, lastSeenTimestamp);

            // Ok: maybe server instructed us as to how long to sleep?
            long sleepMsecs = syncResp.clientWait;
            if (sleepMsecs < MINIMAL_SLEEP_MSECS) { // if not, use some lowish default amount
                sleepMsecs = SLEEP_FOR_EMPTY_SYNCLIST_MSECS;
            }
//            long timeSpent = _timeMaster.currentTimeMillis() - listTime;
            _timeMaster.sleep(sleepMsecs);
            return;
        }
        // Ok, we got something, good.
        // First: handle tombstones we may be getting:
        @SuppressWarnings("unused")
        int tombstoneCount = _handleTombstones(newEntries);
        // then filter out entries that we already have:
        _filterSeen(newEntries);
        if (!_running.get()) { // short-circuit during shutdown
            return;
        }
        if (newEntries.isEmpty()) { // nope: just update state then
            /*
            long msecs = syncResp.lastSeen() - _syncState.syncedUpTo;
            if (!_stuff.isRunningTests()) {
                LOG.warn("No unseen entries out of {} entries: timestamp = {} (+{} sec)",
                    new Object[] { insertedEntryCount, syncResp.lastSeen(), String.format("%.1f", msecs/1000.0)});
            }
            */
            _updatePersistentState(listTime, lastSeenTimestamp);
        } else { // yes: need to do batch updates
            // but can at least update syncUpTo to first entry, right?
            int newCount = newEntries.size();
            AtomicInteger rounds = new AtomicInteger(0);
            long lastProcessed = _fetchMissing(newEntries, rounds);
            int fetched = newCount - newEntries.size();

            double secs = (_timeMaster.currentTimeMillis() - listTime) / 1000.0;
            String timeDesc = String.format("%.2f", secs);
            LOG.info("Fetched {}/{} missing entries from {} in {} seconds ({} rounds)",
                    new Object[] { fetched, newCount, getAddress(), timeDesc, rounds.get()});
            _updatePersistentState(listTime, lastProcessed);
        }
        // And then sleep a bit, before doing next round of syncing
        long msecsBehind = (_timeMaster.currentTimeMillis() - _syncState.getSyncedUpTo());
        long delay = _calculateSleepBetweenSync(insertedEntryCount, msecsBehind);
        
        if (delay > 0L) {
            // only bother informing if above 50 msec sleep
            if (delay >= 50L) {
                double secsBehind = delay / 1000.0;
                LOG.info("With {} listed entries, {} seconds behind, will do {} msec sleep",
                        new Object[] { insertedEntryCount, String.format("%.2f", secsBehind), delay});
            }
            _timeMaster.sleep(delay);
        }
    }

    /**
     * Method called when there is no key range overlap, and at most we want to
     * synchronize cluster view occasionally.
     */
    protected void doMinimalSync() throws Exception
    {
        LOG.info("doMinimalSync(): let's just... Sleep for a bit (TBD)");
        Thread.sleep(30 * 1000L);
        
        // !!! TODO: do something!
    }
    
    /*
    /**********************************************************************
    /* Internal methods, cluster state updates
    /**********************************************************************
     */

    /**
     * Method called to indicate that the node should (or should not) be
     * disabled.
     */
    public void markDisabled(long timestamp, boolean isDisabled)
    {
        if (timestamp <= 0L) { // optional
            timestamp = _syncState.getDisabledUpdated();
        }
        ActiveNodeState state = _syncState.withDisabled(timestamp, isDisabled);
        if (state != _syncState) {
            _syncState = state;
            try {
                _stateStore.upsertEntry(state.getAddress(), state);
            } catch (Exception e) {
                LOG.error("Failed to update node state (disabled to {}) for {}. Problem ({}): {}",
                        isDisabled, _syncState, e.getClass().getName(), e.getMessage());
            }
        }
    }
    
    /*
    /**********************************************************************
    /* Internal methods, other
    /**********************************************************************
     */
    
    /**
     * Helper method called to update persistent state, based on sync list
     * information.
     * 
     * @param syncStartTime Timestamp of when sync attempt was done
     * @param lastSeen
     */
    private void _updatePersistentState(long syncStartTime, long lastSeen)
    {
        ActiveNodeState orig = _syncState;
        _syncState = _syncState.withLastSyncAttempt(syncStartTime);
        if (lastSeen > _syncState.getSyncedUpTo()) {
            _syncState = _syncState.withSyncedUpTo(lastSeen);
        }
        if (_syncState != orig) {
//LOG.warn("Saving sync state ({}) (args: {}, {}): lastStartTime {}, lastSeen {}", orig.address, syncStartTime, lastSeen, _syncState.lastSyncAttempt, _syncState.syncedUpTo);
            try {
                _stateStore.upsertEntry(_syncState.getAddress(), _syncState);
            } catch (Exception e) {
                LOG.error("Failed to update node state for {}. Problem ({}): {}",
                        _syncState, e.getClass().getName(), e.getMessage());
            }
        }
    }
    
    private SyncListResponse _fetchSyncList() throws InterruptedException
    {
        try {
            return _syncListAccessor.fetchSyncList(_cluster,
                    TIMEOUT_FOR_SYNCLIST, _syncState, _lastClusterHash);
        } catch (InterruptedException e) {
            // no point in complaining if we are being shut down:
            if (_running.get()) {
                LOG.warn("Failed to fetch syncList from {} ({}): {}",
                        new Object[] { _syncState.getAddress(), e.getClass().getName(), e.getMessage()});
            }
        }
        return null;
    }

    /**
     * Helper method called to handle removal of entries, by handling
     * tombstones received and converting existing non-deleted local
     * entries into tombstones.
     * 
     * @return Number of tombstone entries found on the list
     */
    protected int _handleTombstones(List entries)
        throws IOException, StoreException
    {
        int count = 0;
        Iterator it = entries.iterator();
        while (it.hasNext()) {
            SyncListResponseEntry entry = it.next();
            // Tombstone: if we have an entry, convert to a tombstone.
            /* 06-Jul-2012, tatu: But if we don't have one, should we create one?
             *   Could think of it either way; but for now, let's not waste time and space
             */
            if (entry.deleted()) {
                ++count;
                it.remove();
                _entryStore.softDelete(StoreOperationSource.SYNC, null, entry.key, true, true);
            }
        }
        return count;
    }

    protected void _filterSeen(List entries)
        throws IOException, StoreException
    {
        Iterator it = entries.iterator();
        while (it.hasNext()) {
            SyncListResponseEntry remoteEntry = it.next();
            /* 11-Jun-2013, tatu: Although tombstones have been handled and removed,
             *   need to pay attention here, since conflict resolution may be
             *   necessary.
             */
            Storable localEntry = _entryStore.findEntry(StoreOperationSource.SYNC, null, remoteEntry.key);
            if (localEntry != null) {
                // Do we have an actual conflict? If so, needs resolution as per:
                if (StoreUtil.needToPullRemoteToResolve(localEntry.getLastModified(), localEntry.getContentHash(),
                        remoteEntry.insertionTime, remoteEntry.hash)) {
                    continue;
                }
                it.remove();
            }
        }
    }
    
    /**
     * Helper method that handles actual fetching of missing entries, to synchronize
     * content.
     * 
     * @param missingEntries Entries to try to fetch
     * @rounds Integer to update with number of rounds done to sync things completely
     * 
     * @return Timestamp to use as the new 'syncedUpTo' value
     */
    private long _fetchMissing(List missingEntries, AtomicInteger rounds)
        throws InterruptedException
    {
        // initially create as big batches as possible
        int maxToFetch = missingEntries.size();
        int tries = 0;
        int fails = 0;
        long syncedUpTo = 0L;

        do {
            ++tries;
            final long startTime = _timeMaster.currentTimeMillis();
            AtomicInteger payloadSize = new AtomicInteger(0);
            SyncPullRequest req = _buildSyncPullRequest(missingEntries, maxToFetch, payloadSize);
            final int expCount = req.size();

            if (expCount == 0) { // sanity check, shouldn't happen but...
                throw new IllegalStateException("Internal error: empty syncPullRequest list ("+missingEntries.size()+" missing entries)");
            }

            rounds.addAndGet(1);
            AtomicInteger status = new AtomicInteger(0);
            InputStream in = null;
            try {
                in = _syncListAccessor.readLocalSyncPullResponse(req, TIMEOUT_FOR_SYNCLIST,
                         getAddress(), status, payloadSize.get());
//            } catch (org.apache.http.conn.HttpHostConnectException e) { // if using Apache HC
            } catch (java.net.ConnectException e) {
                ++fails;
                LOG.warn("Failed to connect server "+getAddress()+" to fetch missing entries", e);
                _timeMaster.sleep(SLEEP_FOR_SYNCPULL_ERRORS_MSECS);
            } catch (Exception e) {
                LOG.warn("Problem trying to make syncPull call to fetch "+expCount+" entries: ("
                        +e.getClass().getName() + ") " + e.getMessage(), e);
                ++fails;
                _timeMaster.sleep(SLEEP_FOR_SYNCPULL_ERRORS_MSECS);
            }
            if (in == null) {
                LOG.warn("Problem trying to fetch {} entries, received status code of {}",
                        expCount, status.get());
                _timeMaster.sleep(SLEEP_FOR_SYNCPULL_ERRORS_MSECS);
                ++fails;
                continue;
            }

            Iterator it = missingEntries.iterator();
            int count = 0;
            int headerLength = 0;
            long payloadLength = 0;
            final PullProblems probs = new PullProblems();
            
            try {
                // let's see if we can correlate entries nicely
                headerLength = -1;
                payloadLength = -1;
                for (; it.hasNext(); ++count, it.remove()) {
                    SyncListResponseEntry reqEntry = it.next();
                    headerLength = SyncPullResponse.readHeaderLength(in);
                    // Service will indicate end-of-response with marker length
                    if (headerLength == SyncHandler.LENGTH_EOF) {
                        break;
                    }
                    // sanity check:
                    if (count == expCount) {
                        ++probs.other;
                        LOG.warn("Server returned more than expected {} entries; ignoring rest!", expCount);
                        break;
                    }
                    // missing header? Unexpected, but not illegal
                    if (headerLength == 0) {
                        if (probs.missing++ == 0) {
                            LOG.warn("Missing entry {}/{} (from {}), id {}: expired? (will only report first)",
                                    new Object[] { count, expCount, _syncState.getAddress(), reqEntry.key});
                        }
                        continue;
                    }
                    
                    byte[] headerBytes = new byte[headerLength];
                    int len = IOUtil.readFully(in, headerBytes);
                    if (len < headerLength) {
                        throw new IOException("Unexpected end-of-input: got "+len+" bytes; needed "+headerLength);
                    }
                    SyncPullEntry header = _syncListAccessor.decodePullEntry(headerBytes);
                    payloadLength = header.storageSize;
                    // and then create the actual entry:
                    _pullEntry(reqEntry, header, in, probs);
                    syncedUpTo = reqEntry.insertionTime; 
                }
                if (count < expCount) {
                    // let's consider 0 entries to be an error, to prevent infinite loops
                    if (count == 0) {
                        LOG.warn("Server returned NO entries, when requested "+expCount);
                        ++fails;
                    }
                    LOG.warn("Server returned fewer entries than requested for sync pull: {} vs {} (in {} msecs)",
                            new Object[] { count, expCount, (_timeMaster.currentTimeMillis() - startTime)});
                }
                if (probs.hasIssues()) {
                    LOG.warn("Problems with pull request to {}: {}", _syncState.getAddress(), probs);
                }
            } catch (Exception e) {
                LOG.warn("Problem trying to fetch syncPull entry {}/{} (header-length: {}, length: {}): ({}) {}",
                        new Object[] { count+1, expCount, headerLength, payloadLength, e.getClass().getName(), e.getMessage() } );
                _timeMaster.sleep(SLEEP_FOR_SYNCPULL_ERRORS_MSECS);
                ++fails;
            } finally {
                if (in != null) {
                    try { in.close(); } catch (Exception e) { // shouldn't really happen
                        LOG.warn("Failed to close HTTP stream: {}", e.getMessage());
                    }
                }
            }
        } while (fails < MAX_SYNC_FAILURES && !missingEntries.isEmpty() && tries < MAX_FETCH_TRIES);

        if (fails > 0) {
            _failCount.addAndGet(fails);
        }
        
        return syncedUpTo;
    }

    private SyncPullRequest _buildSyncPullRequest(List missingEntries,
            int maxEntries, AtomicInteger expectedPayloadSize)
    {
        SyncPullRequest req = new SyncPullRequest();
        Iterator it = missingEntries.iterator();
        SyncListResponseEntry entry = it.next();
        req.addEntry(entry.key);
        long expSize = entry.size;
        while (it.hasNext() && req.size() < maxEntries) {
            entry = it.next();
            expSize += entry.size;
            if (expSize > MAX_TOTAL_PAYLOAD) {
                expSize -= entry.size;
                break;
            }
            req.addEntry(entry.key);
        }
        expectedPayloadSize.set((int) expSize);
        return req;
    }
    
    /**
     * Helper method called to figure out how long to sleep before doing next syncList call.
     * Note that sleep times are rather arbitrary: we hope to be able to better
     * tune these in future.
     * 
     * @param listedCount number of 'newly inserted' entries that were returned
     * @param Number of second that we are "behind" current time (note: due to grace period,
     *    will never be zero, but more like a minute or so at minimum)
     */
    private long _calculateSleepBetweenSync(int listedCount, long msecsBehind)
    {
        // if we are behind by more than 3 minutes, or get >75% "full" response, no delay:
        final int FULL_ENOUGH = _stuff.getServiceConfig().cfgMaxEntriesPerSyncList * 3 / 4;
        if ((msecsBehind >= (3 * 60 * 1000)) || (listedCount >= FULL_ENOUGH)) {
            return 0L;
        }
        // otherwise moderate delay; bit longer for shorter lists
        if (listedCount < 5) {
            return 300L;
        }
        if (listedCount <= 10) {
            return 200L;
        }
        if (listedCount < 40) {
            return 100L;
        }
        return 50L;
    }
    
    /**
     * Method that does the heavy lifting of pulling a single synchronized entry,
     * if and as necessary.
     */
    private void _pullEntry(SyncListResponseEntry reqEntry, SyncPullEntry header,
            InputStream in, PullProblems probs)
        throws IOException
    {
        final StorableKey key = header.key;

        /* first things first: either read things in memory (for inline inclusion),
         * or pipe into a file.
         */
        long expSize = header.storageSize;
        // Sanity check: although rare, deletion could have occurred after we got
        // the initial sync list, so:
        if (header.isDeleted) {
            _entryStore.softDelete(StoreOperationSource.SYNC, null, key, true, true);
            return;
        }
        StorableCreationResult result;
        StorableCreationMetadata stdMetadata = new StorableCreationMetadata(header.compression,
                header.checksum, header.checksumForCompressed);
        stdMetadata.uncompressedSize = header.size;
        stdMetadata.storageSize = header.storageSize;
        // 16-Apr-2014, tatu: Need to remember to set replica flag now
        stdMetadata.replicated = true;

        /* 25-Apr-2014, As per [#32], we need to compensate time-to-live settings so that
         *   it is not reset; rather, it stay as close to remaining TTL as possible.
         *   
         *   Note that this means that "maxTTLSecs" IS modified, and "minTTLSecs" NOT, since
         *   former is measured from creation and latter (if used) from last-access.
         */
        
        ByteContainer customMetadata = _entryConverter.createMetadata(_timeMaster.currentTimeMillis(),
                header.lastAccessMethod, header.minTTLSecs, header.maxTTLSecs);

        // although not 100% required, we can simplify handling of smallest entries
        if (expSize <= _stuff.getServiceConfig().storeConfig.maxInlinedStorageSize) { // inlineable
            ByteContainer data;

            if (expSize == 0) {
                data = ByteContainer.emptyContainer();
            } else {
                byte[] bytes = new byte[(int) expSize];
                int len = IOUtil.readFully(in, bytes);
                if (len < expSize) {
                    throw new IOException("Unexpected end-of-input: got "+len+" bytes; needed "+expSize);
                }
                data = ByteContainer.simple(bytes);
            }
            // 19-Sep-2013, tatu: May need to upsert, when resolving conflicts
            result = _entryStore.upsertConditionally(StoreOperationSource.SYNC, null, key, data,
                    stdMetadata, customMetadata, true,
                    new ConflictOverwriteChecker(reqEntry.insertionTime));
        } else {
            /* 21-Sep-2012, tatu: Important -- we must ensure that store only reads
             *   bytes that belong to the entry payload. The easiest way is by adding
             *   a wrapper stream that ensures this...
             */
            BoundedInputStream bin = new BoundedInputStream(in, stdMetadata.storageSize, false);
            // 19-Sep-2013, tatu: May need to upsert, when resolving conflicts
            result = _entryStore.upsertConditionally(StoreOperationSource.SYNC, null, key, bin,
                    stdMetadata, customMetadata, true,
                    new ConflictOverwriteChecker(reqEntry.insertionTime));

            if (result.succeeded() && !bin.isCompletelyRead()) { // error or warning?
                Storable entry = result.getNewEntry();
                long ssize = (entry == null) ? -1L : entry.getStorageLength();
                ++probs.other;
                LOG.warn("Problems with sync-pull for '{}': read {} bytes, should have read {} more; entry storageSize: {}",
                        new Object[] { header.key, bin.bytesRead(), bin.bytesLeft(), ssize });
            }
        }

        // should we care whether this was redundant or not?
        if (!result.succeeded()) {
            if (probs.redundant++ == 0) {
                if (result.getPreviousEntry() != null) {
                    // most likely ok: already had the entry
                    LOG.info("Redundant sync-pull for '{}' (from {}): entry already existed locally (will only report first)",
                            header.key, _syncState.getAddress());
                } else {
                    // should this add to 'failCount'? For now, don't
                    LOG.warn("Failed sync-pull for '{}' (from {}): no old entry. Strange! (will only report first)",
                            header.key, _syncState.getAddress());
                }
            }
        }
    }

    protected final boolean hasOverlap(NodeState state1, NodeState state2)
    {
        return state1.totalRange().overlapsWith(state2.totalRange());
    }

    private static class PullProblems {
        public int redundant = 0;
        public int missing = 0;
        public int other = 0;

        public boolean hasIssues() {
            return (redundant > 0) || (missing > 0) || (other > 0);
        }

        @Override
        public String toString() {
            return new StringBuilder(60)
                .append(redundant).append(" redundant, ")
                .append(missing).append(" missing entries and ")
                .append(other).append(" other problems")
                .toString();
        }
    }
}