All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.bookkeeper.client.LedgerHandle Maven / Gradle / Ivy

There is a newer version: 4.17.1
Show newest version
/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 */
package org.apache.bookkeeper.client;

import static com.google.common.base.Preconditions.checkState;
import static org.apache.bookkeeper.client.api.BKException.Code.ClientClosedException;
import static org.apache.bookkeeper.client.api.BKException.Code.WriteException;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.RateLimiter;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.Unpooled;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.bookkeeper.client.AsyncCallback.AddCallback;
import org.apache.bookkeeper.client.AsyncCallback.AddCallbackWithLatency;
import org.apache.bookkeeper.client.AsyncCallback.CloseCallback;
import org.apache.bookkeeper.client.AsyncCallback.ReadCallback;
import org.apache.bookkeeper.client.AsyncCallback.ReadLastConfirmedCallback;
import org.apache.bookkeeper.client.BKException.BKIncorrectParameterException;
import org.apache.bookkeeper.client.BKException.BKReadException;
import org.apache.bookkeeper.client.DistributionSchedule.WriteSet;
import org.apache.bookkeeper.client.SyncCallbackUtils.FutureReadLastConfirmed;
import org.apache.bookkeeper.client.SyncCallbackUtils.FutureReadLastConfirmedAndEntry;
import org.apache.bookkeeper.client.SyncCallbackUtils.SyncAddCallback;
import org.apache.bookkeeper.client.SyncCallbackUtils.SyncCloseCallback;
import org.apache.bookkeeper.client.SyncCallbackUtils.SyncReadCallback;
import org.apache.bookkeeper.client.SyncCallbackUtils.SyncReadLastConfirmedCallback;
import org.apache.bookkeeper.client.api.BKException.Code;
import org.apache.bookkeeper.client.api.LastConfirmedAndEntry;
import org.apache.bookkeeper.client.api.LedgerEntries;
import org.apache.bookkeeper.client.api.LedgerMetadata;
import org.apache.bookkeeper.client.api.WriteFlag;
import org.apache.bookkeeper.client.api.WriteHandle;
import org.apache.bookkeeper.client.impl.LedgerEntryImpl;
import org.apache.bookkeeper.common.concurrent.FutureEventListener;
import org.apache.bookkeeper.common.concurrent.FutureUtils;
import org.apache.bookkeeper.common.util.MathUtils;
import org.apache.bookkeeper.net.BookieId;
import org.apache.bookkeeper.proto.BookieProtocol;
import org.apache.bookkeeper.proto.checksum.DigestManager;
import org.apache.bookkeeper.stats.Counter;
import org.apache.bookkeeper.stats.Gauge;
import org.apache.bookkeeper.stats.OpStatsLogger;
import org.apache.bookkeeper.versioning.Versioned;
import org.apache.commons.collections4.IteratorUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Ledger handle contains ledger metadata and is used to access the read and
 * write operations to a ledger.
 */
public class LedgerHandle implements WriteHandle {
    static final Logger LOG = LoggerFactory.getLogger(LedgerHandle.class);

    private static final int STICKY_READ_BOOKIE_INDEX_UNSET = -1;

    final ClientContext clientCtx;

    final byte[] ledgerKey;
    private Versioned versionedMetadata;
    final long ledgerId;
    long lastAddPushed;

    private enum HandleState {
        OPEN,
        CLOSED
    }

    private HandleState handleState = HandleState.OPEN;
    private final CompletableFuture closePromise = new CompletableFuture<>();

    /**
      * Last entryId which has been confirmed to be written durably to the bookies.
      * This value is used by readers, the LAC protocol
      */
    volatile long lastAddConfirmed;

     /**
      * Next entryId which is expected to move forward during {@link #sendAddSuccessCallbacks() }. This is important
      * in order to have an ordered sequence of addEntry acknowledged to the writer
      */
    volatile long pendingAddsSequenceHead;

    /**
     * If bookie sticky reads are enabled, this will contain the index of the bookie
     * selected as "sticky" for this ledger. The bookie is chosen at random when the
     * LedgerHandle is created.
     *
     * 

In case of failures, the bookie index will be updated (to the next bookie in * the ensemble) to avoid continuing to attempt to read from a failed bookie. * *

If the index is -1, it means the sticky reads are disabled. */ private int stickyBookieIndex; long length; final DigestManager macManager; final DistributionSchedule distributionSchedule; final RateLimiter throttler; final LoadingCache bookieFailureHistory; final BookiesHealthInfo bookiesHealthInfo; final EnumSet writeFlags; ScheduledFuture timeoutFuture = null; @VisibleForTesting final Map delayedWriteFailedBookies = new HashMap(); /** * Invalid entry id. This value is returned from methods which * should return an entry id but there is no valid entry available. */ public static final long INVALID_ENTRY_ID = BookieProtocol.INVALID_ENTRY_ID; /** * Invalid ledger id. Ledger IDs must be greater than or equal to 0. * Large negative used to make it easy to spot in logs if erroneously used. */ public static final long INVALID_LEDGER_ID = -0xABCDABCDL; final Object metadataLock = new Object(); boolean changingEnsemble = false; final AtomicInteger numEnsembleChanges = new AtomicInteger(0); Queue pendingAddOps; ExplicitLacFlushPolicy explicitLacFlushPolicy; final Counter ensembleChangeCounter; final Counter lacUpdateHitsCounter; final Counter lacUpdateMissesCounter; private final OpStatsLogger clientChannelWriteWaitStats; LedgerHandle(ClientContext clientCtx, long ledgerId, Versioned versionedMetadata, BookKeeper.DigestType digestType, byte[] password, EnumSet writeFlags) throws GeneralSecurityException, NumberFormatException { this.clientCtx = clientCtx; this.versionedMetadata = versionedMetadata; this.pendingAddOps = new ConcurrentLinkedQueue(); this.writeFlags = writeFlags; LedgerMetadata metadata = versionedMetadata.getValue(); if (metadata.isClosed()) { lastAddConfirmed = lastAddPushed = metadata.getLastEntryId(); length = metadata.getLength(); } else { lastAddConfirmed = lastAddPushed = INVALID_ENTRY_ID; length = 0; } this.pendingAddsSequenceHead = lastAddConfirmed; this.ledgerId = ledgerId; if (clientCtx.getConf().enableStickyReads && getLedgerMetadata().getEnsembleSize() == getLedgerMetadata().getWriteQuorumSize()) { stickyBookieIndex = clientCtx.getPlacementPolicy().getStickyReadBookieIndex(metadata, Optional.empty()); } else { stickyBookieIndex = STICKY_READ_BOOKIE_INDEX_UNSET; } if (clientCtx.getConf().throttleValue > 0) { this.throttler = RateLimiter.create(clientCtx.getConf().throttleValue); } else { this.throttler = null; } macManager = DigestManager.instantiate(ledgerId, password, BookKeeper.DigestType.toProtoDigestType(digestType), clientCtx.getByteBufAllocator(), clientCtx.getConf().useV2WireProtocol); // If the password is empty, pass the same random ledger key which is generated by the hash of the empty // password, so that the bookie can avoid processing the keys for each entry this.ledgerKey = DigestManager.generateMasterKey(password); distributionSchedule = new RoundRobinDistributionSchedule( metadata.getWriteQuorumSize(), metadata.getAckQuorumSize(), metadata.getEnsembleSize()); this.bookieFailureHistory = CacheBuilder.newBuilder() .expireAfterWrite(clientCtx.getConf().bookieFailureHistoryExpirationMSec, TimeUnit.MILLISECONDS) .build(new CacheLoader() { @Override public Long load(BookieId key) { return -1L; } }); this.bookiesHealthInfo = new BookiesHealthInfo() { @Override public long getBookieFailureHistory(BookieId bookieSocketAddress) { Long lastFailure = bookieFailureHistory.getIfPresent(bookieSocketAddress); return lastFailure == null ? -1L : lastFailure; } @Override public long getBookiePendingRequests(BookieId bookieSocketAddress) { return clientCtx.getBookieClient().getNumPendingRequests(bookieSocketAddress, ledgerId); } }; ensembleChangeCounter = clientCtx.getClientStats().getEnsembleChangeCounter(); lacUpdateHitsCounter = clientCtx.getClientStats().getLacUpdateHitsCounter(); lacUpdateMissesCounter = clientCtx.getClientStats().getLacUpdateMissesCounter(); clientChannelWriteWaitStats = clientCtx.getClientStats().getClientChannelWriteWaitLogger(); clientCtx.getClientStats().registerPendingAddsGauge(new Gauge() { @Override public Integer getDefaultValue() { return 0; } @Override public Integer getSample() { return pendingAddOps.size(); } }); initializeWriteHandleState(); } /** * Notify the LedgerHandle that a read operation was failed on a particular bookie. */ void recordReadErrorOnBookie(int bookieIndex) { // If sticky bookie reads are enabled, switch the sticky bookie to the // next bookie in the ensemble so that we avoid to keep reading from the // same failed bookie if (stickyBookieIndex != STICKY_READ_BOOKIE_INDEX_UNSET) { // This will be idempotent when we have multiple read errors on the // same bookie. The net result is that we just go to the next bookie stickyBookieIndex = clientCtx.getPlacementPolicy().getStickyReadBookieIndex(getLedgerMetadata(), Optional.of(bookieIndex)); } } protected void initializeWriteHandleState() { if (clientCtx.getConf().explicitLacInterval > 0) { explicitLacFlushPolicy = new ExplicitLacFlushPolicy.ExplicitLacFlushPolicyImpl( this, clientCtx); } else { explicitLacFlushPolicy = ExplicitLacFlushPolicy.VOID_EXPLICITLAC_FLUSH_POLICY; } if (clientCtx.getConf().addEntryQuorumTimeoutNanos > 0) { this.timeoutFuture = clientCtx.getScheduler().scheduleAtFixedRate( () -> monitorPendingAddOps(), clientCtx.getConf().timeoutMonitorIntervalSec, clientCtx.getConf().timeoutMonitorIntervalSec, TimeUnit.SECONDS); } } private void tearDownWriteHandleState() { explicitLacFlushPolicy.stopExplicitLacFlush(); if (timeoutFuture != null) { timeoutFuture.cancel(false); } } /** * Get the id of the current ledger. * * @return the id of the ledger */ @Override public long getId() { return ledgerId; } @VisibleForTesting public EnumSet getWriteFlags() { return writeFlags; } /** * {@inheritDoc} */ @Override public synchronized long getLastAddConfirmed() { return lastAddConfirmed; } synchronized void setLastAddConfirmed(long lac) { this.lastAddConfirmed = lac; } /** * {@inheritDoc} */ @Override public synchronized long getLastAddPushed() { return lastAddPushed; } /** * Get the Ledger's key/password. * * @return byte array for the ledger's key/password. */ public byte[] getLedgerKey() { return Arrays.copyOf(ledgerKey, ledgerKey.length); } /** * {@inheritDoc} */ @Override public LedgerMetadata getLedgerMetadata() { return versionedMetadata.getValue(); } Versioned getVersionedLedgerMetadata() { return versionedMetadata; } boolean setLedgerMetadata(Versioned expected, Versioned newMetadata) { synchronized (this) { // ensure that we only update the metadata if it is the object we expect it to be if (versionedMetadata == expected) { versionedMetadata = newMetadata; LedgerMetadata metadata = versionedMetadata.getValue(); if (metadata.isClosed()) { lastAddConfirmed = lastAddPushed = metadata.getLastEntryId(); length = metadata.getLength(); } return true; } else { return false; } } } /** * Get this ledger's customMetadata map. * * @return map containing user provided customMetadata. */ public Map getCustomMetadata() { return getLedgerMetadata().getCustomMetadata(); } /** * Get the number of fragments that makeup this ledger. * * @return the count of fragments */ public synchronized long getNumFragments() { return getLedgerMetadata().getAllEnsembles().size(); } /** * Get the count of unique bookies that own part of this ledger * by going over all the fragments of the ledger. * * @return count of unique bookies */ public synchronized long getNumBookies() { Map> m = getLedgerMetadata().getAllEnsembles(); Set s = Sets.newHashSet(); for (List aList : m.values()) { s.addAll(aList); } return s.size(); } /** * Get the DigestManager. * * @return DigestManager for the LedgerHandle */ DigestManager getDigestManager() { return macManager; } /** * Add to the length of the ledger in bytes. * * @param delta * @return the length of the ledger after the addition */ synchronized long addToLength(long delta) { this.length += delta; return this.length; } /** * Returns the length of the ledger in bytes. * * @return the length of the ledger in bytes */ @Override public synchronized long getLength() { return this.length; } /** * Returns the ledger creation time. * * @return the ledger creation time */ public long getCtime() { return getLedgerMetadata().getCtime(); } /** * Get the Distribution Schedule. * * @return DistributionSchedule for the LedgerHandle */ DistributionSchedule getDistributionSchedule() { return distributionSchedule; } /** * Get the health info for bookies for this ledger. * * @return BookiesHealthInfo for every bookie in the write set. */ BookiesHealthInfo getBookiesHealthInfo() { return bookiesHealthInfo; } /** * {@inheritDoc} */ @Override public void close() throws InterruptedException, BKException { SyncCallbackUtils.waitForResult(closeAsync()); } /** * {@inheritDoc} */ @Override public CompletableFuture closeAsync() { CompletableFuture result = new CompletableFuture<>(); SyncCloseCallback callback = new SyncCloseCallback(result); asyncClose(callback, null); return result; } /** * Asynchronous close, any adds in flight will return errors. * *

Closing a ledger will ensure that all clients agree on what the last entry * of the ledger is. This ensures that, once the ledger has been closed, all * reads from the ledger will return the same set of entries. * * @param cb * callback implementation * @param ctx * control object */ public void asyncClose(CloseCallback cb, Object ctx) { asyncCloseInternal(cb, ctx, BKException.Code.LedgerClosedException); } /** * {@inheritDoc} */ @Override public synchronized boolean isClosed() { return getLedgerMetadata().isClosed(); } boolean isHandleWritable() { return !getLedgerMetadata().isClosed() && handleState == HandleState.OPEN; } void asyncCloseInternal(final CloseCallback cb, final Object ctx, final int rc) { try { doAsyncCloseInternal(cb, ctx, rc); } catch (RejectedExecutionException re) { if (LOG.isDebugEnabled()) { LOG.debug("Failed to close ledger {} : ", ledgerId, re); } errorOutPendingAdds(BookKeeper.getReturnRc(clientCtx.getBookieClient(), rc)); cb.closeComplete(BookKeeper.getReturnRc(clientCtx.getBookieClient(), BKException.Code.InterruptedException), this, ctx); } } /** * Same as public version of asyncClose except that this one takes an * additional parameter which is the return code to hand to all the pending * add ops. * * @param cb * @param ctx * @param rc */ void doAsyncCloseInternal(final CloseCallback cb, final Object ctx, final int rc) { executeOrdered(() -> { final HandleState prevHandleState; final List pendingAdds; final long lastEntry; final long finalLength; closePromise.whenComplete((ignore, ex) -> { if (ex != null) { cb.closeComplete( BKException.getExceptionCode(ex, BKException.Code.UnexpectedConditionException), LedgerHandle.this, ctx); } else { cb.closeComplete(BKException.Code.OK, LedgerHandle.this, ctx); } }); synchronized (LedgerHandle.this) { prevHandleState = handleState; // drain pending adds first pendingAdds = drainPendingAddsAndAdjustLength(); // taking the length must occur after draining, as draining changes the length lastEntry = lastAddPushed = LedgerHandle.this.lastAddConfirmed; finalLength = LedgerHandle.this.length; handleState = HandleState.CLOSED; } // error out all pending adds during closing, the callbacks shouldn't be // running under any bk locks. try { errorOutPendingAdds(rc, pendingAdds); } catch (Throwable e) { closePromise.completeExceptionally(e); return; } if (prevHandleState != HandleState.CLOSED) { if (LOG.isDebugEnabled()) { LOG.debug("Closing ledger: {} at entryId {} with {} bytes", getId(), lastEntry, finalLength); } tearDownWriteHandleState(); new MetadataUpdateLoop( clientCtx.getLedgerManager(), getId(), LedgerHandle.this::getVersionedLedgerMetadata, (metadata) -> { if (metadata.isClosed()) { /* If the ledger has been closed with the same lastEntry * and length that we planned to close with, we have nothing to do, * so just return success */ if (lastEntry == metadata.getLastEntryId() && finalLength == metadata.getLength()) { return false; } else { LOG.error("Metadata conflict when closing ledger {}." + " Another client may have recovered the ledger while " + "there" + " were writes outstanding. (local lastEntry:{} " + "length:{}) " + " (metadata lastEntry:{} length:{})", getId(), lastEntry, finalLength, metadata.getLastEntryId(), metadata.getLength()); throw new BKException.BKMetadataVersionException(); } } else { return true; } }, (metadata) -> { return LedgerMetadataBuilder.from(metadata) .withClosedState().withLastEntryId(lastEntry) .withLength(finalLength).build(); }, LedgerHandle.this::setLedgerMetadata) .run().whenComplete((metadata, ex) -> { if (ex != null) { closePromise.completeExceptionally(ex); } else { FutureUtils.complete(closePromise, null); } }); } } ); } /** * Read a sequence of entries synchronously. * * @param firstEntry * id of first entry of sequence (included) * @param lastEntry * id of last entry of sequence (included) * * @see #asyncReadEntries(long, long, ReadCallback, Object) */ public Enumeration readEntries(long firstEntry, long lastEntry) throws InterruptedException, BKException { CompletableFuture> result = new CompletableFuture<>(); asyncReadEntries(firstEntry, lastEntry, new SyncReadCallback(result), null); return SyncCallbackUtils.waitForResult(result); } /** * Read a sequence of entries synchronously, allowing to read after the LastAddConfirmed range.
* This is the same of * {@link #asyncReadUnconfirmedEntries(long, long, ReadCallback, Object) } * * @param firstEntry * id of first entry of sequence (included) * @param lastEntry * id of last entry of sequence (included) * * @see #readEntries(long, long) * @see #asyncReadUnconfirmedEntries(long, long, ReadCallback, java.lang.Object) * @see #asyncReadLastConfirmed(ReadLastConfirmedCallback, java.lang.Object) */ public Enumeration readUnconfirmedEntries(long firstEntry, long lastEntry) throws InterruptedException, BKException { CompletableFuture> result = new CompletableFuture<>(); asyncReadUnconfirmedEntries(firstEntry, lastEntry, new SyncReadCallback(result), null); return SyncCallbackUtils.waitForResult(result); } /** * Read a sequence of entries asynchronously. * * @param firstEntry * id of first entry of sequence * @param lastEntry * id of last entry of sequence * @param cb * object implementing read callback interface * @param ctx * control object */ public void asyncReadEntries(long firstEntry, long lastEntry, ReadCallback cb, Object ctx) { // Little sanity check if (firstEntry < 0 || firstEntry > lastEntry) { LOG.error("IncorrectParameterException on ledgerId:{} firstEntry:{} lastEntry:{}", ledgerId, firstEntry, lastEntry); cb.readComplete(BKException.Code.IncorrectParameterException, this, null, ctx); return; } if (lastEntry > lastAddConfirmed) { LOG.error("ReadEntries exception on ledgerId:{} firstEntry:{} lastEntry:{} lastAddConfirmed:{}", ledgerId, firstEntry, lastEntry, lastAddConfirmed); cb.readComplete(BKException.Code.ReadException, this, null, ctx); return; } asyncReadEntriesInternal(firstEntry, lastEntry, cb, ctx, false); } /** * Read a sequence of entries asynchronously, allowing to read after the LastAddConfirmed range. *
This is the same of * {@link #asyncReadEntries(long, long, ReadCallback, Object) } * but it lets the client read without checking the local value of LastAddConfirmed, so that it is possibile to * read entries for which the writer has not received the acknowledge yet.
* For entries which are within the range 0..LastAddConfirmed BookKeeper guarantees that the writer has successfully * received the acknowledge.
* For entries outside that range it is possible that the writer never received the acknowledge * and so there is the risk that the reader is seeing entries before the writer and this could result in * a consistency issue in some cases.
* With this method you can even read entries before the LastAddConfirmed and entries after it with one call, * the expected consistency will be as described above for each subrange of ids. * * @param firstEntry * id of first entry of sequence * @param lastEntry * id of last entry of sequence * @param cb * object implementing read callback interface * @param ctx * control object * * @see #asyncReadEntries(long, long, ReadCallback, Object) * @see #asyncReadLastConfirmed(ReadLastConfirmedCallback, Object) * @see #readUnconfirmedEntries(long, long) */ public void asyncReadUnconfirmedEntries(long firstEntry, long lastEntry, ReadCallback cb, Object ctx) { // Little sanity check if (firstEntry < 0 || firstEntry > lastEntry) { LOG.error("IncorrectParameterException on ledgerId:{} firstEntry:{} lastEntry:{}", ledgerId, firstEntry, lastEntry); cb.readComplete(BKException.Code.IncorrectParameterException, this, null, ctx); return; } asyncReadEntriesInternal(firstEntry, lastEntry, cb, ctx, false); } /** * Read a sequence of entries asynchronously. * * @param firstEntry * id of first entry of sequence * @param lastEntry * id of last entry of sequence */ @Override public CompletableFuture readAsync(long firstEntry, long lastEntry) { // Little sanity check if (firstEntry < 0 || firstEntry > lastEntry) { LOG.error("IncorrectParameterException on ledgerId:{} firstEntry:{} lastEntry:{}", ledgerId, firstEntry, lastEntry); return FutureUtils.exception(new BKIncorrectParameterException()); } if (lastEntry > lastAddConfirmed) { LOG.error("ReadAsync exception on ledgerId:{} firstEntry:{} lastEntry:{} lastAddConfirmed:{}", ledgerId, firstEntry, lastEntry, lastAddConfirmed); return FutureUtils.exception(new BKReadException()); } return readEntriesInternalAsync(firstEntry, lastEntry, false); } /** * Read a sequence of entries asynchronously, allowing to read after the LastAddConfirmed range. *
This is the same of * {@link #asyncReadEntries(long, long, ReadCallback, Object) } * but it lets the client read without checking the local value of LastAddConfirmed, so that it is possibile to * read entries for which the writer has not received the acknowledge yet.
* For entries which are within the range 0..LastAddConfirmed BookKeeper guarantees that the writer has successfully * received the acknowledge.
* For entries outside that range it is possible that the writer never received the acknowledge * and so there is the risk that the reader is seeing entries before the writer and this could result in * a consistency issue in some cases.
* With this method you can even read entries before the LastAddConfirmed and entries after it with one call, * the expected consistency will be as described above for each subrange of ids. * * @param firstEntry * id of first entry of sequence * @param lastEntry * id of last entry of sequence * * @see #asyncReadEntries(long, long, ReadCallback, Object) * @see #asyncReadLastConfirmed(ReadLastConfirmedCallback, Object) * @see #readUnconfirmedEntries(long, long) */ @Override public CompletableFuture readUnconfirmedAsync(long firstEntry, long lastEntry) { // Little sanity check if (firstEntry < 0 || firstEntry > lastEntry) { LOG.error("IncorrectParameterException on ledgerId:{} firstEntry:{} lastEntry:{}", ledgerId, firstEntry, lastEntry); return FutureUtils.exception(new BKIncorrectParameterException()); } return readEntriesInternalAsync(firstEntry, lastEntry, false); } void asyncReadEntriesInternal(long firstEntry, long lastEntry, ReadCallback cb, Object ctx, boolean isRecoveryRead) { if (!clientCtx.isClientClosed()) { readEntriesInternalAsync(firstEntry, lastEntry, isRecoveryRead) .whenCompleteAsync(new FutureEventListener() { @Override public void onSuccess(LedgerEntries entries) { cb.readComplete( Code.OK, LedgerHandle.this, IteratorUtils.asEnumeration( Iterators.transform(entries.iterator(), le -> { LedgerEntry entry = new LedgerEntry((LedgerEntryImpl) le); le.close(); return entry; })), ctx); } @Override public void onFailure(Throwable cause) { if (cause instanceof BKException) { BKException bke = (BKException) cause; cb.readComplete(bke.getCode(), LedgerHandle.this, null, ctx); } else { cb.readComplete(Code.UnexpectedConditionException, LedgerHandle.this, null, ctx); } } }, clientCtx.getMainWorkerPool().chooseThread(ledgerId)); } else { cb.readComplete(Code.ClientClosedException, LedgerHandle.this, null, ctx); } } /* * Read the last entry in the ledger * * @param cb * object implementing read callback interface * @param ctx * control object */ public void asyncReadLastEntry(ReadCallback cb, Object ctx) { long lastEntryId = getLastAddConfirmed(); if (lastEntryId < 0) { // Ledger was empty, so there is no last entry to read cb.readComplete(BKException.Code.NoSuchEntryException, this, null, ctx); } else { asyncReadEntriesInternal(lastEntryId, lastEntryId, cb, ctx, false); } } public LedgerEntry readLastEntry() throws InterruptedException, BKException { long lastEntryId = getLastAddConfirmed(); if (lastEntryId < 0) { // Ledger was empty, so there is no last entry to read throw new BKException.BKNoSuchEntryException(); } else { CompletableFuture> result = new CompletableFuture<>(); asyncReadEntries(lastEntryId, lastEntryId, new SyncReadCallback(result), null); return SyncCallbackUtils.waitForResult(result).nextElement(); } } CompletableFuture readEntriesInternalAsync(long firstEntry, long lastEntry, boolean isRecoveryRead) { PendingReadOp op = new PendingReadOp(this, clientCtx, firstEntry, lastEntry, isRecoveryRead); if (!clientCtx.isClientClosed()) { // Waiting on the first one. // This is not very helpful if there are multiple ensembles or if bookie goes into unresponsive // state later after N requests sent. // Unfortunately it seems that alternatives are: // - send reads one-by-one (up to the app) // - rework LedgerHandle to send requests one-by-one (maybe later, potential perf impact) // - block worker pool (not good) // Even with this implementation one should be more concerned about OOME when all read responses arrive // or about overloading bookies with these requests then about submission of many small requests. // Naturally one of the solutions would be to submit smaller batches and in this case // current implementation will prevent next batch from starting when bookie is // unresponsive thus helpful enough. if (clientCtx.getConf().waitForWriteSetMs >= 0) { DistributionSchedule.WriteSet ws = distributionSchedule.getWriteSet(firstEntry); try { if (!waitForWritable(ws, ws.size() - 1, clientCtx.getConf().waitForWriteSetMs)) { op.allowFailFastOnUnwritableChannel(); } } finally { ws.recycle(); } } if (isHandleWritable()) { // Ledger handle in read/write mode: submit to OSE for ordered execution. executeOrdered(op); } else { // Read-only ledger handle: bypass OSE and execute read directly in client thread. // This avoids a context-switch to OSE thread and thus reduces latency. op.run(); } } else { op.future().completeExceptionally(BKException.create(ClientClosedException)); } return op.future(); } /** * Add entry synchronously to an open ledger. * * @param data * array of bytes to be written to the ledger * do not reuse the buffer, bk-client will release it appropriately * @return the entryId of the new inserted entry */ public long addEntry(byte[] data) throws InterruptedException, BKException { return addEntry(data, 0, data.length); } /** * {@inheritDoc} */ @Override public CompletableFuture appendAsync(ByteBuf data) { SyncAddCallback callback = new SyncAddCallback(); asyncAddEntry(data, callback, null); return callback; } /** * Add entry synchronously to an open ledger. This can be used only with * {@link LedgerHandleAdv} returned through ledgers created with {@link * BookKeeper#createLedgerAdv(int, int, int, BookKeeper.DigestType, byte[])}. * * * @param entryId * entryId to be added * @param data * array of bytes to be written to the ledger * do not reuse the buffer, bk-client will release it appropriately * @return the entryId of the new inserted entry */ public long addEntry(final long entryId, byte[] data) throws InterruptedException, BKException { LOG.error("To use this feature Ledger must be created with createLedgerAdv interface."); throw BKException.create(BKException.Code.IllegalOpException); } /** * Add entry synchronously to an open ledger. * * @param data * array of bytes to be written to the ledger * do not reuse the buffer, bk-client will release it appropriately * @param offset * offset from which to take bytes from data * @param length * number of bytes to take from data * @return the entryId of the new inserted entry */ public long addEntry(byte[] data, int offset, int length) throws InterruptedException, BKException { if (LOG.isDebugEnabled()) { LOG.debug("Adding entry {}", data); } SyncAddCallback callback = new SyncAddCallback(); asyncAddEntry(data, offset, length, callback, null); return SyncCallbackUtils.waitForResult(callback); } /** * Add entry synchronously to an open ledger. This can be used only with * {@link LedgerHandleAdv} returned through ledgers created with {@link * BookKeeper#createLedgerAdv(int, int, int, BookKeeper.DigestType, byte[])}. * * @param entryId * entryId to be added. * @param data * array of bytes to be written to the ledger * do not reuse the buffer, bk-client will release it appropriately * @param offset * offset from which to take bytes from data * @param length * number of bytes to take from data * @return entryId */ public long addEntry(final long entryId, byte[] data, int offset, int length) throws InterruptedException, BKException { LOG.error("To use this feature Ledger must be created with createLedgerAdv() interface."); throw BKException.create(BKException.Code.IllegalOpException); } /** * Add entry asynchronously to an open ledger. * * @param data * array of bytes to be written * do not reuse the buffer, bk-client will release it appropriately * @param cb * object implementing callbackinterface * @param ctx * some control object */ public void asyncAddEntry(final byte[] data, final AddCallback cb, final Object ctx) { asyncAddEntry(data, 0, data.length, cb, ctx); } /** * Add entry asynchronously to an open ledger. This can be used only with * {@link LedgerHandleAdv} returned through ledgers created with {@link * BookKeeper#createLedgerAdv(int, int, int, BookKeeper.DigestType, byte[])}. * * @param entryId * entryId to be added * @param data * array of bytes to be written * do not reuse the buffer, bk-client will release it appropriately * @param cb * object implementing callbackinterface * @param ctx * some control object */ public void asyncAddEntry(final long entryId, final byte[] data, final AddCallback cb, final Object ctx) { LOG.error("To use this feature Ledger must be created with createLedgerAdv() interface."); cb.addCompleteWithLatency(BKException.Code.IllegalOpException, LedgerHandle.this, entryId, 0, ctx); } /** * Add entry asynchronously to an open ledger, using an offset and range. * * @param data * array of bytes to be written * do not reuse the buffer, bk-client will release it appropriately * @param offset * offset from which to take bytes from data * @param length * number of bytes to take from data * @param cb * object implementing callbackinterface * @param ctx * some control object * @throws ArrayIndexOutOfBoundsException if offset or length is negative or * offset and length sum to a value higher than the length of data. */ public void asyncAddEntry(final byte[] data, final int offset, final int length, final AddCallback cb, final Object ctx) { if (offset < 0 || length < 0 || (offset + length) > data.length) { throw new ArrayIndexOutOfBoundsException( "Invalid values for offset(" + offset + ") or length(" + length + ")"); } asyncAddEntry(Unpooled.wrappedBuffer(data, offset, length), cb, ctx); } public void asyncAddEntry(ByteBuf data, final AddCallback cb, final Object ctx) { PendingAddOp op = PendingAddOp.create(this, clientCtx, getCurrentEnsemble(), data, writeFlags, cb, ctx); doAsyncAddEntry(op); } /** * Add entry asynchronously to an open ledger, using an offset and range. * This can be used only with {@link LedgerHandleAdv} returned through * ledgers created with * {@link BookKeeper#createLedgerAdv(int, int, int, BookKeeper.DigestType, byte[])}. * * @param entryId * entryId of the entry to add. * @param data * array of bytes to be written * do not reuse the buffer, bk-client will release it appropriately * @param offset * offset from which to take bytes from data * @param length * number of bytes to take from data * @param cb * object implementing callbackinterface * @param ctx * some control object * @throws ArrayIndexOutOfBoundsException * if offset or length is negative or offset and length sum to a * value higher than the length of data. */ public void asyncAddEntry(final long entryId, final byte[] data, final int offset, final int length, final AddCallback cb, final Object ctx) { LOG.error("To use this feature Ledger must be created with createLedgerAdv() interface."); cb.addCompleteWithLatency(BKException.Code.IllegalOpException, LedgerHandle.this, entryId, 0, ctx); } /** * Add entry asynchronously to an open ledger, using an offset and range. * * @param entryId * entryId of the entry to add * @param data * array of bytes to be written * do not reuse the buffer, bk-client will release it appropriately * @param offset * offset from which to take bytes from data * @param length * number of bytes to take from data * @param cb * object implementing callbackinterface * @param ctx * some control object * @throws ArrayIndexOutOfBoundsException * if offset or length is negative or offset and length sum to a * value higher than the length of data. */ public void asyncAddEntry(final long entryId, final byte[] data, final int offset, final int length, final AddCallbackWithLatency cb, final Object ctx) { LOG.error("To use this feature Ledger must be created with createLedgerAdv() interface."); cb.addCompleteWithLatency(BKException.Code.IllegalOpException, LedgerHandle.this, entryId, 0, ctx); } /** * Add entry asynchronously to an open ledger, using an offset and range. * This can be used only with {@link LedgerHandleAdv} returned through * ledgers created with {@link BookKeeper#createLedgerAdv(int, int, int, BookKeeper.DigestType, byte[])}. * * @param entryId * entryId of the entry to add. * @param data * io.netty.buffer.ByteBuf of bytes to be written * do not reuse the buffer, bk-client will release it appropriately * @param cb * object implementing callbackinterface * @param ctx * some control object */ public void asyncAddEntry(final long entryId, ByteBuf data, final AddCallbackWithLatency cb, final Object ctx) { LOG.error("To use this feature Ledger must be created with createLedgerAdv() interface."); cb.addCompleteWithLatency(BKException.Code.IllegalOpException, LedgerHandle.this, entryId, 0, ctx); } /** * {@inheritDoc} */ @Override public CompletableFuture force() { CompletableFuture result = new CompletableFuture<>(); ForceLedgerOp op = new ForceLedgerOp(this, clientCtx.getBookieClient(), getCurrentEnsemble(), result); boolean wasClosed = false; synchronized (this) { // synchronized on this to ensure that // the ledger isn't closed between checking and // updating lastAddPushed if (!isHandleWritable()) { wasClosed = true; } } if (wasClosed) { // make sure the callback is triggered in main worker pool try { executeOrdered(new Runnable() { @Override public void run() { LOG.warn("Force() attempted on a closed ledger: {}", ledgerId); result.completeExceptionally(new BKException.BKLedgerClosedException()); } @Override public String toString() { return String.format("force(lid=%d)", ledgerId); } }); } catch (RejectedExecutionException e) { result.completeExceptionally(new BKException.BKInterruptedException()); } return result; } // early exit: no write has been issued yet if (pendingAddsSequenceHead == INVALID_ENTRY_ID) { executeOrdered(new Runnable() { @Override public void run() { FutureUtils.complete(result, null); } @Override public String toString() { return String.format("force(lid=%d)", ledgerId); } }); return result; } try { executeOrdered(op); } catch (RejectedExecutionException e) { result.completeExceptionally(new BKException.BKInterruptedException()); } return result; } /** * Make a recovery add entry request. Recovery adds can add to a ledger even * if it has been fenced. * *

This is only valid for bookie and ledger recovery, which may need to replicate * entries to a quorum of bookies to ensure data safety. * *

Normal client should never call this method. */ void asyncRecoveryAddEntry(final byte[] data, final int offset, final int length, final AddCallback cb, final Object ctx) { PendingAddOp op = PendingAddOp.create(this, clientCtx, getCurrentEnsemble(), Unpooled.wrappedBuffer(data, offset, length), writeFlags, cb, ctx) .enableRecoveryAdd(); doAsyncAddEntry(op); } private boolean isWriteSetWritable(DistributionSchedule.WriteSet writeSet, int allowedNonWritableCount) { if (allowedNonWritableCount < 0) { allowedNonWritableCount = 0; } final int sz = writeSet.size(); final int requiredWritable = sz - allowedNonWritableCount; int nonWritableCount = 0; List currentEnsemble = getCurrentEnsemble(); for (int i = 0; i < sz; i++) { int writeBookieIndex = writeSet.get(i); if (writeBookieIndex < currentEnsemble.size() && !clientCtx.getBookieClient().isWritable(currentEnsemble.get(writeBookieIndex), ledgerId)) { nonWritableCount++; if (nonWritableCount >= allowedNonWritableCount) { return false; } } else { final int knownWritable = i - nonWritableCount; if (knownWritable >= requiredWritable) { return true; } } } return true; } @VisibleForTesting protected boolean waitForWritable(DistributionSchedule.WriteSet writeSet, int allowedNonWritableCount, long durationMs) { if (durationMs < 0) { return true; } final long startTime = MathUtils.nowInNano(); boolean writableResult = isWriteSetWritable(writeSet, allowedNonWritableCount); if (!writableResult && durationMs > 0) { int backoff = 1; final int maxBackoff = 4; final long deadline = startTime + TimeUnit.MILLISECONDS.toNanos(durationMs); while (!(writableResult = isWriteSetWritable(writeSet, allowedNonWritableCount))) { if (MathUtils.nowInNano() < deadline) { long maxSleep = MathUtils.elapsedMSec(startTime); if (maxSleep < 0) { maxSleep = 1; } long sleepMs = Math.min(backoff, maxSleep); try { TimeUnit.MILLISECONDS.sleep(sleepMs); } catch (InterruptedException e) { Thread.currentThread().interrupt(); writableResult = isWriteSetWritable(writeSet, allowedNonWritableCount); break; } if (backoff <= maxBackoff) { backoff++; } } else { writableResult = false; break; } } if (backoff > 1) { LOG.info("Spent {} ms waiting for {} writable channels, writable result {}", MathUtils.elapsedMSec(startTime), writeSet.size() - allowedNonWritableCount, writableResult); } } if (writableResult) { clientChannelWriteWaitStats.registerSuccessfulEvent( MathUtils.elapsedNanos(startTime), TimeUnit.NANOSECONDS); } else { clientChannelWriteWaitStats.registerFailedEvent( MathUtils.elapsedNanos(startTime), TimeUnit.NANOSECONDS); } return writableResult; } protected void doAsyncAddEntry(final PendingAddOp op) { if (throttler != null) { throttler.acquire(); } boolean wasClosed = false; synchronized (this) { // synchronized on this to ensure that // the ledger isn't closed between checking and // updating lastAddPushed if (isHandleWritable()) { long entryId = ++lastAddPushed; long currentLedgerLength = addToLength(op.payload.readableBytes()); op.setEntryId(entryId); op.setLedgerLength(currentLedgerLength); pendingAddOps.add(op); } else { wasClosed = true; } } if (wasClosed) { // make sure the callback is triggered in main worker pool try { executeOrdered(new Runnable() { @Override public void run() { LOG.warn("Attempt to add to closed ledger: {}", ledgerId); op.cb.addCompleteWithLatency(BKException.Code.LedgerClosedException, LedgerHandle.this, INVALID_ENTRY_ID, 0, op.ctx); op.recyclePendAddOpObject(); } @Override public String toString() { return String.format("AsyncAddEntryToClosedLedger(lid=%d)", ledgerId); } }); } catch (RejectedExecutionException e) { op.cb.addCompleteWithLatency(BookKeeper.getReturnRc(clientCtx.getBookieClient(), BKException.Code.InterruptedException), LedgerHandle.this, INVALID_ENTRY_ID, 0, op.ctx); op.recyclePendAddOpObject(); } return; } if (clientCtx.getConf().waitForWriteSetMs >= 0) { DistributionSchedule.WriteSet ws = distributionSchedule.getWriteSet(op.getEntryId()); try { if (!waitForWritable(ws, 0, clientCtx.getConf().waitForWriteSetMs)) { op.allowFailFastOnUnwritableChannel(); } } finally { ws.recycle(); } } op.initiate(); } synchronized void updateLastConfirmed(long lac, long len) { if (lac > lastAddConfirmed) { lastAddConfirmed = lac; lacUpdateHitsCounter.inc(); } else { lacUpdateMissesCounter.inc(); } lastAddPushed = Math.max(lastAddPushed, lac); length = Math.max(length, len); } /** * Obtains asynchronously the last confirmed write from a quorum of bookies. This * call obtains the last add confirmed each bookie has received for this ledger * and returns the maximum. If the ledger has been closed, the value returned by this * call may not correspond to the id of the last entry of the ledger, since it reads * the hint of bookies. Consequently, in the case the ledger has been closed, it may * return a different value than getLastAddConfirmed, which returns the local value * of the ledger handle. * * @see #getLastAddConfirmed() * * @param cb * @param ctx */ public void asyncReadLastConfirmed(final ReadLastConfirmedCallback cb, final Object ctx) { if (clientCtx.getConf().useV2WireProtocol) { // in v2 protocol we don't support readLAC RPC asyncReadPiggybackLastConfirmed(cb, ctx); } else { asyncReadExplicitLastConfirmed(cb, ctx); } } private void asyncReadPiggybackLastConfirmed(final ReadLastConfirmedCallback cb, final Object ctx) { boolean isClosed; long lastEntryId; synchronized (this) { LedgerMetadata metadata = getLedgerMetadata(); isClosed = metadata.isClosed(); lastEntryId = metadata.getLastEntryId(); } if (isClosed) { cb.readLastConfirmedComplete(BKException.Code.OK, lastEntryId, ctx); return; } ReadLastConfirmedOp.LastConfirmedDataCallback innercb = new ReadLastConfirmedOp.LastConfirmedDataCallback() { @Override public void readLastConfirmedDataComplete(int rc, DigestManager.RecoveryData data) { if (rc == BKException.Code.OK) { updateLastConfirmed(data.getLastAddConfirmed(), data.getLength()); cb.readLastConfirmedComplete(rc, data.getLastAddConfirmed(), ctx); } else { cb.readLastConfirmedComplete(rc, INVALID_ENTRY_ID, ctx); } } }; new ReadLastConfirmedOp(clientCtx.getBookieClient(), distributionSchedule, macManager, ledgerId, getCurrentEnsemble(), ledgerKey, innercb).initiate(); } /** * Obtains asynchronously the last confirmed write from a quorum of bookies. * It is similar as * {@link #asyncReadLastConfirmed(org.apache.bookkeeper.client.AsyncCallback.ReadLastConfirmedCallback, Object)}, * but it doesn't wait all the responses from the quorum. It would callback * immediately if it received a LAC which is larger than current LAC. * * @see #asyncTryReadLastConfirmed(org.apache.bookkeeper.client.AsyncCallback.ReadLastConfirmedCallback, Object) * * @param cb * callback to return read last confirmed * @param ctx * callback context */ public void asyncTryReadLastConfirmed(final ReadLastConfirmedCallback cb, final Object ctx) { boolean isClosed; long lastEntryId; synchronized (this) { LedgerMetadata metadata = getLedgerMetadata(); isClosed = metadata.isClosed(); lastEntryId = metadata.getLastEntryId(); } if (isClosed) { cb.readLastConfirmedComplete(BKException.Code.OK, lastEntryId, ctx); return; } ReadLastConfirmedOp.LastConfirmedDataCallback innercb = new ReadLastConfirmedOp.LastConfirmedDataCallback() { AtomicBoolean completed = new AtomicBoolean(false); @Override public void readLastConfirmedDataComplete(int rc, DigestManager.RecoveryData data) { if (rc == BKException.Code.OK) { updateLastConfirmed(data.getLastAddConfirmed(), data.getLength()); if (completed.compareAndSet(false, true)) { cb.readLastConfirmedComplete(rc, data.getLastAddConfirmed(), ctx); } } else { if (completed.compareAndSet(false, true)) { cb.readLastConfirmedComplete(rc, INVALID_ENTRY_ID, ctx); } } } }; new TryReadLastConfirmedOp(this, clientCtx.getBookieClient(), getCurrentEnsemble(), innercb, getLastAddConfirmed()).initiate(); } /** * {@inheritDoc} */ @Override public CompletableFuture tryReadLastAddConfirmedAsync() { FutureReadLastConfirmed result = new FutureReadLastConfirmed(); asyncTryReadLastConfirmed(result, null); return result; } /** * {@inheritDoc} */ @Override public CompletableFuture readLastAddConfirmedAsync() { FutureReadLastConfirmed result = new FutureReadLastConfirmed(); asyncReadLastConfirmed(result, null); return result; } /** * {@inheritDoc} */ @Override public CompletableFuture readLastAddConfirmedAndEntryAsync(long entryId, long timeOutInMillis, boolean parallel) { FutureReadLastConfirmedAndEntry result = new FutureReadLastConfirmedAndEntry(); asyncReadLastConfirmedAndEntry(entryId, timeOutInMillis, parallel, result, null); return result; } /** * Asynchronous read next entry and the latest last add confirmed. * If the next entryId is less than known last add confirmed, the call will read next entry directly. * If the next entryId is ahead of known last add confirmed, the call will issue a long poll read * to wait for the next entry entryId. * *

The callback will return the latest last add confirmed and next entry if it is available within timeout * period timeOutInMillis. * * @param entryId * next entry id to read * @param timeOutInMillis * timeout period to wait for the entry id to be available (for long poll only) * @param parallel * whether to issue the long poll reads in parallel * @param cb * callback to return the result * @param ctx * callback context */ public void asyncReadLastConfirmedAndEntry(final long entryId, final long timeOutInMillis, final boolean parallel, final AsyncCallback.ReadLastConfirmedAndEntryCallback cb, final Object ctx) { boolean isClosed; long lac; synchronized (this) { LedgerMetadata metadata = getLedgerMetadata(); isClosed = metadata.isClosed(); lac = metadata.getLastEntryId(); } if (isClosed) { if (entryId > lac) { cb.readLastConfirmedAndEntryComplete(BKException.Code.OK, lac, null, ctx); return; } } else { lac = getLastAddConfirmed(); } if (entryId <= lac) { asyncReadEntries(entryId, entryId, new ReadCallback() { @Override public void readComplete(int rc, LedgerHandle lh, Enumeration seq, Object ctx) { if (BKException.Code.OK == rc) { if (seq.hasMoreElements()) { cb.readLastConfirmedAndEntryComplete(rc, getLastAddConfirmed(), seq.nextElement(), ctx); } else { cb.readLastConfirmedAndEntryComplete(rc, getLastAddConfirmed(), null, ctx); } } else { cb.readLastConfirmedAndEntryComplete(rc, INVALID_ENTRY_ID, null, ctx); } } }, ctx); return; } // wait for entry entryId ReadLastConfirmedAndEntryOp.LastConfirmedAndEntryCallback innercb = new ReadLastConfirmedAndEntryOp.LastConfirmedAndEntryCallback() { AtomicBoolean completed = new AtomicBoolean(false); @Override public void readLastConfirmedAndEntryComplete(int rc, long lastAddConfirmed, LedgerEntry entry) { if (rc == BKException.Code.OK) { if (completed.compareAndSet(false, true)) { cb.readLastConfirmedAndEntryComplete(rc, lastAddConfirmed, entry, ctx); } } else { if (completed.compareAndSet(false, true)) { cb.readLastConfirmedAndEntryComplete(rc, INVALID_ENTRY_ID, null, ctx); } } } }; new ReadLastConfirmedAndEntryOp(this, clientCtx, getCurrentEnsemble(), innercb, entryId - 1, timeOutInMillis) .parallelRead(parallel) .initiate(); } /** * Context objects for synchronous call to read last confirmed. */ static class LastConfirmedCtx { static final long ENTRY_ID_PENDING = -10; long response; int rc; LastConfirmedCtx() { this.response = ENTRY_ID_PENDING; } void setLastConfirmed(long lastConfirmed) { this.response = lastConfirmed; } long getlastConfirmed() { return this.response; } void setRC(int rc) { this.rc = rc; } int getRC() { return this.rc; } boolean ready() { return (this.response != ENTRY_ID_PENDING); } } /** * Obtains synchronously the last confirmed write from a quorum of bookies. This call * obtains the last add confirmed each bookie has received for this ledger * and returns the maximum. If the ledger has been closed, the value returned by this * call may not correspond to the id of the last entry of the ledger, since it reads * the hint of bookies. Consequently, in the case the ledger has been closed, it may * return a different value than getLastAddConfirmed, which returns the local value * of the ledger handle. * * @see #getLastAddConfirmed() * * @return The entry id of the last confirmed write or {@link #INVALID_ENTRY_ID INVALID_ENTRY_ID} * if no entry has been confirmed * @throws InterruptedException * @throws BKException */ public long readLastConfirmed() throws InterruptedException, BKException { LastConfirmedCtx ctx = new LastConfirmedCtx(); asyncReadLastConfirmed(new SyncReadLastConfirmedCallback(), ctx); synchronized (ctx) { while (!ctx.ready()) { ctx.wait(); } } if (ctx.getRC() != BKException.Code.OK) { throw BKException.create(ctx.getRC()); } return ctx.getlastConfirmed(); } /** * Obtains synchronously the last confirmed write from a quorum of bookies. * It is similar as {@link #readLastConfirmed()}, but it doesn't wait all the responses * from the quorum. It would callback immediately if it received a LAC which is larger * than current LAC. * * @see #readLastConfirmed() * * @return The entry id of the last confirmed write or {@link #INVALID_ENTRY_ID INVALID_ENTRY_ID} * if no entry has been confirmed * @throws InterruptedException * @throws BKException */ public long tryReadLastConfirmed() throws InterruptedException, BKException { LastConfirmedCtx ctx = new LastConfirmedCtx(); asyncTryReadLastConfirmed(new SyncReadLastConfirmedCallback(), ctx); synchronized (ctx) { while (!ctx.ready()) { ctx.wait(); } } if (ctx.getRC() != BKException.Code.OK) { throw BKException.create(ctx.getRC()); } return ctx.getlastConfirmed(); } /** * Obtains asynchronously the explicit last add confirmed from a quorum of * bookies. This call obtains Explicit LAC value and piggy-backed LAC value (just like * {@link #asyncReadLastConfirmed(ReadLastConfirmedCallback, Object)}) from each * bookie in the ensemble and returns the maximum. * If in the write LedgerHandle, explicitLAC feature is not enabled then this call behavior * will be similar to {@link #asyncReadLastConfirmed(ReadLastConfirmedCallback, Object)}. * If the read explicit lastaddconfirmed is greater than getLastAddConfirmed, then it updates the * lastAddConfirmed of this ledgerhandle. If the ledger has been closed, it * returns the value of the last add confirmed from the metadata. * * @see #getLastAddConfirmed() * * @param cb * callback to return read explicit last confirmed * @param ctx * callback context */ public void asyncReadExplicitLastConfirmed(final ReadLastConfirmedCallback cb, final Object ctx) { boolean isClosed; synchronized (this) { LedgerMetadata metadata = getLedgerMetadata(); isClosed = metadata.isClosed(); if (isClosed) { lastAddConfirmed = metadata.getLastEntryId(); length = metadata.getLength(); } } if (isClosed) { cb.readLastConfirmedComplete(BKException.Code.OK, lastAddConfirmed, ctx); return; } PendingReadLacOp.LacCallback innercb = new PendingReadLacOp.LacCallback() { @Override public void getLacComplete(int rc, long lac) { if (rc == BKException.Code.OK) { // here we are trying to update lac only but not length updateLastConfirmed(lac, 0); cb.readLastConfirmedComplete(rc, lac, ctx); } else { cb.readLastConfirmedComplete(rc, INVALID_ENTRY_ID, ctx); } } }; new PendingReadLacOp(this, clientCtx.getBookieClient(), getCurrentEnsemble(), innercb).initiate(); } /* * Obtains synchronously the explicit last add confirmed from a quorum of * bookies. This call obtains Explicit LAC value and piggy-backed LAC value (just like * {@Link #readLastAddConfirmed()) from each bookie in the ensemble and returns the maximum. * If in the write LedgerHandle, explicitLAC feature is not enabled then this call behavior * will be similar to {@Link #readLastAddConfirmed()}. * If the read explicit lastaddconfirmed is greater than getLastAddConfirmed, then it updates the * lastAddConfirmed of this ledgerhandle. If the ledger has been closed, it * returns the value of the last add confirmed from the metadata. * * @see #getLastAddConfirmed() * * @return The entry id of the explicit last confirmed write or * {@link #INVALID_ENTRY_ID INVALID_ENTRY_ID} if no entry has been * confirmed. * @throws InterruptedException * @throws BKException */ public long readExplicitLastConfirmed() throws InterruptedException, BKException { LastConfirmedCtx ctx = new LastConfirmedCtx(); asyncReadExplicitLastConfirmed(new SyncReadLastConfirmedCallback(), ctx); synchronized (ctx) { while (!ctx.ready()) { ctx.wait(); } } if (ctx.getRC() != BKException.Code.OK) { throw BKException.create(ctx.getRC()); } return ctx.getlastConfirmed(); } // close the ledger and send fails to all the adds in the pipeline void handleUnrecoverableErrorDuringAdd(int rc) { if (getLedgerMetadata().getState() == LedgerMetadata.State.IN_RECOVERY) { // we should not close ledger if ledger is recovery mode // otherwise we may lose entry. errorOutPendingAdds(rc); return; } LOG.error("Closing ledger {} due to {}", ledgerId, BKException.codeLogger(rc)); asyncCloseInternal(NoopCloseCallback.instance, null, rc); } private void monitorPendingAddOps() { int timedOut = 0; for (PendingAddOp op : pendingAddOps) { if (op.maybeTimeout()) { timedOut++; } } if (timedOut > 0) { LOG.info("Timed out {} add ops", timedOut); } } void errorOutPendingAdds(int rc) { errorOutPendingAdds(rc, drainPendingAddsAndAdjustLength()); } synchronized List drainPendingAddsAndAdjustLength() { PendingAddOp pendingAddOp; List opsDrained = new ArrayList(pendingAddOps.size()); while ((pendingAddOp = pendingAddOps.poll()) != null) { addToLength(-pendingAddOp.entryLength); opsDrained.add(pendingAddOp); } return opsDrained; } void errorOutPendingAdds(int rc, List ops) { for (PendingAddOp op : ops) { op.submitCallback(rc); } } void sendAddSuccessCallbacks() { // Start from the head of the queue and proceed while there are // entries that have had all their responses come back PendingAddOp pendingAddOp; while ((pendingAddOp = pendingAddOps.peek()) != null && !changingEnsemble) { if (!pendingAddOp.completed) { if (LOG.isDebugEnabled()) { LOG.debug("pending add not completed: {}", pendingAddOp); } return; } // Check if it is the next entry in the sequence. if (pendingAddOp.entryId != 0 && pendingAddOp.entryId != pendingAddsSequenceHead + 1) { if (LOG.isDebugEnabled()) { LOG.debug("Head of the queue entryId: {} is not the expected value: {}", pendingAddOp.entryId, pendingAddsSequenceHead + 1); } return; } pendingAddOps.remove(); explicitLacFlushPolicy.updatePiggyBackedLac(lastAddConfirmed); pendingAddsSequenceHead = pendingAddOp.entryId; if (!writeFlags.contains(WriteFlag.DEFERRED_SYNC)) { this.lastAddConfirmed = pendingAddsSequenceHead; } pendingAddOp.submitCallback(BKException.Code.OK); } } @VisibleForTesting boolean hasDelayedWriteFailedBookies() { return !delayedWriteFailedBookies.isEmpty(); } void notifyWriteFailed(int index, BookieId addr) { synchronized (metadataLock) { delayedWriteFailedBookies.put(index, addr); } } void maybeHandleDelayedWriteBookieFailure() { synchronized (metadataLock) { if (delayedWriteFailedBookies.isEmpty()) { return; } Map toReplace = new HashMap<>(delayedWriteFailedBookies); delayedWriteFailedBookies.clear(); // Original intent of this change is to do a best-effort ensemble change. // But this is not possible until the local metadata is completely immutable. // Until the feature "Make LedgerMetadata Immutable #610" Is complete we will use // handleBookieFailure() to handle delayed writes as regular bookie failures. handleBookieFailure(toReplace); } } void handleBookieFailure(final Map failedBookies) { if (clientCtx.getConf().disableEnsembleChangeFeature.isAvailable()) { if (LOG.isDebugEnabled()) { LOG.debug("Ensemble change is disabled. Retry sending to failed bookies {} for ledger {}.", failedBookies, ledgerId); } executeOrdered(() -> unsetSuccessAndSendWriteRequest(getCurrentEnsemble(), failedBookies.keySet())); return; } if (writeFlags.contains(WriteFlag.DEFERRED_SYNC)) { if (LOG.isDebugEnabled()) { LOG.debug("Cannot perform ensemble change with write flags {}. " + "Failed bookies {} for ledger {}.", writeFlags, failedBookies, ledgerId); } handleUnrecoverableErrorDuringAdd(WriteException); return; } boolean triggerLoop = false; Map toReplace = null; List origEnsemble = null; synchronized (metadataLock) { if (changingEnsemble) { delayedWriteFailedBookies.putAll(failedBookies); } else { changingEnsemble = true; triggerLoop = true; toReplace = new HashMap<>(delayedWriteFailedBookies); delayedWriteFailedBookies.clear(); toReplace.putAll(failedBookies); origEnsemble = getCurrentEnsemble(); } } if (triggerLoop) { ensembleChangeLoop(origEnsemble, toReplace); } } void ensembleChangeLoop(List origEnsemble, Map failedBookies) { int ensembleChangeId = numEnsembleChanges.incrementAndGet(); ensembleChangeCounter.inc(); String logContext = String.format("[EnsembleChange(ledger:%d, change-id:%010d)]", ledgerId, ensembleChangeId); // when the ensemble changes are too frequent, close handle if (ensembleChangeId > clientCtx.getConf().maxAllowedEnsembleChanges) { LOG.info("{} reaches max allowed ensemble change number {}", logContext, clientCtx.getConf().maxAllowedEnsembleChanges); handleUnrecoverableErrorDuringAdd(WriteException); return; } if (LOG.isDebugEnabled()) { LOG.debug("{} Replacing {} in {}", logContext, failedBookies, origEnsemble); } AtomicInteger attempts = new AtomicInteger(0); new MetadataUpdateLoop( clientCtx.getLedgerManager(), getId(), this::getVersionedLedgerMetadata, (metadata) -> metadata.getState() == LedgerMetadata.State.OPEN && failedBookies.entrySet().stream().anyMatch( e -> LedgerMetadataUtils.getLastEnsembleValue(metadata) .get(e.getKey()).equals(e.getValue())), (metadata) -> { attempts.incrementAndGet(); List currentEnsemble = getCurrentEnsemble(); List newEnsemble = EnsembleUtils.replaceBookiesInEnsemble( clientCtx.getBookieWatcher(), metadata, currentEnsemble, failedBookies, logContext); Long lastEnsembleKey = LedgerMetadataUtils.getLastEnsembleKey(metadata); LedgerMetadataBuilder builder = LedgerMetadataBuilder.from(metadata); long newEnsembleStartEntry = getLastAddConfirmed() + 1; checkState(lastEnsembleKey <= newEnsembleStartEntry, "New ensemble must either replace the last ensemble, or add a new one"); if (LOG.isDebugEnabled()) { LOG.debug("{}[attempt:{}] changing ensemble from: {} to: {} starting at entry: {}", logContext, attempts.get(), currentEnsemble, newEnsemble, newEnsembleStartEntry); } if (lastEnsembleKey.equals(newEnsembleStartEntry)) { return builder.replaceEnsembleEntry(newEnsembleStartEntry, newEnsemble).build(); } else { return builder.newEnsembleEntry(newEnsembleStartEntry, newEnsemble).build(); } }, this::setLedgerMetadata) .run().whenCompleteAsync((metadata, ex) -> { if (ex != null) { LOG.warn("{}[attempt:{}] Exception changing ensemble", logContext, attempts.get(), ex); handleUnrecoverableErrorDuringAdd(BKException.getExceptionCode(ex, WriteException)); } else if (metadata.getValue().isClosed()) { if (LOG.isDebugEnabled()) { LOG.debug("{}[attempt:{}] Metadata closed during attempt to replace bookie." + " Another client must have recovered the ledger.", logContext, attempts.get()); } handleUnrecoverableErrorDuringAdd(BKException.Code.LedgerClosedException); } else if (metadata.getValue().getState() == LedgerMetadata.State.IN_RECOVERY) { if (LOG.isDebugEnabled()) { LOG.debug("{}[attempt:{}] Metadata marked as in-recovery during attempt to replace bookie." + " Another client must be recovering the ledger.", logContext, attempts.get()); } handleUnrecoverableErrorDuringAdd(BKException.Code.LedgerFencedException); } else { if (LOG.isDebugEnabled()) { LOG.debug("{}[attempt:{}] Success updating metadata.", logContext, attempts.get()); } List newEnsemble = null; Set replaced = null; synchronized (metadataLock) { if (!delayedWriteFailedBookies.isEmpty()) { Map toReplace = new HashMap<>(delayedWriteFailedBookies); delayedWriteFailedBookies.clear(); ensembleChangeLoop(origEnsemble, toReplace); } else { newEnsemble = getCurrentEnsemble(); replaced = EnsembleUtils.diffEnsemble(origEnsemble, newEnsemble); LOG.info("New Ensemble: {} for ledger: {}", newEnsemble, ledgerId); changingEnsemble = false; } } if (newEnsemble != null) { // unsetSuccess outside of lock unsetSuccessAndSendWriteRequest(newEnsemble, replaced); } } }, clientCtx.getMainWorkerPool().chooseThread(ledgerId)); } void unsetSuccessAndSendWriteRequest(List ensemble, final Set bookies) { for (PendingAddOp pendingAddOp : pendingAddOps) { for (Integer bookieIndex: bookies) { pendingAddOp.unsetSuccessAndSendWriteRequest(ensemble, bookieIndex); } } } void registerOperationFailureOnBookie(BookieId bookie, long entryId) { if (clientCtx.getConf().enableBookieFailureTracking) { bookieFailureHistory.put(bookie, entryId); } } static class NoopCloseCallback implements CloseCallback { static NoopCloseCallback instance = new NoopCloseCallback(); @Override public void closeComplete(int rc, LedgerHandle lh, Object ctx) { if (rc != BKException.Code.OK) { LOG.warn("Close failed: {}", BKException.codeLogger(rc)); } // noop } } /** * Get the current ensemble from the ensemble list. The current ensemble * is the last ensemble in the list. The ledger handle uses this ensemble when * triggering operations which work on the end of the ledger, such as adding new * entries or reading the last add confirmed. * *

This method is also used by ReadOnlyLedgerHandle during recovery, and when * tailing a ledger. * *

Generally, this method should only be called by LedgerHandle and not by the * operations themselves, to avoid adding more dependencies between the classes. * There are too many already. */ List getCurrentEnsemble() { // Getting current ensemble from the metadata is only a temporary // thing until metadata is immutable. At that point, current ensemble // becomes a property of the LedgerHandle itself. return LedgerMetadataUtils.getCurrentEnsemble(versionedMetadata.getValue()); } /** * Return a {@link WriteSet} suitable for reading a particular entry. * This will include all bookies that are cotna */ WriteSet getWriteSetForReadOperation(long entryId) { if (stickyBookieIndex != STICKY_READ_BOOKIE_INDEX_UNSET) { // When sticky reads are enabled we want to make sure to take // advantage of read-ahead (or, anyway, from efficiencies in // reading sequential data from disk through the page cache). // For this, all the entries that a given bookie prefetches, // should read from that bookie. // For example, with e=2, w=2, a=2 we would have // B-1 B-2 // e-0 X X // e-1 X X // e-2 X X // // In this case we want all the requests to be issued to B-1 (by // preference), so that cache hits will be maximized. // // We can only enable sticky reads if the ensemble==writeQuorum // otherwise the same bookie will not have all the entries // stored return distributionSchedule.getWriteSet(stickyBookieIndex); } else { return distributionSchedule.getWriteSet(entryId); } } /** * Execute the callback in the thread pinned to the ledger. * @param runnable * @throws RejectedExecutionException */ void executeOrdered(Runnable runnable) throws RejectedExecutionException { clientCtx.getMainWorkerPool().executeOrdered(ledgerId, runnable); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy