All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskRunner Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexing.seekablestream;


import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.SettableFuture;
import org.apache.druid.data.input.Committer;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.discovery.DiscoveryDruidNode;
import org.apache.druid.discovery.LookupNodeService;
import org.apache.druid.discovery.NodeType;
import org.apache.druid.indexer.IngestionState;
import org.apache.druid.indexer.TaskStatus;
import org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReport;
import org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData;
import org.apache.druid.indexing.common.LockGranularity;
import org.apache.druid.indexing.common.TaskLockType;
import org.apache.druid.indexing.common.TaskRealtimeMetricsMonitorBuilder;
import org.apache.druid.indexing.common.TaskReport;
import org.apache.druid.indexing.common.TaskToolbox;
import org.apache.druid.indexing.common.actions.CheckPointDataSourceMetadataAction;
import org.apache.druid.indexing.common.actions.ResetDataSourceMetadataAction;
import org.apache.druid.indexing.common.actions.SegmentLockAcquireAction;
import org.apache.druid.indexing.common.actions.TimeChunkLockAcquireAction;
import org.apache.druid.indexing.common.stats.RowIngestionMeters;
import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory;
import org.apache.druid.indexing.common.task.IndexTaskUtils;
import org.apache.druid.indexing.common.task.RealtimeIndexTask;
import org.apache.druid.indexing.seekablestream.common.OrderedPartitionableRecord;
import org.apache.druid.indexing.seekablestream.common.OrderedSequenceNumber;
import org.apache.druid.indexing.seekablestream.common.RecordSupplier;
import org.apache.druid.indexing.seekablestream.common.StreamPartition;
import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.collect.Utils;
import org.apache.druid.java.util.common.parsers.ParseException;
import org.apache.druid.java.util.emitter.EmittingLogger;
import org.apache.druid.segment.indexing.RealtimeIOConfig;
import org.apache.druid.segment.realtime.FireDepartment;
import org.apache.druid.segment.realtime.FireDepartmentMetrics;
import org.apache.druid.segment.realtime.appenderator.Appenderator;
import org.apache.druid.segment.realtime.appenderator.AppenderatorDriverAddResult;
import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager;
import org.apache.druid.segment.realtime.appenderator.SegmentsAndMetadata;
import org.apache.druid.segment.realtime.appenderator.StreamAppenderatorDriver;
import org.apache.druid.segment.realtime.firehose.ChatHandler;
import org.apache.druid.segment.realtime.firehose.ChatHandlerProvider;
import org.apache.druid.server.security.Access;
import org.apache.druid.server.security.Action;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.utils.CircularBuffer;
import org.apache.druid.utils.CollectionUtils;
import org.joda.time.DateTime;

import javax.annotation.Nullable;
import javax.servlet.http.HttpServletRequest;
import javax.validation.constraints.NotNull;
import javax.ws.rs.Consumes;
import javax.ws.rs.DefaultValue;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

/**
 * Interface for abstracting the indexing task run logic.
 *
 * @param     Partition Number Type
 * @param  Sequence Number Type
 */
public abstract class SeekableStreamIndexTaskRunner implements ChatHandler
{
  public enum Status
  {
    NOT_STARTED,
    STARTING,
    READING,
    PAUSED,
    PUBLISHING
  }

  private static final EmittingLogger log = new EmittingLogger(SeekableStreamIndexTaskRunner.class);
  static final String METADATA_NEXT_PARTITIONS = "nextPartitions";
  static final String METADATA_PUBLISH_PARTITIONS = "publishPartitions";

  private final Map endOffsets;

  // lastReadOffsets are the last offsets that were read and processed.
  private final Map lastReadOffsets = new HashMap<>();

  // currOffsets are what should become the start offsets of the next reader, if we stopped reading now. They are
  // initialized to the start offsets when the task begins.
  private final ConcurrentMap currOffsets = new ConcurrentHashMap<>();
  private final ConcurrentMap lastPersistedOffsets = new ConcurrentHashMap<>();

  // The pause lock and associated conditions are to support coordination between the Jetty threads and the main
  // ingestion loop. The goal is to provide callers of the API a guarantee that if pause() returns successfully
  // the ingestion loop has been stopped at the returned sequences and will not ingest any more data until resumed. The
  // fields are used as follows (every step requires acquiring [pauseLock]):
  //   Pausing:
  //   - In pause(), [pauseRequested] is set to true and then execution waits for [status] to change to PAUSED, with the
  //     condition checked when [hasPaused] is signalled.
  //   - In possiblyPause() called from the main loop, if [pauseRequested] is true, [status] is set to PAUSED,
  //     [hasPaused] is signalled, and execution pauses until [pauseRequested] becomes false, either by being set or by
  //     the [pauseMillis] timeout elapsing. [pauseRequested] is checked when [shouldResume] is signalled.
  //   Resuming:
  //   - In resume(), [pauseRequested] is set to false, [shouldResume] is signalled, and execution waits for [status] to
  //     change to something other than PAUSED, with the condition checked when [shouldResume] is signalled.
  //   - In possiblyPause(), when [shouldResume] is signalled, if [pauseRequested] has become false the pause loop ends,
  //     [status] is changed to STARTING and [shouldResume] is signalled.
  private final Lock pauseLock = new ReentrantLock();
  private final Condition hasPaused = pauseLock.newCondition();
  private final Condition shouldResume = pauseLock.newCondition();

  protected final AtomicBoolean stopRequested = new AtomicBoolean(false);
  private final AtomicBoolean publishOnStop = new AtomicBoolean(false);

  // [statusLock] is used to synchronize the Jetty thread calling stopGracefully() with the main run thread. It prevents
  // the main run thread from switching into a publishing state while the stopGracefully() thread thinks it's still in
  // a pre-publishing state. This is important because stopGracefully() will try to use the [stopRequested] flag to stop
  // the main thread where possible, but this flag is not honored once publishing has begun so in this case we must
  // interrupt the thread. The lock ensures that if the run thread is about to transition into publishing state, it
  // blocks until after stopGracefully() has set [stopRequested] and then does a final check on [stopRequested] before
  // transitioning to publishing state.
  private final Object statusLock = new Object();

  protected final Lock pollRetryLock = new ReentrantLock();
  protected final Condition isAwaitingRetry = pollRetryLock.newCondition();

  private final SeekableStreamIndexTask task;
  private final SeekableStreamIndexTaskIOConfig ioConfig;
  private final SeekableStreamIndexTaskTuningConfig tuningConfig;
  private final InputRowParser parser;
  private final AuthorizerMapper authorizerMapper;
  private final Optional chatHandlerProvider;
  private final CircularBuffer savedParseExceptions;
  private final String stream;
  private final RowIngestionMeters rowIngestionMeters;
  private final AppenderatorsManager appenderatorsManager;

  private final Set publishingSequences = Sets.newConcurrentHashSet();
  private final List> publishWaitList = new ArrayList<>();
  private final List> handOffWaitList = new ArrayList<>();

  private final LockGranularity lockGranularityToUse;

  private volatile DateTime startTime;
  private volatile Status status = Status.NOT_STARTED; // this is only ever set by the task runner thread (runThread)
  private volatile TaskToolbox toolbox;
  private volatile Thread runThread;
  private volatile Appenderator appenderator;
  private volatile StreamAppenderatorDriver driver;
  private volatile IngestionState ingestionState;

  protected volatile boolean pauseRequested = false;
  private volatile long nextCheckpointTime;

  private volatile CopyOnWriteArrayList> sequences;
  private volatile Throwable backgroundThreadException;

  public SeekableStreamIndexTaskRunner(
      final SeekableStreamIndexTask task,
      final InputRowParser parser,
      final AuthorizerMapper authorizerMapper,
      final Optional chatHandlerProvider,
      final CircularBuffer savedParseExceptions,
      final RowIngestionMetersFactory rowIngestionMetersFactory,
      final AppenderatorsManager appenderatorsManager,
      final LockGranularity lockGranularityToUse
  )
  {
    Preconditions.checkNotNull(task);
    this.task = task;
    this.ioConfig = task.getIOConfig();
    this.tuningConfig = task.getTuningConfig();
    this.parser = parser;
    this.authorizerMapper = authorizerMapper;
    this.chatHandlerProvider = chatHandlerProvider;
    this.savedParseExceptions = savedParseExceptions;
    this.stream = ioConfig.getStartSequenceNumbers().getStream();
    this.rowIngestionMeters = rowIngestionMetersFactory.createRowIngestionMeters();
    this.appenderatorsManager = appenderatorsManager;
    this.endOffsets = new ConcurrentHashMap<>(ioConfig.getEndSequenceNumbers().getPartitionSequenceNumberMap());
    this.sequences = new CopyOnWriteArrayList<>();
    this.ingestionState = IngestionState.NOT_STARTED;
    this.lockGranularityToUse = lockGranularityToUse;

    resetNextCheckpointTime();
  }

  public TaskStatus run(TaskToolbox toolbox)
  {
    try {
      return runInternal(toolbox);
    }
    catch (Exception e) {
      log.error(e, "Encountered exception while running task.");
      final String errorMsg = Throwables.getStackTraceAsString(e);
      toolbox.getTaskReportFileWriter().write(task.getId(), getTaskCompletionReports(errorMsg));
      return TaskStatus.failure(
          task.getId(),
          errorMsg
      );
    }
  }

  private Set computeExclusiveStartPartitionsForSequence(
      Map sequenceStartOffsets
  )
  {
    if (sequenceStartOffsets.equals(ioConfig.getStartSequenceNumbers().getPartitionSequenceNumberMap())) {
      return ioConfig.getStartSequenceNumbers().getExclusivePartitions();
    } else {
      return isEndOffsetExclusive() ? Collections.emptySet() : sequenceStartOffsets.keySet();
    }
  }

  @VisibleForTesting
  public void setToolbox(TaskToolbox toolbox)
  {
    this.toolbox = toolbox;
  }

  @VisibleForTesting
  public void initializeSequences() throws IOException
  {
    if (!restoreSequences()) {
      final TreeMap> checkpoints = getCheckPointsFromContext(
          toolbox,
          task.getContextValue(SeekableStreamSupervisor.CHECKPOINTS_CTX_KEY)
      );
      if (checkpoints != null) {
        Iterator>> sequenceOffsets = checkpoints.entrySet()
                                                                                                            .iterator();
        Map.Entry> previous = sequenceOffsets.next();
        while (sequenceOffsets.hasNext()) {
          Map.Entry> current = sequenceOffsets.next();
          final Set exclusiveStartPartitions = computeExclusiveStartPartitionsForSequence(
              previous.getValue()
          );
          addSequence(
              new SequenceMetadata<>(
                  previous.getKey(),
                  StringUtils.format("%s_%s", ioConfig.getBaseSequenceName(), previous.getKey()),
                  previous.getValue(),
                  current.getValue(),
                  true,
                  exclusiveStartPartitions
              )
          );
          previous = current;
        }
        final Set exclusiveStartPartitions = computeExclusiveStartPartitionsForSequence(
            previous.getValue()
        );
        addSequence(
            new SequenceMetadata<>(
                previous.getKey(),
                StringUtils.format("%s_%s", ioConfig.getBaseSequenceName(), previous.getKey()),
                previous.getValue(),
                endOffsets,
                false,
                exclusiveStartPartitions
            )
        );
      } else {
        addSequence(
            new SequenceMetadata<>(
                0,
                StringUtils.format("%s_%s", ioConfig.getBaseSequenceName(), 0),
                ioConfig.getStartSequenceNumbers().getPartitionSequenceNumberMap(),
                endOffsets,
                false,
                ioConfig.getStartSequenceNumbers().getExclusivePartitions()
            )
        );
      }
    }

    log.info("Starting with sequences:  %s", sequences);
  }

  private TaskStatus runInternal(TaskToolbox toolbox) throws Exception
  {
    log.info("SeekableStream indexing task starting up!");
    startTime = DateTimes.nowUtc();
    status = Status.STARTING;

    setToolbox(toolbox);
    initializeSequences();

    if (chatHandlerProvider.isPresent()) {
      log.info("Found chat handler of class[%s]", chatHandlerProvider.get().getClass().getName());
      chatHandlerProvider.get().register(task.getId(), this, false);
    } else {
      log.warn("No chat handler detected");
    }

    runThread = Thread.currentThread();

    // Set up FireDepartmentMetrics
    final FireDepartment fireDepartmentForMetrics = new FireDepartment(
        task.getDataSchema(),
        new RealtimeIOConfig(null, null),
        null
    );
    FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
    toolbox.getMonitorScheduler()
           .addMonitor(TaskRealtimeMetricsMonitorBuilder.build(task, fireDepartmentForMetrics, rowIngestionMeters));

    final String lookupTier = task.getContextValue(RealtimeIndexTask.CTX_KEY_LOOKUP_TIER);
    final LookupNodeService lookupNodeService = lookupTier == null ?
                                                toolbox.getLookupNodeService() :
                                                new LookupNodeService(lookupTier);

    final DiscoveryDruidNode discoveryDruidNode = new DiscoveryDruidNode(
        toolbox.getDruidNode(),
        NodeType.PEON,
        ImmutableMap.of(
            toolbox.getDataNodeService().getName(), toolbox.getDataNodeService(),
            lookupNodeService.getName(), lookupNodeService
        )
    );

    Throwable caughtExceptionOuter = null;
    try (final RecordSupplier recordSupplier = task.newTaskRecordSupplier()) {

      if (appenderatorsManager.shouldTaskMakeNodeAnnouncements()) {
        toolbox.getDataSegmentServerAnnouncer().announce();
        toolbox.getDruidNodeAnnouncer().announce(discoveryDruidNode);
      }
      appenderator = task.newAppenderator(fireDepartmentMetrics, toolbox);
      driver = task.newDriver(appenderator, toolbox, fireDepartmentMetrics);

      // Start up, set up initial sequences.
      final Object restoredMetadata = driver.startJob(
          segmentId -> {
            try {
              if (lockGranularityToUse == LockGranularity.SEGMENT) {
                return toolbox.getTaskActionClient().submit(
                    new SegmentLockAcquireAction(
                        TaskLockType.EXCLUSIVE,
                        segmentId.getInterval(),
                        segmentId.getVersion(),
                        segmentId.getShardSpec().getPartitionNum(),
                        1000L
                    )
                ).isOk();
              } else {
                return toolbox.getTaskActionClient().submit(
                    new TimeChunkLockAcquireAction(
                        TaskLockType.EXCLUSIVE,
                        segmentId.getInterval(),
                        1000L
                    )
                ) != null;
              }
            }
            catch (IOException e) {
              throw new RuntimeException(e);
            }
          }
      );
      if (restoredMetadata == null) {
        // no persist has happened so far
        // so either this is a brand new task or replacement of a failed task
        Preconditions.checkState(sequences.get(0).startOffsets.entrySet().stream().allMatch(
            partitionOffsetEntry ->
                createSequenceNumber(partitionOffsetEntry.getValue()).compareTo(
                    createSequenceNumber(ioConfig.getStartSequenceNumbers()
                                                 .getPartitionSequenceNumberMap()
                                                 .get(partitionOffsetEntry.getKey())
                    )) >= 0
        ), "Sequence sequences are not compatible with start sequences of task");
        currOffsets.putAll(sequences.get(0).startOffsets);
      } else {
        @SuppressWarnings("unchecked")
        final Map restoredMetadataMap = (Map) restoredMetadata;
        final SeekableStreamEndSequenceNumbers restoredNextPartitions =
            deserializePartitionsFromMetadata(
                toolbox.getObjectMapper(),
                restoredMetadataMap.get(METADATA_NEXT_PARTITIONS)
            );

        currOffsets.putAll(restoredNextPartitions.getPartitionSequenceNumberMap());

        // Sanity checks.
        if (!restoredNextPartitions.getStream().equals(ioConfig.getStartSequenceNumbers().getStream())) {
          throw new ISE(
              "WTF?! Restored stream[%s] but expected stream[%s]",
              restoredNextPartitions.getStream(),
              ioConfig.getStartSequenceNumbers().getStream()
          );
        }

        if (!currOffsets.keySet().equals(ioConfig.getStartSequenceNumbers().getPartitionSequenceNumberMap().keySet())) {
          throw new ISE(
              "WTF?! Restored partitions[%s] but expected partitions[%s]",
              currOffsets.keySet(),
              ioConfig.getStartSequenceNumbers().getPartitionSequenceNumberMap().keySet()
          );
        }
        // sequences size can be 0 only when all sequences got published and task stopped before it could finish
        // which is super rare
        if (sequences.size() == 0 || getLastSequenceMetadata().isCheckpointed()) {
          this.endOffsets.putAll(sequences.size() == 0
                                 ? currOffsets
                                 : getLastSequenceMetadata().getEndOffsets());
          log.info("End sequences changed to [%s]", endOffsets);
        }
      }

      // Filter out partitions with END_OF_SHARD markers since these partitions have already been fully read. This
      // should have been done by the supervisor already so this is defensive.
      int numPreFilterPartitions = currOffsets.size();
      if (currOffsets.entrySet().removeIf(x -> isEndOfShard(x.getValue()))) {
        log.info(
            "Removed [%d] partitions from assignment which have already been closed",
            numPreFilterPartitions - currOffsets.size()
        );
      }

      // Initialize lastReadOffsets immediately after restoring currOffsets. This is only done when end offsets are
      // inclusive, because the point of initializing lastReadOffsets here is so we know when to skip the start record.
      // When end offsets are exclusive, we never skip the start record.
      if (!isEndOffsetExclusive()) {
        for (Map.Entry entry : currOffsets.entrySet()) {
          final boolean isAtStart = entry.getValue().equals(
              ioConfig.getStartSequenceNumbers().getPartitionSequenceNumberMap().get(entry.getKey())
          );

          if (!isAtStart || ioConfig.getStartSequenceNumbers().getExclusivePartitions().contains(entry.getKey())) {
            lastReadOffsets.put(entry.getKey(), entry.getValue());
          }
        }
      }

      // Set up committer.
      final Supplier committerSupplier = () -> {
        final Map snapshot = ImmutableMap.copyOf(currOffsets);
        lastPersistedOffsets.clear();
        lastPersistedOffsets.putAll(snapshot);

        return new Committer()
        {
          @Override
          public Object getMetadata()
          {
            return ImmutableMap.of(METADATA_NEXT_PARTITIONS, new SeekableStreamEndSequenceNumbers<>(stream, snapshot));
          }

          @Override
          public void run()
          {
            // Do nothing.
          }
        };
      };

      // restart publishing of sequences (if any)
      maybePersistAndPublishSequences(committerSupplier);

      Set> assignment = assignPartitions(recordSupplier);
      possiblyResetDataSourceMetadata(toolbox, recordSupplier, assignment);
      seekToStartingSequence(recordSupplier, assignment);

      ingestionState = IngestionState.BUILD_SEGMENTS;

      // Main loop.
      // Could eventually support leader/follower mode (for keeping replicas more in sync)
      boolean stillReading = !assignment.isEmpty();
      status = Status.READING;
      Throwable caughtExceptionInner = null;

      try {
        while (stillReading) {
          if (possiblyPause()) {
            // The partition assignments may have changed while paused by a call to setEndOffsets() so reassign
            // partitions upon resuming. Don't call "seekToStartingSequence" after "assignPartitions", because there's
            // no need to re-seek here. All we're going to be doing is dropping partitions.
            assignment = assignPartitions(recordSupplier);
            possiblyResetDataSourceMetadata(toolbox, recordSupplier, assignment);

            if (assignment.isEmpty()) {
              log.info("All partitions have been fully read");
              publishOnStop.set(true);
              stopRequested.set(true);
            }
          }

          // if stop is requested or task's end sequence is set by call to setEndOffsets method with finish set to true
          if (stopRequested.get() || sequences.size() == 0 || getLastSequenceMetadata().isCheckpointed()) {
            status = Status.PUBLISHING;
          }

          if (stopRequested.get()) {
            break;
          }

          if (backgroundThreadException != null) {
            throw new RuntimeException(backgroundThreadException);
          }

          checkPublishAndHandoffFailure();

          maybePersistAndPublishSequences(committerSupplier);

          // calling getRecord() ensures that exceptions specific to kafka/kinesis like OffsetOutOfRangeException
          // are handled in the subclasses.
          List> records = getRecords(
              recordSupplier,
              toolbox
          );

          // note: getRecords() also updates assignment
          stillReading = !assignment.isEmpty();

          SequenceMetadata sequenceToCheckpoint = null;
          for (OrderedPartitionableRecord record : records) {
            final boolean shouldProcess = verifyRecordInRange(record.getPartitionId(), record.getSequenceNumber());

            log.trace(
                "Got stream[%s] partition[%s] sequenceNumber[%s], shouldProcess[%s].",
                record.getStream(),
                record.getPartitionId(),
                record.getSequenceNumber(),
                shouldProcess
            );

            if (shouldProcess) {
              try {
                final List valueBytess = record.getData();
                final List rows;
                if (valueBytess == null || valueBytess.isEmpty()) {
                  rows = Utils.nullableListOf((InputRow) null);
                } else {
                  rows = new ArrayList<>();
                  for (byte[] valueBytes : valueBytess) {
                    rows.addAll(parser.parseBatch(ByteBuffer.wrap(valueBytes)));
                  }
                }
                boolean isPersistRequired = false;

                final SequenceMetadata sequenceToUse = sequences
                    .stream()
                    .filter(sequenceMetadata -> sequenceMetadata.canHandle(this, record))
                    .findFirst()
                    .orElse(null);

                if (sequenceToUse == null) {
                  throw new ISE(
                      "WTH?! cannot find any valid sequence for record with partition [%s] and sequenceNumber [%s]. Current sequences: %s",
                      record.getPartitionId(),
                      record.getSequenceNumber(),
                      sequences
                  );
                }

                for (InputRow row : rows) {
                  if (row != null && task.withinMinMaxRecordTime(row)) {
                    final AppenderatorDriverAddResult addResult = driver.add(
                        row,
                        sequenceToUse.getSequenceName(),
                        committerSupplier,
                        true,
                        // do not allow incremental persists to happen until all the rows from this batch
                        // of rows are indexed
                        false
                    );

                    if (addResult.isOk()) {
                      // If the number of rows in the segment exceeds the threshold after adding a row,
                      // move the segment out from the active segments of BaseAppenderatorDriver to make a new segment.
                      final boolean isPushRequired = addResult.isPushRequired(
                          tuningConfig.getPartitionsSpec().getMaxRowsPerSegment(),
                          tuningConfig.getPartitionsSpec().getMaxTotalRows()
                      );
                      if (isPushRequired && !sequenceToUse.isCheckpointed()) {
                        sequenceToCheckpoint = sequenceToUse;
                      }
                      isPersistRequired |= addResult.isPersistRequired();
                    } else {
                      // Failure to allocate segment puts determinism at risk, bail out to be safe.
                      // May want configurable behavior here at some point.
                      // If we allow continuing, then consider blacklisting the interval for a while to avoid constant checks.
                      throw new ISE("Could not allocate segment for row with timestamp[%s]", row.getTimestamp());
                    }

                    if (addResult.getParseException() != null) {
                      handleParseException(addResult.getParseException(), record);
                    } else {
                      rowIngestionMeters.incrementProcessed();
                    }
                  } else {
                    rowIngestionMeters.incrementThrownAway();
                  }
                }
                if (isPersistRequired) {
                  Futures.addCallback(
                      driver.persistAsync(committerSupplier.get()),
                      new FutureCallback()
                      {
                        @Override
                        public void onSuccess(@Nullable Object result)
                        {
                          log.info("Persist completed with metadata [%s]", result);
                        }

                        @Override
                        public void onFailure(Throwable t)
                        {
                          log.error("Persist failed, dying");
                          backgroundThreadException = t;
                        }
                      }
                  );
                }
              }
              catch (ParseException e) {
                handleParseException(e, record);
              }

              // in kafka, we can easily get the next offset by adding 1, but for kinesis, there's no way
              // to get the next sequence number without having to make an expensive api call. So the behavior
              // here for kafka is to +1 while for kinesis we simply save the current sequence number
              lastReadOffsets.put(record.getPartitionId(), record.getSequenceNumber());
              currOffsets.put(record.getPartitionId(), getNextStartOffset(record.getSequenceNumber()));
            }

            // Use record.getSequenceNumber() in the moreToRead check, since currOffsets might not have been
            // updated if we were skipping records for being beyond the end.
            final boolean moreToReadAfterThisRecord = isMoreToReadAfterReadingRecord(
                record.getSequenceNumber(),
                endOffsets.get(record.getPartitionId())
            );

            if (!moreToReadAfterThisRecord && assignment.remove(record.getStreamPartition())) {
              log.info("Finished reading stream[%s], partition[%s].", record.getStream(), record.getPartitionId());
              recordSupplier.assign(assignment);
              stillReading = !assignment.isEmpty();
            }
          }

          if (System.currentTimeMillis() > nextCheckpointTime) {
            sequenceToCheckpoint = getLastSequenceMetadata();
          }

          if (sequenceToCheckpoint != null && stillReading) {
            Preconditions.checkArgument(
                getLastSequenceMetadata()
                         .getSequenceName()
                         .equals(sequenceToCheckpoint.getSequenceName()),
                "Cannot checkpoint a sequence [%s] which is not the latest one, sequences %s",
                sequenceToCheckpoint,
                sequences
            );
            requestPause();
            final CheckPointDataSourceMetadataAction checkpointAction = new CheckPointDataSourceMetadataAction(
                task.getDataSource(),
                ioConfig.getTaskGroupId(),
                task.getIOConfig().getBaseSequenceName(),
                null,
                createDataSourceMetadata(
                    new SeekableStreamStartSequenceNumbers<>(
                        stream,
                        sequenceToCheckpoint.getStartOffsets(),
                        sequenceToCheckpoint.getExclusiveStartPartitions()
                    )
                )
            );
            if (!toolbox.getTaskActionClient().submit(checkpointAction)) {
              throw new ISE("Checkpoint request with sequences [%s] failed, dying", currOffsets);
            }
          }
        }
        ingestionState = IngestionState.COMPLETED;
      }
      catch (Exception e) {
        // (1) catch all exceptions while reading from kafka
        caughtExceptionInner = e;
        log.error(e, "Encountered exception in run() before persisting.");
        throw e;
      }
      finally {
        log.info("Persisting all pending data");
        try {
          driver.persist(committerSupplier.get()); // persist pending data
        }
        catch (Exception e) {
          if (caughtExceptionInner != null) {
            caughtExceptionInner.addSuppressed(e);
          } else {
            throw e;
          }
        }
      }

      synchronized (statusLock) {
        if (stopRequested.get() && !publishOnStop.get()) {
          throw new InterruptedException("Stopping without publishing");
        }

        status = Status.PUBLISHING;
      }

      for (int i = 0; i < sequences.size(); i++) {
        final SequenceMetadata sequenceMetadata = sequences.get(i);
        if (!publishingSequences.contains(sequenceMetadata.getSequenceName())) {
          final boolean isLast = i == (sequences.size() - 1);
          if (isLast) {
            // Shorten endOffsets of the last sequence to match currOffsets.
            sequenceMetadata.setEndOffsets(currOffsets);
          }

          // Update assignments of the sequence, which should clear them. (This will be checked later, when the
          // Committer is built.)
          sequenceMetadata.updateAssignments(currOffsets, this::isMoreToReadAfterReadingRecord);
          publishingSequences.add(sequenceMetadata.getSequenceName());
          // persist already done in finally, so directly add to publishQueue
          publishAndRegisterHandoff(sequenceMetadata);
        }
      }

      if (backgroundThreadException != null) {
        throw new RuntimeException(backgroundThreadException);
      }

      // Wait for publish futures to complete.
      Futures.allAsList(publishWaitList).get();

      // Wait for handoff futures to complete.
      // Note that every publishing task (created by calling AppenderatorDriver.publish()) has a corresponding
      // handoffFuture. handoffFuture can throw an exception if 1) the corresponding publishFuture failed or 2) it
      // failed to persist sequences. It might also return null if handoff failed, but was recoverable.
      // See publishAndRegisterHandoff() for details.
      List handedOffList = Collections.emptyList();
      if (tuningConfig.getHandoffConditionTimeout() == 0) {
        handedOffList = Futures.allAsList(handOffWaitList).get();
      } else {
        try {
          handedOffList = Futures.allAsList(handOffWaitList)
                                 .get(tuningConfig.getHandoffConditionTimeout(), TimeUnit.MILLISECONDS);
        }
        catch (TimeoutException e) {
          // Handoff timeout is not an indexing failure, but coordination failure. We simply ignore timeout exception
          // here.
          log.makeAlert("Timed out after [%d] millis waiting for handoffs", tuningConfig.getHandoffConditionTimeout())
             .addData("TaskId", task.getId())
             .emit();
        }
      }

      for (SegmentsAndMetadata handedOff : handedOffList) {
        log.info(
            "Handoff completed for segments %s with metadata[%s].",
            Lists.transform(handedOff.getSegments(), DataSegment::getId),
            Preconditions.checkNotNull(handedOff.getCommitMetadata(), "commitMetadata")
        );
      }

      appenderator.close();
    }
    catch (InterruptedException | RejectedExecutionException e) {
      // (2) catch InterruptedException and RejectedExecutionException thrown for the whole ingestion steps including
      // the final publishing.
      caughtExceptionOuter = e;
      try {
        Futures.allAsList(publishWaitList).cancel(true);
        Futures.allAsList(handOffWaitList).cancel(true);
        if (appenderator != null) {
          appenderator.closeNow();
        }
      }
      catch (Exception e2) {
        e.addSuppressed(e2);
      }

      // handle the InterruptedException that gets wrapped in a RejectedExecutionException
      if (e instanceof RejectedExecutionException
          && (e.getCause() == null || !(e.getCause() instanceof InterruptedException))) {
        throw e;
      }

      // if we were interrupted because we were asked to stop, handle the exception and return success, else rethrow
      if (!stopRequested.get()) {
        Thread.currentThread().interrupt();
        throw e;
      }

      log.info("The task was asked to stop before completing");
    }
    catch (Exception e) {
      // (3) catch all other exceptions thrown for the whole ingestion steps including the final publishing.
      caughtExceptionOuter = e;
      try {
        Futures.allAsList(publishWaitList).cancel(true);
        Futures.allAsList(handOffWaitList).cancel(true);
        if (appenderator != null) {
          appenderator.closeNow();
        }
      }
      catch (Exception e2) {
        e.addSuppressed(e2);
      }
      throw e;
    }
    finally {
      try {

        if (driver != null) {
          driver.close();
        }
        if (chatHandlerProvider.isPresent()) {
          chatHandlerProvider.get().unregister(task.getId());
        }

        if (appenderatorsManager.shouldTaskMakeNodeAnnouncements()) {
          toolbox.getDruidNodeAnnouncer().unannounce(discoveryDruidNode);
          toolbox.getDataSegmentServerAnnouncer().unannounce();
        }
      }
      catch (Throwable e) {
        if (caughtExceptionOuter != null) {
          caughtExceptionOuter.addSuppressed(e);
        } else {
          throw e;
        }
      }
    }

    toolbox.getTaskReportFileWriter().write(task.getId(), getTaskCompletionReports(null));
    return TaskStatus.success(task.getId());
  }

  private void checkPublishAndHandoffFailure() throws ExecutionException, InterruptedException
  {
    // Check if any publishFuture failed.
    final List> publishFinished = publishWaitList
        .stream()
        .filter(Future::isDone)
        .collect(Collectors.toList());

    for (ListenableFuture publishFuture : publishFinished) {
      // If publishFuture failed, the below line will throw an exception and catched by (1), and then (2) or (3).
      publishFuture.get();
    }

    publishWaitList.removeAll(publishFinished);

    // Check if any handoffFuture failed.
    final List> handoffFinished = handOffWaitList
        .stream()
        .filter(Future::isDone)
        .collect(Collectors.toList());

    for (ListenableFuture handoffFuture : handoffFinished) {
      // If handoffFuture failed, the below line will throw an exception and catched by (1), and then (2) or (3).
      handoffFuture.get();
    }

    handOffWaitList.removeAll(handoffFinished);
  }

  private void publishAndRegisterHandoff(SequenceMetadata sequenceMetadata)
  {
    log.info("Publishing segments for sequence [%s]", sequenceMetadata);

    final ListenableFuture publishFuture = Futures.transform(
        driver.publish(
            sequenceMetadata.createPublisher(this, toolbox, ioConfig.isUseTransaction()),
            sequenceMetadata.getCommitterSupplier(this, stream, lastPersistedOffsets).get(),
            Collections.singletonList(sequenceMetadata.getSequenceName())
        ),
        (Function) publishedSegmentsAndMetadata -> {
          if (publishedSegmentsAndMetadata == null) {
            throw new ISE(
                "Transaction failure publishing segments for sequence [%s]",
                sequenceMetadata
            );
          } else {
            return publishedSegmentsAndMetadata;
          }
        }
    );
    publishWaitList.add(publishFuture);

    // Create a handoffFuture for every publishFuture. The created handoffFuture must fail if publishFuture fails.
    final SettableFuture handoffFuture = SettableFuture.create();
    handOffWaitList.add(handoffFuture);

    Futures.addCallback(
        publishFuture,
        new FutureCallback()
        {
          @Override
          public void onSuccess(SegmentsAndMetadata publishedSegmentsAndMetadata)
          {
            log.info(
                "Published segments %s with metadata[%s].",
                Lists.transform(publishedSegmentsAndMetadata.getSegments(), DataSegment::getId),
                Preconditions.checkNotNull(publishedSegmentsAndMetadata.getCommitMetadata(), "commitMetadata")
            );

            sequences.remove(sequenceMetadata);
            publishingSequences.remove(sequenceMetadata.getSequenceName());
            try {
              persistSequences();
            }
            catch (IOException e) {
              log.error(e, "Unable to persist state, dying");
              handoffFuture.setException(e);
              throw new RuntimeException(e);
            }

            Futures.transform(
                driver.registerHandoff(publishedSegmentsAndMetadata),
                new Function()
                {
                  @Nullable
                  @Override
                  public Void apply(@Nullable SegmentsAndMetadata handoffSegmentsAndMetadata)
                  {
                    if (handoffSegmentsAndMetadata == null) {
                      log.warn(
                          "Failed to handoff segments %s",
                          Lists.transform(publishedSegmentsAndMetadata.getSegments(), DataSegment::getId)
                      );
                    }
                    handoffFuture.set(handoffSegmentsAndMetadata);
                    return null;
                  }
                }
            );
          }

          @Override
          public void onFailure(Throwable t)
          {
            log.error(t, "Error while publishing segments for sequenceNumber[%s]", sequenceMetadata);
            handoffFuture.setException(t);
          }
        }
    );
  }

  private static File getSequencesPersistFile(TaskToolbox toolbox)
  {
    return new File(toolbox.getPersistDir(), "sequences.json");
  }

  private boolean restoreSequences() throws IOException
  {
    final File sequencesPersistFile = getSequencesPersistFile(toolbox);
    if (sequencesPersistFile.exists()) {
      sequences = new CopyOnWriteArrayList<>(
          toolbox.getObjectMapper().>>readValue(
              sequencesPersistFile,
              getSequenceMetadataTypeReference()
          )
      );
      return true;
    } else {
      return false;
    }
  }

  private synchronized void persistSequences() throws IOException
  {
    log.info("Persisting Sequences Metadata [%s]", sequences);
    toolbox.getObjectMapper().writerWithType(
        getSequenceMetadataTypeReference()
    ).writeValue(getSequencesPersistFile(toolbox), sequences);
  }

  private Map getTaskCompletionReports(@Nullable String errorMsg)
  {
    return TaskReport.buildTaskReports(
        new IngestionStatsAndErrorsTaskReport(
            task.getId(),
            new IngestionStatsAndErrorsTaskReportData(
                ingestionState,
                getTaskCompletionUnparseableEvents(),
                getTaskCompletionRowStats(),
                errorMsg
            )
        )
    );
  }

  private Map getTaskCompletionUnparseableEvents()
  {
    Map unparseableEventsMap = new HashMap<>();
    List buildSegmentsParseExceptionMessages = IndexTaskUtils.getMessagesFromSavedParseExceptions(
        savedParseExceptions
    );
    if (buildSegmentsParseExceptionMessages != null) {
      unparseableEventsMap.put(RowIngestionMeters.BUILD_SEGMENTS, buildSegmentsParseExceptionMessages);
    }
    return unparseableEventsMap;
  }

  private Map getTaskCompletionRowStats()
  {
    Map metrics = new HashMap<>();
    metrics.put(
        RowIngestionMeters.BUILD_SEGMENTS,
        rowIngestionMeters.getTotals()
    );
    return metrics;
  }


  private void maybePersistAndPublishSequences(Supplier committerSupplier)
      throws InterruptedException
  {
    for (SequenceMetadata sequenceMetadata : sequences) {
      sequenceMetadata.updateAssignments(currOffsets, this::isMoreToReadBeforeReadingRecord);
      if (!sequenceMetadata.isOpen() && !publishingSequences.contains(sequenceMetadata.getSequenceName())) {
        publishingSequences.add(sequenceMetadata.getSequenceName());
        try {
          Object result = driver.persist(committerSupplier.get());
          log.info(
              "Persist completed with results: [%s], adding sequence [%s] to publish queue",
              result,
              sequenceMetadata
          );
          publishAndRegisterHandoff(sequenceMetadata);
        }
        catch (InterruptedException e) {
          log.warn("Interrupted while persisting sequence [%s]", sequenceMetadata);
          throw e;
        }
      }
    }
  }

  private Set> assignPartitions(
      RecordSupplier recordSupplier
  )
  {
    final Set> assignment = new HashSet<>();
    for (Map.Entry entry : currOffsets.entrySet()) {
      final PartitionIdType partition = entry.getKey();
      final SequenceOffsetType currOffset = entry.getValue();
      final SequenceOffsetType endOffset = endOffsets.get(partition);

      if (!isRecordAlreadyRead(partition, endOffset) && isMoreToReadBeforeReadingRecord(currOffset, endOffset)) {
        log.info(
            "Adding partition[%s], start[%s] -> end[%s] to assignment.",
            partition,
            currOffset,
            endOffset
        );

        assignment.add(StreamPartition.of(stream, partition));
      } else {
        log.info("Finished reading partition[%s].", partition);
      }
    }

    recordSupplier.assign(assignment);

    return assignment;
  }

  private void addSequence(final SequenceMetadata sequenceMetadata)
  {
    // Sanity check that the start of the new sequence matches up with the end of the prior sequence.
    for (Map.Entry entry : sequenceMetadata.getStartOffsets().entrySet()) {
      final PartitionIdType partition = entry.getKey();
      final SequenceOffsetType startOffset = entry.getValue();

      if (!sequences.isEmpty()) {
        final SequenceOffsetType priorOffset = getLastSequenceMetadata().endOffsets.get(partition);

        if (!startOffset.equals(priorOffset)) {
          throw new ISE(
              "New sequence startOffset[%s] does not equal expected prior offset[%s]",
              startOffset,
              priorOffset
          );
        }
      }
    }

    if (!isEndOffsetExclusive() && !sequences.isEmpty()) {
      final SequenceMetadata lastMetadata = getLastSequenceMetadata();
      if (!lastMetadata.endOffsets.keySet().equals(sequenceMetadata.getExclusiveStartPartitions())) {
        throw new ISE(
            "Exclusive start partitions[%s] for new sequence don't match to the prior offset[%s]",
            sequenceMetadata.getExclusiveStartPartitions(),
            lastMetadata
        );
      }
    }

    // Actually do the add.
    sequences.add(sequenceMetadata);
  }

  private SequenceMetadata getLastSequenceMetadata()
  {
    Preconditions.checkState(!sequences.isEmpty(), "Empty sequences");
    return sequences.get(sequences.size() - 1);
  }

  /**
   * Returns true if the given record has already been read, based on lastReadOffsets.
   */
  private boolean isRecordAlreadyRead(
      final PartitionIdType recordPartition,
      final SequenceOffsetType recordSequenceNumber
  )
  {
    final SequenceOffsetType lastReadOffset = lastReadOffsets.get(recordPartition);

    if (lastReadOffset == null) {
      return false;
    } else {
      return createSequenceNumber(recordSequenceNumber).compareTo(createSequenceNumber(lastReadOffset)) <= 0;
    }
  }

  /**
   * Returns true if, given that we want to start reading from recordSequenceNumber and end at endSequenceNumber, there
   * is more left to read. Used in pre-read checks to determine if there is anything left to read.
   */
  private boolean isMoreToReadBeforeReadingRecord(
      final SequenceOffsetType recordSequenceNumber,
      final SequenceOffsetType endSequenceNumber
  )
  {
    final int compareToEnd = createSequenceNumber(recordSequenceNumber)
        .compareTo(createSequenceNumber(endSequenceNumber));

    return isEndOffsetExclusive() ? compareToEnd < 0 : compareToEnd <= 0;
  }

  /**
   * Returns true if, given that recordSequenceNumber has already been read and we want to end at endSequenceNumber,
   * there is more left to read. Used in post-read checks to determine if there is anything left to read.
   */
  private boolean isMoreToReadAfterReadingRecord(
      final SequenceOffsetType recordSequenceNumber,
      final SequenceOffsetType endSequenceNumber
  )
  {
    final int compareNextToEnd = createSequenceNumber(getNextStartOffset(recordSequenceNumber))
        .compareTo(createSequenceNumber(endSequenceNumber));

    // Unlike isMoreToReadBeforeReadingRecord, we don't care if the end is exclusive or not. If we read it, we're done.
    return compareNextToEnd < 0;
  }

  private void seekToStartingSequence(
      RecordSupplier recordSupplier,
      Set> partitions
  ) throws InterruptedException
  {
    for (final StreamPartition partition : partitions) {
      final SequenceOffsetType sequence = currOffsets.get(partition.getPartitionId());
      log.info("Seeking partition[%s] to sequenceNumber[%s].", partition.getPartitionId(), sequence);
      recordSupplier.seek(partition, sequence);
    }
  }

  /**
   * Checks if the pauseRequested flag was set and if so blocks:
   * a) if pauseMillis == PAUSE_FOREVER, until pauseRequested is cleared
   * b) if pauseMillis != PAUSE_FOREVER, until pauseMillis elapses -or- pauseRequested is cleared
   * 

* If pauseMillis is changed while paused, the new pause timeout will be applied. This allows adjustment of the * pause timeout (making a timed pause into an indefinite pause and vice versa is valid) without having to resume * and ensures that the loop continues to stay paused without ingesting any new events. You will need to signal * shouldResume after adjusting pauseMillis for the new value to take effect. *

* Sets paused = true and signals paused so callers can be notified when the pause command has been accepted. *

* Additionally, pauses if all partitions assignments have been read and pauseAfterRead flag is set. * * @return true if a pause request was handled, false otherwise */ private boolean possiblyPause() throws InterruptedException { pauseLock.lockInterruptibly(); try { if (pauseRequested) { status = Status.PAUSED; hasPaused.signalAll(); while (pauseRequested) { log.info("Pausing ingestion until resumed"); shouldResume.await(); } status = Status.READING; shouldResume.signalAll(); log.info("Ingestion loop resumed"); return true; } } finally { pauseLock.unlock(); } return false; } private void handleParseException(ParseException pe, OrderedPartitionableRecord record) { if (pe.isFromPartiallyValidRow()) { rowIngestionMeters.incrementProcessedWithError(); } else { rowIngestionMeters.incrementUnparseable(); } if (tuningConfig.isLogParseExceptions()) { log.error( pe, "Encountered parse exception on row from partition[%s] sequenceNumber[%s]", record.getPartitionId(), record.getSequenceNumber() ); } if (savedParseExceptions != null) { savedParseExceptions.add(pe); } if (rowIngestionMeters.getUnparseable() + rowIngestionMeters.getProcessedWithError() > tuningConfig.getMaxParseExceptions()) { log.error("Max parse exceptions exceeded, terminating task..."); throw new RuntimeException("Max parse exceptions exceeded, terminating task..."); } } private boolean isPaused() { return status == Status.PAUSED; } private void requestPause() { pauseRequested = true; } protected void sendResetRequestAndWait( Map, SequenceOffsetType> outOfRangePartitions, TaskToolbox taskToolbox ) throws IOException { Map partitionOffsetMap = CollectionUtils.mapKeys( outOfRangePartitions, StreamPartition::getPartitionId ); boolean result = taskToolbox .getTaskActionClient() .submit( new ResetDataSourceMetadataAction( task.getDataSource(), createDataSourceMetadata( new SeekableStreamEndSequenceNumbers<>( ioConfig.getStartSequenceNumbers().getStream(), partitionOffsetMap ) ) ) ); if (result) { log.makeAlert("Resetting sequences for datasource [%s]", task.getDataSource()) .addData("partitions", partitionOffsetMap.keySet()) .emit(); requestPause(); } else { log.makeAlert("Failed to send reset request for partitions [%s]", partitionOffsetMap.keySet()).emit(); } } /** * Authorizes action to be performed on this task's datasource * * @return authorization result */ private Access authorizationCheck(final HttpServletRequest req, Action action) { return IndexTaskUtils.datasourceAuthorizationCheck(req, action, task.getDataSource(), authorizerMapper); } public Appenderator getAppenderator() { return appenderator; } @VisibleForTesting public RowIngestionMeters getRowIngestionMeters() { return rowIngestionMeters; } public void stopForcefully() { log.info("Stopping forcefully (status: [%s])", status); stopRequested.set(true); runThread.interrupt(); } public void stopGracefully() { log.info("Stopping gracefully (status: [%s])", status); stopRequested.set(true); synchronized (statusLock) { if (status == Status.PUBLISHING) { runThread.interrupt(); return; } } try { if (pauseLock.tryLock(SeekableStreamIndexTask.LOCK_ACQUIRE_TIMEOUT_SECONDS, TimeUnit.SECONDS)) { try { if (pauseRequested) { pauseRequested = false; shouldResume.signalAll(); } } finally { pauseLock.unlock(); } } else { log.warn("While stopping: failed to acquire pauseLock before timeout, interrupting run thread"); runThread.interrupt(); return; } if (pollRetryLock.tryLock(SeekableStreamIndexTask.LOCK_ACQUIRE_TIMEOUT_SECONDS, TimeUnit.SECONDS)) { try { isAwaitingRetry.signalAll(); } finally { pollRetryLock.unlock(); } } else { log.warn("While stopping: failed to acquire pollRetryLock before timeout, interrupting run thread"); runThread.interrupt(); } } catch (Exception e) { throw new RuntimeException(e); } } @POST @Path("/stop") public Response stop(@Context final HttpServletRequest req) { authorizationCheck(req, Action.WRITE); stopGracefully(); return Response.status(Response.Status.OK).build(); } @GET @Path("/status") @Produces(MediaType.APPLICATION_JSON) public Status getStatusHTTP(@Context final HttpServletRequest req) { authorizationCheck(req, Action.READ); return status; } @VisibleForTesting public Status getStatus() { return status; } @GET @Path("/offsets/current") @Produces(MediaType.APPLICATION_JSON) public Map getCurrentOffsets(@Context final HttpServletRequest req) { authorizationCheck(req, Action.READ); return getCurrentOffsets(); } public ConcurrentMap getCurrentOffsets() { return currOffsets; } @GET @Path("/offsets/end") @Produces(MediaType.APPLICATION_JSON) public Map getEndOffsetsHTTP(@Context final HttpServletRequest req) { authorizationCheck(req, Action.READ); return getEndOffsets(); } public Map getEndOffsets() { return endOffsets; } @POST @Path("/offsets/end") @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) public Response setEndOffsetsHTTP( Map sequences, @QueryParam("finish") @DefaultValue("true") final boolean finish, // this field is only for internal purposes, shouldn't be usually set by users @Context final HttpServletRequest req ) throws InterruptedException { authorizationCheck(req, Action.WRITE); return setEndOffsets(sequences, finish); } @GET @Path("/rowStats") @Produces(MediaType.APPLICATION_JSON) public Response getRowStats( @Context final HttpServletRequest req ) { authorizationCheck(req, Action.READ); Map returnMap = new HashMap<>(); Map totalsMap = new HashMap<>(); Map averagesMap = new HashMap<>(); totalsMap.put( RowIngestionMeters.BUILD_SEGMENTS, rowIngestionMeters.getTotals() ); averagesMap.put( RowIngestionMeters.BUILD_SEGMENTS, rowIngestionMeters.getMovingAverages() ); returnMap.put("movingAverages", averagesMap); returnMap.put("totals", totalsMap); return Response.ok(returnMap).build(); } @GET @Path("/unparseableEvents") @Produces(MediaType.APPLICATION_JSON) public Response getUnparseableEvents( @Context final HttpServletRequest req ) { authorizationCheck(req, Action.READ); List events = IndexTaskUtils.getMessagesFromSavedParseExceptions(savedParseExceptions); return Response.ok(events).build(); } @VisibleForTesting public Response setEndOffsets( Map sequenceNumbers, boolean finish // this field is only for internal purposes, shouldn't be usually set by users ) throws InterruptedException { if (sequenceNumbers == null) { return Response.status(Response.Status.BAD_REQUEST) .entity("Request body must contain a map of { partition:endOffset }") .build(); } else if (!endOffsets.keySet().containsAll(sequenceNumbers.keySet())) { return Response.status(Response.Status.BAD_REQUEST) .entity( StringUtils.format( "Request contains partitions not being handled by this task, my partitions: %s", endOffsets.keySet() ) ) .build(); } else { try { pauseLock.lockInterruptibly(); // Perform all sequence related checks before checking for isPaused() // and after acquiring pauseLock to correctly guard against duplicate requests Preconditions.checkState(sequenceNumbers.size() > 0, "WTH?! No Sequences found to set end sequences"); final SequenceMetadata latestSequence = getLastSequenceMetadata(); final Set exclusiveStartPartitions; if (isEndOffsetExclusive()) { // When end offsets are exclusive, there's no need for marking the next sequence as having any // exclusive-start partitions. It should always start from the end offsets of the prior sequence. exclusiveStartPartitions = Collections.emptySet(); } else { // When end offsets are inclusive, we must mark all partitions as exclusive-start, to avoid reading // their final messages (which have already been read). exclusiveStartPartitions = sequenceNumbers.keySet(); } if ((latestSequence.getStartOffsets().equals(sequenceNumbers) && latestSequence.getExclusiveStartPartitions().equals(exclusiveStartPartitions) && !finish) || (latestSequence.getEndOffsets().equals(sequenceNumbers) && finish)) { log.warn("Ignoring duplicate request, end sequences already set for sequences [%s]", sequenceNumbers); resume(); return Response.ok(sequenceNumbers).build(); } else if (latestSequence.isCheckpointed()) { return Response.status(Response.Status.BAD_REQUEST) .entity(StringUtils.format( "WTH?! Sequence [%s] has already endOffsets set, cannot set to [%s]", latestSequence, sequenceNumbers )).build(); } else if (!isPaused()) { return Response.status(Response.Status.BAD_REQUEST) .entity("Task must be paused before changing the end sequences") .build(); } for (Map.Entry entry : sequenceNumbers.entrySet()) { if (createSequenceNumber(entry.getValue()).compareTo(createSequenceNumber(currOffsets.get(entry.getKey()))) < 0) { return Response.status(Response.Status.BAD_REQUEST) .entity( StringUtils.format( "End sequence must be >= current sequence for partition [%s] (current: %s)", entry.getKey(), currOffsets.get(entry.getKey()) ) ) .build(); } } resetNextCheckpointTime(); latestSequence.setEndOffsets(sequenceNumbers); if (finish) { log.info("Updating endOffsets from [%s] to [%s]", endOffsets, sequenceNumbers); endOffsets.putAll(sequenceNumbers); } else { // create new sequence log.info("Creating new sequence with startOffsets [%s] and endOffsets [%s]", sequenceNumbers, endOffsets); final SequenceMetadata newSequence = new SequenceMetadata<>( latestSequence.getSequenceId() + 1, StringUtils.format("%s_%d", ioConfig.getBaseSequenceName(), latestSequence.getSequenceId() + 1), sequenceNumbers, endOffsets, false, exclusiveStartPartitions ); addSequence(newSequence); } persistSequences(); } catch (Exception e) { log.error(e, "Unable to set end sequences, dying"); backgroundThreadException = e; // should resume to immediately finish kafka index task as failed resume(); return Response.status(Response.Status.INTERNAL_SERVER_ERROR) .entity(Throwables.getStackTraceAsString(e)) .build(); } finally { pauseLock.unlock(); } } resume(); return Response.ok(sequenceNumbers).build(); } private void resetNextCheckpointTime() { nextCheckpointTime = DateTimes.nowUtc().plus(tuningConfig.getIntermediateHandoffPeriod()).getMillis(); } @VisibleForTesting public CopyOnWriteArrayList> getSequences() { return sequences; } @GET @Path("/checkpoints") @Produces(MediaType.APPLICATION_JSON) public Map> getCheckpointsHTTP( @Context final HttpServletRequest req ) { authorizationCheck(req, Action.READ); return getCheckpoints(); } private Map> getCheckpoints() { return new TreeMap<>(sequences.stream() .collect(Collectors.toMap( SequenceMetadata::getSequenceId, SequenceMetadata::getStartOffsets ))); } /** * Signals the ingestion loop to pause. * * @return one of the following Responses: 400 Bad Request if the task has started publishing; 202 Accepted if the * method has timed out and returned before the task has paused; 200 OK with a map of the current partition sequences * in the response body if the task successfully paused */ @POST @Path("/pause") @Produces(MediaType.APPLICATION_JSON) public Response pauseHTTP( @Context final HttpServletRequest req ) throws InterruptedException { authorizationCheck(req, Action.WRITE); return pause(); } @VisibleForTesting public Response pause() throws InterruptedException { if (!(status == Status.PAUSED || status == Status.READING)) { return Response.status(Response.Status.BAD_REQUEST) .entity(StringUtils.format("Can't pause, task is not in a pausable state (state: [%s])", status)) .build(); } pauseLock.lockInterruptibly(); try { pauseRequested = true; pollRetryLock.lockInterruptibly(); try { isAwaitingRetry.signalAll(); } finally { pollRetryLock.unlock(); } if (isPaused()) { shouldResume.signalAll(); // kick the monitor so it re-awaits with the new pauseMillis } long nanos = TimeUnit.SECONDS.toNanos(2); while (!isPaused()) { if (nanos <= 0L) { return Response.status(Response.Status.ACCEPTED) .entity("Request accepted but task has not yet paused") .build(); } nanos = hasPaused.awaitNanos(nanos); } } finally { pauseLock.unlock(); } try { return Response.ok().entity(toolbox.getObjectMapper().writeValueAsString(getCurrentOffsets())).build(); } catch (JsonProcessingException e) { throw new RuntimeException(e); } } @POST @Path("/resume") public Response resumeHTTP(@Context final HttpServletRequest req) throws InterruptedException { authorizationCheck(req, Action.WRITE); resume(); return Response.status(Response.Status.OK).build(); } @VisibleForTesting public void resume() throws InterruptedException { pauseLock.lockInterruptibly(); try { pauseRequested = false; shouldResume.signalAll(); long nanos = TimeUnit.SECONDS.toNanos(5); while (isPaused()) { if (nanos <= 0L) { throw new RuntimeException("Resume command was not accepted within 5 seconds"); } nanos = shouldResume.awaitNanos(nanos); } } finally { pauseLock.unlock(); } } @GET @Path("/time/start") @Produces(MediaType.APPLICATION_JSON) public DateTime getStartTime(@Context final HttpServletRequest req) { authorizationCheck(req, Action.WRITE); return startTime; } /** * This method does two things: * * 1) Verifies that the sequence numbers we read are at least as high as those read previously, and throws an * exception if not. * 2) Returns false if we should skip this record because it's either (a) the first record in a partition that we are * needing to be exclusive on; (b) too late to read, past the endOffsets. */ private boolean verifyRecordInRange( final PartitionIdType partition, final SequenceOffsetType recordOffset ) { // Verify that the record is at least as high as its currOffset. final SequenceOffsetType currOffset = Preconditions.checkNotNull( currOffsets.get(partition), "Current offset is null for sequenceNumber[%s] and partition[%s]", recordOffset, partition ); final OrderedSequenceNumber recordSequenceNumber = createSequenceNumber(recordOffset); final OrderedSequenceNumber currentSequenceNumber = createSequenceNumber(currOffset); final int comparisonToCurrent = recordSequenceNumber.compareTo(currentSequenceNumber); if (comparisonToCurrent < 0) { throw new ISE( "Record sequenceNumber[%s] is smaller than current sequenceNumber[%s] for partition[%s]", recordOffset, currOffset, partition ); } // Check if the record has already been read. if (isRecordAlreadyRead(partition, recordOffset)) { return false; } // Finally, check if this record comes before the endOffsets for this partition. return isMoreToReadBeforeReadingRecord(recordSequenceNumber.get(), endOffsets.get(partition)); } /** * checks if the input seqNum marks end of shard. Used by Kinesis only */ protected abstract boolean isEndOfShard(SequenceOffsetType seqNum); /** * deserializes the checkpoints into of Map> * * @param toolbox task toolbox * @param checkpointsString the json-serialized checkpoint string * * @return checkpoint * * @throws IOException jsonProcessingException */ @Nullable protected abstract TreeMap> getCheckPointsFromContext( TaskToolbox toolbox, String checkpointsString ) throws IOException; /** * Calculates the sequence number used to update currOffsets after finished reading a record. * This is what would become the start offsets of the next reader, if we stopped reading now. * * @param sequenceNumber the sequence number that has already been processed * * @return next sequence number to be stored */ protected abstract SequenceOffsetType getNextStartOffset(SequenceOffsetType sequenceNumber); /** * deserializes stored metadata into SeekableStreamStartSequenceNumbers * * @param mapper json objectMapper * @param object metadata * * @return SeekableStreamEndSequenceNumbers */ protected abstract SeekableStreamEndSequenceNumbers deserializePartitionsFromMetadata( ObjectMapper mapper, Object object ); /** * polls the next set of records from the recordSupplier, the main purpose of having a separate method here * is to catch and handle exceptions specific to Kafka/Kinesis * * @param recordSupplier * @param toolbox * * @return list of records polled, can be empty but cannot be null * * @throws Exception */ @NotNull protected abstract List> getRecords( RecordSupplier recordSupplier, TaskToolbox toolbox ) throws Exception; /** * creates specific implementations of kafka/kinesis datasource metadata * * @param partitions partitions used to create the datasource metadata * * @return datasource metadata */ protected abstract SeekableStreamDataSourceMetadata createDataSourceMetadata( SeekableStreamSequenceNumbers partitions ); /** * create a specific implementation of Kafka/Kinesis sequence number/offset used for comparison mostly * * @param sequenceNumber * * @return a specific OrderedSequenceNumber instance for Kafka/Kinesis */ protected abstract OrderedSequenceNumber createSequenceNumber(SequenceOffsetType sequenceNumber); /** * check if the sequence offsets stored in currOffsets are still valid sequence offsets compared to * earliest sequence offsets fetched from stream */ protected abstract void possiblyResetDataSourceMetadata( TaskToolbox toolbox, RecordSupplier recordSupplier, Set> assignment ); /** * In Kafka, the endOffsets are exclusive, so skip it. * In Kinesis the endOffsets are inclusive */ protected abstract boolean isEndOffsetExclusive(); protected abstract TypeReference>> getSequenceMetadataTypeReference(); }