org.apache.druid.segment.metadata.AbstractSegmentMetadataCache Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-server Show documentation
Druid Server
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.metadata;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
import com.google.common.base.Stopwatch;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Interner;
import com.google.common.collect.Interners;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import org.apache.druid.client.InternalQueryConfig;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.concurrent.Execs;
import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.java.util.common.guava.Yielder;
import org.apache.druid.java.util.common.guava.Yielders;
import org.apache.druid.java.util.emitter.EmittingLogger;
import org.apache.druid.java.util.emitter.service.ServiceEmitter;
import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.query.DruidMetrics;
import org.apache.druid.query.QueryContexts;
import org.apache.druid.query.TableDataSource;
import org.apache.druid.query.metadata.metadata.AllColumnIncluderator;
import org.apache.druid.query.metadata.metadata.ColumnAnalysis;
import org.apache.druid.query.metadata.metadata.SegmentAnalysis;
import org.apache.druid.query.metadata.metadata.SegmentMetadataQuery;
import org.apache.druid.query.spec.MultipleSpecificSegmentSpec;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.segment.column.Types;
import org.apache.druid.server.QueryLifecycleFactory;
import org.apache.druid.server.coordination.DruidServerMetadata;
import org.apache.druid.server.coordination.ServerType;
import org.apache.druid.server.security.Access;
import org.apache.druid.server.security.Escalator;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.SegmentId;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

/**
 * An abstract class that listens for segment change events and caches segment metadata. It periodically refreshes
 * the segments, by fetching their metadata which includes schema information from sources like
 * data nodes, tasks, metadata database and builds table schema.
 * 
 * At startup, the cache awaits the initialization of the timeline.
 * If the cache employs a segment metadata query to retrieve segment schema, it attempts to refresh a maximum
 * of {@code MAX_SEGMENTS_PER_QUERY} segments for each datasource in each refresh cycle.
 * Once all datasources have undergone this process, the initial schema of each datasource is constructed,
 * and the cache is marked as initialized.
 * Subsequently, the cache continues to periodically refresh segments and update the datasource schema.
 * It is also important to note that a failure in segment refresh results in pausing the refresh work,
 * and the process is resumed in the next refresh cycle.
 * 

 * This class has an abstract method {@link #refresh(Set, Set)} which the child class must override
 * with the logic to build and cache table schema.
 * 

 * Note on handling tombstone segments:
 * These segments lack data or column information.
 * Additionally, segment metadata queries, which are not yet implemented for tombstone segments
 * (see: https://github.com/apache/druid/pull/12137) do not provide metadata for tombstones,
 * leading to indefinite refresh attempts for these segments.
 * Therefore, these segments are never added to the set of segments being refreshed.
 *
 * @param  The type of information associated with the data source, which must extend {@link DataSourceInformation}.
 */
public abstract class AbstractSegmentMetadataCache
{
  private static final EmittingLogger log = new EmittingLogger(AbstractSegmentMetadataCache.class);
  private static final int MAX_SEGMENTS_PER_QUERY = 15000;
  private static final long DEFAULT_NUM_ROWS = 0;

  private final QueryLifecycleFactory queryLifecycleFactory;
  private final SegmentMetadataCacheConfig config;
  // Escalator, so we can attach an authentication result to queries we generate.
  private final Escalator escalator;

  private final ColumnTypeMergePolicy columnTypeMergePolicy;

  // For awaitInitialization.
  private final CountDownLatch initialized = new CountDownLatch(1);

  // Configured context to attach to internally generated queries.
  private final InternalQueryConfig internalQueryConfig;

  @GuardedBy("lock")
  private boolean refreshImmediately = false;

  /**
   * Counts the total number of known segments. This variable is used only for the segments table in the system schema
   * to initialize a map with a more proper size when it creates a snapshot. As a result, it doesn't have to be exact,
   * and thus there is no concurrency control for this variable.
   */
  private int totalSegments = 0;

  // Newest segments first, so they override older ones.
  protected static final Comparator SEGMENT_ORDER = Comparator
      .comparing((SegmentId segmentId) -> segmentId.getInterval().getStart())
      .reversed()
      .thenComparing(Function.identity());

  protected static final Interner ROW_SIGNATURE_INTERNER = Interners.newWeakInterner();

  /**
   * DataSource -> Segment -> AvailableSegmentMetadata(contains RowSignature) for that segment.
   * Use SortedMap for segments so they are merged in deterministic order, from older to newer.
   *
   * This map is updated by these two threads.
   *
   * - {@link #callbackExec} can update it in {@link #addSegment}, {@link #removeServerSegment},
   *   and {@link #removeSegment}.
   * - {@link #cacheExec} can update it in {@link #refreshSegmentsForDataSource}.
   *
   * While it is being updated, this map is read by these two types of thread.
   *
   * - {@link #cacheExec} can iterate all {@link AvailableSegmentMetadata}s per datasource.
   *   See {@link #buildDataSourceRowSignature}.
   * - Query threads can create a snapshot of the entire map for processing queries on the system table.
   *   See {@link #getSegmentMetadataSnapshot()}.
   *
   * As the access pattern of this map is read-intensive, we should minimize the contention between writers and readers.
   * Since there are two threads that can update this map at the same time, those writers should lock the inner map
   * first and then lock the entry before it updates segment metadata. This can be done using
   * {@link ConcurrentMap#compute} as below. Note that, if you need to update the variables guarded by {@link #lock}
   * inside of compute(), you should get the lock before calling compute() to keep the function executed in compute()
   * not expensive.
   *
   * 
   *   segmentMedataInfo.compute(
   *     datasourceParam,
   *     (datasource, segmentsMap) -> {
   *       if (segmentsMap == null) return null;
   *       else {
   *         segmentsMap.compute(
   *           segmentIdParam,
   *           (segmentId, segmentMetadata) -> {
   *             // update segmentMetadata
   *           }
   *         );
   *         return segmentsMap;
   *       }
   *     }
   *   );
   * 
   *
   * Readers can simply delegate the locking to the concurrent map and iterate map entries.
   */
  protected final ConcurrentHashMap> segmentMetadataInfo
      = new ConcurrentHashMap<>();

  protected final ExecutorService cacheExec;
  protected final ExecutorService callbackExec;

  @GuardedBy("lock")
  protected boolean isServerViewInitialized = false;

  protected final ServiceEmitter emitter;

  /**
   * Map of datasource and generic object extending DataSourceInformation.
   * This structure can be accessed by {@link #cacheExec} and {@link #callbackExec} threads.
   */
  protected final ConcurrentHashMap tables = new ConcurrentHashMap<>();

  /**
   * This lock coordinates the access from multiple threads to those variables guarded by this lock.
   * Currently, there are 2 threads that can access these variables.
   *
   * - {@link #callbackExec} executes the timeline callbacks whenever ServerView changes.
   * - {@code cacheExec} periodically refreshes segment metadata and {@link DataSourceInformation} if necessary
   *   based on the information collected via timeline callbacks.
   */
  protected final Object lock = new Object();

  // All mutable segments.
  @GuardedBy("lock")
  protected final TreeSet mutableSegments = new TreeSet<>(SEGMENT_ORDER);

  // All datasources that need tables regenerated.
  @GuardedBy("lock")
  protected final Set dataSourcesNeedingRebuild = new HashSet<>();

  // All segments that need to be refreshed.
  @GuardedBy("lock")
  protected final TreeSet segmentsNeedingRefresh = new TreeSet<>(SEGMENT_ORDER);

  public AbstractSegmentMetadataCache(
      final QueryLifecycleFactory queryLifecycleFactory,
      final SegmentMetadataCacheConfig config,
      final Escalator escalator,
      final InternalQueryConfig internalQueryConfig,
      final ServiceEmitter emitter
  )
  {
    this.queryLifecycleFactory = Preconditions.checkNotNull(queryLifecycleFactory, "queryLifecycleFactory");
    this.config = Preconditions.checkNotNull(config, "config");
    this.columnTypeMergePolicy = config.getMetadataColumnTypeMergePolicy();
    this.cacheExec = Execs.singleThreaded("DruidSchema-Cache-%d");
    this.callbackExec = Execs.singleThreaded("DruidSchema-Callback-%d");
    this.escalator = escalator;
    this.internalQueryConfig = internalQueryConfig;
    this.emitter = emitter;
  }

  protected void cacheExecLoop()
  {
    final Stopwatch stopwatch = Stopwatch.createStarted();
    long lastRefresh = 0L;
    long lastFailure = 0L;

    try {
      refreshWaitCondition();
      while (!Thread.currentThread().isInterrupted()) {
        final Set segmentsToRefresh = new TreeSet<>();
        final Set dataSourcesToRebuild = new TreeSet<>();

        try {
          synchronized (lock) {
            final long nextRefreshNoFuzz = DateTimes
                .utc(lastRefresh)
                .plus(config.getMetadataRefreshPeriod())
                .getMillis();

            // Fuzz a bit to spread load out when we have multiple brokers.
            final long nextRefresh = nextRefreshNoFuzz + (long) ((nextRefreshNoFuzz - lastRefresh) * 0.10);

            while (true) {
              // Do not refresh if it's too soon after a failure (to avoid rapid cycles of failure).
              final boolean wasRecentFailure = DateTimes.utc(lastFailure)
                                                        .plus(config.getMetadataRefreshPeriod())
                                                        .isAfterNow();

              if (isServerViewInitialized &&
                  !wasRecentFailure &&
                  shouldRefresh() &&
                  (refreshImmediately || nextRefresh < System.currentTimeMillis())) {
                // We need to do a refresh. Break out of the waiting loop.
                break;
              }

              // lastFailure != 0L means exceptions happened before and there're some refresh work was not completed.
              // so that even if ServerView is initialized, we can't let broker complete initialization.
              if (isServerViewInitialized && lastFailure == 0L) {
                // Server view is initialized, but we don't need to do a refresh. Could happen if there are
                // no segments in the system yet. Just mark us as initialized, then.
                setInitializedAndReportInitTime(stopwatch);
              }

              // Wait some more, we'll wake up when it might be time to do another refresh.
              lock.wait(Math.max(1, nextRefresh - System.currentTimeMillis()));
            }

            segmentsToRefresh.addAll(segmentsNeedingRefresh);
            segmentsNeedingRefresh.clear();

            // Mutable segments need a refresh every period, since new columns could be added dynamically.
            segmentsNeedingRefresh.addAll(mutableSegments);

            lastFailure = 0L;
            lastRefresh = System.currentTimeMillis();
            refreshImmediately = false;
          }

          refresh(segmentsToRefresh, dataSourcesToRebuild);

          setInitializedAndReportInitTime(stopwatch);
        }
        catch (InterruptedException e) {
          // Fall through.
          throw e;
        }
        catch (Exception e) {
          log.warn(e, "Metadata refresh failed, trying again soon.");

          synchronized (lock) {
            // Add our segments and datasources back to their refresh and rebuild lists.
            segmentsNeedingRefresh.addAll(segmentsToRefresh);
            dataSourcesNeedingRebuild.addAll(dataSourcesToRebuild);
            lastFailure = System.currentTimeMillis();
          }
        }
      }
    }
    catch (InterruptedException e) {
      // Just exit.
    }
    catch (Throwable e) {
      // Throwables that fall out to here (not caught by an inner try/catch) are potentially gnarly, like
      // OOMEs. Anyway, let's just emit an alert and stop refreshing metadata.
      log.makeAlert(e, "Metadata refresh failed permanently").emit();
      throw e;
    }
    finally {
      log.info("Metadata refresh stopped.");
    }
  }


  /**
   * Lifecycle start method.
   */
  public abstract void start() throws InterruptedException;

  /**
   * Lifecycle stop method.
   */
  public abstract void stop();

  private void setInitializedAndReportInitTime(Stopwatch stopwatch)
  {
    // report the cache init time
    if (initialized.getCount() == 1) {
      long elapsedTime = stopwatch.elapsed(TimeUnit.MILLISECONDS);
      emitter.emit(ServiceMetricEvent.builder().setMetric("metadatacache/init/time", elapsedTime));
      log.info("%s initialized in [%,d] ms.", getClass().getSimpleName(), elapsedTime);
      stopwatch.stop();
    }
    initialized.countDown();
  }

  public void refreshWaitCondition() throws InterruptedException
  {
    // noop
  }

  /**
   * Refresh is executed only when there are segments or datasources needing refresh.
   */
  @SuppressWarnings("GuardedBy")
  protected boolean shouldRefresh()
  {
    return (!segmentsNeedingRefresh.isEmpty() || !dataSourcesNeedingRebuild.isEmpty());
  }

  public void awaitInitialization() throws InterruptedException
  {
    initialized.await();
  }

  /**
   * Fetch schema for the given datasource.
   *
   * @param name datasource
   *
   * @return schema information for the given datasource
   */
  @Nullable
  public T getDatasource(String name)
  {
    return tables.get(name);
  }

  /**
   * @return Map of datasource and corresponding schema information.
   */
  public Map getDataSourceInformationMap()
  {
    return ImmutableMap.copyOf(tables);
  }

  /**
   * @return Set of datasources for which schema information is cached.
   */
  public Set getDatasourceNames()
  {
    return tables.keySet();
  }

  /**
   * Get metadata for all the cached segments, which includes information like RowSignature, realtime & numRows etc.
   *
   * @return Map of segmentId and corresponding metadata.
   */
  public Map getSegmentMetadataSnapshot()
  {
    final Map segmentMetadata = Maps.newHashMapWithExpectedSize(getTotalSegments());
    final Iterator it = iterateSegmentMetadata();
    while (it.hasNext()) {
      final AvailableSegmentMetadata availableSegmentMetadata = it.next();
      segmentMetadata.put(availableSegmentMetadata.getSegment().getId(), availableSegmentMetadata);
    }
    return segmentMetadata;
  }

  /**
   * Get metadata for all the cached segments, which includes information like RowSignature, realtime & numRows etc.
   * This is a lower-overhead method than {@link #getSegmentMetadataSnapshot()}.
   *
   * @return iterator of metadata.
   */
  public Iterator iterateSegmentMetadata()
  {
    return FluentIterable.from(segmentMetadataInfo.values())
                         .transformAndConcat(Map::values)
                         .iterator();
  }

  /**
   * Get metadata for the specified segment, which includes information like RowSignature, realtime & numRows.
   *
   * @param datasource segment datasource
   * @param segmentId  segment Id
   *
   * @return Metadata information for the given segment
   */
  @Nullable
  public AvailableSegmentMetadata getAvailableSegmentMetadata(String datasource, SegmentId segmentId)
  {
    final ConcurrentSkipListMap dataSourceMap =
        segmentMetadataInfo.get(datasource);

    if (dataSourceMap == null) {
      return null;
    } else {
      return dataSourceMap.get(segmentId);
    }
  }

  /**
   * Returns total number of segments. This method doesn't use the lock intentionally to avoid expensive contention.
   * As a result, the returned value might be inexact.
   */
  public int getTotalSegments()
  {
    return totalSegments;
  }

  /**
   * The child classes must override this method with the logic to build and cache table schema.
   *
   * @param segmentsToRefresh    segments for which the schema might have changed
   * @param dataSourcesToRebuild datasources for which the schema might have changed
   * @throws IOException         when querying segment schema from data nodes and tasks
   */
  public abstract void refresh(Set segmentsToRefresh, Set dataSourcesToRebuild) throws IOException;

  @VisibleForTesting
  public void addSegment(final DruidServerMetadata server, final DataSegment segment)
  {
    // Get lock first so that we won't wait in ConcurrentMap.compute().
    synchronized (lock) {
      // someday we could hypothetically remove broker special casing, whenever BrokerServerView supports tracking
      // broker served segments in the timeline, to ensure that removeSegment the event is triggered accurately
      if (server.getType().equals(ServerType.BROKER)) {
        // a segment on a broker means a broadcast datasource, skip metadata because we'll also see this segment on the
        // historical, however mark the datasource for refresh because it needs to be globalized
        markDataSourceAsNeedRebuild(segment.getDataSource());
      } else {
        segmentMetadataInfo.compute(
            segment.getDataSource(),
            (datasource, segmentsMap) -> {
              if (segmentsMap == null) {
                segmentsMap = new ConcurrentSkipListMap<>(SEGMENT_ORDER);
              }
              segmentsMap.compute(
                  segment.getId(),
                  (segmentId, segmentMetadata) -> {
                    if (segmentMetadata == null) {
                      // Unknown segment.
                      totalSegments++;
                      // segmentReplicatable is used to determine if segments are served by historical or realtime servers
                      long isRealtime = server.isSegmentReplicationTarget() ? 0 : 1;
                      segmentMetadata = AvailableSegmentMetadata
                          .builder(segment, isRealtime, ImmutableSet.of(server), null, DEFAULT_NUM_ROWS) // Added without needing a refresh
                          .build();
                      if (segment.isTombstone()) {
                        log.debug("Skipping refresh for tombstone segment.");
                      } else {
                        markSegmentAsNeedRefresh(segment.getId());
                      }
                      if (!server.isSegmentReplicationTarget()) {
                        log.debug("Added new mutable segment [%s].", segment.getId());
                        markSegmentAsMutable(segment.getId());
                      } else {
                        log.debug("Added new immutable segment [%s].", segment.getId());
                      }
                    } else {
                      // We know this segment.
                      final Set segmentServers = segmentMetadata.getReplicas();
                      final ImmutableSet servers = new ImmutableSet.Builder()
                          .addAll(segmentServers)
                          .add(server)
                          .build();
                      segmentMetadata = AvailableSegmentMetadata
                          .from(segmentMetadata)
                          .withReplicas(servers)
                          .withRealtime(recomputeIsRealtime(servers))
                          .build();
                      if (server.isSegmentReplicationTarget()) {
                        // If a segment shows up on a replicatable (historical) server at any point, then it must be immutable,
                        // even if it's also available on non-replicatable (realtime) servers.
                        unmarkSegmentAsMutable(segment.getId());
                        log.debug("Segment[%s] has become immutable.", segment.getId());
                      }
                    }
                    assert segmentMetadata != null;
                    return segmentMetadata;
                  }
              );

              return segmentsMap;
            }
        );
      }
      if (!tables.containsKey(segment.getDataSource())) {
        refreshImmediately = true;
      }

      lock.notifyAll();
    }
  }

  @VisibleForTesting
  public void removeSegment(final DataSegment segment)
  {
    // Get lock first so that we won't wait in ConcurrentMap.compute().
    synchronized (lock) {
      log.debug("Segment [%s] is gone.", segment.getId());

      segmentsNeedingRefresh.remove(segment.getId());
      unmarkSegmentAsMutable(segment.getId());

      segmentMetadataInfo.compute(
          segment.getDataSource(),
          (dataSource, segmentsMap) -> {
            if (segmentsMap == null) {
              log.warn("Unknown segment [%s] was removed from the cluster. Ignoring this event.", segment.getId());
              return null;
            } else {
              if (segmentsMap.remove(segment.getId()) == null) {
                log.warn("Unknown segment [%s] was removed from the cluster. Ignoring this event.", segment.getId());
              } else {
                totalSegments--;
              }
              removeSegmentAction(segment.getId());
              if (segmentsMap.isEmpty()) {
                tables.remove(segment.getDataSource());
                log.info("dataSource [%s] no longer exists, all metadata removed.", segment.getDataSource());
                return null;
              } else {
                markDataSourceAsNeedRebuild(segment.getDataSource());
                return segmentsMap;
              }
            }
          }
      );

      lock.notifyAll();
    }
  }

  /**
   * This method should be overridden by child classes to execute any action on segment removal.
   */
  protected abstract void removeSegmentAction(SegmentId segmentId);

  @VisibleForTesting
  public void removeServerSegment(final DruidServerMetadata server, final DataSegment segment)
  {
    // Get lock first so that we won't wait in ConcurrentMap.compute().
    synchronized (lock) {
      log.debug("Segment [%s] is gone from server [%s]", segment.getId(), server.getName());
      segmentMetadataInfo.compute(
          segment.getDataSource(),
          (datasource, knownSegments) -> {
            if (knownSegments == null) {
              log.warn(
                  "Unknown segment [%s] is removed from server [%s]. Ignoring this event",
                  segment.getId(),
                  server.getHost()
              );
              return null;
            }

            if (server.getType().equals(ServerType.BROKER)) {
              // for brokers, if the segment drops from all historicals before the broker this could be null.
              if (!knownSegments.isEmpty()) {
                // a segment on a broker means a broadcast datasource, skip metadata because we'll also see this segment on the
                // historical, however mark the datasource for refresh because it might no longer be broadcast or something
                markDataSourceAsNeedRebuild(segment.getDataSource());
              }
            } else {
              knownSegments.compute(
                  segment.getId(),
                  (segmentId, segmentMetadata) -> {
                    if (segmentMetadata == null) {
                      log.warn(
                          "Unknown segment [%s] is removed from server [%s]. Ignoring this event",
                          segment.getId(),
                          server.getHost()
                      );
                      return null;
                    } else {
                      final Set segmentServers = segmentMetadata.getReplicas();
                      final ImmutableSet servers = FluentIterable
                          .from(segmentServers)
                          .filter(Predicates.not(Predicates.equalTo(server)))
                          .toSet();
                      return AvailableSegmentMetadata
                          .from(segmentMetadata)
                          .withReplicas(servers)
                          .withRealtime(recomputeIsRealtime(servers))
                          .build();
                    }
                  }
              );
            }
            if (knownSegments.isEmpty()) {
              return null;
            } else {
              return knownSegments;
            }
          }
      );

      lock.notifyAll();
    }
  }

  protected void markSegmentAsNeedRefresh(SegmentId segmentId)
  {
    synchronized (lock) {
      segmentsNeedingRefresh.add(segmentId);
    }
  }

  private void markSegmentAsMutable(SegmentId segmentId)
  {
    synchronized (lock) {
      mutableSegments.add(segmentId);
    }
  }

  protected void unmarkSegmentAsMutable(SegmentId segmentId)
  {
    synchronized (lock) {
      mutableSegments.remove(segmentId);
    }
  }

  @VisibleForTesting
  public void markDataSourceAsNeedRebuild(String datasource)
  {
    synchronized (lock) {
      dataSourcesNeedingRebuild.add(datasource);
    }
  }

  /**
   * Attempt to refresh "segmentSignatures" for a set of segments. Returns the set of segments actually refreshed,
   * which may be a subset of the asked-for set.
   */
  @VisibleForTesting
  public Set refreshSegments(final Set segments) throws IOException
  {
    final Set retVal = new HashSet<>();

    // Organize segments by datasource.
    final Map> segmentMap = new TreeMap<>();

    for (SegmentId segmentId : segments) {
      segmentMap.computeIfAbsent(segmentId.getDataSource(), x -> new TreeSet<>(SEGMENT_ORDER))
                .add(segmentId);
    }

    for (Map.Entry> entry : segmentMap.entrySet()) {
      final String dataSource = entry.getKey();
      retVal.addAll(refreshSegmentsForDataSource(dataSource, entry.getValue()));
    }

    return retVal;
  }

  private long recomputeIsRealtime(ImmutableSet servers)
  {
    if (servers.isEmpty()) {
      return 0;
    }
    final Optional historicalServer = servers
        .stream()
        // Ideally, this filter should have checked whether it's a broadcast segment loaded in brokers.
        // However, we don't current track of the broadcast segments loaded in brokers, so this filter is still valid.
        // See addSegment(), removeServerSegment(), and removeSegment()
        .filter(metadata -> metadata.getType().equals(ServerType.HISTORICAL))
        .findAny();

    // if there is any historical server in the replicas, isRealtime flag should be unset
    return historicalServer.isPresent() ? 0 : 1;
  }

  /**
   * Attempt to refresh "segmentSignatures" for a set of segments for a particular dataSource. Returns the set of
   * segments actually refreshed, which may be a subset of the asked-for set.
   */
  public Set refreshSegmentsForDataSource(final String dataSource, final Set segments)
      throws IOException
  {
    final Stopwatch stopwatch = Stopwatch.createStarted();

    if (!segments.stream().allMatch(segmentId -> segmentId.getDataSource().equals(dataSource))) {
      // Sanity check. We definitely expect this to pass.
      throw new ISE("'segments' must all match 'dataSource'!");
    }

    log.debug("Refreshing metadata for datasource[%s].", dataSource);

    final ServiceMetricEvent.Builder builder =
        new ServiceMetricEvent.Builder().setDimension(DruidMetrics.DATASOURCE, dataSource);

    emitter.emit(builder.setMetric("metadatacache/refresh/count", segments.size()));

    // Segment id string -> SegmentId object.
    final Map segmentIdMap = Maps.uniqueIndex(segments, SegmentId::toString);

    final Set retVal = new HashSet<>();

    logSegmentsToRefresh(dataSource, segments);

    final Sequence sequence = runSegmentMetadataQuery(
        Iterables.limit(segments, MAX_SEGMENTS_PER_QUERY)
    );

    Yielder yielder = Yielders.each(sequence);

    try {
      while (!yielder.isDone()) {
        final SegmentAnalysis analysis = yielder.get();
        final SegmentId segmentId = segmentIdMap.get(analysis.getId());

        if (segmentId == null) {
          log.warn("Got analysis for segment [%s] we didn't ask for, ignoring.", analysis.getId());
        } else {
          final RowSignature rowSignature = analysisToRowSignature(analysis);
          log.debug("Segment[%s] has signature[%s].", segmentId, rowSignature);

          if (segmentMetadataQueryResultHandler(dataSource, segmentId, rowSignature, analysis)) {
            retVal.add(segmentId);
          }
        }

        yielder = yielder.next(null);
      }
    }
    finally {
      yielder.close();
    }

    long refreshDurationMillis = stopwatch.elapsed(TimeUnit.MILLISECONDS);

    emitter.emit(builder.setMetric("metadatacache/refresh/time", refreshDurationMillis));

    log.debug(
        "Refreshed metadata for datasource [%s] in %,d ms (%d segments queried, %d segments left).",
        dataSource,
        refreshDurationMillis,
        retVal.size(),
        segments.size() - retVal.size()
    );

    return retVal;
  }

  /**
   * Log the segment details for a datasource to be refreshed for debugging purpose.
   */
  void logSegmentsToRefresh(String dataSource, Set ids)
  {
    // no-op
  }

  /**
   * Action to be executed on the result of Segment metadata query.
   * Returns if the segment metadata was updated.
   */
  protected boolean segmentMetadataQueryResultHandler(
      String dataSource,
      SegmentId segmentId,
      RowSignature rowSignature,
      SegmentAnalysis analysis
  )
  {
    AtomicBoolean added = new AtomicBoolean(false);
    segmentMetadataInfo.compute(
        dataSource,
        (datasourceKey, dataSourceSegments) -> {
          if (dataSourceSegments == null) {
            // Datasource may have been removed or become unavailable while this refresh was ongoing.
            log.warn(
                "No segment map found with datasource [%s], skipping refresh of segment [%s]",
                datasourceKey,
                segmentId
            );
            return null;
          } else {
            dataSourceSegments.compute(
                segmentId,
                (segmentIdKey, segmentMetadata) -> {
                  if (segmentMetadata == null) {
                    log.warn("No segment [%s] found, skipping refresh", segmentId);
                    return null;
                  } else {
                    final AvailableSegmentMetadata updatedSegmentMetadata = AvailableSegmentMetadata
                        .from(segmentMetadata)
                        .withRowSignature(rowSignature)
                        .withNumRows(analysis.getNumRows())
                        .build();
                    added.set(true);
                    return updatedSegmentMetadata;
                  }
                }
            );

            if (dataSourceSegments.isEmpty()) {
              return null;
            } else {
              return dataSourceSegments;
            }
          }
        }
    );

    return added.get();
  }

  @VisibleForTesting
  @Nullable
  public RowSignature buildDataSourceRowSignature(final String dataSource)
  {
    ConcurrentSkipListMap segmentsMap = segmentMetadataInfo.get(dataSource);

    // Preserve order.
    final Map columnTypes = new LinkedHashMap<>();

    if (segmentsMap != null && !segmentsMap.isEmpty()) {
      for (AvailableSegmentMetadata availableSegmentMetadata : segmentsMap.values()) {
        final RowSignature rowSignature = availableSegmentMetadata.getRowSignature();
        if (rowSignature != null) {
          for (String column : rowSignature.getColumnNames()) {
            final ColumnType columnType =
                rowSignature.getColumnType(column)
                            .orElseThrow(() -> new ISE("Encountered null type for column [%s]", column));

            columnTypes.compute(column, (c, existingType) -> columnTypeMergePolicy.merge(existingType, columnType));
          }
        }
      }
    } else {
      // table has no segments
      return null;
    }

    final RowSignature.Builder builder = RowSignature.builder();
    columnTypes.forEach(builder::add);

    return builder.build();
  }

  @VisibleForTesting
  public TreeSet getSegmentsNeedingRefresh()
  {
    synchronized (lock) {
      return segmentsNeedingRefresh;
    }
  }

  @VisibleForTesting
  public TreeSet getMutableSegments()
  {
    synchronized (lock) {
      return mutableSegments;
    }
  }

  @VisibleForTesting
  public Set getDataSourcesNeedingRebuild()
  {
    synchronized (lock) {
      return dataSourcesNeedingRebuild;
    }
  }

  protected boolean fetchAggregatorsInSegmentMetadataQuery()
  {
    return false;
  }

  /**
   * Execute a SegmentMetadata query and return a {@link Sequence} of {@link SegmentAnalysis}.
   *
   * @param segments Iterable of {@link SegmentId} objects that are subject of the SegmentMetadata query.
   * @return {@link Sequence} of {@link SegmentAnalysis} objects
   */
  @VisibleForTesting
  public Sequence runSegmentMetadataQuery(
      final Iterable segments
  )
  {
    // Sanity check: getOnlyElement of a set, to ensure all segments have the same datasource.
    final String dataSource = Iterables.getOnlyElement(
        StreamSupport.stream(segments.spliterator(), false)
                     .map(SegmentId::getDataSource).collect(Collectors.toSet())
    );

    final MultipleSpecificSegmentSpec querySegmentSpec = new MultipleSpecificSegmentSpec(
        StreamSupport.stream(segments.spliterator(), false)
                     .map(SegmentId::toDescriptor).collect(Collectors.toList())
    );

    final SegmentMetadataQuery segmentMetadataQuery = new SegmentMetadataQuery(
        new TableDataSource(dataSource),
        querySegmentSpec,
        new AllColumnIncluderator(),
        false,
        // disable the parallel merge because we don't care about the merge and don't want to consume its resources
        QueryContexts.override(
            internalQueryConfig.getContext(),
            QueryContexts.BROKER_PARALLEL_MERGE_KEY,
            false
        ),
        fetchAggregatorsInSegmentMetadataQuery()
        ? EnumSet.of(SegmentMetadataQuery.AnalysisType.AGGREGATORS)
        : EnumSet.noneOf(SegmentMetadataQuery.AnalysisType.class),
        false,
        null,
        null // we don't care about merging strategy because merge is false
    );

    return queryLifecycleFactory
        .factorize()
        .runSimple(segmentMetadataQuery, escalator.createEscalatedAuthenticationResult(), Access.OK).getResults();
  }

  @VisibleForTesting
  static RowSignature analysisToRowSignature(final SegmentAnalysis analysis)
  {
    final RowSignature.Builder rowSignatureBuilder = RowSignature.builder();
    for (Map.Entry entry : analysis.getColumns().entrySet()) {
      if (entry.getValue().isError()) {
        // Skip columns with analysis errors.
        continue;
      }

      ColumnType valueType = entry.getValue().getTypeSignature();

      // this shouldn't happen, but if it does, first try to fall back to legacy type information field in case
      // standard upgrade order was not followed for 0.22 to 0.23+, and if that also fails, then assume types are some
      // flavor of COMPLEX.
      if (valueType == null) {
        // at some point in the future this can be simplified to the contents of the catch clause here, once the
        // likelyhood of upgrading from some version lower than 0.23 is low
        try {
          valueType = ColumnType.fromString(entry.getValue().getType());
          if (valueType == null) {
            valueType = ColumnType.ofComplex(entry.getValue().getType());
          }
        }
        catch (IllegalArgumentException ignored) {
          valueType = ColumnType.UNKNOWN_COMPLEX;
        }
      }

      rowSignatureBuilder.add(entry.getKey(), valueType);
    }
    return ROW_SIGNATURE_INTERNER.intern(rowSignatureBuilder.build());
  }

  /**
   * This method is not thread-safe and must be used only in unit tests.
   */
  @VisibleForTesting
  public void setAvailableSegmentMetadata(final SegmentId segmentId, final AvailableSegmentMetadata availableSegmentMetadata)
  {
    final ConcurrentSkipListMap dataSourceSegments = segmentMetadataInfo
        .computeIfAbsent(
            segmentId.getDataSource(),
            k -> new ConcurrentSkipListMap<>(SEGMENT_ORDER)
        );
    if (dataSourceSegments.put(segmentId, availableSegmentMetadata) == null) {
      totalSegments++;
    }
  }

  /**
   * This is a helper method for unit tests to emulate heavy work done with {@link #lock}.
   * It must be used only in unit tests.
   */
  @VisibleForTesting
  protected void doInLock(Runnable runnable)
  {
    synchronized (lock) {
      runnable.run();
    }
  }

  /**
   * ColumnTypeMergePolicy defines the rules of which type to use when faced with the possibility of different types
   * for the same column from segment to segment. It is used to help compute a {@link RowSignature} for a table in
   * Druid based on the segment metadata of all segments, merging the types of each column encountered to end up with
   * a single type to represent it globally.
   */
  @FunctionalInterface
  public interface ColumnTypeMergePolicy
  {
    ColumnType merge(ColumnType existingType, ColumnType newType);

    @JsonCreator
    static ColumnTypeMergePolicy fromString(String type)
    {
      if (LeastRestrictiveTypeMergePolicy.NAME.equalsIgnoreCase(type)) {
        return LeastRestrictiveTypeMergePolicy.INSTANCE;
      }
      if (FirstTypeMergePolicy.NAME.equalsIgnoreCase(type)) {
        return FirstTypeMergePolicy.INSTANCE;
      }
      throw new IAE("Unknown type [%s]", type);
    }
  }

  /**
   * Classic logic, we use the first type we encounter. This policy is effectively 'newest first' because we iterated
   * segments starting from the most recent time chunk, so this typically results in the most recently used type being
   * chosen, at least for systems that are continuously updated with 'current' data.
   *
   * Since {@link ColumnTypeMergePolicy} are used to compute the SQL schema, at least in systems using SQL schemas which
   * are partially or fully computed by this cache, this merge policy can result in query time errors if incompatible
   * types are mixed if the chosen type is more restrictive than the types of some segments. If data is likely to vary
   * in type across segments, consider using {@link LeastRestrictiveTypeMergePolicy} instead.
   */
  public static class FirstTypeMergePolicy implements ColumnTypeMergePolicy
  {
    public static final String NAME = "latestInterval";
    private static final FirstTypeMergePolicy INSTANCE = new FirstTypeMergePolicy();

    @Override
    public ColumnType merge(ColumnType existingType, ColumnType newType)
    {
      if (existingType == null) {
        return newType;
      }
      if (newType == null) {
        return existingType;
      }
      // if any are json, are all json
      if (ColumnType.NESTED_DATA.equals(newType) || ColumnType.NESTED_DATA.equals(existingType)) {
        return ColumnType.NESTED_DATA;
      }
      // "existing type" is the 'newest' type, since we iterate the segments list by newest start time
      return existingType;
    }

    @Override
    public int hashCode()
    {
      return Objects.hash(NAME);
    }

    @Override
    public boolean equals(Object o)
    {
      if (this == o) {
        return true;
      }
      return o != null && getClass() == o.getClass();
    }

    @Override
    public String toString()
    {
      return NAME;
    }
  }

  /**
   * Resolves types using {@link ColumnType#leastRestrictiveType(ColumnType, ColumnType)} to find the ColumnType that
   * can best represent all data contained across all segments.
   */
  public static class LeastRestrictiveTypeMergePolicy implements ColumnTypeMergePolicy
  {
    public static final String NAME = "leastRestrictive";
    private static final LeastRestrictiveTypeMergePolicy INSTANCE = new LeastRestrictiveTypeMergePolicy();

    @Override
    public ColumnType merge(ColumnType existingType, ColumnType newType)
    {
      try {
        return ColumnType.leastRestrictiveType(existingType, newType);
      }
      catch (Types.IncompatibleTypeException incompatibleTypeException) {
        // fall back to first encountered type if they are not compatible for some reason
        return FirstTypeMergePolicy.INSTANCE.merge(existingType, newType);
      }
    }

    @Override
    public int hashCode()
    {
      return Objects.hash(NAME);
    }

    @Override
    public boolean equals(Object o)
    {
      if (this == o) {
        return true;
      }
      return o != null && getClass() == o.getClass();
    }

    @Override
    public String toString()
    {
      return NAME;
    }
  }
}