All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.sql.calcite.schema.DruidSchema Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.sql.calcite.schema;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import com.google.inject.Inject;
import org.apache.calcite.schema.Table;
import org.apache.calcite.schema.impl.AbstractSchema;
import org.apache.druid.client.ServerView;
import org.apache.druid.client.TimelineServerView;
import org.apache.druid.guice.ManageLifecycle;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.concurrent.Execs;
import org.apache.druid.java.util.common.concurrent.ScheduledExecutors;
import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.java.util.common.guava.Yielder;
import org.apache.druid.java.util.common.guava.Yielders;
import org.apache.druid.java.util.common.lifecycle.LifecycleStart;
import org.apache.druid.java.util.common.lifecycle.LifecycleStop;
import org.apache.druid.java.util.emitter.EmittingLogger;
import org.apache.druid.query.TableDataSource;
import org.apache.druid.query.metadata.metadata.AllColumnIncluderator;
import org.apache.druid.query.metadata.metadata.ColumnAnalysis;
import org.apache.druid.query.metadata.metadata.SegmentAnalysis;
import org.apache.druid.query.metadata.metadata.SegmentMetadataQuery;
import org.apache.druid.query.spec.MultipleSpecificSegmentSpec;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.server.QueryLifecycleFactory;
import org.apache.druid.server.coordination.DruidServerMetadata;
import org.apache.druid.server.coordination.ServerType;
import org.apache.druid.server.security.AuthenticationResult;
import org.apache.druid.server.security.Escalator;
import org.apache.druid.sql.calcite.planner.PlannerConfig;
import org.apache.druid.sql.calcite.table.DruidTable;
import org.apache.druid.sql.calcite.table.RowSignature;
import org.apache.druid.sql.calcite.view.DruidViewMacro;
import org.apache.druid.sql.calcite.view.ViewManager;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.SegmentId;

import java.io.IOException;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

@ManageLifecycle
public class DruidSchema extends AbstractSchema
{
  // Newest segments first, so they override older ones.
  private static final Comparator SEGMENT_ORDER = Comparator
      .comparing((SegmentId segmentId) -> segmentId.getInterval().getStart())
      .reversed()
      .thenComparing(Function.identity());

  public static final String NAME = "druid";

  private static final EmittingLogger log = new EmittingLogger(DruidSchema.class);
  private static final int MAX_SEGMENTS_PER_QUERY = 15000;
  private static final long DEFAULT_NUM_ROWS = 0;

  private final QueryLifecycleFactory queryLifecycleFactory;
  private final PlannerConfig config;
  private final ViewManager viewManager;
  private final ExecutorService cacheExec;
  private final ConcurrentMap tables;

  // For awaitInitialization.
  private final CountDownLatch initialized = new CountDownLatch(1);

  // Protects access to segmentSignatures, mutableSegments, segmentsNeedingRefresh, lastRefresh, isServerViewInitialized, segmentMetadata
  private final Object lock = new Object();

  // DataSource -> Segment -> AvailableSegmentMetadata(contains RowSignature) for that segment.
  // Use TreeMap for segments so they are merged in deterministic order, from older to newer.
  @GuardedBy("lock")
  private final Map> segmentMetadataInfo = new HashMap<>();
  private int totalSegments = 0;

  // All mutable segments.
  private final Set mutableSegments = new TreeSet<>(SEGMENT_ORDER);

  // All dataSources that need tables regenerated.
  private final Set dataSourcesNeedingRebuild = new HashSet<>();

  // All segments that need to be refreshed.
  private final TreeSet segmentsNeedingRefresh = new TreeSet<>(SEGMENT_ORDER);

  // Escalator, so we can attach an authentication result to queries we generate.
  private final Escalator escalator;

  private boolean refreshImmediately = false;
  private long lastRefresh = 0L;
  private long lastFailure = 0L;
  private boolean isServerViewInitialized = false;

  @Inject
  public DruidSchema(
      final QueryLifecycleFactory queryLifecycleFactory,
      final TimelineServerView serverView,
      final PlannerConfig config,
      final ViewManager viewManager,
      final Escalator escalator
  )
  {
    this.queryLifecycleFactory = Preconditions.checkNotNull(queryLifecycleFactory, "queryLifecycleFactory");
    Preconditions.checkNotNull(serverView, "serverView");
    this.config = Preconditions.checkNotNull(config, "config");
    this.viewManager = Preconditions.checkNotNull(viewManager, "viewManager");
    this.cacheExec = ScheduledExecutors.fixed(1, "DruidSchema-Cache-%d");
    this.tables = new ConcurrentHashMap<>();
    this.escalator = escalator;

    serverView.registerTimelineCallback(
        Execs.directExecutor(),
        new TimelineServerView.TimelineCallback()
        {
          @Override
          public ServerView.CallbackAction timelineInitialized()
          {
            synchronized (lock) {
              isServerViewInitialized = true;
              lock.notifyAll();
            }

            return ServerView.CallbackAction.CONTINUE;
          }

          @Override
          public ServerView.CallbackAction segmentAdded(final DruidServerMetadata server, final DataSegment segment)
          {
            addSegment(server, segment);
            return ServerView.CallbackAction.CONTINUE;
          }

          @Override
          public ServerView.CallbackAction segmentRemoved(final DataSegment segment)
          {
            removeSegment(segment);
            return ServerView.CallbackAction.CONTINUE;
          }

          @Override
          public ServerView.CallbackAction serverSegmentRemoved(
              final DruidServerMetadata server,
              final DataSegment segment
          )
          {
            removeServerSegment(server, segment);
            return ServerView.CallbackAction.CONTINUE;
          }
        }
    );
  }

  @LifecycleStart
  public void start() throws InterruptedException
  {
    cacheExec.submit(
        new Runnable()
        {
          @Override
          public void run()
          {
            try {
              while (!Thread.currentThread().isInterrupted()) {
                final Set segmentsToRefresh = new TreeSet<>();
                final Set dataSourcesToRebuild = new TreeSet<>();

                try {
                  synchronized (lock) {
                    final long nextRefreshNoFuzz = DateTimes
                        .utc(lastRefresh)
                        .plus(config.getMetadataRefreshPeriod())
                        .getMillis();

                    // Fuzz a bit to spread load out when we have multiple brokers.
                    final long nextRefresh = nextRefreshNoFuzz + (long) ((nextRefreshNoFuzz - lastRefresh) * 0.10);

                    while (true) {
                      // Do not refresh if it's too soon after a failure (to avoid rapid cycles of failure).
                      final boolean wasRecentFailure = DateTimes.utc(lastFailure)
                                                                .plus(config.getMetadataRefreshPeriod())
                                                                .isAfterNow();

                      if (isServerViewInitialized &&
                          !wasRecentFailure &&
                          (!segmentsNeedingRefresh.isEmpty() || !dataSourcesNeedingRebuild.isEmpty()) &&
                          (refreshImmediately || nextRefresh < System.currentTimeMillis())) {
                        // We need to do a refresh. Break out of the waiting loop.
                        break;
                      }

                      if (isServerViewInitialized) {
                        // Server view is initialized, but we don't need to do a refresh. Could happen if there are
                        // no segments in the system yet. Just mark us as initialized, then.
                        initialized.countDown();
                      }

                      // Wait some more, we'll wake up when it might be time to do another refresh.
                      lock.wait(Math.max(1, nextRefresh - System.currentTimeMillis()));
                    }

                    segmentsToRefresh.addAll(segmentsNeedingRefresh);
                    segmentsNeedingRefresh.clear();

                    // Mutable segments need a refresh every period, since new columns could be added dynamically.
                    segmentsNeedingRefresh.addAll(mutableSegments);

                    lastFailure = 0L;
                    lastRefresh = System.currentTimeMillis();
                    refreshImmediately = false;
                  }

                  // Refresh the segments.
                  final Set refreshed = refreshSegments(segmentsToRefresh);

                  synchronized (lock) {
                    // Add missing segments back to the refresh list.
                    segmentsNeedingRefresh.addAll(Sets.difference(segmentsToRefresh, refreshed));

                    // Compute the list of dataSources to rebuild tables for.
                    dataSourcesToRebuild.addAll(dataSourcesNeedingRebuild);
                    refreshed.forEach(segment -> dataSourcesToRebuild.add(segment.getDataSource()));
                    dataSourcesNeedingRebuild.clear();

                    lock.notifyAll();
                  }

                  // Rebuild the dataSources.
                  for (String dataSource : dataSourcesToRebuild) {
                    final DruidTable druidTable = buildDruidTable(dataSource);
                    final DruidTable oldTable = tables.put(dataSource, druidTable);
                    if (oldTable == null || !oldTable.getRowSignature().equals(druidTable.getRowSignature())) {
                      log.info("dataSource [%s] has new signature: %s.", dataSource, druidTable.getRowSignature());
                    } else {
                      log.debug("dataSource [%s] signature is unchanged.", dataSource);
                    }
                  }

                  initialized.countDown();
                }
                catch (InterruptedException e) {
                  // Fall through.
                  throw e;
                }
                catch (Exception e) {
                  log.warn(e, "Metadata refresh failed, trying again soon.");

                  synchronized (lock) {
                    // Add our segments and dataSources back to their refresh and rebuild lists.
                    segmentsNeedingRefresh.addAll(segmentsToRefresh);
                    dataSourcesNeedingRebuild.addAll(dataSourcesToRebuild);
                    lastFailure = System.currentTimeMillis();
                    lock.notifyAll();
                  }
                }
              }
            }
            catch (InterruptedException e) {
              // Just exit.
            }
            catch (Throwable e) {
              // Throwables that fall out to here (not caught by an inner try/catch) are potentially gnarly, like
              // OOMEs. Anyway, let's just emit an alert and stop refreshing metadata.
              log.makeAlert(e, "Metadata refresh failed permanently").emit();
              throw e;
            }
            finally {
              log.info("Metadata refresh stopped.");
            }
          }
        }
    );

    if (config.isAwaitInitializationOnStart()) {
      final long startNanos = System.nanoTime();
      log.debug("%s waiting for initialization.", getClass().getSimpleName());
      awaitInitialization();
      log.info("%s initialized in [%,d] ms.", getClass().getSimpleName(), (System.nanoTime() - startNanos) / 1000000);
    }
  }

  @LifecycleStop
  public void stop()
  {
    cacheExec.shutdownNow();
  }

  public void awaitInitialization() throws InterruptedException
  {
    initialized.await();
  }

  @Override
  protected Map getTableMap()
  {
    return ImmutableMap.copyOf(tables);
  }

  @Override
  protected Multimap getFunctionMultimap()
  {
    final ImmutableMultimap.Builder builder = ImmutableMultimap.builder();
    for (Map.Entry entry : viewManager.getViews().entrySet()) {
      builder.put(entry);
    }
    return builder.build();
  }

  @VisibleForTesting
  void addSegment(final DruidServerMetadata server, final DataSegment segment)
  {
    synchronized (lock) {
      final Map knownSegments = segmentMetadataInfo.get(segment.getDataSource());
      AvailableSegmentMetadata segmentMetadata = knownSegments != null ? knownSegments.get(segment.getId()) : null;
      if (segmentMetadata == null) {
        // segmentReplicatable is used to determine if segments are served by historical or realtime servers
        long isRealtime = server.segmentReplicatable() ? 0 : 1;
        segmentMetadata = AvailableSegmentMetadata.builder(
            segment,
            isRealtime,
            ImmutableSet.of(server),
            null,
            DEFAULT_NUM_ROWS
        ).build();
        // Unknown segment.
        setAvailableSegmentMetadata(segment.getId(), segmentMetadata);
        segmentsNeedingRefresh.add(segment.getId());
        if (!server.segmentReplicatable()) {
          log.debug("Added new mutable segment[%s].", segment.getId());
          mutableSegments.add(segment.getId());
        } else {
          log.debug("Added new immutable segment[%s].", segment.getId());
        }
      } else {
        final Set segmentServers = segmentMetadata.getReplicas();
        final ImmutableSet servers = new ImmutableSet.Builder()
            .addAll(segmentServers)
            .add(server)
            .build();
        final AvailableSegmentMetadata metadataWithNumReplicas = AvailableSegmentMetadata
            .from(segmentMetadata)
            .withReplicas(servers)
            .withRealtime(recomputeIsRealtime(servers))
            .build();
        knownSegments.put(segment.getId(), metadataWithNumReplicas);
        if (server.segmentReplicatable()) {
          // If a segment shows up on a replicatable (historical) server at any point, then it must be immutable,
          // even if it's also available on non-replicatable (realtime) servers.
          mutableSegments.remove(segment.getId());
          log.debug("Segment[%s] has become immutable.", segment.getId());
        }
      }
      if (!tables.containsKey(segment.getDataSource())) {
        refreshImmediately = true;
      }

      lock.notifyAll();
    }
  }

  @VisibleForTesting
  void removeSegment(final DataSegment segment)
  {
    synchronized (lock) {
      log.debug("Segment[%s] is gone.", segment.getId());

      dataSourcesNeedingRebuild.add(segment.getDataSource());
      segmentsNeedingRefresh.remove(segment.getId());
      mutableSegments.remove(segment.getId());

      final Map dataSourceSegments =
          segmentMetadataInfo.get(segment.getDataSource());
      if (dataSourceSegments.remove(segment.getId()) != null) {
        totalSegments--;
      }

      if (dataSourceSegments.isEmpty()) {
        segmentMetadataInfo.remove(segment.getDataSource());
        tables.remove(segment.getDataSource());
        log.info("dataSource[%s] no longer exists, all metadata removed.", segment.getDataSource());
      }

      lock.notifyAll();
    }
  }

  @VisibleForTesting
  void removeServerSegment(final DruidServerMetadata server, final DataSegment segment)
  {
    synchronized (lock) {
      log.debug("Segment[%s] is gone from server[%s]", segment.getId(), server.getName());
      final Map knownSegments = segmentMetadataInfo.get(segment.getDataSource());
      final AvailableSegmentMetadata segmentMetadata = knownSegments.get(segment.getId());
      final Set segmentServers = segmentMetadata.getReplicas();
      final ImmutableSet servers = FluentIterable
          .from(segmentServers)
          .filter(Predicates.not(Predicates.equalTo(server)))
          .toSet();

      final AvailableSegmentMetadata metadataWithNumReplicas = AvailableSegmentMetadata
          .from(segmentMetadata)
          .withReplicas(servers)
          .withRealtime(recomputeIsRealtime(servers))
          .build();
      knownSegments.put(segment.getId(), metadataWithNumReplicas);
      lock.notifyAll();
    }
  }

  /**
   * Attempt to refresh "segmentSignatures" for a set of segments. Returns the set of segments actually refreshed,
   * which may be a subset of the asked-for set.
   */
  @VisibleForTesting
  Set refreshSegments(final Set segments) throws IOException
  {
    final Set retVal = new HashSet<>();

    // Organize segments by dataSource.
    final Map> segmentMap = new TreeMap<>();

    for (SegmentId segmentId : segments) {
      segmentMap.computeIfAbsent(segmentId.getDataSource(), x -> new TreeSet<>(SEGMENT_ORDER))
                .add(segmentId);
    }

    for (Map.Entry> entry : segmentMap.entrySet()) {
      final String dataSource = entry.getKey();
      retVal.addAll(refreshSegmentsForDataSource(dataSource, entry.getValue()));
    }

    return retVal;
  }

  private long recomputeIsRealtime(ImmutableSet servers)
  {
    final Optional historicalServer = servers
        .stream()
        .filter(metadata -> metadata.getType().equals(ServerType.HISTORICAL))
        .findAny();

    // if there is any historical server in the replicas, isRealtime flag should be unset
    final long isRealtime = historicalServer.isPresent() ? 0 : 1;
    return isRealtime;
  }

  /**
   * Attempt to refresh "segmentSignatures" for a set of segments for a particular dataSource. Returns the set of
   * segments actually refreshed, which may be a subset of the asked-for set.
   */
  private Set refreshSegmentsForDataSource(final String dataSource, final Set segments)
      throws IOException
  {
    if (!segments.stream().allMatch(segmentId -> segmentId.getDataSource().equals(dataSource))) {
      // Sanity check. We definitely expect this to pass.
      throw new ISE("'segments' must all match 'dataSource'!");
    }

    log.debug("Refreshing metadata for dataSource[%s].", dataSource);

    final long startTime = System.currentTimeMillis();

    // Segment id string -> SegmentId object.
    final Map segmentIdMap = Maps.uniqueIndex(segments, SegmentId::toString);

    final Set retVal = new HashSet<>();
    final Sequence sequence = runSegmentMetadataQuery(
        queryLifecycleFactory,
        Iterables.limit(segments, MAX_SEGMENTS_PER_QUERY),
        escalator.createEscalatedAuthenticationResult()
    );

    Yielder yielder = Yielders.each(sequence);

    try {
      while (!yielder.isDone()) {
        final SegmentAnalysis analysis = yielder.get();
        final SegmentId segmentId = segmentIdMap.get(analysis.getId());

        if (segmentId == null) {
          log.warn("Got analysis for segment[%s] we didn't ask for, ignoring.", analysis.getId());
        } else {
          synchronized (lock) {
            final RowSignature rowSignature = analysisToRowSignature(analysis);
            log.debug("Segment[%s] has signature[%s].", segmentId, rowSignature);
            final Map dataSourceSegments = segmentMetadataInfo.get(dataSource);
            if (dataSourceSegments == null) {
              // Datasource may have been removed or become unavailable while this refresh was ongoing.
              log.warn(
                  "No segment map found with datasource[%s], skipping refresh of segment[%s]",
                  dataSource,
                  segmentId
              );
            } else {
              final AvailableSegmentMetadata segmentMetadata = dataSourceSegments.get(segmentId);
              if (segmentMetadata == null) {
                log.warn("No segment[%s] found, skipping refresh", segmentId);
              } else {
                final AvailableSegmentMetadata updatedSegmentMetadata = AvailableSegmentMetadata
                    .from(segmentMetadata)
                    .withRowSignature(rowSignature)
                    .withNumRows(analysis.getNumRows())
                    .build();
                dataSourceSegments.put(segmentId, updatedSegmentMetadata);
                setAvailableSegmentMetadata(segmentId, updatedSegmentMetadata);
                retVal.add(segmentId);
              }
            }
          }
        }

        yielder = yielder.next(null);
      }
    }
    finally {
      yielder.close();
    }

    log.debug(
        "Refreshed metadata for dataSource[%s] in %,d ms (%d segments queried, %d segments left).",
        dataSource,
        System.currentTimeMillis() - startTime,
        retVal.size(),
        segments.size() - retVal.size()
    );

    return retVal;
  }

  @VisibleForTesting
  void setAvailableSegmentMetadata(final SegmentId segmentId, final AvailableSegmentMetadata availableSegmentMetadata)
  {
    synchronized (lock) {
      TreeMap dataSourceSegments = segmentMetadataInfo.computeIfAbsent(
          segmentId.getDataSource(),
          x -> new TreeMap<>(SEGMENT_ORDER)
      );
      if (dataSourceSegments.put(segmentId, availableSegmentMetadata) == null) {
        totalSegments++;
      }
    }
  }

  private DruidTable buildDruidTable(final String dataSource)
  {
    synchronized (lock) {
      final Map segmentMap = segmentMetadataInfo.get(dataSource);
      final Map columnTypes = new TreeMap<>();

      if (segmentMap != null) {
        for (AvailableSegmentMetadata availableSegmentMetadata : segmentMap.values()) {
          final RowSignature rowSignature = availableSegmentMetadata.getRowSignature();
          if (rowSignature != null) {
            for (String column : rowSignature.getRowOrder()) {
              // Newer column types should override older ones.
              columnTypes.putIfAbsent(column, rowSignature.getColumnType(column));
            }
          }
        }
      }

      final RowSignature.Builder builder = RowSignature.builder();
      columnTypes.forEach(builder::add);
      return new DruidTable(new TableDataSource(dataSource), builder.build());
    }
  }

  private static Sequence runSegmentMetadataQuery(
      final QueryLifecycleFactory queryLifecycleFactory,
      final Iterable segments,
      final AuthenticationResult authenticationResult
  )
  {
    // Sanity check: getOnlyElement of a set, to ensure all segments have the same dataSource.
    final String dataSource = Iterables.getOnlyElement(
        StreamSupport.stream(segments.spliterator(), false)
                     .map(SegmentId::getDataSource).collect(Collectors.toSet())
    );

    final MultipleSpecificSegmentSpec querySegmentSpec = new MultipleSpecificSegmentSpec(
        StreamSupport.stream(segments.spliterator(), false)
                     .map(SegmentId::toDescriptor).collect(Collectors.toList())
    );

    final SegmentMetadataQuery segmentMetadataQuery = new SegmentMetadataQuery(
        new TableDataSource(dataSource),
        querySegmentSpec,
        new AllColumnIncluderator(),
        false,
        ImmutableMap.of(),
        EnumSet.noneOf(SegmentMetadataQuery.AnalysisType.class),
        false,
        false
    );

    return queryLifecycleFactory.factorize().runSimple(segmentMetadataQuery, authenticationResult, null);
  }

  private static RowSignature analysisToRowSignature(final SegmentAnalysis analysis)
  {
    final RowSignature.Builder rowSignatureBuilder = RowSignature.builder();
    for (Map.Entry entry : analysis.getColumns().entrySet()) {
      if (entry.getValue().isError()) {
        // Skip columns with analysis errors.
        continue;
      }

      ValueType valueType;
      try {
        valueType = ValueType.valueOf(StringUtils.toUpperCase(entry.getValue().getType()));
      }
      catch (IllegalArgumentException e) {
        // Assume unrecognized types are some flavor of COMPLEX. This throws away information about exactly
        // what kind of complex column it is, which we may want to preserve some day.
        valueType = ValueType.COMPLEX;
      }

      rowSignatureBuilder.add(entry.getKey(), valueType);
    }
    return rowSignatureBuilder.build();
  }

  Map getSegmentMetadataSnapshot()
  {
    final Map segmentMetadata = new HashMap<>();
    synchronized (lock) {
      for (TreeMap val : segmentMetadataInfo.values()) {
        segmentMetadata.putAll(val);
      }
    }
    return segmentMetadata;
  }

  int getTotalSegments()
  {
    return totalSegments;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy