All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.query.materializedview.DataSourceOptimizer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.query.materializedview;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSortedSet;
import com.google.inject.Inject;
import org.apache.druid.client.TimelineServerView;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.query.Query;
import org.apache.druid.query.TableDataSource;
import org.apache.druid.query.groupby.GroupByQuery;
import org.apache.druid.query.spec.MultipleIntervalSegmentSpec;
import org.apache.druid.query.timeseries.TimeseriesQuery;
import org.apache.druid.query.topn.TopNQuery;
import org.apache.druid.timeline.TimelineObjectHolder;
import org.joda.time.Interval;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Collectors;

public class DataSourceOptimizer
{
  private final ReadWriteLock lock = new ReentrantReadWriteLock();
  private final TimelineServerView serverView;
  private ConcurrentHashMap derivativesHitCount = new ConcurrentHashMap<>();
  private ConcurrentHashMap totalCount = new ConcurrentHashMap<>();
  private ConcurrentHashMap hitCount = new ConcurrentHashMap<>();
  private ConcurrentHashMap costTime = new ConcurrentHashMap<>();
  private ConcurrentHashMap, AtomicLong>> missFields = new ConcurrentHashMap<>();

  @Inject
  public DataSourceOptimizer(TimelineServerView serverView)
  {
    this.serverView = serverView;
  }

  /**
   * Do main work about materialized view selection: transform user query to one or more sub-queries.
   *
   * In the sub-query, the dataSource is the derivative of dataSource in user query, and sum of all sub-queries'
   * intervals equals the interval in user query
   *
   * Derived dataSource with smallest average data size per segment granularity have highest priority to replace the
   * datasource in user query
   *
   * @param query only TopNQuery/TimeseriesQuery/GroupByQuery can be optimized
   * @return a list of queries with specified derived dataSources and intervals
   */
  public List optimize(Query query)
  {
    long start = System.currentTimeMillis();
    // only topN/timeseries/groupby query can be optimized
    // only TableDataSource can be optimiezed
    if (!(query instanceof TopNQuery || query instanceof TimeseriesQuery || query instanceof GroupByQuery)
        || !(query.getDataSource() instanceof TableDataSource)) {
      return Collections.singletonList(query);
    }
    String datasourceName = ((TableDataSource) query.getDataSource()).getName();
    // get all derivatives for datasource in query. The derivatives set is sorted by average size of
    // per segment granularity.
    Set derivatives = DerivativeDataSourceManager.getDerivatives(datasourceName);

    if (derivatives.isEmpty()) {
      return Collections.singletonList(query);
    }
    lock.readLock().lock();
    try {
      totalCount.computeIfAbsent(datasourceName, dsName -> new AtomicLong(0)).incrementAndGet();
      hitCount.putIfAbsent(datasourceName, new AtomicLong(0));
      AtomicLong costTimeOfDataSource = costTime.computeIfAbsent(datasourceName, dsName -> new AtomicLong(0));

      // get all fields which the query required
      Set requiredFields = MaterializedViewUtils.getRequiredFields(query);

      Set derivativesWithRequiredFields = new HashSet<>();
      for (DerivativeDataSource derivativeDataSource : derivatives) {
        derivativesHitCount.putIfAbsent(derivativeDataSource.getName(), new AtomicLong(0));
        if (derivativeDataSource.getColumns().containsAll(requiredFields)) {
          derivativesWithRequiredFields.add(derivativeDataSource);
        }
      }
      // if no derivatives contains all required dimensions, this materialized view selection failed.
      if (derivativesWithRequiredFields.isEmpty()) {
        missFields
            .computeIfAbsent(datasourceName, dsName -> new ConcurrentHashMap<>())
            .computeIfAbsent(requiredFields, rf -> new AtomicLong(0))
            .incrementAndGet();
        costTimeOfDataSource.addAndGet(System.currentTimeMillis() - start);
        return Collections.singletonList(query);
      }

      List queries = new ArrayList<>();
      List remainingQueryIntervals = query.getIntervals();

      for (DerivativeDataSource derivativeDataSource : ImmutableSortedSet.copyOf(derivativesWithRequiredFields)) {
        TableDataSource tableDataSource = new TableDataSource(derivativeDataSource.getName());
        final List derivativeIntervals = remainingQueryIntervals.stream()
            .flatMap(interval -> serverView
                .getTimeline(tableDataSource)
                .orElseThrow(() -> new ISE("No timeline for dataSource: %s", derivativeDataSource.getName()))
                .lookup(interval)
                .stream()
                .map(TimelineObjectHolder::getInterval)
            )
            .collect(Collectors.toList());
        // if the derivative does not contain any parts of intervals in the query, the derivative will
        // not be selected.
        if (derivativeIntervals.isEmpty()) {
          continue;
        }

        remainingQueryIntervals = MaterializedViewUtils.minus(remainingQueryIntervals, derivativeIntervals);
        queries.add(
            query.withDataSource(new TableDataSource(derivativeDataSource.getName()))
                .withQuerySegmentSpec(new MultipleIntervalSegmentSpec(derivativeIntervals))
        );
        derivativesHitCount.get(derivativeDataSource.getName()).incrementAndGet();
        if (remainingQueryIntervals.isEmpty()) {
          break;
        }
      }

      if (queries.isEmpty()) {
        costTime.get(datasourceName).addAndGet(System.currentTimeMillis() - start);
        return Collections.singletonList(query);
      }

      //after materialized view selection, the result of the remaining query interval will be computed based on
      // the original datasource.
      if (!remainingQueryIntervals.isEmpty()) {
        queries.add(query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(remainingQueryIntervals)));
      }
      hitCount.get(datasourceName).incrementAndGet();
      costTime.get(datasourceName).addAndGet(System.currentTimeMillis() - start);
      return queries;
    }
    finally {
      lock.readLock().unlock();
    }
  }

  public List getAndResetStats()
  {
    ImmutableMap derivativesHitCountSnapshot;
    ImmutableMap totalCountSnapshot;
    ImmutableMap hitCountSnapshot;
    ImmutableMap costTimeSnapshot;
    ImmutableMap, AtomicLong>> missFieldsSnapshot;
    lock.writeLock().lock();
    try {
      derivativesHitCountSnapshot = ImmutableMap.copyOf(derivativesHitCount);
      totalCountSnapshot = ImmutableMap.copyOf(totalCount);
      hitCountSnapshot = ImmutableMap.copyOf(hitCount);
      costTimeSnapshot = ImmutableMap.copyOf(costTime);
      missFieldsSnapshot = ImmutableMap.copyOf(missFields);
      derivativesHitCount.clear();
      totalCount.clear();
      hitCount.clear();
      costTime.clear();
      missFields.clear();
    }
    finally {
      lock.writeLock().unlock();
    }
    List stats = new ArrayList<>();
    Map> baseToDerivatives = DerivativeDataSourceManager.getAllDerivatives();
    for (Map.Entry> entry : baseToDerivatives.entrySet()) {
      Map derivativesStat = new HashMap<>();
      for (DerivativeDataSource derivative : entry.getValue()) {
        derivativesStat.put(
            derivative.getName(),
            derivativesHitCountSnapshot.getOrDefault(derivative.getName(), new AtomicLong(0)).get()
        );
      }
      stats.add(
          new DataSourceOptimizerStats(
              entry.getKey(),
              hitCountSnapshot.getOrDefault(entry.getKey(), new AtomicLong(0)).get(),
              totalCountSnapshot.getOrDefault(entry.getKey(), new AtomicLong(0)).get(),
              costTimeSnapshot.getOrDefault(entry.getKey(), new AtomicLong(0)).get(),
              missFieldsSnapshot.getOrDefault(entry.getKey(), new ConcurrentHashMap<>()),
              derivativesStat
          )
      );
    }
    return stats;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy