com.yahoo.bard.webservice.data.DruidQueryBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fili-core Show documentation
Fili web service library provides core capabilities for RESTful aggregation navigation, query planning and metadata
There is a newer version: 1.1.13
// Copyright 2017 Yahoo Inc.
// Licensed under the terms of the Apache license. Please see LICENSE.md file distributed with this work for terms.
package com.yahoo.bard.webservice.data;

import static com.yahoo.bard.webservice.web.ErrorMessageFormat.TOP_N_UNSORTED;

import com.yahoo.bard.webservice.config.BardFeatureFlag;
import com.yahoo.bard.webservice.data.dimension.Dimension;
import com.yahoo.bard.webservice.data.dimension.FilterBuilderException;
import com.yahoo.bard.webservice.data.dimension.VirtualDimension;
import com.yahoo.bard.webservice.data.metric.TemplateDruidQuery;
import com.yahoo.bard.webservice.data.time.Granularity;
import com.yahoo.bard.webservice.druid.model.builders.DruidFilterBuilder;
import com.yahoo.bard.webservice.druid.model.builders.DruidHavingBuilder;
import com.yahoo.bard.webservice.druid.model.datasource.DataSource;
import com.yahoo.bard.webservice.druid.model.datasource.QueryDataSource;
import com.yahoo.bard.webservice.druid.model.datasource.TableDataSource;
import com.yahoo.bard.webservice.druid.model.datasource.UnionDataSource;
import com.yahoo.bard.webservice.druid.model.filter.Filter;
import com.yahoo.bard.webservice.druid.model.having.Having;
import com.yahoo.bard.webservice.druid.model.orderby.LimitSpec;
import com.yahoo.bard.webservice.druid.model.orderby.OrderByColumn;
import com.yahoo.bard.webservice.druid.model.orderby.TopNMetric;
import com.yahoo.bard.webservice.druid.model.query.DruidAggregationQuery;
import com.yahoo.bard.webservice.druid.model.query.GroupByQuery;
import com.yahoo.bard.webservice.druid.model.query.TimeSeriesQuery;
import com.yahoo.bard.webservice.druid.model.query.TopNQuery;
import com.yahoo.bard.webservice.table.ConstrainedTable;
import com.yahoo.bard.webservice.table.LogicalTable;
import com.yahoo.bard.webservice.table.LogicalTableDictionary;
import com.yahoo.bard.webservice.table.TableGroup;
import com.yahoo.bard.webservice.table.TableIdentifier;
import com.yahoo.bard.webservice.table.resolver.NoMatchFoundException;
import com.yahoo.bard.webservice.table.resolver.PhysicalTableResolver;
import com.yahoo.bard.webservice.table.resolver.QueryPlanningConstraint;
import com.yahoo.bard.webservice.web.apirequest.DataApiRequest;
import com.yahoo.bard.webservice.web.filters.ApiFilters;

import org.joda.time.DateTimeZone;
import org.joda.time.Interval;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import javax.inject.Inject;
import javax.inject.Singleton;

/**
 * Druid Query builder class.
 */
@Singleton
public class DruidQueryBuilder {

    private static final Logger LOG = LoggerFactory.getLogger(DruidQueryBuilder.class);
    protected final LogicalTableDictionary tableDictionary;
    protected final PhysicalTableResolver resolver;
    protected final DruidFilterBuilder druidFilterBuilder;
    protected final DruidHavingBuilder druidHavingBuilder;

    /**
     * Constructor.
     *
     * @param tableDictionary  Dictionary of logical tables used to look up table groups
     * @param resolver  Strategy for resolving the physical table
     * @param druidFilterBuilder  A factory methods for druid filters
     * @param druidHavingBuilder  A factory methods for druid havings
     */
    @Inject
    public DruidQueryBuilder(
            LogicalTableDictionary tableDictionary,
            PhysicalTableResolver resolver,
            DruidFilterBuilder druidFilterBuilder,
            DruidHavingBuilder druidHavingBuilder
    ) {
        this.tableDictionary = tableDictionary;
        this.resolver = resolver;
        this.druidFilterBuilder = druidFilterBuilder;
        this.druidHavingBuilder = druidHavingBuilder;
        LOG.trace("Table dictionary: {} \nPhysical table resolver: {}", tableDictionary, resolver);
    }

    /**
     * Build a druid query object from an API request and it's templateDruidQuery.
     *
     * @param request  DataApiRequest to use in building the query
     * @param template  TemplateDruidQuery to build out the query with
     *
     * @return a DruidAggregationQuery
     * @throws FilterBuilderException if filters cannot be built or resolve to no rows
     * @throws NoMatchFoundException if no PhysicalTable satisfies this request
     */
    public DruidAggregationQuery buildQuery(
            DataApiRequest request,
            TemplateDruidQuery template
    ) throws FilterBuilderException, NoMatchFoundException {

        LOG.trace("Building druid query with DataApiRequest: {} and TemplateDruidQuery: {}", request, template);

        // Whether to build the orderBy clause
        LimitSpec druidOrderBy;
        // Whether to build a topN query
        TopNMetric druidTopNMetric;

        if (request.getTopN().isPresent()) {
            //This is a topN query

            if (canOptimizeTopN(request, template)) {
                druidOrderBy = null;
                OrderByColumn sortBy = request.getSorts().iterator().next();
                druidTopNMetric = new TopNMetric(sortBy.getDimension(), sortBy.getDirection());
            } else if (request.getSorts().size() > 0) {
                druidOrderBy = new LimitSpec(request.getSorts());
                druidTopNMetric = null;
            } else {
                // We don't expect to reach this point. This is checked in DataApiRequest. Here for completeness
                throw new UnsupportedOperationException(TOP_N_UNSORTED.format(request.getTopN()));
            }
        } else if (request.getSorts().isEmpty() && !request.getCount().isPresent()) {
            //This is an arbitrary groupBy query
            druidOrderBy = null;
            druidTopNMetric = null;
        } else {
            //This is a sorted and/or a limited groupBy query
            druidOrderBy = new LimitSpec(request.getSorts(), request.getCount());
            druidTopNMetric = null;
        }

        // Get the tableGroup from the logical table and the alias
        LogicalTable logicalTable = tableDictionary.get(TableIdentifier.create(request));
        TableGroup group = logicalTable.getTableGroup();

        // combine the filters on the requested logical table with the api query filters
        DataApiRequest tableFilteredRequest = request.withFilters(
                logicalTable.getFilters()
                        .map(f -> ApiFilters.union(f, request.getApiFilters()))
                        .orElseGet(request::getApiFilters)
        );

        // Resolve the table from the the group, the combined dimensions in request, and template time grain
        QueryPlanningConstraint constraint = new QueryPlanningConstraint(tableFilteredRequest, template);
        ConstrainedTable table = resolver.resolve(group.getPhysicalTables(), constraint).withConstraint(constraint);


        Filter filter = druidFilterBuilder.buildFilters(tableFilteredRequest.getApiFilters());

        Set nonVirtualGroupingDimensions = tableFilteredRequest.getAllGroupingDimensions().stream()
                .filter(dim -> ! (dim instanceof VirtualDimension))
                .collect(Collectors.toCollection(LinkedHashSet::new));

        return druidTopNMetric != null ?
            buildTopNQuery(
                    template,
                    table,
                    tableFilteredRequest.getGranularity(),
                    tableFilteredRequest.getTimeZone(),
                    nonVirtualGroupingDimensions,
                    filter,
                    tableFilteredRequest.getIntervals(),
                    druidTopNMetric,
                    tableFilteredRequest.getTopN().get()
            ) :
             canOptimizeTimeSeries(tableFilteredRequest, template) ?
                buildTimeSeriesQuery(
                        template,
                        table,
                        tableFilteredRequest.getGranularity(),
                        tableFilteredRequest.getTimeZone(),
                        filter,
                        tableFilteredRequest.getIntervals()
                ) :
                buildGroupByQuery(
                        template,
                        table,
                        tableFilteredRequest.getGranularity(),
                        tableFilteredRequest.getTimeZone(),
                        nonVirtualGroupingDimensions,
                        filter,
                        druidHavingBuilder.buildHavings(tableFilteredRequest.getHavings()),
                        tableFilteredRequest.getIntervals(),
                        druidOrderBy
                );
    }

    /**
     * Builds a druid groupBy query recursively nesting dataSource based on the TemplateDruidQuery.
     *
     * @param template  The query template, possibly nested
     * @param table  The physical table that underlies the lowest-level datasource
     * @param granularity  The granularity from the request
     * @param timeZone  The time zone from the request
     * @param groupByDimensions  The grouping dimensions from the request
     * @param filter  The filters specified in the request (only applied at the lowest level)
     * @param having  The having clause specified in the request.
     * @param intervals  The intervals specified from the request
     * @param druidOrderBy  The order by
     *
     * @return a GroupByQuery
     */
    protected GroupByQuery buildGroupByQuery(
            TemplateDruidQuery template,
            ConstrainedTable table,
            Granularity granularity,
            DateTimeZone timeZone,
            Set groupByDimensions,
            Filter filter,
            Having having,
            List intervals,
            LimitSpec druidOrderBy
    ) {
        LOG.trace(
                "Building druid groupBy query with following parameters \n" +
                        "TemplateDruidQuery: {} \n" +
                        "TimeGrain: {} \n" +
                        "TimeZone: {} \n" +
                        "Table: {} \n" +
                        "Group by dimensions: {} \n" +
                        "Filter: {} \n" +
                        "Intervals: {} \n",
                template,
                granularity,
                timeZone,
                table,
                groupByDimensions,
                filter,
                intervals
        );

        Filter mergedFilter = filter;

        // Override the grain with what's set in the template if it has one set
        Granularity mergedGranularity = template.getTimeGrain() != null
                ? template.getTimeGrain().buildZonedTimeGrain(timeZone)
                : granularity;

        DataSource dataSource;
        if (!template.isNested()) {
            LOG.trace("Building a single pass druid groupBy query");
            dataSource = buildTableDataSource(table);
        } else {
            LOG.trace("Building a multi pass druid groupBy query");
            // Build the inner query without an order by, since we only want to do that at the top level
            // Sorts and Having don't apply to inner queries and Filters only apply to the innermost query
            GroupByQuery query = buildGroupByQuery(
                    template.getInnerQuery().get(),
                    table,
                    mergedGranularity,
                    timeZone,
                    groupByDimensions,
                    mergedFilter,
                    (Having) null,
                    intervals,
                    (LimitSpec) null
            );
            dataSource = new QueryDataSource(query);
            // Filters have been handled by the inner query, are not needed/allowed on the outer query
            mergedFilter = null;
        }

        // Filters must be applied at the lowest level as they exclude data from aggregates
        return new GroupByQuery(
                dataSource,
                mergedGranularity,
                groupByDimensions,
                mergedFilter,
                having,
                template.getAggregations(),
                template.getPostAggregations(),
                intervals,
                druidOrderBy,
                null,
                false,
                template.getVirtualColumns()
        );
    }

    /**
     * Build a data source from a table.
     *
     * @param table A fact table or fact table view
     *
     * @return A table datasource for a fact table or a union data source for a fact table view
     */
    private DataSource buildTableDataSource(ConstrainedTable table) {
        if (table.getDataSourceNames().size() == 1) {
            return new TableDataSource(table);
        } else {
            return new UnionDataSource(table);
        }
    }

    /**
     * Builds a druid topN query.
     *
     * @param template  The query template. Not nested since nesting is not supported in druid topN queries
     * @param table  The physical table that underlies the lowest-level datasource
     * @param granularity  The grain from the request
     * @param timeZone  The time zone from the request
     * @param groupByDimension  The grouping dimension from the request
     * @param filter  The filters specified in the request
     * @param intervals  The intervals specified from the request
     * @param metricSpec The topn metric spec
     * @param topN  The number of requested top entries per time bucket
     *
     * @return a TopNQuery
     */
    protected TopNQuery buildTopNQuery(
            TemplateDruidQuery template,
            ConstrainedTable table,
            Granularity granularity,
            DateTimeZone timeZone,
            Set groupByDimension,
            Filter filter,
            List intervals,
            TopNMetric metricSpec,
            int topN
    ) {
        LOG.trace(
                "Building druid topN query with following parameters \n" +
                        "TemplateDruidQuery: {} \n" +
                        "TimeGrain: {} \n" +
                        "TimeZone: {} \n" +
                        "Table: {} \n" +
                        "Group by dimensions: {} \n" +
                        "Filter: {} \n" +
                        "Intervals: {} \n" +
                        "MetricSpec: {} \n" +
                        "Threshold: {} \n",
                template,
                granularity,
                timeZone,
                table,
                groupByDimension,
                filter,
                intervals,
                metricSpec,
                topN
        );

        // Override the grain with what's set in the template if it has one set
        Granularity mergeGrain = (template.getTimeGrain() != null) ?
                template.getTimeGrain().buildZonedTimeGrain(timeZone) :
                granularity;

        LOG.trace("Building a single pass druid topN query");

        // The data source is the table directly, since there is no nested query below us
        return new TopNQuery(
                buildTableDataSource(table),
                // The check that the set of dimensions has exactly one element is currently done above
                mergeGrain,
                groupByDimension.iterator().next(),
                filter,
                template.getAggregations(),
                template.getPostAggregations(),
                intervals,
                topN,
                metricSpec,
                template.getVirtualColumns()
        );
    }

    /**
     * Builds a druid TimeSeries query.
     *
     * @param template  The query template. Not nested since nesting is not supported in druid timeseries queries
     * @param table  The physical table that underlies the lowest-level datasource
     * @param granularity  The grain from the request
     * @param timeZone  The time zone from the request
     * @param filter  The filters specified in the request
     * @param intervals  The intervals specified from the request
     *
     * @return a TimeSeriesQuery
     */
    protected TimeSeriesQuery buildTimeSeriesQuery(
            TemplateDruidQuery template,
            ConstrainedTable table,
            Granularity granularity,
            DateTimeZone timeZone,
            Filter filter,
            List intervals
    ) {
        LOG.trace(
                "Building druid timeseries query with following parameters \n" +
                        "TemplateDruidQuery: {} \n" +
                        "TimeGrain: {} \n" +
                        "Time Zone: {} \n" +
                        "Table: {} \n" +
                        "Filter: {} \n" +
                        "Intervals: {} \n",
                template,
                granularity,
                timeZone,
                table,
                filter,
                intervals
        );

        // Override the grain with what's set in the template if it has one set
        if (template.getTimeGrain() != null) {
            granularity = template.getTimeGrain().buildZonedTimeGrain(timeZone);
        }

        LOG.trace("Building a single pass druid timeseries query");

        return new TimeSeriesQuery(
                buildTableDataSource(table),
                granularity,
                filter,
                template.getAggregations(),
                template.getPostAggregations(),
                intervals,
                template.getVirtualColumns()
        );
    }

    /**
     * Determine if the optimization to a TopN query can be done.
     *
     * @param apiRequest  The request data
     * @param templateDruidQuery  The template query
     *
     * @return true if the optimization can be done, false if it can't
     */
    protected boolean canOptimizeTopN(DataApiRequest apiRequest, TemplateDruidQuery templateDruidQuery) {
        return apiRequest.optimizeBackendQuery() &&
                apiRequest.getDimensions().size() == 1 &&
                apiRequest.getSorts().size() == 1 &&
                !templateDruidQuery.isNested() &&
                BardFeatureFlag.TOP_N.isOn() &&
                apiRequest.getHavings().isEmpty();
    }

    /**
     * Determine if the optimization to a Timeseries query can be done.
     *
     * @param apiRequest  The request data
     * @param templateDruidQuery  The template query
     *
     * @return true if the optimization can be done, false if it can't
     */
    protected boolean canOptimizeTimeSeries(DataApiRequest apiRequest, TemplateDruidQuery templateDruidQuery) {
        return apiRequest.optimizeBackendQuery() &&
            apiRequest.getDimensions().isEmpty() &&
                !templateDruidQuery.isNested() &&
                apiRequest.getSorts().isEmpty() &&
                !apiRequest.getCount().isPresent() &&
                apiRequest.getHavings().isEmpty();
    }
}