All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.bigquery.sqlengine.builder.BigQueryDeduplicateSQLBuilder Maven / Gradle / Ivy

/*
 * Copyright © 2022 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.bigquery.sqlengine.builder;

import com.google.common.annotations.VisibleForTesting;
import io.cdap.cdap.etl.api.aggregation.DeduplicateAggregationDefinition;
import io.cdap.cdap.etl.api.relational.Expression;
import io.cdap.plugin.gcp.bigquery.relational.SQLExpression;
import io.cdap.plugin.gcp.bigquery.sqlengine.util.BigQuerySQLEngineUtils;

import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Helper class used to generate BigQuery SQL Statements for Select statements.
 */
public class BigQueryDeduplicateSQLBuilder extends BigQueryBaseSQLBuilder {
  private static final String ROW_NUM_PREFIX = "rn_";

  private final DeduplicateAggregationDefinition deduplicationDefinition;
  private final String source;
  private final String sourceAlias;
  private final String rowNumColumnAlias;

  public BigQueryDeduplicateSQLBuilder(DeduplicateAggregationDefinition deduplicationDefinition,
                                       String sourceExpression,
                                       String sourceAlias) {
    this(deduplicationDefinition,
         sourceExpression,
         sourceAlias,
         ROW_NUM_PREFIX + BigQuerySQLEngineUtils.newIdentifier());
  }

  @VisibleForTesting
  protected BigQueryDeduplicateSQLBuilder(DeduplicateAggregationDefinition deduplicationDefinition,
                                          String source,
                                          String sourceAlias,
                                          String rowNumColumnAlias) {
    this.deduplicationDefinition = deduplicationDefinition;
    this.source = source;
    this.sourceAlias = sourceAlias;
    // This is the alias used to store the row number value. Format is "rn_"
    this.rowNumColumnAlias = rowNumColumnAlias;
  }

  public String getQuery() {
    return String.format(SELECT_DEDUPLICATE_STATEMENT,
                         rowNumColumnAlias,
                         getInnerSelect(),
                         rowNumColumnAlias);
  }

  /**
   * Get select statement for the underlying relation which includes a row number column used to deduplicate
   * @return select statement that can be used to enumerate duplicate rows
   */
  @VisibleForTesting
  protected String getInnerSelect() {
    return SELECT + getSelectedFields(deduplicationDefinition) +
      FROM + OPEN_GROUP + SPACE + source + SPACE + CLOSE_GROUP + AS + sourceAlias;
  }

  /**
   * Gets selected fields as a string. This also includes a field used for assigning row numbers.
   *
   * @return selected fields separated by commas
   */
  @VisibleForTesting
  protected String getSelectedFields(DeduplicateAggregationDefinition def) {
    Set columns = getSelectColumnsStream(def.getSelectExpressions())
      .collect(Collectors.toCollection(LinkedHashSet::new));
    columns.add(getRowNumColumn(def));
    return String.join(COMMA, columns);
  }

  /**
   * Build statement to generate a row number based on the supplied deduplication definition.
   * @param def deduplication definition
   * @return statement used to assign row numbers to output columns.
   */
  @VisibleForTesting
  protected String getRowNumColumn(DeduplicateAggregationDefinition def) {
    StringBuilder window = new StringBuilder();
    // Add partition by clause for windowing
    window.append(PARTITION_BY).append(getPartitionByFields(def.getGroupByExpressions()));
    // Add ordering clause if specified
    if (def.getFilterExpressions() != null && def.getFilterExpressions().size() > 0) {
      window.append(SPACE).append(ORDER_BY).append(getOrderByFields(def.getFilterExpressions()));
    }
    return String.format(ROW_NUMBER_PARTITION_COLUMN,
                         window.toString(),
                         rowNumColumnAlias);
  }

  /**
   * Get fields used for partitioning
   * @param partitionByExpressions expressions used for partitioning
   * @return expressions separated by a comma.
   */
  @VisibleForTesting
  protected String getPartitionByFields(List partitionByExpressions) {
    return getExpressionSQLStream(partitionByExpressions)
      .collect(Collectors.joining(COMMA));
  }

  /**
   * Get fields used for ordering.
   * @param orderByExpression expressions used for ordering
   * @return order by expressions separated by a comma.
   */
  @VisibleForTesting
  protected String getOrderByFields(List orderByExpression) {
    return orderByExpression
      .stream()
      .map(this::getOrderByField)
      .collect(Collectors.joining(COMMA));
  }

  /**
   * Buinds Order By expression based on a filter expression.
   * @param filterExpression supplied expression
   * @return Order by SQL expression
   */
  protected String getOrderByField(DeduplicateAggregationDefinition.FilterExpression filterExpression) {
    String exp = ((SQLExpression) filterExpression.getExpression()).extract();

    switch (filterExpression.getFilterFunction()) {
      case MIN:
        // MIN of a value means ORDER ASCENDING and selecting the first result.
        // ...[ORDER BY] exp ASC NULLS LAST
        return exp + SPACE + ORDER_ASC + SPACE + NULLS_LAST;
      case MAX:
        // MAX of a value means ORDER DESCENDING and selecting the first result.
        // ...[ORDER BY] exp DESC NULLS LAST
        return exp + SPACE + ORDER_DESC + SPACE + NULLS_LAST;
      case ANY_NULLS_FIRST:
        // ANY_NULLS_FIRST means order with null values first and pick the first.
        // ...[ORDER BY] IF(exp IS NULL , 0 , 1) ASC
        return IF_FUNCTION + OPEN_GROUP + exp + IS_NULL + COMMA + ZERO + COMMA + ONE + CLOSE_GROUP + SPACE + ORDER_ASC;
      case ANY_NULLS_LAST:
      default:
        // ANY_NULLS_LAST means order with null values first and pick the first.
        // ...[ORDER BY] IF(exp IS NULL , 0 , 1) DESC
        return IF_FUNCTION + OPEN_GROUP + exp + IS_NULL + COMMA + ZERO + COMMA + ONE + CLOSE_GROUP + SPACE + ORDER_DESC;
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy