All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.segment.indexing.DataSchema Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.indexing;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import org.apache.druid.common.utils.IdUtils;
import org.apache.druid.data.input.impl.AggregateProjectionSpec;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.data.input.impl.ParseSpec;
import org.apache.druid.data.input.impl.TimestampSpec;
import org.apache.druid.error.DruidException;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.segment.column.ColumnHolder;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
import org.apache.druid.segment.transform.TransformSpec;

import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;


/**
 *
 */
public class DataSchema
{
  private static final Logger log = new Logger(DataSchema.class);

  public static Builder builder()
  {
    return new Builder();
  }

  public static Builder builder(DataSchema schema)
  {
    return new Builder(schema);
  }

  private final String dataSource;
  private final AggregatorFactory[] aggregators;
  private final GranularitySpec granularitySpec;
  private final TransformSpec transformSpec;
  private final Map parserMap;
  private final ObjectMapper objectMapper;

  // The below fields can be initialized lazily from parser for backward compatibility.
  private TimestampSpec timestampSpec;
  private DimensionsSpec dimensionsSpec;

  // This is used for backward compatibility
  private InputRowParser inputRowParser;
  private List projections;

  @JsonCreator
  public DataSchema(
      @JsonProperty("dataSource") String dataSource,
      @JsonProperty("timestampSpec") @Nullable TimestampSpec timestampSpec, // can be null in old task spec
      @JsonProperty("dimensionsSpec") @Nullable DimensionsSpec dimensionsSpec, // can be null in old task spec
      @JsonProperty("metricsSpec") AggregatorFactory[] aggregators,
      @JsonProperty("granularitySpec") GranularitySpec granularitySpec,
      @JsonProperty("transformSpec") TransformSpec transformSpec,
      @JsonProperty("projections") @Nullable List projections,
      @Deprecated @JsonProperty("parser") @Nullable Map parserMap,
      @JacksonInject ObjectMapper objectMapper
  )
  {
    validateDatasourceName(dataSource);
    this.dataSource = dataSource;

    this.timestampSpec = timestampSpec;
    this.aggregators = aggregators == null ? new AggregatorFactory[]{} : aggregators;
    this.dimensionsSpec = dimensionsSpec == null
                          ? null
                          : computeDimensionsSpec(
                              Preconditions.checkNotNull(timestampSpec, "timestampSpec"),
                              dimensionsSpec,
                              this.aggregators
                          );

    if (granularitySpec == null) {
      log.warn("No granularitySpec has been specified. Using UniformGranularitySpec as default.");
      this.granularitySpec = new UniformGranularitySpec(null, null, null);
    } else {
      this.granularitySpec = granularitySpec;
    }
    this.transformSpec = transformSpec == null ? TransformSpec.NONE : transformSpec;
    this.projections = projections;
    this.parserMap = parserMap;
    this.objectMapper = objectMapper;

    // Fail-fast if there are output name collisions. Note: because of the pull-from-parser magic in getDimensionsSpec,
    // this validation is not necessarily going to be able to catch everything. It will run again in getDimensionsSpec.
    computeAndValidateOutputFieldNames(this.dimensionsSpec, this.aggregators);

    if (this.granularitySpec.isRollup() && this.aggregators.length == 0) {
      log.warn(
          "Rollup is enabled for dataSource [%s] but no metricsSpec has been provided. "
          + "Are you sure this is what you want?",
          dataSource
      );
    }
  }

  private static void validateDatasourceName(String dataSource)
  {
    IdUtils.validateId("dataSource", dataSource);
  }

  /**
   * Computes the {@link DimensionsSpec} that we will actually use. It is derived from, but not necessarily identical
   * to, the one that we were given.
   */
  private static DimensionsSpec computeDimensionsSpec(
      final TimestampSpec timestampSpec,
      final DimensionsSpec dimensionsSpec,
      final AggregatorFactory[] aggregators
  )
  {
    final Set inputFieldNames = computeInputFieldNames(timestampSpec, dimensionsSpec, aggregators);
    final Set outputFieldNames = computeAndValidateOutputFieldNames(dimensionsSpec, aggregators);

    // Set up additional exclusions: all inputs and outputs, minus defined dimensions.
    final Set additionalDimensionExclusions = new HashSet<>();
    additionalDimensionExclusions.addAll(inputFieldNames);
    additionalDimensionExclusions.addAll(outputFieldNames);
    additionalDimensionExclusions.removeAll(dimensionsSpec.getDimensionNames());

    return dimensionsSpec.withDimensionExclusions(additionalDimensionExclusions);
  }

  private static Set computeInputFieldNames(
      final TimestampSpec timestampSpec,
      final DimensionsSpec dimensionsSpec,
      final AggregatorFactory[] aggregators
  )
  {
    final Set fields = new HashSet<>();

    fields.add(timestampSpec.getTimestampColumn());
    fields.addAll(dimensionsSpec.getDimensionNames());
    Arrays.stream(aggregators)
          .flatMap(aggregator -> aggregator.requiredFields().stream())
          .forEach(fields::add);

    return fields;
  }

  /**
   * Computes the set of field names that are specified by the provided dimensions and aggregator lists.
   *
   * If either list is null, it is ignored.
   *
   * @throws IllegalArgumentException if there are duplicate field names, or if any dimension or aggregator
   *                                  has a null name
   */
  private static Set computeAndValidateOutputFieldNames(
      @Nullable final DimensionsSpec dimensionsSpec,
      @Nullable final AggregatorFactory[] aggregators
  )
  {
    // Field name -> where it was seen
    final Map> fields = new TreeMap<>();

    fields.computeIfAbsent(ColumnHolder.TIME_COLUMN_NAME, k -> TreeMultiset.create()).add(
        StringUtils.format(
            "primary timestamp (%s cannot appear elsewhere except as long-typed dimension)",
            ColumnHolder.TIME_COLUMN_NAME
        )
    );

    if (dimensionsSpec != null) {
      boolean sawTimeDimension = false;

      for (int i = 0; i < dimensionsSpec.getDimensions().size(); i++) {
        final DimensionSchema dimSchema = dimensionsSpec.getDimensions().get(i);
        final String field = dimSchema.getName();
        if (Strings.isNullOrEmpty(field)) {
          throw DruidException
              .forPersona(DruidException.Persona.USER)
              .ofCategory(DruidException.Category.INVALID_INPUT)
              .build("Encountered dimension with null or empty name at position[%d]", i);
        }

        if (ColumnHolder.TIME_COLUMN_NAME.equals(field)) {
          if (i > 0 && dimensionsSpec.isForceSegmentSortByTime()) {
            throw DruidException
                .forPersona(DruidException.Persona.USER)
                .ofCategory(DruidException.Category.INVALID_INPUT)
                .build(
                    "Encountered dimension[%s] at position[%d]. This is only supported when the dimensionsSpec "
                    + "parameter[%s] is set to[false]. %s",
                    field,
                    i,
                    DimensionsSpec.PARAMETER_FORCE_TIME_SORT,
                    DimensionsSpec.WARNING_NON_TIME_SORT_ORDER
                );
          } else if (!dimSchema.getColumnType().is(ValueType.LONG)) {
            throw DruidException
                .forPersona(DruidException.Persona.USER)
                .ofCategory(DruidException.Category.INVALID_INPUT)
                .build(
                    "Encountered dimension[%s] with incorrect type[%s]. Type must be 'long'.",
                    field,
                    dimSchema.getColumnType()
                );
          } else if (!sawTimeDimension) {
            // Skip adding __time to "fields" (once) if it's listed as a dimension, so it doesn't show up as an error.
            sawTimeDimension = true;
            continue;
          }
        }

        fields.computeIfAbsent(field, k -> TreeMultiset.create()).add("dimensions list");
      }
    }

    if (aggregators != null) {
      for (int i = 0; i < aggregators.length; i++) {
        final String field = aggregators[i].getName();
        if (Strings.isNullOrEmpty(field)) {
          throw new IAE("Encountered metric with null or empty name at position %d", i);
        }

        fields.computeIfAbsent(field, k -> TreeMultiset.create()).add("metricsSpec list");
      }
    }

    final List errors = new ArrayList<>();

    for (Map.Entry> fieldEntry : fields.entrySet()) {
      if (fieldEntry.getValue().entrySet().stream().mapToInt(Multiset.Entry::getCount).sum() > 1) {
        errors.add(
            StringUtils.format(
                "[%s] seen in %s",
                fieldEntry.getKey(),
                fieldEntry.getValue().entrySet().stream().map(
                    entry ->
                        StringUtils.format(
                            "%s%s",
                            entry.getElement(),
                            entry.getCount() == 1 ? "" : StringUtils.format(
                                " (%d occurrences)",
                                entry.getCount()
                            )
                        )
                ).collect(Collectors.joining(", "))
            )
        );
      }
    }

    if (errors.isEmpty()) {
      return fields.keySet();
    } else {
      throw DruidException.forPersona(DruidException.Persona.USER)
                          .ofCategory(DruidException.Category.INVALID_INPUT)
                          .build("Cannot specify a column more than once: %s", String.join("; ", errors));
    }
  }

  @JsonProperty
  public String getDataSource()
  {
    return dataSource;
  }

  @Nullable
  @JsonProperty("timestampSpec")
  private TimestampSpec getGivenTimestampSpec()
  {
    return timestampSpec;
  }

  public TimestampSpec getTimestampSpec()
  {
    if (timestampSpec == null) {
      timestampSpec = Preconditions.checkNotNull(getParser(), "inputRowParser").getParseSpec().getTimestampSpec();
    }
    return timestampSpec;
  }

  @Nullable
  @JsonProperty("dimensionsSpec")
  private DimensionsSpec getGivenDimensionsSpec()
  {
    return dimensionsSpec;
  }

  public DimensionsSpec getDimensionsSpec()
  {
    if (dimensionsSpec == null) {
      dimensionsSpec = computeDimensionsSpec(
          getTimestampSpec(),
          Preconditions.checkNotNull(getParser(), "inputRowParser").getParseSpec().getDimensionsSpec(),
          aggregators
      );
    }
    return dimensionsSpec;
  }

  @JsonProperty("metricsSpec")
  public AggregatorFactory[] getAggregators()
  {
    return aggregators;
  }

  @JsonProperty
  public GranularitySpec getGranularitySpec()
  {
    return granularitySpec;
  }

  @JsonProperty
  public TransformSpec getTransformSpec()
  {
    return transformSpec;
  }

  @JsonProperty
  @JsonInclude(JsonInclude.Include.NON_NULL)
  public List getProjections()
  {
    return projections;
  }

  @Deprecated
  @JsonProperty("parser")
  @Nullable
  @JsonInclude(Include.NON_NULL)
  public Map getParserMap()
  {
    return parserMap;
  }

  @Nullable
  public InputRowParser getParser()
  {
    if (inputRowParser == null) {
      if (parserMap == null) {
        return null;
      }
      //noinspection unchecked
      inputRowParser = transformSpec.decorate(objectMapper.convertValue(this.parserMap, InputRowParser.class));
      ParseSpec parseSpec = inputRowParser.getParseSpec();
      parseSpec = parseSpec.withDimensionsSpec(
          computeDimensionsSpec(parseSpec.getTimestampSpec(), parseSpec.getDimensionsSpec(), aggregators)
      );
      if (timestampSpec != null) {
        parseSpec = parseSpec.withTimestampSpec(timestampSpec);
      }
      if (dimensionsSpec != null) {
        parseSpec = parseSpec.withDimensionsSpec(dimensionsSpec);
      }
      inputRowParser = inputRowParser.withParseSpec(parseSpec);
    }
    return inputRowParser;
  }

  public DataSchema withGranularitySpec(GranularitySpec granularitySpec)
  {
    return builder(this).withGranularity(granularitySpec).build();
  }

  public DataSchema withTransformSpec(TransformSpec transformSpec)
  {
    return builder(this).withTransform(transformSpec).build();
  }

  public DataSchema withDimensionsSpec(DimensionsSpec dimensionsSpec)
  {
    return builder(this).withDimensions(dimensionsSpec).build();
  }

  @Override
  public String toString()
  {
    return "DataSchema{" +
           "dataSource='" + dataSource + '\'' +
           ", aggregators=" + Arrays.toString(aggregators) +
           ", granularitySpec=" + granularitySpec +
           ", transformSpec=" + transformSpec +
           ", parserMap=" + parserMap +
           ", timestampSpec=" + timestampSpec +
           ", dimensionsSpec=" + dimensionsSpec +
           ", projections=" + projections +
           ", inputRowParser=" + inputRowParser +
           '}';
  }

  public static class Builder
  {
    private String dataSource;
    private AggregatorFactory[] aggregators;
    private GranularitySpec granularitySpec;
    private TransformSpec transformSpec;
    private Map parserMap;
    private ObjectMapper objectMapper;
    private TimestampSpec timestampSpec;
    private DimensionsSpec dimensionsSpec;
    private List projections;

    public Builder()
    {

    }

    public Builder(DataSchema schema)
    {
      this.dataSource = schema.dataSource;
      this.timestampSpec = schema.timestampSpec;
      this.dimensionsSpec = schema.dimensionsSpec;
      this.transformSpec = schema.transformSpec;
      this.aggregators = schema.aggregators;
      this.projections = schema.projections;
      this.granularitySpec = schema.granularitySpec;
      this.parserMap = schema.parserMap;
      this.objectMapper = schema.objectMapper;
    }

    public Builder withDataSource(String dataSource)
    {
      this.dataSource = dataSource;
      return this;
    }

    public Builder withTimestamp(TimestampSpec timestampSpec)
    {
      this.timestampSpec = timestampSpec;
      return this;
    }

    public Builder withDimensions(DimensionsSpec dimensionsSpec)
    {
      this.dimensionsSpec = dimensionsSpec;
      return this;
    }

    public Builder withDimensions(List dimensions)
    {
      this.dimensionsSpec = DimensionsSpec.builder().setDimensions(dimensions).build();
      return this;
    }

    public Builder withDimensions(DimensionSchema... dimensions)
    {
      return withDimensions(Arrays.asList(dimensions));
    }

    public Builder withAggregators(AggregatorFactory... aggregators)
    {
      this.aggregators = aggregators;
      return this;
    }

    public Builder withGranularity(GranularitySpec granularitySpec)
    {
      this.granularitySpec = granularitySpec;
      return this;
    }

    public Builder withTransform(TransformSpec transformSpec)
    {
      this.transformSpec = transformSpec;
      return this;
    }

    public Builder withProjections(List projections)
    {
      this.projections = projections;
      return this;
    }

    @Deprecated
    public Builder withObjectMapper(ObjectMapper objectMapper)
    {
      this.objectMapper = objectMapper;
      return this;
    }

    @Deprecated
    public Builder withParserMap(Map parserMap)
    {
      this.parserMap = parserMap;
      return this;
    }

    public DataSchema build()
    {
      return new DataSchema(
          dataSource,
          timestampSpec,
          dimensionsSpec,
          aggregators,
          granularitySpec,
          transformSpec,
          projections,
          parserMap,
          objectMapper
      );
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy