org.apache.druid.indexing.input.GeneratorInputSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-indexing-service Show documentation
druid-indexing-service
There is a newer version: 32.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexing.input;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import org.apache.druid.data.input.AbstractInputSource;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.InputRowListPlusRawValues;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.data.input.InputSource;
import org.apache.druid.data.input.InputSourceReader;
import org.apache.druid.data.input.InputSplit;
import org.apache.druid.data.input.InputStats;
import org.apache.druid.data.input.MapBasedInputRow;
import org.apache.druid.data.input.SplitHintSpec;
import org.apache.druid.data.input.impl.MapInputRowParser;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.guice.IndexingServiceInputSourceModule;
import org.apache.druid.java.util.common.CloseableIterators;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.segment.generator.DataGenerator;
import org.apache.druid.segment.generator.GeneratorBasicSchemas;
import org.apache.druid.segment.generator.GeneratorColumnSchema;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.File;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.stream.LongStream;
import java.util.stream.Stream;

/**
 * {@link InputSource} that can be used to seed a Druid cluster with test data, using either the built-in schemas
 * defined in {@link GeneratorBasicSchemas}, or by directly supplying a list of {@link GeneratorColumnSchema}, to
 * construct a {@link DataGenerator}. To produce a stable set of data, a random {@link #seed} may be supplied which
 * will be used for all data generated by the columns. When {@link #numSplits} is greater than 1, the {@link #seed}
 * will be instead used to pick a new seed for each split, allowing the splits to produce a different set of data,
 * but still in a stable manner.
 */
public class GeneratorInputSource extends AbstractInputSource implements SplittableInputSource
{
  private static final int DEFAULT_NUM_ROWS = 1000;
  private static final int DEFAULT_NUM_SPLITS = 1;
  private static final long DEFAULT_SEED = 1024L;
  private static final long DEFAULT_START_TIME = DateTimes.nowUtc().minusDays(1).getMillis();
  private static final int DEFAULT_CONSECUTIVE_TIMESTAMPS = 100;
  private static final double DEFAULT_TIMESTAMP_INCREMENT = 1.0;

  private final String schemaName;
  private final List schema;
  private final int numRows;
  private final Integer numSplits;
  private final Long seed;
  private final Long startTime;
  private final Integer numConsecutiveTimestamps;
  private final Double timestampIncrement;
  
  @JsonCreator
  public GeneratorInputSource(
      @JsonProperty("schemaName") @Nullable String schemaName,
      @JsonProperty("schema") @Nullable List schema,
      @JsonProperty("numRows") Integer numRows,
      @JsonProperty("numSplits") Integer numSplits,
      @JsonProperty("seed") Long seed,
      @JsonProperty("startTime") Long startTime,
      @JsonProperty("numConsecutiveTimestamps") Integer numConsecutiveTimestamps,
      @JsonProperty("timestampIncrement") Double timestampIncrement
  )
  {
    Preconditions.checkArgument(
        schemaName != null || schema != null,
        "Must specify either 'schemaName' or 'schema'"
    );
    this.schemaName = schemaName;
    this.schema = schema != null
                         ? schema
                         : GeneratorBasicSchemas.SCHEMA_MAP.get(schemaName).getColumnSchemas();
    this.numRows = numRows != null ? numRows : DEFAULT_NUM_ROWS;
    this.numSplits = numSplits != null ? numSplits : DEFAULT_NUM_SPLITS;
    this.seed = seed != null ? seed : DEFAULT_SEED;
    this.startTime = startTime != null ? startTime : DEFAULT_START_TIME;
    this.numConsecutiveTimestamps = numConsecutiveTimestamps != null
                                    ? numConsecutiveTimestamps
                                    : DEFAULT_CONSECUTIVE_TIMESTAMPS;
    this.timestampIncrement = timestampIncrement != null ? timestampIncrement : DEFAULT_TIMESTAMP_INCREMENT;
  }

  @JsonIgnore
  @Nonnull
  @Override
  public Set getTypes()
  {
    return Collections.singleton(IndexingServiceInputSourceModule.GENERATOR_SCHEME);
  }

  @Override
  public Stream> createSplits(
      InputFormat inputFormat,
      @Nullable SplitHintSpec splitHintSpec
  )
  {
    Random r = new Random(seed);
    return LongStream.range(0, numSplits).mapToObj(i -> new InputSplit<>(r.nextLong()));
  }

  @Override
  public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
  {
    return numSplits;
  }

  @Override
  public InputSource withSplit(InputSplit split)
  {
    return new GeneratorInputSource(
        schemaName,
        schema,
        numRows,
        1,
        split.get(),
        startTime,
        numConsecutiveTimestamps,
        timestampIncrement
    );
  }

  @Override
  public boolean needsFormat()
  {
    return false;
  }

  @Override
  protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory)
  {
    return new InputSourceReader()
    {
      @Override
      public CloseableIterator read(InputStats inputStats)
      {
        return CloseableIterators.withEmptyBaggage(new Iterator()
        {
          int rowCount = 0;
          private final DataGenerator generator = makeGenerator();

          @Override
          public boolean hasNext()
          {
            return rowCount < numRows;
          }

          @Override
          public InputRow next()
          {
            rowCount++;
            return MapInputRowParser.parse(
                inputRowSchema,
                generator.nextRaw(inputRowSchema.getTimestampSpec().getTimestampColumn())
            );
          }
        });
      }

      @Override
      public CloseableIterator sample()
      {
        return CloseableIterators.withEmptyBaggage(new Iterator()
        {
          int rowCount = 0;
          private final DataGenerator generator = makeGenerator();

          @Override
          public boolean hasNext()
          {
            return rowCount < numRows;
          }

          @Override
          public InputRowListPlusRawValues next()
          {
            rowCount++;
            InputRow row = generator.nextRow();
            return InputRowListPlusRawValues.of(row, ((MapBasedInputRow) row).getEvent());
          }
        });
      }
    };
  }

  @JsonProperty
  public String getSchemaName()
  {
    return schemaName;
  }

  @JsonProperty
  public List getSchema()
  {
    return schemaName == null ? schema : null;
  }

  @JsonProperty
  public int getNumRows()
  {
    return numRows;
  }

  @JsonProperty
  public Integer getNumSplits()
  {
    return numSplits;
  }

  @JsonProperty
  public Long getSeed()
  {
    return seed;
  }

  @JsonProperty
  public Long getStartTime()
  {
    return startTime;
  }

  @JsonProperty
  public Integer getNumConsecutiveTimestamps()
  {
    return numConsecutiveTimestamps;
  }

  @JsonProperty
  public Double getTimestampIncrement()
  {
    return timestampIncrement;
  }

  @Override
  public boolean equals(Object o)
  {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }
    GeneratorInputSource that = (GeneratorInputSource) o;
    return numRows == that.numRows &&
           Objects.equals(schemaName, that.schemaName) &&
           Objects.equals(schema, that.schema) &&
           Objects.equals(numSplits, that.numSplits) &&
           Objects.equals(seed, that.seed) &&
           Objects.equals(startTime, that.startTime) &&
           Objects.equals(numConsecutiveTimestamps, that.numConsecutiveTimestamps) &&
           Objects.equals(timestampIncrement, that.timestampIncrement);
  }

  @Override
  public int hashCode()
  {
    return Objects.hash(
        schemaName,
        schema,
        numRows,
        numSplits,
        seed,
        startTime,
        numConsecutiveTimestamps,
        timestampIncrement
    );
  }

  private DataGenerator makeGenerator()
  {
    return new DataGenerator(
        schema,
        seed,
        startTime,
        numConsecutiveTimestamps,
        timestampIncrement
    );
  }
}