All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.druid.indexing.firehose.IngestSegmentFirehoseFactory Maven / Gradle / Ivy

There is a newer version: 0.12.3
Show newest version
/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.indexing.firehose;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import io.druid.java.util.emitter.EmittingLogger;
import io.druid.data.input.Firehose;
import io.druid.data.input.FirehoseFactory;
import io.druid.data.input.impl.InputRowParser;
import io.druid.indexing.common.TaskToolbox;
import io.druid.indexing.common.actions.SegmentListUsedAction;
import io.druid.java.util.common.parsers.ParseException;
import io.druid.query.filter.DimFilter;
import io.druid.segment.IndexIO;
import io.druid.segment.QueryableIndexStorageAdapter;
import io.druid.segment.loading.SegmentLoadingException;
import io.druid.segment.realtime.firehose.IngestSegmentFirehose;
import io.druid.segment.realtime.firehose.WindowedStorageAdapter;
import io.druid.segment.transform.TransformSpec;
import io.druid.timeline.DataSegment;
import io.druid.timeline.TimelineObjectHolder;
import io.druid.timeline.VersionedIntervalTimeline;
import io.druid.timeline.partition.PartitionChunk;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public class IngestSegmentFirehoseFactory implements FirehoseFactory
{
  private static final EmittingLogger log = new EmittingLogger(IngestSegmentFirehoseFactory.class);
  private final String dataSource;
  private final Interval interval;
  private final DimFilter dimFilter;
  private final List dimensions;
  private final List metrics;
  private final IndexIO indexIO;
  private TaskToolbox taskToolbox;

  @JsonCreator
  public IngestSegmentFirehoseFactory(
      @JsonProperty("dataSource") final String dataSource,
      @JsonProperty("interval") Interval interval,
      @JsonProperty("filter") DimFilter dimFilter,
      @JsonProperty("dimensions") List dimensions,
      @JsonProperty("metrics") List metrics,
      @JacksonInject IndexIO indexIO
  )
  {
    Preconditions.checkNotNull(dataSource, "dataSource");
    Preconditions.checkNotNull(interval, "interval");
    this.dataSource = dataSource;
    this.interval = interval;
    this.dimFilter = dimFilter;
    this.dimensions = dimensions;
    this.metrics = metrics;
    this.indexIO = Preconditions.checkNotNull(indexIO, "null IndexIO");
  }

  @JsonProperty
  public String getDataSource()
  {
    return dataSource;
  }

  @JsonProperty
  public Interval getInterval()
  {
    return interval;
  }

  @JsonProperty("filter")
  public DimFilter getDimensionsFilter()
  {
    return dimFilter;
  }

  @JsonProperty
  public List getDimensions()
  {
    return dimensions;
  }

  @JsonProperty
  public List getMetrics()
  {
    return metrics;
  }

  public void setTaskToolbox(TaskToolbox taskToolbox)
  {
    this.taskToolbox = taskToolbox;
  }

  @Override
  public Firehose connect(InputRowParser inputRowParser, File temporaryDirectory) throws IOException, ParseException
  {
    log.info("Connecting firehose: dataSource[%s], interval[%s]", dataSource, interval);

    Preconditions.checkNotNull(taskToolbox, "taskToolbox is not set");

    try {
      final List usedSegments = taskToolbox
          .getTaskActionClient()
          .submit(new SegmentListUsedAction(dataSource, interval, null));
      final Map segmentFileMap = taskToolbox.fetchSegments(usedSegments);
      final List> timeLineSegments = VersionedIntervalTimeline
          .forSegments(usedSegments)
          .lookup(interval);

      final List dims;
      if (dimensions != null) {
        dims = dimensions;
      } else if (inputRowParser.getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
        dims = inputRowParser.getParseSpec().getDimensionsSpec().getDimensionNames();
      } else {
        dims = getUniqueDimensions(
            timeLineSegments,
            inputRowParser.getParseSpec().getDimensionsSpec().getDimensionExclusions()
        );
      }

      final List metricsList = metrics == null ? getUniqueMetrics(timeLineSegments) : metrics;

      final List adapters = Lists.newArrayList(
          Iterables.concat(
              Iterables.transform(
                  timeLineSegments,
                  new Function, Iterable>()
                  {
                    @Override
                    public Iterable apply(final TimelineObjectHolder holder)
                    {
                      return
                          Iterables.transform(
                              holder.getObject(),
                              new Function, WindowedStorageAdapter>()
                              {
                                @Override
                                public WindowedStorageAdapter apply(final PartitionChunk input)
                                {
                                  final DataSegment segment = input.getObject();
                                  try {
                                    return new WindowedStorageAdapter(
                                        new QueryableIndexStorageAdapter(
                                            indexIO.loadIndex(
                                                Preconditions.checkNotNull(
                                                    segmentFileMap.get(segment),
                                                    "File for segment %s", segment.getIdentifier()
                                                )
                                            )
                                        ),
                                        holder.getInterval()
                                    );
                                  }
                                  catch (IOException e) {
                                    throw Throwables.propagate(e);
                                  }
                                }
                              }
                          );
                    }
                  }
              )
          )
      );

      final TransformSpec transformSpec = TransformSpec.fromInputRowParser(inputRowParser);
      return new IngestSegmentFirehose(adapters, transformSpec, dims, metricsList, dimFilter);
    }
    catch (IOException | SegmentLoadingException e) {
      throw Throwables.propagate(e);
    }
  }

  @VisibleForTesting
  static List getUniqueDimensions(
      List> timelineSegments,
      @Nullable Set excludeDimensions
  )
  {
    final BiMap uniqueDims = HashBiMap.create();

    // Here, we try to retain the order of dimensions as they were specified since the order of dimensions may be
    // optimized for performance.
    // Dimensions are extracted from the recent segments to olders because recent segments are likely to be queried more
    // frequently, and thus the performance should be optimized for recent ones rather than old ones.

    // timelineSegments are sorted in order of interval
    int index = 0;
    for (TimelineObjectHolder timelineHolder : Lists.reverse(timelineSegments)) {
      for (PartitionChunk chunk : timelineHolder.getObject()) {
        for (String dimension : chunk.getObject().getDimensions()) {
          if (!uniqueDims.containsKey(dimension) &&
              (excludeDimensions == null || !excludeDimensions.contains(dimension))) {
            uniqueDims.put(dimension, index++);
          }
        }
      }
    }

    final BiMap orderedDims = uniqueDims.inverse();
    return IntStream.range(0, orderedDims.size())
                    .mapToObj(orderedDims::get)
                    .collect(Collectors.toList());
  }

  @VisibleForTesting
  static List getUniqueMetrics(List> timelineSegments)
  {
    final BiMap uniqueMetrics = HashBiMap.create();

    // Here, we try to retain the order of metrics as they were specified. Metrics are extracted from the recent
    // segments to olders.

    // timelineSegments are sorted in order of interval
    int index = 0;
    for (TimelineObjectHolder timelineHolder : Lists.reverse(timelineSegments)) {
      for (PartitionChunk chunk : timelineHolder.getObject()) {
        for (String metric : chunk.getObject().getMetrics()) {
          if (!uniqueMetrics.containsKey(metric)) {
            uniqueMetrics.put(metric, index++);
          }
        }
      }
    }

    final BiMap orderedMetrics = uniqueMetrics.inverse();
    return IntStream.range(0, orderedMetrics.size())
        .mapToObj(orderedMetrics::get)
        .collect(Collectors.toList());
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy