org.apache.druid.indexer.path.DatasourcePathSpec Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.indexer.path;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.druid.indexer.HadoopDruidIndexerConfig;
import org.apache.druid.indexer.hadoop.DatasourceIngestionSpec;
import org.apache.druid.indexer.hadoop.DatasourceInputFormat;
import org.apache.druid.indexer.hadoop.WindowedDataSegment;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class DatasourcePathSpec implements PathSpec
{
private static final Logger logger = new Logger(DatasourcePathSpec.class);
public static final String TYPE = "dataSource";
private final DatasourceIngestionSpec ingestionSpec;
private final long maxSplitSize;
private final List segments;
/*
Note: User would set this flag when they are doing pure re-indexing and would like to have a different
set of aggregators than the ones used during original indexing.
Default behavior is to expect same aggregators as used in original data ingestion job to support delta-ingestion
use case.
*/
private final boolean useNewAggs;
private static final String USE_NEW_AGGS_KEY = "useNewAggs";
@JsonCreator
public DatasourcePathSpec(
@JsonProperty("segments") List segments,
@JsonProperty("ingestionSpec") DatasourceIngestionSpec spec,
@JsonProperty("maxSplitSize") Long maxSplitSize,
@JsonProperty(USE_NEW_AGGS_KEY) boolean useNewAggs
)
{
this.segments = segments;
this.ingestionSpec = Preconditions.checkNotNull(spec, "null ingestionSpec");
if (maxSplitSize == null) {
this.maxSplitSize = 0;
} else {
this.maxSplitSize = maxSplitSize.longValue();
}
this.useNewAggs = useNewAggs;
}
@JsonProperty
public boolean isUseNewAggs()
{
return useNewAggs;
}
@JsonProperty
public List getSegments()
{
return segments;
}
@JsonProperty
public DatasourceIngestionSpec getIngestionSpec()
{
return ingestionSpec;
}
@JsonProperty
public long getMaxSplitSize()
{
return maxSplitSize;
}
@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException
{
if (segments == null || segments.isEmpty()) {
if (ingestionSpec.isIgnoreWhenNoSegments()) {
logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
return job;
} else {
throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
}
}
logger.info(
"Found total [%d] segments for [%s] in interval [%s]",
segments.size(),
ingestionSpec.getDataSource(),
ingestionSpec.getIntervals()
);
DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
if (updatedIngestionSpec.getDimensions() == null) {
List dims;
if (config.getParser().getParseSpec().getDimensionsSpec().hasFixedDimensions()) {
dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
} else {
Set dimSet = Sets.newHashSet(
Iterables.concat(
Iterables.transform(
segments,
new Function>()
{
@Override
public Iterable apply(WindowedDataSegment dataSegment)
{
return dataSegment.getSegment().getDimensions();
}
}
)
)
);
dims = Lists.newArrayList(
Sets.difference(
dimSet,
config.getParser()
.getParseSpec()
.getDimensionsSpec()
.getDimensionExclusions()
)
);
}
updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
}
if (updatedIngestionSpec.getMetrics() == null) {
Set metrics = new HashSet<>();
final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
if (cols != null) {
if (useNewAggs) {
for (AggregatorFactory col : cols) {
metrics.addAll(col.requiredFields());
}
} else {
for (AggregatorFactory col : cols) {
metrics.add(col.getName());
}
}
}
updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
}
updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
// propagate in the transformSpec from the overall job config
updatedIngestionSpec = updatedIngestionSpec.withTransformSpec(
config.getSchema().getDataSchema().getTransformSpec()
);
DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
return job;
}
public static boolean checkIfReindexingAndIsUseAggEnabled(Map configuredPathSpec)
{
return TYPE.equals(configuredPathSpec.get("type")) && Boolean.parseBoolean(configuredPathSpec.getOrDefault(
USE_NEW_AGGS_KEY,
false
).toString());
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DatasourcePathSpec that = (DatasourcePathSpec) o;
if (maxSplitSize != that.maxSplitSize) {
return false;
}
if (!ingestionSpec.equals(that.ingestionSpec)) {
return false;
}
return !(segments != null ? !segments.equals(that.segments) : that.segments != null);
}
@Override
public int hashCode()
{
int result = ingestionSpec.hashCode();
result = 31 * result + (int) (maxSplitSize ^ (maxSplitSize >>> 32));
result = 31 * result + (segments != null ? segments.hashCode() : 0);
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy