![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.druid.segment.indexing.ReaderUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.segment.indexing;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.data.input.impl.TimestampSpec;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.java.util.common.parsers.JSONPathFieldSpec;
import org.apache.druid.java.util.common.parsers.JSONPathFieldType;
import org.apache.druid.java.util.common.parsers.JSONPathSpec;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.filter.DimFilter;
import org.apache.druid.segment.transform.Transform;
import org.apache.druid.segment.transform.TransformSpec;
import javax.annotation.Nullable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ReaderUtils
{
private static final Logger LOG = new Logger(ReaderUtils.class);
private static final Pattern JSON_PATH_PATTERN = Pattern.compile("\\[(.*?)]");
private static final Pattern BRACKET_NOTATED_CHILD_PATTERN = Pattern.compile("'(.*?)'");
public static Set getColumnsRequiredForIngestion(
Set fullInputSchema,
TimestampSpec timestampSpec,
DimensionsSpec dimensionsSpec,
TransformSpec transformSpec,
AggregatorFactory[] aggregators,
@Nullable JSONPathSpec flattenSpec
)
{
Set fieldsRequired = new HashSet<>();
// We need to read timestamp column for Druid timestamp field
fieldsRequired.add(timestampSpec.getTimestampColumn());
// Find columns we need to read from the flattenSpec
if (flattenSpec != null) {
if (dimensionsSpec.getDimensions().isEmpty() && flattenSpec.isUseFieldDiscovery()) {
// Schemaless ingestion with useFieldDiscovery needs to read all columns
return fullInputSchema;
}
// Parse columns needed from flattenSpec
for (JSONPathFieldSpec fields : flattenSpec.getFields()) {
if (fields.getType() == JSONPathFieldType.ROOT) {
// ROOT type just get top level field using the expr as the key
fieldsRequired.add(fields.getExpr());
} else if (fields.getType() == JSONPathFieldType.PATH) {
// Parse PATH type to determine columns needed
String parsedPath;
try {
parsedPath = JSONPathFieldSpec.getCompilePath(fields.getExpr());
}
catch (Exception e) {
// We can skip columns used in this path as the path is invalid
LOG.debug("Ignoring columns from JSON path [%s] as path expression is invalid", fields.getExpr());
continue;
}
// Remove the $
parsedPath = parsedPath.substring(1);
// If the first level is a deep scan, then we need all columns
if (parsedPath.length() >= 2 && "..".equals(parsedPath.substring(0, 2))) {
return fullInputSchema;
}
Matcher jsonPathMatcher = JSON_PATH_PATTERN.matcher(parsedPath);
if (!jsonPathMatcher.find()) {
LOG.warn("Failed to parse JSON path for required column from path [%s]", fields.getExpr());
return fullInputSchema;
}
String matchedGroup = jsonPathMatcher.group();
Matcher childMatcher = BRACKET_NOTATED_CHILD_PATTERN.matcher(matchedGroup);
if (childMatcher.find()) {
// Get name of the column from bracket-notated child i.e. ['region']
childMatcher.reset();
while (childMatcher.find()) {
String columnName = childMatcher.group();
// Remove the quote around column name
fieldsRequired.add(columnName.substring(1, columnName.length() - 1));
}
} else if ("[*]".equals(matchedGroup)) {
// If the first level is a wildcard, then we need all columns
return fullInputSchema;
} else {
// This can happen if it is a filter expression, slice operator, or index / indexes
// We just return all columns...
return fullInputSchema;
}
} else {
// Others type aren't supported but returning full schema just in case...
LOG.warn("Got unexpected JSONPathFieldType [%s]", fields.getType());
return fullInputSchema;
}
}
// If useFieldDiscovery is false then we have already determined all the columns we need to read from
// (as only explicitly specified fields will be available to use in the other specs)
if (!flattenSpec.isUseFieldDiscovery()) {
fieldsRequired.retainAll(fullInputSchema);
return fieldsRequired;
}
} else {
// Without flattenSpec, useFieldDiscovery is default to true and thus needs to read all columns since this is
// schemaless
if (dimensionsSpec.getDimensions().isEmpty()) {
return fullInputSchema;
}
}
// Determine any fields we need to read from input file that is used in the transform of the transformSpec
List transforms = transformSpec.getTransforms();
for (Transform transform : transforms) {
fieldsRequired.addAll(transform.getRequiredColumns());
}
// Determine any fields we need to read from input file that is used in the filter of the transformSpec
DimFilter filter = transformSpec.getFilter();
if (filter != null) {
fieldsRequired.addAll(filter.getRequiredColumns());
}
// Determine any fields we need to read from input file that is used in the dimensionsSpec
List dimensionSchema = dimensionsSpec.getDimensions();
for (DimensionSchema dim : dimensionSchema) {
fieldsRequired.add(dim.getName());
}
// Determine any fields we need to read from input file that is used in the metricsSpec
for (AggregatorFactory agg : aggregators) {
fieldsRequired.addAll(agg.requiredFields());
}
// Only required fields that actually exist in the input schema
fieldsRequired.retainAll(fullInputSchema);
return fieldsRequired;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy