
org.apache.hudi.common.table.read.HoodieFileGroupReaderSchemaHandler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.table.read;
import org.apache.hudi.common.engine.HoodieReaderContext;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordMerger;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.avro.Schema;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchemaDedupNested;
import static org.apache.hudi.avro.AvroSchemaUtils.createNewSchemaFromFieldsWithReference;
import static org.apache.hudi.avro.AvroSchemaUtils.findNestedField;
/**
* This class is responsible for handling the schema for the file group reader.
*/
public class HoodieFileGroupReaderSchemaHandler {
protected final Schema dataSchema;
// requestedSchema: the schema that the caller requests
protected final Schema requestedSchema;
// requiredSchema: the requestedSchema with any additional columns required for merging etc
protected final Schema requiredSchema;
protected final InternalSchema internalSchema;
protected final HoodieTableConfig hoodieTableConfig;
protected final HoodieReaderContext readerContext;
protected final HoodieRecordMerger recordMerger;
protected final boolean hasBootstrapBaseFile;
protected boolean needsBootstrapMerge;
protected final boolean needsMORMerge;
public HoodieFileGroupReaderSchemaHandler(HoodieReaderContext readerContext,
Schema dataSchema,
Schema requestedSchema,
Option internalSchemaOpt,
HoodieTableConfig hoodieTableConfig) {
this.readerContext = readerContext;
this.hasBootstrapBaseFile = readerContext.getHasBootstrapBaseFile();
this.needsMORMerge = readerContext.getHasLogFiles();
this.recordMerger = readerContext.getRecordMerger();
this.dataSchema = dataSchema;
this.requestedSchema = requestedSchema;
this.hoodieTableConfig = hoodieTableConfig;
this.requiredSchema = prepareRequiredSchema();
this.internalSchema = pruneInternalSchema(requiredSchema, internalSchemaOpt);
readerContext.setNeedsBootstrapMerge(this.needsBootstrapMerge);
}
public Schema getDataSchema() {
return this.dataSchema;
}
public Schema getRequestedSchema() {
return this.requestedSchema;
}
public Schema getRequiredSchema() {
return this.requiredSchema;
}
public InternalSchema getInternalSchema() {
return this.internalSchema;
}
public Option> getOutputConverter() {
if (!requestedSchema.equals(requiredSchema)) {
return Option.of(readerContext.projectRecord(requiredSchema, requestedSchema));
}
return Option.empty();
}
private static InternalSchema pruneInternalSchema(Schema requiredSchema, Option internalSchemaOption) {
if (!internalSchemaOption.isPresent()) {
return InternalSchema.getEmptyInternalSchema();
}
InternalSchema notPruned = internalSchemaOption.get();
if (notPruned == null || notPruned.isEmptySchema()) {
return InternalSchema.getEmptyInternalSchema();
}
return AvroInternalSchemaConverter.pruneAvroSchemaToInternalSchema(requiredSchema, notPruned);
}
private Schema generateRequiredSchema() {
//might need to change this if other queries than mor have mandatory fields
if (!needsMORMerge) {
return requestedSchema;
}
List addedFields = new ArrayList<>();
for (String field : recordMerger.getMandatoryFieldsForMerging(hoodieTableConfig)) {
if (!findNestedField(requestedSchema, field).isPresent()) {
Option foundFieldOpt = findNestedField(dataSchema, field);
if (!foundFieldOpt.isPresent()) {
throw new IllegalArgumentException("Field: " + field + " does not exist in the table schema");
}
Schema.Field foundField = foundFieldOpt.get();
addedFields.add(foundField);
}
}
if (addedFields.isEmpty()) {
return requestedSchema;
}
return appendFieldsToSchemaDedupNested(requestedSchema, addedFields);
}
protected Schema prepareRequiredSchema() {
Schema preReorderRequiredSchema = generateRequiredSchema();
Pair, List> requiredFields = getDataAndMetaCols(preReorderRequiredSchema);
this.needsBootstrapMerge = hasBootstrapBaseFile && !requiredFields.getLeft().isEmpty() && !requiredFields.getRight().isEmpty();
return needsBootstrapMerge
? createSchemaFromFields(Stream.concat(requiredFields.getLeft().stream(), requiredFields.getRight().stream()).collect(Collectors.toList()))
: preReorderRequiredSchema;
}
public Pair,List> getBootstrapRequiredFields() {
return getDataAndMetaCols(requiredSchema);
}
public Pair,List> getBootstrapDataFields() {
return getDataAndMetaCols(dataSchema);
}
private static Pair, List> getDataAndMetaCols(Schema schema) {
Map> fieldsByMeta = schema.getFields().stream()
//if there are no data fields, then we don't want to think the temp col is a data col
.filter(f -> !Objects.equals(f.name(), HoodiePositionBasedFileGroupRecordBuffer.ROW_INDEX_TEMPORARY_COLUMN_NAME))
.collect(Collectors.partitioningBy(f -> HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION.contains(f.name())));
return Pair.of(fieldsByMeta.getOrDefault(true, Collections.emptyList()),
fieldsByMeta.getOrDefault(false, Collections.emptyList()));
}
public Schema createSchemaFromFields(List fields) {
//fields have positions set, so we need to remove them due to avro setFields implementation
for (int i = 0; i < fields.size(); i++) {
Schema.Field curr = fields.get(i);
fields.set(i, new Schema.Field(curr.name(), curr.schema(), curr.doc(), curr.defaultVal()));
}
return createNewSchemaFromFieldsWithReference(dataSchema, fields);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy