org.apache.hudi.common.engine.HoodieReaderContext Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.engine;
import org.apache.hudi.common.config.RecordMergeMode;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordMerger;
import org.apache.hudi.common.table.read.HoodieFileGroupReaderSchemaHandler;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.function.UnaryOperator;
import static org.apache.hudi.common.model.HoodieRecord.RECORD_KEY_METADATA_FIELD;
/**
* An abstract reader context class for {@code HoodieFileGroupReader} to use, containing APIs for
* engine-specific implementation on reading data files, getting field values from a record,
* transforming a record, etc.
*
* For each query engine, this class should be extended and plugged into {@code HoodieFileGroupReader}
* to realize the file group reading.
*
* @param The type of engine-specific record representation, e.g.,{@code InternalRow} in Spark
* and {@code RowData} in Flink.
*/
public abstract class HoodieReaderContext {
private HoodieFileGroupReaderSchemaHandler schemaHandler = null;
private String tablePath = null;
private String latestCommitTime = null;
private Option recordMerger = null;
private Boolean hasLogFiles = null;
private Boolean hasBootstrapBaseFile = null;
private Boolean needsBootstrapMerge = null;
private Boolean shouldMergeUseRecordPosition = null;
// Getter and Setter for schemaHandler
public HoodieFileGroupReaderSchemaHandler getSchemaHandler() {
return schemaHandler;
}
public void setSchemaHandler(HoodieFileGroupReaderSchemaHandler schemaHandler) {
this.schemaHandler = schemaHandler;
}
public String getTablePath() {
if (tablePath == null) {
throw new IllegalStateException("Table path not set in reader context.");
}
return tablePath;
}
public void setTablePath(String tablePath) {
this.tablePath = tablePath;
}
public String getLatestCommitTime() {
return latestCommitTime;
}
public void setLatestCommitTime(String latestCommitTime) {
this.latestCommitTime = latestCommitTime;
}
public Option getRecordMerger() {
return recordMerger;
}
public void setRecordMerger(Option recordMerger) {
this.recordMerger = recordMerger;
}
// Getter and Setter for hasLogFiles
public boolean getHasLogFiles() {
return hasLogFiles;
}
public void setHasLogFiles(boolean hasLogFiles) {
this.hasLogFiles = hasLogFiles;
}
// Getter and Setter for hasBootstrapBaseFile
public boolean getHasBootstrapBaseFile() {
return hasBootstrapBaseFile;
}
public void setHasBootstrapBaseFile(boolean hasBootstrapBaseFile) {
this.hasBootstrapBaseFile = hasBootstrapBaseFile;
}
// Getter and Setter for needsBootstrapMerge
public boolean getNeedsBootstrapMerge() {
return needsBootstrapMerge;
}
public void setNeedsBootstrapMerge(boolean needsBootstrapMerge) {
this.needsBootstrapMerge = needsBootstrapMerge;
}
// Getter and Setter for useRecordPosition
public boolean getShouldMergeUseRecordPosition() {
return shouldMergeUseRecordPosition;
}
public void setShouldMergeUseRecordPosition(boolean shouldMergeUseRecordPosition) {
this.shouldMergeUseRecordPosition = shouldMergeUseRecordPosition;
}
// These internal key names are only used in memory for record metadata and merging,
// and should not be persisted to storage.
public static final String INTERNAL_META_RECORD_KEY = "_0";
public static final String INTERNAL_META_PARTITION_PATH = "_1";
public static final String INTERNAL_META_ORDERING_FIELD = "_2";
public static final String INTERNAL_META_OPERATION = "_3";
public static final String INTERNAL_META_INSTANT_TIME = "_4";
public static final String INTERNAL_META_SCHEMA_ID = "_5";
/**
* Gets the record iterator based on the type of engine-specific record representation from the
* file.
*
* @param filePath {@link StoragePath} instance of a file.
* @param start Starting byte to start reading.
* @param length Bytes to read.
* @param dataSchema Schema of records in the file in {@link Schema}.
* @param requiredSchema Schema containing required fields to read in {@link Schema} for projection.
* @param storage {@link HoodieStorage} for reading records.
* @return {@link ClosableIterator} that can return all records through iteration.
*/
public abstract ClosableIterator getFileRecordIterator(
StoragePath filePath, long start, long length, Schema dataSchema, Schema requiredSchema,
HoodieStorage storage) throws IOException;
/**
* Gets the record iterator based on the type of engine-specific record representation from the
* file.
*
* @param storagePathInfo {@link StoragePathInfo} instance of a file.
* @param start Starting byte to start reading.
* @param length Bytes to read.
* @param dataSchema Schema of records in the file in {@link Schema}.
* @param requiredSchema Schema containing required fields to read in {@link Schema} for projection.
* @param storage {@link HoodieStorage} for reading records.
* @return {@link ClosableIterator} that can return all records through iteration.
*/
public ClosableIterator getFileRecordIterator(
StoragePathInfo storagePathInfo, long start, long length, Schema dataSchema, Schema requiredSchema,
HoodieStorage storage) throws IOException {
return getFileRecordIterator(storagePathInfo.getPath(), start, length, dataSchema, requiredSchema, storage);
}
/**
* Converts an Avro record, e.g., serialized in the log files, to an engine-specific record.
*
* @param avroRecord The Avro record.
* @return An engine-specific record in Type {@link T}.
*/
public abstract T convertAvroRecord(IndexedRecord avroRecord);
public abstract GenericRecord convertToAvroRecord(T record, Schema schema);
/**
* @param mergeMode record merge mode
* @param mergeStrategyId record merge strategy ID
* @param mergeImplClasses custom implementation classes for record merging
*
* @return {@link HoodieRecordMerger} to use.
*/
public abstract Option getRecordMerger(RecordMergeMode mergeMode, String mergeStrategyId, String mergeImplClasses);
/**
* Gets the field value.
*
* @param record The record in engine-specific type.
* @param schema The Avro schema of the record.
* @param fieldName The field name.
* @return The field value.
*/
public abstract Object getValue(T record, Schema schema, String fieldName);
/**
* Gets the record key in String.
*
* @param record The record in engine-specific type.
* @param schema The Avro schema of the record.
* @return The record key in String.
*/
public String getRecordKey(T record, Schema schema) {
Object val = getValue(record, schema, RECORD_KEY_METADATA_FIELD);
return val.toString();
}
/**
* Gets the ordering value in particular type.
*
* @param recordOption An option of record.
* @param metadataMap A map containing the record metadata.
* @param schema The Avro schema of the record.
* @param orderingFieldName name of the ordering field
* @param orderingFieldTypeOpt type of the ordering field
* @param orderingFieldDefault default value for ordering
* @return The ordering value.
*/
public Comparable getOrderingValue(Option recordOption,
Map metadataMap,
Schema schema,
String orderingFieldName,
Option orderingFieldTypeOpt,
Comparable orderingFieldDefault) {
if (metadataMap.containsKey(INTERNAL_META_ORDERING_FIELD)) {
return (Comparable) metadataMap.get(INTERNAL_META_ORDERING_FIELD);
}
if (!recordOption.isPresent() || !orderingFieldTypeOpt.isPresent()) {
return orderingFieldDefault;
}
Object value = getValue(recordOption.get(), schema, orderingFieldName);
Comparable finalOrderingVal = value != null ? castValue((Comparable) value, orderingFieldTypeOpt.get()) : orderingFieldDefault;
metadataMap.put(INTERNAL_META_ORDERING_FIELD, finalOrderingVal);
return finalOrderingVal;
}
/**
* Constructs a new {@link HoodieRecord} based on the record of engine-specific type and metadata for merging.
*
* @param recordOption An option of the record in engine-specific type if exists.
* @param metadataMap The record metadata.
* @return A new instance of {@link HoodieRecord}.
*/
public abstract HoodieRecord constructHoodieRecord(Option recordOption,
Map metadataMap);
/**
* Seals the engine-specific record to make sure the data referenced in memory do not change.
*
* @param record The record.
* @return The record containing the same data that do not change in memory over time.
*/
public abstract T seal(T record);
/**
* Generates metadata map based on the information.
*
* @param recordKey Record key in String.
* @param partitionPath Partition path in String.
* @param orderingVal Ordering value in String.
* @return A mapping containing the metadata.
*/
public Map generateMetadataForRecord(
String recordKey, String partitionPath, Comparable orderingVal, Option orderingFieldType) {
Map meta = new HashMap<>();
meta.put(INTERNAL_META_RECORD_KEY, recordKey);
meta.put(INTERNAL_META_PARTITION_PATH, partitionPath);
meta.put(INTERNAL_META_ORDERING_FIELD, orderingFieldType.map(type -> castValue(orderingVal, type)).orElse(orderingVal));
return meta;
}
/**
* Generates metadata of the record. Only fetches record key that is necessary for merging.
*
* @param record The record.
* @param schema The Avro schema of the record.
* @return A mapping containing the metadata.
*/
public Map generateMetadataForRecord(T record, Schema schema) {
Map meta = new HashMap<>();
meta.put(INTERNAL_META_RECORD_KEY, getRecordKey(record, schema));
meta.put(INTERNAL_META_SCHEMA_ID, this.schemaHandler.encodeAvroSchema(schema));
return meta;
}
/**
* Updates the schema and reset the ordering value in existing metadata mapping of a record.
*
* @param meta Metadata in a mapping.
* @param schema New schema to set.
* @return The input metadata mapping.
*/
public Map updateSchemaAndResetOrderingValInMetadata(Map meta,
Schema schema) {
meta.remove(INTERNAL_META_ORDERING_FIELD);
meta.put(INTERNAL_META_SCHEMA_ID, this.schemaHandler.encodeAvroSchema(schema));
return meta;
}
/**
* Merge the skeleton file and data file iterators into a single iterator that will produce rows that contain all columns from the
* skeleton file iterator, followed by all columns in the data file iterator
*
* @param skeletonFileIterator iterator over bootstrap skeleton files that contain hudi metadata columns
* @param dataFileIterator iterator over data files that were bootstrapped into the hudi table
* @return iterator that concatenates the skeletonFileIterator and dataFileIterator
*/
public abstract ClosableIterator mergeBootstrapReaders(ClosableIterator skeletonFileIterator,
Schema skeletonRequiredSchema,
ClosableIterator dataFileIterator,
Schema dataRequiredSchema);
/**
* Creates a function that will reorder records of schema "from" to schema of "to"
* all fields in "to" must be in "from", but not all fields in "from" must be in "to"
*
* @param from the schema of records to be passed into UnaryOperator
* @param to the schema of records produced by UnaryOperator
* @param renamedColumns map of renamed columns where the key is the new name from the query and
* the value is the old name that exists in the file
* @return a function that takes in a record and returns the record with reordered columns
*/
public abstract UnaryOperator projectRecord(Schema from, Schema to, Map renamedColumns);
public final UnaryOperator projectRecord(Schema from, Schema to) {
return projectRecord(from, to, Collections.emptyMap());
}
public abstract Comparable castValue(Comparable value, Schema.Type newType);
/**
* Extracts the record position value from the record itself.
*
* @return the record position in the base file.
*/
public long extractRecordPosition(T record, Schema schema, String fieldName, long providedPositionIfNeeded) {
if (supportsParquetRowIndex()) {
Object position = getValue(record, schema, fieldName);
if (position != null) {
return (long) position;
} else {
throw new IllegalStateException("Record position extraction failed");
}
}
return providedPositionIfNeeded;
}
public boolean supportsParquetRowIndex() {
return false;
}
}