Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.HoodieReaderConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.DeleteRecord;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieDeltaWriteStat;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieOperation;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.model.HoodiePayloadProps;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
import org.apache.hudi.common.model.IOType;
import org.apache.hudi.common.model.MetadataValues;
import org.apache.hudi.common.table.log.AppendResult;
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator;
import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.util.DefaultSizeEstimator;
import org.apache.hudi.common.util.HoodieRecordUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.SizeEstimator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieAppendException;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.table.HoodieTable;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.collectColumnRangeMetadata;
/**
* IO Operation to append data onto an existing file.
*/
public class HoodieAppendHandle extends HoodieWriteHandle {
private static final Logger LOG = LoggerFactory.getLogger(HoodieAppendHandle.class);
// This acts as the sequenceID for records written
private static final AtomicLong RECORD_COUNTER = new AtomicLong(1);
private static final int NUMBER_OF_RECORDS_TO_ESTIMATE_RECORD_SIZE = 100;
private final boolean shouldWriteRecordPositions;
// Buffer for holding records in memory before they are flushed to disk
private final List recordList = new ArrayList<>();
// Buffer for holding records (to be deleted), along with their position in log block, in memory before they are flushed to disk
private final List> recordsToDeleteWithPositions = new ArrayList<>();
// Incoming records to be written to logs.
protected Iterator> recordItr;
// Writer to log into the file group's latest slice.
protected Writer writer;
protected final List statuses;
// Total number of records written during appending
protected long recordsWritten = 0;
// Total number of records deleted during appending
protected long recordsDeleted = 0;
// Total number of records updated during appending
protected long updatedRecordsWritten = 0;
// Total number of new records inserted into the delta file
protected long insertRecordsWritten = 0;
// Average record size for a HoodieRecord. This size is updated at the end of every log block flushed to disk
private long averageRecordSize = 0;
// Flag used to initialize some metadata
private boolean doInit = true;
// Total number of bytes written during this append phase (an estimation)
protected long estimatedNumberOfBytesWritten;
// Number of records that must be written to meet the max block size for a log block
private long numberOfRecords = 0;
// Max block size to limit to for a log block
private final long maxBlockSize = config.getLogFileDataBlockMaxSize();
// Header metadata for a log block
protected final Map header = new HashMap<>();
private final SizeEstimator sizeEstimator;
// This is used to distinguish between normal append and logcompaction's append operation.
private boolean isLogCompaction = false;
// use writer schema for log compaction.
private boolean useWriterSchema = false;
private final Properties recordProperties = new Properties();
/**
* This is used by log compaction only.
*/
public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
String partitionPath, String fileId, Iterator> recordItr,
TaskContextSupplier taskContextSupplier, Map header) {
this(config, instantTime, hoodieTable, partitionPath, fileId, recordItr, taskContextSupplier);
this.useWriterSchema = true;
this.isLogCompaction = true;
this.header.putAll(header);
}
public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
String partitionPath, String fileId, Iterator> recordItr, TaskContextSupplier taskContextSupplier) {
super(config, instantTime, partitionPath, fileId, hoodieTable,
config.shouldWritePartialUpdates()
// When enabling writing partial updates to the data blocks in log files,
// i.e., partial update schema is set, the writer schema is the partial
// schema containing the updated fields only
? Option.of(new Schema.Parser().parse(config.getPartialUpdateSchema()))
: Option.empty(),
taskContextSupplier);
this.recordItr = recordItr;
this.sizeEstimator = new DefaultSizeEstimator();
this.statuses = new ArrayList<>();
this.recordProperties.putAll(config.getProps());
this.shouldWriteRecordPositions = config.shouldWriteRecordPositions();
}
public HoodieAppendHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
String partitionPath, String fileId, TaskContextSupplier sparkTaskContextSupplier) {
this(config, instantTime, hoodieTable, partitionPath, fileId, null, sparkTaskContextSupplier);
}
private void init(HoodieRecord record) {
if (!doInit) {
return;
}
String prevCommit = instantTime;
String baseFile = "";
List logFiles = new ArrayList<>();
if (config.isCDCEnabled()) {
// the cdc reader needs the base file metadata to have deterministic update sequence.
TableFileSystemView.SliceView rtView = hoodieTable.getSliceView();
Option fileSlice = rtView.getLatestFileSlice(partitionPath, fileId);
if (fileSlice.isPresent()) {
prevCommit = fileSlice.get().getBaseInstantTime();
baseFile = fileSlice.get().getBaseFile().map(BaseFile::getFileName).orElse("");
logFiles = fileSlice.get().getLogFiles().map(HoodieLogFile::getFileName).collect(Collectors.toList());
}
}
// Prepare the first write status
HoodieDeltaWriteStat deltaWriteStat = new HoodieDeltaWriteStat();
writeStatus.setStat(deltaWriteStat);
writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(partitionPath);
averageRecordSize = sizeEstimator.sizeEstimate(record);
deltaWriteStat.setPrevCommit(prevCommit);
deltaWriteStat.setPartitionPath(partitionPath);
deltaWriteStat.setFileId(fileId);
deltaWriteStat.setBaseFile(baseFile);
deltaWriteStat.setLogFiles(logFiles);
try {
// Save hoodie partition meta in the partition path
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(storage, instantTime,
new StoragePath(config.getBasePath()),
FSUtils.constructAbsolutePath(config.getBasePath(), partitionPath),
hoodieTable.getPartitionMetafileFormat());
partitionMetadata.trySave();
this.writer = createLogWriter(getFileInstant(record));
} catch (Exception e) {
LOG.error("Error in update task at commit " + instantTime, e);
writeStatus.setGlobalError(e);
throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit "
+ instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + "/" + partitionPath, e);
}
doInit = false;
}
/**
* Returns the instant time to use in the log file name.
*/
private String getFileInstant(HoodieRecord> record) {
if (config.isConsistentHashingEnabled()) {
// Handle log file only case. This is necessary for the concurrent clustering and writer case (e.g., consistent hashing bucket index).
// NOTE: flink engine use instantTime to mark operation type, check BaseFlinkCommitActionExecutor::execute
String taggedInstant = HoodieRecordUtils.getCurrentLocationInstant(record);
if (HoodieInstantTimeGenerator.isValidInstantTime(taggedInstant) && !instantTime.equals(taggedInstant)) {
// the tagged instant is the pending clustering instant, use this instant in the file name so that
// the dual-write file is shadowed to the reader view.
return taggedInstant;
}
}
return instantTime;
}
/**
* Returns whether the hoodie record is an UPDATE.
*/
protected boolean isUpdateRecord(HoodieRecord hoodieRecord) {
// If currentLocation is present, then this is an update
return hoodieRecord.getCurrentLocation() != null;
}
private Option prepareRecord(HoodieRecord hoodieRecord) {
Option