All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.source.IncrementalInputSplits Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.source;

import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.cdc.HoodieCDCExtractor;
import org.apache.hudi.common.table.cdc.HoodieCDCFileSplit;
import org.apache.hudi.common.table.cdc.HoodieCDCUtils;
import org.apache.hudi.common.table.log.InstantRange;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.sink.partitioner.profile.WriteProfiles;
import org.apache.hudi.source.prune.PartitionPruners;
import org.apache.hudi.table.format.cdc.CdcInputSplit;
import org.apache.hudi.table.format.mor.MergeOnReadInputSplit;
import org.apache.hudi.util.ClusteringUtil;
import org.apache.hudi.util.StreamerUtil;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.fs.FileStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS;

/**
 * Utilities to generate incremental input splits {@link MergeOnReadInputSplit}.
 * The input splits are used for streaming and incremental read.
 *
 * 

How to generate the input splits: *

    *
  1. first fetch all the commit metadata for the incremental instants;
  2. *
  3. resolve the incremental commit file paths;
  4. *
  5. filter the full file paths by required partitions;
  6. *
  7. use the file paths from #step 3 as the back-up of the filesystem view.
  8. *
*/ public class IncrementalInputSplits implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(IncrementalInputSplits.class); private final Configuration conf; private final Path path; private final RowType rowType; private final long maxCompactionMemoryInBytes; // for partition pruning private final PartitionPruners.PartitionPruner partitionPruner; // skip compaction private final boolean skipCompaction; // skip clustering private final boolean skipClustering; private IncrementalInputSplits( Configuration conf, Path path, RowType rowType, long maxCompactionMemoryInBytes, @Nullable PartitionPruners.PartitionPruner partitionPruner, boolean skipCompaction, boolean skipClustering) { this.conf = conf; this.path = path; this.rowType = rowType; this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; this.partitionPruner = partitionPruner; this.skipCompaction = skipCompaction; this.skipClustering = skipClustering; } /** * Returns the builder. */ public static Builder builder() { return new Builder(); } /** * Returns the incremental input splits. * * @param metaClient The meta client * @param cdcEnabled Whether cdc is enabled * * @return The list of incremental input splits or empty if there are no new instants */ public Result inputSplits( HoodieTableMetaClient metaClient, boolean cdcEnabled) { HoodieTimeline commitTimeline = getReadTimeline(metaClient); if (commitTimeline.empty()) { LOG.warn("No splits found for the table under path " + path); return Result.EMPTY; } final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT); final String endCommit = this.conf.getString(FlinkOptions.READ_END_COMMIT); final boolean startFromEarliest = FlinkOptions.START_COMMIT_EARLIEST.equalsIgnoreCase(startCommit); final boolean startOutOfRange = startCommit != null && commitTimeline.isBeforeTimelineStarts(startCommit); final boolean endOutOfRange = endCommit != null && commitTimeline.isBeforeTimelineStarts(endCommit); // We better add another premise: whether the endCommit is cleaned. boolean fullTableScan = startFromEarliest || startOutOfRange || endOutOfRange; List instants = filterInstantsWithRange(commitTimeline, null); // Step1: generates the instant range // if the specified end commit is archived, still uses the specified timestamp, // else uses the latest filtered instant time // (would be the latest instant time if the specified end commit is greater than the latest instant time) final String rangeEnd = endOutOfRange || instants.isEmpty() ? endCommit : instants.get(instants.size() - 1).getTimestamp(); // keep the same semantics with streaming read, default start from the latest commit final String rangeStart = startFromEarliest ? null : (startCommit == null ? rangeEnd : startCommit); final InstantRange instantRange; if (!fullTableScan) { instantRange = InstantRange.builder().startInstant(rangeStart).endInstant(rangeEnd) .rangeType(InstantRange.RangeType.CLOSE_CLOSE).nullableBoundary(cdcEnabled).build(); } else if (startFromEarliest && endCommit == null) { // short-cut for snapshot read instantRange = null; } else { instantRange = InstantRange.builder().startInstant(rangeStart).endInstant(rangeEnd) .rangeType(InstantRange.RangeType.CLOSE_CLOSE).nullableBoundary(true).build(); } // Step2: decides the read end commit final String endInstant = endOutOfRange || endCommit == null ? commitTimeline.lastInstant().get().getTimestamp() : rangeEnd; // Step3: find out the files to read, tries to read the files from the commit metadata first, // fallback to full table scan if any of the following conditions matches: // 1. there are files in metadata be deleted; // 2. read from earliest // 3. the start commit is archived // 4. the end commit is archived Set readPartitions; final FileStatus[] fileStatuses; if (fullTableScan) { // scans the partitions and files directly. FileIndex fileIndex = getFileIndex(); readPartitions = new TreeSet<>(fileIndex.getOrBuildPartitionPaths()); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading in user provided path."); return Result.EMPTY; } fileStatuses = fileIndex.getFilesInPartitions(); } else { if (instants.size() == 0) { LOG.info("No new instant found for the table under path " + path + ", skip reading"); return Result.EMPTY; } if (cdcEnabled) { // case1: cdc change log enabled List inputSplits = getCdcInputSplits(metaClient, instantRange); return Result.instance(inputSplits, endInstant); } // case2: normal incremental read String tableName = conf.getString(FlinkOptions.TABLE_NAME); List metadataList = instants.stream() .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList()); readPartitions = getReadPartitions(metadataList); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading in user provided path."); return Result.EMPTY; } FileStatus[] files = WriteProfiles.getFilesFromMetadata(path, metaClient.getHadoopConf(), metadataList, metaClient.getTableType(), false); if (files == null) { LOG.warn("Found deleted files in metadata, fall back to full table scan."); // fallback to full table scan // reading from the earliest, scans the partitions and files directly. FileIndex fileIndex = getFileIndex(); readPartitions = new TreeSet<>(fileIndex.getOrBuildPartitionPaths()); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading in user provided path."); return Result.EMPTY; } fileStatuses = fileIndex.getFilesInPartitions(); } else { fileStatuses = files; } } if (fileStatuses.length == 0) { LOG.warn("No files found for reading in user provided path."); return Result.EMPTY; } List inputSplits = getInputSplits(metaClient, commitTimeline, fileStatuses, readPartitions, endInstant, instantRange, false); return Result.instance(inputSplits, endInstant); } /** * Returns the incremental input splits. * * @param metaClient The meta client * @param issuedInstant The last issued instant, only valid in streaming read * @param issuedOffset The last issued offset, only valid in streaming read * @param cdcEnabled Whether cdc is enabled * * @return The list of incremental input splits or empty if there are no new instants */ public Result inputSplits( HoodieTableMetaClient metaClient, @Nullable String issuedInstant, @Nullable String issuedOffset, boolean cdcEnabled) { metaClient.reloadActiveTimeline(); HoodieTimeline commitTimeline = getReadTimeline(metaClient); if (commitTimeline.empty()) { LOG.warn("No splits found for the table under path " + path); return Result.EMPTY; } // Assumes a timeline: // c1.inflight, c2(issued instant), c3, c4 // -> c1, c2(issued instant), c3, c4 // c1, c3 and c4 are the candidate instants, // we call c1 a 'hollow' instant which has lower version number but greater completion time, // filtering the timeline using just c2 could cause data loss, // check these hollow instants first. Result hollowSplits = getHollowInputSplits(metaClient, metaClient.getHadoopConf(), issuedInstant, issuedOffset, commitTimeline, cdcEnabled); List instants = filterInstantsWithRange(commitTimeline, issuedInstant); // get the latest instant that satisfies condition final String endInstant = instants.size() == 0 ? null : instants.get(instants.size() - 1).getTimestamp(); final InstantRange instantRange; if (endInstant != null) { // when cdc is enabled, returns instant range with nullable boundary // to filter the reading instants on the timeline instantRange = getInstantRange(issuedInstant, endInstant, cdcEnabled); } else if (hollowSplits.isEmpty()) { LOG.info("No new instant found for the table under path " + path + ", skip reading"); return Result.EMPTY; } else { return hollowSplits; } // version number should be monotonically increasing // fetch the instant offset by completion time String offsetToIssue = instants.stream().map(HoodieInstant::getStateTransitionTime).max(String::compareTo).orElse(endInstant); if (instantRange == null) { // reading from the earliest, scans the partitions and files directly. FileIndex fileIndex = getFileIndex(); Set readPartitions = new TreeSet<>(fileIndex.getOrBuildPartitionPaths()); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading under path: " + path); return Result.EMPTY; } FileStatus[] fileStatuses = fileIndex.getFilesInPartitions(); if (fileStatuses.length == 0) { LOG.warn("No files found for reading under path: " + path); return Result.EMPTY; } List inputSplits = getInputSplits(metaClient, commitTimeline, fileStatuses, readPartitions, endInstant, null, false); return Result.instance(inputSplits, endInstant, offsetToIssue); } else { List inputSplits = getIncInputSplits(metaClient, metaClient.getHadoopConf(), commitTimeline, instants, instantRange, endInstant, cdcEnabled); return Result.instance(mergeList(hollowSplits.getInputSplits(), inputSplits), endInstant, offsetToIssue); } } /** * Returns the input splits for streaming incremental read. */ private List getIncInputSplits( HoodieTableMetaClient metaClient, org.apache.hadoop.conf.Configuration hadoopConf, HoodieTimeline commitTimeline, List instants, InstantRange instantRange, String endInstant, boolean cdcEnabled) { // streaming read if (cdcEnabled) { // case1: cdc change log enabled return getCdcInputSplits(metaClient, instantRange); } // case2: normal streaming read String tableName = conf.getString(FlinkOptions.TABLE_NAME); List activeMetadataList = instants.stream() .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList()); List archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName); if (archivedMetadataList.size() > 0) { LOG.warn("\n" + "--------------------------------------------------------------------------------\n" + "---------- caution: the reader has fall behind too much from the writer,\n" + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + "--------------------------------------------------------------------------------"); } // IMPORTANT: the merged metadata list must be in ascending order by instant time List metadataList = mergeList(archivedMetadataList, activeMetadataList); Set readPartitions = getReadPartitions(metadataList); if (readPartitions.size() == 0) { LOG.warn("No partitions found for reading under path: " + path); return Collections.emptyList(); } FileStatus[] fileStatuses = WriteProfiles.getFilesFromMetadata(path, hadoopConf, metadataList, metaClient.getTableType()); if (fileStatuses.length == 0) { LOG.warn("No files found for reading under path: " + path); return Collections.emptyList(); } return getInputSplits(metaClient, commitTimeline, fileStatuses, readPartitions, endInstant, instantRange, skipCompaction); } /** * Returns the input splits for 'hollow' instants. */ private Result getHollowInputSplits( HoodieTableMetaClient metaClient, org.apache.hadoop.conf.Configuration hadoopConf, @Nullable String issuedInstant, @Nullable String issuedOffset, HoodieTimeline commitTimeline, boolean cdcEnabled) { if (issuedInstant == null || issuedOffset == null) { return Result.EMPTY; } // find the write commit instant that finishes later than the issued instant // while with smaller txn start time. List instants = commitTimeline.getInstantsAsStream() .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN, issuedInstant)) .filter(s -> HoodieTimeline.compareTimestamps(s.getStateTransitionTime(), GREATER_THAN, issuedOffset)) .filter(s -> StreamerUtil.isWriteCommit(metaClient.getTableType(), s, commitTimeline)).collect(Collectors.toList()); if (instants.isEmpty()) { return Result.EMPTY; } String offsetToIssue = instants.stream().map(HoodieInstant::getStateTransitionTime).max(String::compareTo).orElse(issuedOffset); List inputSplits = instants.stream().map(instant -> { String instantTs = instant.getTimestamp(); // Assumes we consume from timeline: // c0, c1.inflight, c2(issued instant), c3, c4 // -> c0, c1, c2(issued instant), c3, c4 // c1, c3 and c4 are the candidate instants, // c4 data file could include overlapping records from c2, // use (c2, c4] instant range for c3 and c4, // c1 data file could include overlapping records from c0, // use the [c1, c1] instant range for c1. InstantRange instantRange = InstantRange.builder() .startInstant(instantTs) .endInstant(instantTs) .nullableBoundary(cdcEnabled) .rangeType(InstantRange.RangeType.CLOSE_CLOSE).build(); return getIncInputSplits(metaClient, hadoopConf, commitTimeline, Collections.singletonList(instant), instantRange, instantTs, cdcEnabled); }).flatMap(Collection::stream).collect(Collectors.toList()); return Result.instance(inputSplits, issuedInstant, offsetToIssue); } @Nullable private InstantRange getInstantRange(String issuedInstant, String instantToIssue, boolean nullableBoundary) { if (issuedInstant != null) { // the streaming reader may record the last issued instant, if the issued instant is present, // the instant range should be: (issued instant, the latest instant]. return InstantRange.builder().startInstant(issuedInstant).endInstant(instantToIssue) .nullableBoundary(nullableBoundary).rangeType(InstantRange.RangeType.OPEN_CLOSE).build(); } else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) { // first time consume and has a start commit final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT); return startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) ? null : InstantRange.builder().startInstant(startCommit).endInstant(instantToIssue) .nullableBoundary(nullableBoundary).rangeType(InstantRange.RangeType.CLOSE_CLOSE).build(); } else { // first time consume and no start commit, consumes the latest incremental data set. return InstantRange.builder().startInstant(instantToIssue).endInstant(instantToIssue) .nullableBoundary(nullableBoundary).rangeType(InstantRange.RangeType.CLOSE_CLOSE).build(); } } private List getInputSplits( HoodieTableMetaClient metaClient, HoodieTimeline commitTimeline, FileStatus[] fileStatuses, Set readPartitions, String endInstant, InstantRange instantRange, boolean skipBaseFiles) { final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses); final AtomicInteger cnt = new AtomicInteger(0); final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE); return readPartitions.stream() .map(relPartitionPath -> getFileSlices(fsView, relPartitionPath, endInstant, skipBaseFiles) .map(fileSlice -> { Option> logPaths = Option.ofNullable(fileSlice.getLogFiles() .sorted(HoodieLogFile.getLogFileComparator()) .map(logFile -> logFile.getPath().toString()) .filter(logPath -> !logPath.endsWith(HoodieCDCUtils.CDC_LOGFILE_SUFFIX)) .collect(Collectors.toList())); String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null); return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, endInstant, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange, fileSlice.getFileId()); }).collect(Collectors.toList())) .flatMap(Collection::stream) .collect(Collectors.toList()); } private List getCdcInputSplits( HoodieTableMetaClient metaClient, InstantRange instantRange) { HoodieCDCExtractor extractor = new HoodieCDCExtractor(metaClient, instantRange); Map> fileSplits = extractor.extractCDCFileSplits(); if (fileSplits.isEmpty()) { LOG.warn("No change logs found for reading in path: " + path); return Collections.emptyList(); } final AtomicInteger cnt = new AtomicInteger(0); return fileSplits.entrySet().stream() .map(splits -> new CdcInputSplit(cnt.getAndAdd(1), metaClient.getBasePath(), maxCompactionMemoryInBytes, splits.getKey().getFileId(), splits.getValue().stream().sorted().toArray(HoodieCDCFileSplit[]::new))) .collect(Collectors.toList()); } private static Stream getFileSlices( HoodieTableFileSystemView fsView, String relPartitionPath, String endInstant, boolean skipBaseFiles) { return skipBaseFiles ? fsView.getAllLogsMergedFileSliceBeforeOrOn(relPartitionPath, endInstant) : fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant); } private FileIndex getFileIndex() { return FileIndex.builder() .path(new org.apache.hadoop.fs.Path(path.toUri())) .conf(conf) .rowType(rowType) .partitionPruner(partitionPruner) .build(); } /** * Returns the partitions to read with given metadata list. * The partitions would be filtered by the pushed down required partitions. * * @param metadataList The metadata list * @return the set of read partitions */ private Set getReadPartitions(List metadataList) { Set partitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList); // apply partition push down if (this.partitionPruner != null) { Set selectedPartitions = this.partitionPruner.filter(partitions); double total = partitions.size(); double selectedNum = selectedPartitions.size(); double percentPruned = total == 0 ? 0 : (1 - selectedNum / total) * 100; LOG.info("Selected " + selectedNum + " partitions out of " + total + ", pruned " + percentPruned + "% partitions."); return selectedPartitions; } return partitions; } /** * Returns the archived metadata in case the reader consumes untimely or it wants * to read from the earliest. * *

Note: should improve it with metadata table when the metadata table is stable enough. * * @param metaClient The meta client * @param instantRange The instant range to filter the timeline instants * @param commitTimeline The commit timeline * @param tableName The table name * @return the list of archived metadata, or empty if there is no need to read the archived timeline */ private List getArchivedMetadata( HoodieTableMetaClient metaClient, InstantRange instantRange, HoodieTimeline commitTimeline, String tableName) { if (commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) { // read the archived metadata if the start instant is archived. HoodieTimeline archivedTimeline = getArchivedReadTimeline(metaClient, instantRange.getStartInstant()); if (!archivedTimeline.empty()) { return archivedTimeline.getInstantsAsStream() .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, archivedTimeline)).collect(Collectors.toList()); } } return Collections.emptyList(); } private HoodieTimeline getReadTimeline(HoodieTableMetaClient metaClient) { HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(); return filterInstantsAsPerUserConfigs(timeline); } private HoodieTimeline getArchivedReadTimeline(HoodieTableMetaClient metaClient, String startInstant) { HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(startInstant, false); HoodieTimeline archivedCompleteTimeline = archivedTimeline.getCommitsTimeline().filterCompletedInstants(); return filterInstantsAsPerUserConfigs(archivedCompleteTimeline); } /** * Returns the instants with a given issuedInstant to start from. * * @param commitTimeline The completed commits timeline * @param issuedInstant The last issued instant that has already been delivered to downstream * * @return the filtered hoodie instants */ @VisibleForTesting public List filterInstantsWithRange( HoodieTimeline commitTimeline, @Nullable final String issuedInstant) { HoodieTimeline completedTimeline = commitTimeline.filterCompletedInstants(); if (issuedInstant != null) { // returns early for streaming mode return completedTimeline .getInstantsAsStream() .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN, issuedInstant)) .collect(Collectors.toList()); } Stream instantStream = completedTimeline.getInstantsAsStream(); if (OptionsResolver.hasNoSpecificReadCommits(this.conf)) { // by default read from the latest commit return completedTimeline.lastInstant().map(Collections::singletonList).orElseGet(Collections::emptyList); } if (OptionsResolver.isSpecificStartCommit(this.conf)) { final String startCommit = this.conf.get(FlinkOptions.READ_START_COMMIT); instantStream = instantStream .filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, startCommit)); } if (this.conf.getOptional(FlinkOptions.READ_END_COMMIT).isPresent()) { final String endCommit = this.conf.get(FlinkOptions.READ_END_COMMIT); instantStream = instantStream.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN_OR_EQUALS, endCommit)); } return instantStream.collect(Collectors.toList()); } /** * Filters out the unnecessary instants as per user specified configs. * * @param timeline The timeline * * @return the filtered timeline */ @VisibleForTesting public HoodieTimeline filterInstantsAsPerUserConfigs(HoodieTimeline timeline) { final HoodieTimeline oriTimeline = timeline; if (OptionsResolver.isMorTable(this.conf) & this.skipCompaction) { // the compaction commit uses 'commit' as action which is tricky timeline = timeline.filter(instant -> !instant.getAction().equals(HoodieTimeline.COMMIT_ACTION)); } if (this.skipClustering) { timeline = timeline.filter(instant -> !ClusteringUtil.isClusteringInstant(instant, oriTimeline)); } return timeline; } private static List mergeList(List list1, List list2) { if (list1.isEmpty()) { return list2; } if (list2.isEmpty()) { return list1; } List merged = new ArrayList<>(list1); merged.addAll(list2); return merged; } // ------------------------------------------------------------------------- // Inner Class // ------------------------------------------------------------------------- /** * Represents a result of calling {@link #inputSplits}. */ public static class Result { private final List inputSplits; // input splits private final String endInstant; // end instant to consume to private final String offset; // monotonic increasing consumption offset public static final Result EMPTY = instance(Collections.emptyList(), ""); public boolean isEmpty() { return this.inputSplits.size() == 0; } public List getInputSplits() { return this.inputSplits; } public String getEndInstant() { return this.endInstant; } @Nullable public String getOffset() { return offset; } private Result(List inputSplits, String endInstant, @Nullable String offset) { this.inputSplits = inputSplits; this.endInstant = endInstant; this.offset = offset; } public static Result instance(List inputSplits, String endInstant) { return new Result(inputSplits, endInstant, null); } public static Result instance(List inputSplits, String endInstant, String offset) { return new Result(inputSplits, endInstant, offset); } } /** * Builder for {@link IncrementalInputSplits}. */ public static class Builder { private Configuration conf; private Path path; private RowType rowType; private long maxCompactionMemoryInBytes; // for partition pruning private PartitionPruners.PartitionPruner partitionPruner; // skip compaction private boolean skipCompaction = false; // skip clustering private boolean skipClustering = false; public Builder() { } public Builder conf(Configuration conf) { this.conf = conf; return this; } public Builder path(Path path) { this.path = path; return this; } public Builder rowType(RowType rowType) { this.rowType = rowType; return this; } public Builder maxCompactionMemoryInBytes(long maxCompactionMemoryInBytes) { this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes; return this; } public Builder partitionPruner(@Nullable PartitionPruners.PartitionPruner partitionPruner) { this.partitionPruner = partitionPruner; return this; } public Builder skipCompaction(boolean skipCompaction) { this.skipCompaction = skipCompaction; return this; } public Builder skipClustering(boolean skipClustering) { this.skipClustering = skipClustering; return this; } public IncrementalInputSplits build() { return new IncrementalInputSplits( Objects.requireNonNull(this.conf), Objects.requireNonNull(this.path), Objects.requireNonNull(this.rowType), this.maxCompactionMemoryInBytes, this.partitionPruner, this.skipCompaction, this.skipClustering); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy