All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.io.HoodieMergedReadHandle Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.io;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordMerger;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.table.HoodieTable;

import org.apache.avro.Schema;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static java.util.stream.Collectors.toList;
import static org.apache.hudi.common.util.StringUtils.nonEmpty;
import static org.apache.hudi.common.util.ValidationUtils.checkState;

public class HoodieMergedReadHandle extends HoodieReadHandle {

  protected final Schema readerSchema;
  protected final Schema baseFileReaderSchema;
  private final Option fileSliceOpt;

  public HoodieMergedReadHandle(HoodieWriteConfig config,
                                Option instantTime,
                                HoodieTable hoodieTable,
                                Pair partitionPathFileIDPair) {
    this(config, instantTime, hoodieTable, partitionPathFileIDPair, Option.empty());
  }

  public HoodieMergedReadHandle(HoodieWriteConfig config,
                                Option instantTime,
                                HoodieTable hoodieTable,
                                Pair partitionPathFileIDPair,
                                Option fileSliceOption) {
    super(config, instantTime, hoodieTable, partitionPathFileIDPair);
    readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
    // config.getSchema is not canonicalized, while config.getWriteSchema is canonicalized. So, we have to use the canonicalized schema to read the existing data.
    baseFileReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getWriteSchema()), config.allowOperationMetadataField());
    fileSliceOpt = fileSliceOption.isPresent() ? fileSliceOption : getLatestFileSlice();
  }

  public List> getMergedRecords() {
    if (!fileSliceOpt.isPresent()) {
      return Collections.emptyList();
    }
    checkState(nonEmpty(instantTime), String.format("Expected a valid instant time but got `%s`", instantTime));
    final FileSlice fileSlice = fileSliceOpt.get();
    String baseFileInstantTime = fileSlice.getBaseFile().get().getCommitTime();
    final HoodieRecordLocation currentLocation = new HoodieRecordLocation(baseFileInstantTime, fileSlice.getFileId());
    Option baseFileReader = Option.empty();
    HoodieMergedLogRecordScanner logRecordScanner = null;
    try {
      baseFileReader = getBaseFileReader(fileSlice);
      logRecordScanner = getLogRecordScanner(fileSlice);
      List> mergedRecords = new ArrayList<>();
      doMergedRead(baseFileReader, logRecordScanner).forEach(r -> {
        r.unseal();
        r.setCurrentLocation(currentLocation);
        r.seal();
        mergedRecords.add(r);
      });
      return mergedRecords;
    } catch (IOException e) {
      throw new HoodieIndexException("Error in reading " + fileSlice, e);
    } finally {
      if (baseFileReader.isPresent()) {
        baseFileReader.get().close();
      }
      if (logRecordScanner != null) {
        logRecordScanner.close();
      }
    }
  }

  private Option getLatestFileSlice() {
    if (nonEmpty(instantTime)
        && hoodieTable.getMetaClient().getCommitsTimeline().filterCompletedInstants().lastInstant().isPresent()) {
      return Option.fromJavaOptional(hoodieTable
          .getHoodieView()
          .getLatestFileSlices(partitionPathFileIDPair.getLeft())
          .filter(fileSlice -> fileSlice.getFileId().equals(partitionPathFileIDPair.getRight()))
          .findFirst());
    }
    return Option.empty();
  }

  private Option getBaseFileReader(FileSlice fileSlice) throws IOException {
    if (fileSlice.getBaseFile().isPresent()) {
      return Option.of(createNewFileReader(fileSlice.getBaseFile().get()));
    }
    return Option.empty();
  }

  private HoodieMergedLogRecordScanner getLogRecordScanner(FileSlice fileSlice) {
    List logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
        .map(l -> l.getPath().toString()).collect(toList());
    return HoodieMergedLogRecordScanner.newBuilder()
        .withStorage(storage)
        .withBasePath(hoodieTable.getMetaClient().getBasePath())
        .withLogFilePaths(logFilePaths)
        .withReaderSchema(readerSchema)
        .withLatestInstantTime(instantTime)
        .withMaxMemorySizeInBytes(IOUtils.getMaxMemoryPerCompaction(hoodieTable.getTaskContextSupplier(), config))
        .withReverseReader(config.getCompactionReverseLogReadEnabled())
        .withBufferSize(config.getMaxDFSStreamBufferSize())
        .withSpillableMapBasePath(config.getSpillableMapBasePath())
        .withPartition(fileSlice.getPartitionPath())
        .withOptimizedLogBlocksScan(config.enableOptimizedLogBlocksScan())
        .withDiskMapType(config.getCommonConfig().getSpillableDiskMapType())
        .withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
        .withRecordMerger(config.getRecordMerger())
        .withTableMetaClient(hoodieTable.getMetaClient())
        .build();
  }

  private List> doMergedRead(Option baseFileReaderOpt, HoodieMergedLogRecordScanner logRecordScanner) throws IOException {
    List> mergedRecords = new ArrayList<>();
    Map deltaRecordMap = logRecordScanner.getRecords();
    Set deltaRecordKeys = new HashSet<>(deltaRecordMap.keySet());

    if (baseFileReaderOpt.isPresent()) {
      HoodieFileReader baseFileReader = baseFileReaderOpt.get();
      HoodieRecordMerger recordMerger = config.getRecordMerger();
      ClosableIterator> baseFileItr = baseFileReader.getRecordIterator(baseFileReaderSchema);
      HoodieTableConfig tableConfig = hoodieTable.getMetaClient().getTableConfig();
      Option> simpleKeyGenFieldsOpt =
          tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp()));
      while (baseFileItr.hasNext()) {
        HoodieRecord record = baseFileItr.next().wrapIntoHoodieRecordPayloadWithParams(readerSchema,
            config.getProps(), simpleKeyGenFieldsOpt, logRecordScanner.isWithOperationField(), logRecordScanner.getPartitionNameOverride(), false, Option.empty());
        String key = record.getRecordKey();
        if (deltaRecordMap.containsKey(key)) {
          deltaRecordKeys.remove(key);
          Option> mergeResult = recordMerger
              .merge(record, readerSchema, deltaRecordMap.get(key), readerSchema, config.getPayloadConfig().getProps());
          if (!mergeResult.isPresent()) {
            continue;
          }
          HoodieRecord r = mergeResult.get().getLeft().wrapIntoHoodieRecordPayloadWithParams(readerSchema,
              config.getProps(), simpleKeyGenFieldsOpt, logRecordScanner.isWithOperationField(), logRecordScanner.getPartitionNameOverride(), false, Option.empty());
          mergedRecords.add(r);
        } else {
          mergedRecords.add(record.copy());
        }
      }
    }

    for (String key : deltaRecordKeys) {
      mergedRecords.add(deltaRecordMap.get(key));
    }

    return mergedRecords;
  }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy