org.apache.hudi.hadoop.realtime.HoodieMergeOnReadSnapshotReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hadoop.realtime;

import org.apache.hudi.common.model.HoodieAvroIndexedRecord;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
import org.apache.hudi.common.util.DefaultSizeEstimator;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.storage.HoodieStorageUtils;

import org.apache.avro.Schema;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.hudi.common.config.HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED;
import static org.apache.hudi.common.config.HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE;
import static org.apache.hudi.common.config.HoodieMemoryConfig.DEFAULT_MR_MAX_DFS_STREAM_BUFFER_SIZE;
import static org.apache.hudi.common.config.HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE;
import static org.apache.hudi.common.config.HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH;
import static org.apache.hudi.common.config.HoodieReaderConfig.ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN;
import static org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getBaseFileReader;
import static org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes;
import static org.apache.hudi.internal.schema.InternalSchema.getEmptyInternalSchema;

/**
 * An implementation of {@link AbstractRealtimeRecordReader} that reads from base parquet files and log files,
 * and merges the records on the fly. It differs from {@link HoodieRealtimeRecordReader} in that it does not
 * implement Hadoop's RecordReader interface, and instead implements Iterator interface that returns an iterator
 * of {@link HoodieRecord}s which are {@link HoodieAvroIndexedRecord}s. This can be used by query engines like
 * Trino that do not use Hadoop's RecordReader interface. However, the engine must support reading from iterators
 * and also support Avro (de)serialization.
 */
public class HoodieMergeOnReadSnapshotReader extends AbstractRealtimeRecordReader implements Iterator, AutoCloseable {

  private static final Logger LOG = LoggerFactory.getLogger(HoodieMergeOnReadSnapshotReader.class);

  private final String tableBasePath;
  private final List logFilePaths;
  private final String latestInstantTime;
  private final Schema readerSchema;
  private final JobConf jobConf;
  private final HoodieMergedLogRecordScanner logRecordScanner;
  private final HoodieFileReader baseFileReader;
  private final Map logRecordsByKey;
  private final Iterator recordsIterator;
  private final ExternalSpillableMap mergedRecordsByKey;

  /**
   * In order to instantiate this record reader, one needs to provide following parameters.
   * An example usage is demonstrated in TestHoodieMergeOnReadSnapshotReader.
   *
   * @param tableBasePath     Base path of the Hudi table
   * @param baseFilePath      Path of the base file as of the latest instant time for the split being processed
   * @param logFilePaths      Paths of the log files as of the latest file slices pertaining to file group id of the base file
   * @param latestInstantTime Latest instant time
   * @param readerSchema      Schema of the reader
   * @param jobConf           Any job configuration
   * @param start             Start offset
   * @param length            Length of the split
   */
  public HoodieMergeOnReadSnapshotReader(String tableBasePath,
                                         String baseFilePath,
                                         List logFilePaths,
                                         String latestInstantTime,
                                         Schema readerSchema,
                                         JobConf jobConf,
                                         long start,
                                         long length) throws IOException {
    super(getRealtimeSplit(tableBasePath, baseFilePath, logFilePaths, latestInstantTime, start, length, new String[0]), jobConf);
    this.tableBasePath = tableBasePath;
    this.logFilePaths = logFilePaths;
    this.latestInstantTime = latestInstantTime;
    this.readerSchema = readerSchema;
    this.jobConf = jobConf;
    HoodieTimer timer = new HoodieTimer().startTimer();
    this.logRecordScanner = getMergedLogRecordScanner();
    LOG.debug("Time taken to scan log records: {}", timer.endTimer());
    this.baseFileReader = getBaseFileReader(new Path(baseFilePath), jobConf);
    this.logRecordsByKey = logRecordScanner.getRecords();
    Set logRecordKeys = new HashSet<>(this.logRecordsByKey.keySet());
    this.mergedRecordsByKey = new ExternalSpillableMap<>(
        getMaxCompactionMemoryInBytes(jobConf),
        jobConf.get(SPILLABLE_MAP_BASE_PATH.key(),
            FileIOUtils.getDefaultSpillableMapBasePath()),
        new DefaultSizeEstimator(),
        new HoodieRecordSizeEstimator(readerSchema),
        jobConf.getEnum(SPILLABLE_DISK_MAP_TYPE.key(), SPILLABLE_DISK_MAP_TYPE.defaultValue()),
        jobConf.getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()));
    try (ClosableIterator baseFileIterator = baseFileReader.getRecordKeyIterator()) {
      timer.startTimer();
      while (baseFileIterator.hasNext()) {
        String key = baseFileIterator.next();
        if (logRecordKeys.contains(key)) {
          logRecordKeys.remove(key);
          Option mergedRecord = buildGenericRecordWithCustomPayload(logRecordsByKey.get(key));
          if (mergedRecord.isPresent()) {
            HoodieRecord hoodieRecord = mergedRecord.get().copy();
            mergedRecordsByKey.put(key, hoodieRecord);
          }
        }
      }
    }
    LOG.debug("Time taken to merge base file and log file records: {}", timer.endTimer());
    this.recordsIterator = mergedRecordsByKey.values().iterator();
  }

  @Override
  public boolean hasNext() {
    return recordsIterator.hasNext();
  }

  @Override
  public HoodieRecord next() {
    return recordsIterator.next();
  }

  public Map getRecordsByKey() {
    return mergedRecordsByKey;
  }

  public Iterator getRecordsIterator() {
    return recordsIterator;
  }

  public Map getLogRecordsByKey() {
    return logRecordsByKey;
  }

  private static HoodieRealtimeFileSplit getRealtimeSplit(String tableBasePath, String baseFilePath,
                                                          List logFilePaths,
                                                          String latestInstantTime,
                                                          long start, long length, String[] hosts) {
    HoodieRealtimePath realtimePath = new HoodieRealtimePath(
        new Path(baseFilePath).getParent(),
        baseFilePath,
        tableBasePath,
        logFilePaths,
        latestInstantTime,
        false, // TODO: Fix this to support incremental queries
        Option.empty());
    return HoodieInputFormatUtils.createRealtimeFileSplit(realtimePath, start, length, hosts);
  }

  private HoodieMergedLogRecordScanner getMergedLogRecordScanner() {
    return HoodieMergedLogRecordScanner.newBuilder()
        .withStorage(HoodieStorageUtils.getStorage(
            split.getPath().toString(), HadoopFSUtils.getStorageConf(jobConf)))
        .withBasePath(tableBasePath)
        .withLogFilePaths(logFilePaths.stream().map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()))
        .withReaderSchema(readerSchema)
        .withLatestInstantTime(latestInstantTime)
        .withMaxMemorySizeInBytes(getMaxCompactionMemoryInBytes(jobConf))
        .withReverseReader(false)
        .withBufferSize(jobConf.getInt(MAX_DFS_STREAM_BUFFER_SIZE.key(),
            DEFAULT_MR_MAX_DFS_STREAM_BUFFER_SIZE))
        .withSpillableMapBasePath(jobConf.get(SPILLABLE_MAP_BASE_PATH.key(),
            FileIOUtils.getDefaultSpillableMapBasePath()))
        .withDiskMapType(jobConf.getEnum(SPILLABLE_DISK_MAP_TYPE.key(), SPILLABLE_DISK_MAP_TYPE.defaultValue()))
        .withBitCaskDiskMapCompressionEnabled(jobConf.getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()))
        .withOptimizedLogBlocksScan(jobConf.getBoolean(ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN.key(),
            Boolean.parseBoolean(ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN.defaultValue())))
        .withInternalSchema(schemaEvolutionContext.internalSchemaOption.orElse(getEmptyInternalSchema()))
        .build();
  }

  private Option buildGenericRecordWithCustomPayload(HoodieRecord record) throws IOException {
    if (usesCustomPayload) {
      return record.toIndexedRecord(getWriterSchema(), payloadProps);
    } else {
      return record.toIndexedRecord(readerSchema, payloadProps);
    }
  }

  @Override
  public void close() throws Exception {
    if (baseFileReader != null) {
      baseFileReader.close();
    }
    if (logRecordScanner != null) {
      logRecordScanner.close();
    }
    if (mergedRecordsByKey != null) {
      mergedRecordsByKey.close();
    }
  }
}