All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 *  Copyright (c) 2016 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.uber.hoodie.io.compact;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.CompactionWriteStat;
import com.uber.hoodie.common.model.HoodieAvroPayload;
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieCompactedLogRecordScanner;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCompactionException;
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
import com.uber.hoodie.table.HoodieTable;
import java.util.Collection;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;

import static java.util.stream.Collectors.*;

/**
 * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage.
 * Computes all possible compactions, passes it through a CompactionFilter and executes
 * all the compactions and writes a new version of base files and make a normal commit
 *
 * @see HoodieCompactor
 */
public class HoodieRealtimeTableCompactor implements HoodieCompactor {

  private static Logger log = LogManager.getLogger(HoodieRealtimeTableCompactor.class);

  @Override
  public HoodieCompactionMetadata compact(JavaSparkContext jsc, HoodieWriteConfig config,
      HoodieTable hoodieTable) throws IOException {
    Preconditions.checkArgument(
        hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
        "HoodieRealtimeTableCompactor can only compact table of type "
            + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient()
            .getTableType().name());

    // TODO - rollback any compactions in flight

    HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
    String compactionCommit = startCompactionCommit(hoodieTable);
    log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommit);
    List partitionPaths =
        FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning());

    log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
    List operations =
        jsc.parallelize(partitionPaths, partitionPaths.size())
            .flatMap((FlatMapFunction) partitionPath -> hoodieTable
                .getRTFileSystemView()
                .getLatestFileSlices(partitionPath)
                .map(s -> new CompactionOperation(s.getDataFile().get(),
                        partitionPath, s.getLogFiles().collect(Collectors.toList()), config))
                .collect(toList()).iterator()).collect();
    log.info("Total of " + operations.size() + " compactions are retrieved");

    // Filter the compactions with the passed in filter. This lets us choose most effective compactions only
    operations = config.getCompactionStrategy().orderAndFilter(config, operations);
    if (operations.isEmpty()) {
      log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
      return null;
    }

    log.info("After filtering, Compacting " + operations + " files");
    List updateStatusMap =
        jsc.parallelize(operations, operations.size())
            .map(s -> executeCompaction(metaClient, config, s, compactionCommit))
            .flatMap(new FlatMapFunction, CompactionWriteStat>() {
              @Override
              public Iterator call(
                  List compactionWriteStats)
                  throws Exception {
                return compactionWriteStats.iterator();
              }
            }).collect();

    HoodieCompactionMetadata metadata = new HoodieCompactionMetadata();
    for (CompactionWriteStat stat : updateStatusMap) {
      metadata.addWriteStat(stat.getPartitionPath(), stat);
    }

    log.info("Compaction finished with result " + metadata);

    //noinspection ConstantConditions
    if (isCompactionSucceeded(metadata)) {
      log.info("Compaction succeeded " + compactionCommit);
      commitCompaction(compactionCommit, metaClient, metadata);
    } else {
      log.info("Compaction failed " + compactionCommit);
    }
    return metadata;
  }

  private boolean isCompactionSucceeded(HoodieCompactionMetadata result) {
    //TODO figure out a success factor for a compaction
    return true;
  }

  private List executeCompaction(HoodieTableMetaClient metaClient,
      HoodieWriteConfig config, CompactionOperation operation, String commitTime)
      throws IOException {
    FileSystem fs = FSUtils.getFs();
    Schema readerSchema =
        HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));

    log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation
        .getDeltaFilePaths() + " for commit " + commitTime);
    // TODO - FIX THIS
    // Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
    // Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
    // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.

    HoodieCompactedLogRecordScanner scanner = new HoodieCompactedLogRecordScanner(fs, operation.getDeltaFilePaths(), readerSchema);
    if (!scanner.iterator().hasNext()) {
      return Lists.newArrayList();
    }

    // Compacting is very similar to applying updates to existing file
    HoodieCopyOnWriteTable table =
        new HoodieCopyOnWriteTable<>(config, metaClient);
    Iterator> result = table
        .handleUpdate(commitTime, operation.getFileId(), scanner.iterator());
    Iterable> resultIterable = () -> result;
    return StreamSupport.stream(resultIterable.spliterator(), false)
        .flatMap(Collection::stream)
        .map(WriteStatus::getStat)
        .map(s -> CompactionWriteStat.newBuilder().withHoodieWriteStat(s)
            .setTotalRecordsToUpdate(scanner.getTotalRecordsToUpdate())
            .setTotalLogFiles(scanner.getTotalLogFiles())
            .setTotalLogRecords(scanner.getTotalLogRecords())
            .onPartition(operation.getPartitionPath()).build())
        .collect(toList());
  }

  public boolean commitCompaction(String commitTime, HoodieTableMetaClient metaClient,
      HoodieCompactionMetadata metadata) {
    log.info("Committing Compaction " + commitTime);
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();

    try {
      activeTimeline.saveAsComplete(
          new HoodieInstant(true, HoodieTimeline.COMPACTION_ACTION, commitTime),
          Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    } catch (IOException e) {
      throw new HoodieCompactionException(
          "Failed to commit " + metaClient.getBasePath() + " at time " + commitTime, e);
    }
    return true;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy