All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.common.model.HoodieFileGroup Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 *  Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */

package com.uber.hoodie.common.model;

import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;

import org.apache.commons.lang3.tuple.Pair;

import java.io.Serializable;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.TreeMap;
import java.util.stream.Stream;

/**
 * A set of data/base files + set of log files, that make up an unit for all operations
 */
public class HoodieFileGroup implements Serializable {

    public static Comparator getReverseCommitTimeComparator() {
        return (o1, o2) -> {
            // reverse the order
            return o2.compareTo(o1);
        };
    }


    /**
     * Partition containing the file group.
     */
    private final String partitionPath;

    /**
     * uniquely identifies the file group
     */
    private final String id;

    /**
     * Slices of files in this group, sorted with greater commit first.
     */
    private final TreeMap fileSlices;

    /**
     * Timeline, based on which all getter work
     */
    private final HoodieTimeline timeline;

    /**
     * The last completed instant, that acts as a high watermark for all
     * getters
     */
    private final Optional lastInstant;

    public HoodieFileGroup(String partitionPath, String id, HoodieTimeline timeline) {
        this.partitionPath = partitionPath;
        this.id = id;
        this.fileSlices = new TreeMap<>(HoodieFileGroup.getReverseCommitTimeComparator());
        this.timeline = timeline;
        this.lastInstant = timeline.lastInstant();
    }

    /**
     * Add a new datafile into the file group
     *
     * @param dataFile
     */
    public void addDataFile(HoodieDataFile dataFile) {
        if (!fileSlices.containsKey(dataFile.getCommitTime())) {
            fileSlices.put(dataFile.getCommitTime(), new FileSlice(dataFile.getCommitTime(), id));
        }
        fileSlices.get(dataFile.getCommitTime()).setDataFile(dataFile);
    }

    /**
     * Add a new log file into the group
     *
     * @param logFile
     */
    public void addLogFile(HoodieLogFile logFile) {
        if (!fileSlices.containsKey(logFile.getBaseCommitTime())) {
            fileSlices.put(logFile.getBaseCommitTime(), new FileSlice(logFile.getBaseCommitTime(), id));
        }
        fileSlices.get(logFile.getBaseCommitTime()).addLogFile(logFile);
    }

    public String getId() {
        return id;
    }

    public String getPartitionPath() {
        return partitionPath;
    }

    /**
     * A FileSlice is considered committed, if one of the following is true
     *  - There is a committed data file
     *  - There are some log files, that are based off a commit or delta commit
     *
     * @param slice
     * @return
     */
    private boolean isFileSliceCommitted(FileSlice slice) {
        String maxCommitTime = lastInstant.get().getTimestamp();
        return timeline.containsOrBeforeTimelineStarts(slice.getBaseCommitTime()) &&
                HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(),
                        maxCommitTime,
                        HoodieTimeline.LESSER_OR_EQUAL);

    }

    /**
     * Provides a stream of committed file slices, sorted reverse base commit time.
     *
     * @return
     */
    public Stream getAllFileSlices() {
        if (!timeline.empty()) {
            return fileSlices.entrySet().stream()
                    .map(sliceEntry -> sliceEntry.getValue())
                    .filter(slice -> isFileSliceCommitted(slice));
        }
        return Stream.empty();
    }

    /**
     * Gets the latest slice - this can contain either
     *
     *  - just the log files without data file
     *  - (or) data file with 0 or more log files
     *
     * @return
     */
    public Optional getLatestFileSlice() {
        // there should always be one
        return getAllFileSlices().findFirst();
    }

    /**
     * Obtain the latest file slice, upto a commitTime i.e <= maxCommitTime
     *
     * @param maxCommitTime
     * @return
     */
    public Optional getLatestFileSliceBeforeOrOn(String maxCommitTime) {
        return getAllFileSlices()
                .filter(slice ->
                        HoodieTimeline.compareTimestamps(slice.getBaseCommitTime(),
                                maxCommitTime,
                                HoodieTimeline.LESSER_OR_EQUAL))
                .findFirst();
    }

    public Optional getLatestFileSliceInRange(List commitRange) {
        return getAllFileSlices()
                .filter(slice -> commitRange.contains(slice.getBaseCommitTime()))
                .findFirst();
    }

    /**
     * Stream of committed data files, sorted reverse commit time
     *
     * @return
     */
    public Stream getAllDataFiles() {
        return getAllFileSlices()
                .filter(slice -> slice.getDataFile().isPresent())
                .map(slice -> slice.getDataFile().get());
    }

    /**
     * Get the latest committed data file
     *
     * @return
     */
    public Optional getLatestDataFile() {
        return getAllDataFiles().findFirst();
    }

    /**
     * Get the latest data file, that is <=  max commit time
     *
     * @param maxCommitTime
     * @return
     */
    public Optional getLatestDataFileBeforeOrOn(String maxCommitTime) {
        return getAllDataFiles()
                .filter(dataFile ->
                        HoodieTimeline.compareTimestamps(dataFile.getCommitTime(),
                                maxCommitTime,
                                HoodieTimeline.LESSER_OR_EQUAL))
                .findFirst();
    }

    /**
     * Get the latest data file, that is contained within the provided commit range.
     *
     * @param commitRange
     * @return
     */
    public Optional getLatestDataFileInRange(List commitRange) {
        return getAllDataFiles()
                .filter(dataFile -> commitRange.contains(dataFile.getCommitTime()))
                .findFirst();
    }

    /**
     * Obtain the latest log file (based on latest committed data file),
     * currently being appended to
     *
     * @return logfile if present, empty if no log file has been opened already.
     */
    public Optional getLatestLogFile() {
        Optional latestSlice = getLatestFileSlice();
        if (latestSlice.isPresent() && latestSlice.get().getLogFiles().count() > 0) {
            return latestSlice.get().getLogFiles().findFirst();
        }
        return Optional.empty();
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("HoodieFileGroup {");
        sb.append("id=").append(id);
        sb.append(", fileSlices='").append(fileSlices).append('\'');
        sb.append('}');
        return sb.toString();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy