All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.hive.mapred.PaimonOutputCommitter Maven / Gradle / Ivy

There is a newer version: 0.9.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.hive.mapred;

import org.apache.paimon.fs.FileIO;
import org.apache.paimon.fs.Path;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.BatchTableCommit;
import org.apache.paimon.table.sink.BatchTableWrite;
import org.apache.paimon.table.sink.BatchWriteBuilder;
import org.apache.paimon.table.sink.CommitMessage;

import org.apache.paimon.shade.guava30.com.google.common.collect.ImmutableMap;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.OutputCommitter;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.TaskType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;

import static org.apache.paimon.hive.utils.HiveUtils.createFileStoreTable;

/** A Paimon table committer for adding data files to the Paimon table. */
public class PaimonOutputCommitter extends OutputCommitter {

    private static final String PRE_COMMIT = ".preCommit";

    private static final Logger LOG = LoggerFactory.getLogger(PaimonOutputCommitter.class);

    @Override
    public void setupJob(JobContext jobContext) throws IOException {}

    @Override
    public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException {}

    @Override
    public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException {
        // We need to commit if this is the last phase of a MapReduce process
        return TaskType.REDUCE.equals(
                        taskAttemptContext.getTaskAttemptID().getTaskID().getTaskType())
                || taskAttemptContext.getJobConf().getNumReduceTasks() == 0;
    }

    @Override
    public void commitTask(TaskAttemptContext originalContext) throws IOException {
        TaskAttemptContext taskAttemptContext =
                TezUtil.enrichContextWithAttemptWrapper(originalContext);
        TaskAttemptID attemptID = taskAttemptContext.getTaskAttemptID();
        JobConf jobConf = taskAttemptContext.getJobConf();
        FileStoreTable table = createFileStoreTable(jobConf);

        Map writers =
                Optional.ofNullable(PaimonRecordWriter.getWriters(attemptID))
                        .orElseGet(
                                () -> {
                                    LOG.info(
                                            "CommitTask found no writers for output table: {}, attemptID: {}",
                                            table.name(),
                                            attemptID);
                                    return ImmutableMap.of();
                                });
        PaimonRecordWriter writer = writers.get(table.name());
        if (writer != null) {

            try (BatchTableWrite batchTableWrite = writer.batchTableWrite()) {
                List commitTables = batchTableWrite.prepareCommit();
                createPreCommitFile(
                        commitTables,
                        generatePreCommitFileLocation(
                                table.location(),
                                attemptID.getJobID(),
                                attemptID.getTaskID().getId()),
                        table.fileIO());
                writer.close(true);
            } catch (Exception e) {
                LOG.error(
                        "CommitTask prepareCommit error for specific table: {}, attemptID: {}",
                        table.name(),
                        attemptID);
                throw new RuntimeException(e);
            }
        } else {
            LOG.info(
                    "CommitTask found no writer for specific table: {}, attemptID: {}",
                    table.name(),
                    attemptID);
        }
        PaimonRecordWriter.removeWriters(attemptID);
    }

    @Override
    public void abortTask(TaskAttemptContext originalContext) throws IOException {
        TaskAttemptContext taskAttemptContext =
                TezUtil.enrichContextWithAttemptWrapper(originalContext);
        Map writers =
                PaimonRecordWriter.removeWriters(taskAttemptContext.getTaskAttemptID());

        // close writer and delete files
        if (writers != null) {
            for (PaimonRecordWriter writer : writers.values()) {
                writer.close(true);
            }
        }
    }

    @Override
    public void commitJob(JobContext originalContext) throws IOException {
        JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
        JobConf jobConf = jobContext.getJobConf();

        long startTime = System.currentTimeMillis();
        LOG.info("CommitJob {} has started", jobContext.getJobID());
        FileStoreTable table = createFileStoreTable(jobConf);

        if (table != null) {
            BatchWriteBuilder batchWriteBuilder = table.newBatchWriteBuilder();
            List commitMessagesList =
                    getAllPreCommitMessage(table.location(), jobContext, table.fileIO());
            try (BatchTableCommit batchTableCommit = batchWriteBuilder.newCommit()) {
                batchTableCommit.commit(commitMessagesList);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            deleteTemporaryFile(
                    jobContext,
                    generateJobLocation(table.location(), jobContext.getJobID()),
                    table.fileIO());
        } else {
            LOG.info("CommitJob not found table, Skipping job commit.");
        }

        LOG.info(
                "Commit took {} ms for job {}",
                System.currentTimeMillis() - startTime,
                jobContext.getJobID());
    }

    @Override
    public void abortJob(JobContext originalContext, int status) throws IOException {
        JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
        FileStoreTable table = createFileStoreTable(jobContext.getJobConf());
        if (table != null) {

            LOG.info("AbortJob {} has started", jobContext.getJobID());
            List commitMessagesList =
                    getAllPreCommitMessage(table.location(), jobContext, table.fileIO());
            BatchWriteBuilder batchWriteBuilder = table.newBatchWriteBuilder();
            try (BatchTableCommit batchTableCommit = batchWriteBuilder.newCommit()) {
                batchTableCommit.abort(commitMessagesList);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            deleteTemporaryFile(
                    jobContext,
                    generateJobLocation(table.location(), jobContext.getJobID()),
                    table.fileIO());
            LOG.info("Job {} is aborted. preCommit file has deleted", jobContext.getJobID());
        }
    }

    /**
     * Delete job's temporary locations.
     *
     * @param jobContext The job context
     * @param location The locations to clean up
     */
    private void deleteTemporaryFile(JobContext jobContext, Path location, FileIO fileIO) {
        LOG.info("Deleting temporary file for job {} started", jobContext.getJobID());

        LOG.info("The deleted file is located in : {}", location);
        try {
            fileIO.delete(location, true);
        } catch (IOException e) {
            LOG.debug("Failed to delete directory {} ", location, e);
        }
        LOG.info("Deleting temporary file for job {} finished", jobContext.getJobID());
    }

    /**
     * Get the CommitMessages.
     *
     * @param location The location of the table
     * @param jobContext The job context
     * @param io The FileIO used for reading a files generated for commit
     * @return The list of the committed data files
     */
    private static List getAllPreCommitMessage(
            Path location, JobContext jobContext, FileIO io) {
        JobConf conf = jobContext.getJobConf();

        int totalCommitMessagesSize =
                conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();

        List commitMessagesList = Collections.synchronizedList(new ArrayList<>());

        for (int i = 0; i < totalCommitMessagesSize; i++) {
            Path commitFileLocation =
                    generatePreCommitFileLocation(location, jobContext.getJobID(), i);
            commitMessagesList.addAll(readPreCommitFile(commitFileLocation, io));
        }

        return commitMessagesList;
    }

    /**
     * Generates the job temp location based on the job configuration.
     *
     * @param location The location of the table
     * @param jobId The JobID for the task
     * @return The file to store the results
     */
    static Path generateJobLocation(Path location, JobID jobId) {
        return new Path(new Path(location, "temp"), jobId.toString());
    }

    /**
     * Generates preCommit file location based on the configuration and a specific task id. In order
     * to ensure the atomicity of the commit, we need to keep preCommit persistent, restore it in
     * {@link PaimonOutputCommitter#commitJob(JobContext)} to complete the final commit, and delete
     * the temporary files at the end.
     *
     * @param location The location of the table
     * @param jobId jobId
     * @param taskId taskId
     * @return The location of preCommit file path
     */
    private static Path generatePreCommitFileLocation(Path location, JobID jobId, int taskId) {
        return new Path(generateJobLocation(location, jobId), "task_" + taskId + PRE_COMMIT);
    }

    /**
     * * Create a temp preCommitFile to store {@link BatchTableWrite#prepareCommit()}'s
     * result @Param commitTables The commitMessages of the table preCommit @Param location The temp
     * file's location @Param io The FileIO of the table.
     */
    private static void createPreCommitFile(
            List commitTables, Path location, FileIO io) throws IOException {
        try (ObjectOutputStream objectOutputStream =
                new ObjectOutputStream(io.newOutputStream(location, true))) {
            objectOutputStream.writeObject(commitTables);
        }
    }

    private static List readPreCommitFile(Path location, FileIO io) {
        try (ObjectInputStream objectInputStream =
                new ObjectInputStream(io.newInputStream(location))) {
            return (List) objectInputStream.readObject();
        } catch (ClassNotFoundException | IOException e) {
            throw new RuntimeException(
                    String.format("Can not read or parse CommitMessage file: %s", location));
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy