org.apache.spark.sql.sources.v2.writer.DataWriter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of spark-sql_2.11
There is a newer version: 2.4.8
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.sources.v2.writer;

import java.io.IOException;

import org.apache.spark.annotation.InterfaceStability;

/**
 * A data writer returned by {@link DataWriterFactory#createDataWriter(int, int)} and is
 * responsible for writing data for an input RDD partition.
 *
 * One Spark task has one exclusive data writer, so there is no thread-safe concern.
 *
 * {@link #write(Object)} is called for each record in the input RDD partition. If one record fails
 * the {@link #write(Object)}, {@link #abort()} is called afterwards and the remaining records will
 * not be processed. If all records are successfully written, {@link #commit()} is called.
 *
 * If this data writer succeeds(all records are successfully written and {@link #commit()}
 * succeeds), a {@link WriterCommitMessage} will be sent to the driver side and pass to
 * {@link DataSourceWriter#commit(WriterCommitMessage[])} with commit messages from other data
 * writers. If this data writer fails(one record fails to write or {@link #commit()} fails), an
 * exception will be sent to the driver side, and Spark will retry this writing task for some times,
 * each time {@link DataWriterFactory#createDataWriter(int, int)} gets a different `attemptNumber`,
 * and finally call {@link DataSourceWriter#abort(WriterCommitMessage[])} if all retry fail.
 *
 * Besides the retry mechanism, Spark may launch speculative tasks if the existing writing task
 * takes too long to finish. Different from retried tasks, which are launched one by one after the
 * previous one fails, speculative tasks are running simultaneously. It's possible that one input
 * RDD partition has multiple data writers with different `attemptNumber` running at the same time,
 * and data sources should guarantee that these data writers don't conflict and can work together.
 * Implementations can coordinate with driver during {@link #commit()} to make sure only one of
 * these data writers can commit successfully. Or implementations can allow all of them to commit
 * successfully, and have a way to revert committed data writers without the commit message, because
 * Spark only accepts the commit message that arrives first and ignore others.
 *
 * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
 * source writers, or {@link org.apache.spark.sql.catalyst.InternalRow} for data source writers
 * that mix in {@link SupportsWriteInternalRow}.
 */
@InterfaceStability.Evolving
public interface DataWriter {

  /**
   * Writes one record.
   *
   * If this method fails (by throwing an exception), {@link #abort()} will be called and this
   * data writer is considered to have been failed.
   *
   * @throws IOException if failure happens during disk/network IO like writing files.
   */
  void write(T record) throws IOException;

  /**
   * Commits this writer after all records are written successfully, returns a commit message which
   * will be sent back to driver side and passed to
   * {@link DataSourceWriter#commit(WriterCommitMessage[])}.
   *
   * The written data should only be visible to data source readers after
   * {@link DataSourceWriter#commit(WriterCommitMessage[])} succeeds, which means this method
   * should still "hide" the written data and ask the {@link DataSourceWriter} at driver side to
   * do the final commit via {@link WriterCommitMessage}.
   *
   * If this method fails (by throwing an exception), {@link #abort()} will be called and this
   * data writer is considered to have been failed.
   *
   * @throws IOException if failure happens during disk/network IO like writing files.
   */
  WriterCommitMessage commit() throws IOException;

  /**
   * Aborts this writer if it is failed. Implementations should clean up the data for already
   * written records.
   *
   * This method will only be called if there is one record failed to write, or {@link #commit()}
   * failed.
   *
   * If this method fails(by throwing an exception), the underlying data source may have garbage
   * that need to be cleaned by {@link DataSourceWriter#abort(WriterCommitMessage[])} or manually,
   * but these garbage should not be visible to data source readers.
   *
   * @throws IOException if failure happens during disk/network IO like writing files.
   */
  void abort() throws IOException;
}