All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.clickzetta.platform.bulkload.BulkLoadWriterImpl Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
package com.clickzetta.platform.bulkload;

import com.clickzetta.platform.client.CZClient;
import com.clickzetta.platform.client.Table;
import com.clickzetta.platform.client.api.ArrowRow;
import com.clickzetta.platform.client.api.BulkLoadOperation;
import com.clickzetta.platform.client.api.BulkLoadState;
import com.clickzetta.platform.client.api.BulkLoadWriter;
import com.clickzetta.platform.client.api.Row;
import com.clickzetta.platform.common.CZException;
import com.clickzetta.platform.common.Constant;
import com.clickzetta.platform.common.Schema;
import com.google.common.base.Preconditions;
import cz.proto.ingestion.v2.IngestionV2;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.OutputFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;

class BulkLoadWriterImpl implements BulkLoadWriter {

  private static final Logger LOG = LoggerFactory.getLogger(BulkLoadWriterImpl.class);
  private final CZClient client;
  private final BulkLoadMetaData metaData;
  private final boolean useHttp;
  private final int partitionId;
  // File format to write.
  private final FileFormatType fileFormat;
  // Max file size to write.
  private final long maxFileSize;
  // Max number of records to write in a single file.
  private final long maxFileRecordCount;
  // Static partition values to write.
  private final Map partitionValues;
  private StagingConfig stagingConfig;
  private FileIO fileIO;
  // Root dir to write files.
  private String location;
  // File writer responsible for a single file.
  private RowAppender applier;
  // Total number of rows written.
  private long totalNumRows = 0;
  // List of finished files.
  private List finishedFiles = new ArrayList<>();
  // List of finished file sizes
  private List finishedFileSizes = new ArrayList<>();
  // File name uuid to differentiate files from parallel streams.
  private String fileNameUuid = UUID.randomUUID().toString();
  // Sequential file id to create file name
  private int fileId = 0;
  private boolean closed = false;

  private boolean complexTypeRreCheck;

  public BulkLoadWriterImpl(CZClient client,
                            BulkLoadMetaData metaData,
                            BulkLoadConfig config,
                            int partitionId) throws IOException {
    Preconditions.checkArgument(metaData.getState() == BulkLoadState.CREATED,
        "Failed to create BulkLoadWriter due to invalid state: " + metaData.getState());
    this.client = client;
    this.metaData = metaData;
    this.useHttp = client.getClientContext().url().startsWith("http://");
    this.partitionId = partitionId;
    this.stagingConfig = config.getStagingConfig();
    this.fileIO = stagingConfig.createFileIO(metaData.preferInternalEndpoint(), useHttp, metaData.getEncryptionOptions());
    this.location = stagingConfig.getPath();
    this.fileFormat = config.getFileFormat();
    this.maxFileRecordCount = config.getMaxNumRowsPerFile();
    this.maxFileSize = config.getMaxFileSizeInBytesPerFile();
    this.partitionValues = PartitionSpecUtils.parsePartitionSpec(
        metaData.getArrowTable().getStreamSchema(), metaData.getPartitionSpecs());
    Object obj = client.getClientContext().getProperties().getOrDefault(Constant.ARROW_ROW_FORMAT_CHECK, true);
    this.complexTypeRreCheck = obj instanceof String ? Boolean.parseBoolean((String) obj) : (boolean) obj;
  }

  @Override
  public String getStreamId() {
    return metaData.getStreamId();
  }

  @Override
  public BulkLoadOperation getOperation() {
    return metaData.getOperation();
  }

  @Override
  public Schema getSchema() {
    return getTable().getSchema();
  }

  @Override
  public Table getTable() {
    return metaData.getArrowTable();
  }

  @Override
  public Optional getPartitionSpecs() {
    return metaData.getPartitionSpecs();
  }

  @Override
  public long getPartitionId() {
    return partitionId;
  }

  @Override
  public Row createRow() {
    switch (getOperation()) {
      case APPEND:
      case OVERWRITE:
      case UPSERT:
        return new ArrowRow(metaData.getArrowTable(), IngestionV2.OperationType.INSERT, complexTypeRreCheck);
      default:
        throw new CZException("Unsupported bulk load operation " + getOperation());
    }
  }

  @Override
  public void write(Row... rows) throws IOException {
    checkFileStatus();
    PartitionSpecUtils.fillPartitionValues(partitionValues, rows);
    applier.append(rows);
    totalNumRows += rows.length;
  }

  @Override
  public void close() throws IOException {
    if (closed) {
      return;
    }

    closeCurrentFile();

    client.finishBulkLoadStreamWriterV2(
        metaData.getInstanceId(),
        metaData.getWorkspace(),
        metaData.getSchemaName(),
        metaData.getTableName(),
        metaData.getStreamId(),
        partitionId,
        finishedFiles,
        finishedFileSizes);

    LOG.info("Flush bulk load stream {} partitionId {} with {} files",
        metaData.getStreamId(), partitionId, finishedFiles.size());

    finishedFiles.clear();
    finishedFileSizes.clear();
    closed = true;
  }

  private String getCurrentFileName() {
    // remove trailing slash if exists
    String path = location.endsWith("/") ? location.substring(0, location.length() - 1) : location;
    return String.format("%s/%s-%06d.%s", path, fileNameUuid, fileId, fileFormat.name().toLowerCase(Locale.ENGLISH));
  }

  private void closeCurrentFile() throws IOException {
    if (applier != null) {
      applier.close();

      String fileName = getCurrentFileName();
      finishedFiles.add(fileName);
      finishedFileSizes.add(applier.length());

      fileId++;
      applier = null;

      // Reset staging config to force refreshing token.
      fileIO.close();
      fileIO = null;
      stagingConfig = null;
    }
  }

  private OutputFile createNextFile() throws IOException {
    if (stagingConfig == null) {
      stagingConfig = client.getBulkLoadStreamStsTokenV2(
          metaData.getInstanceId(),
          metaData.getWorkspace(),
          metaData.getSchemaName(),
          metaData.getTableName(),
          metaData.getStreamId());
      fileIO = stagingConfig.createFileIO(metaData.preferInternalEndpoint(), useHttp, metaData.getEncryptionOptions());
      location = stagingConfig.getPath();
    }
    String fileName = getCurrentFileName();
    return fileIO.newOutputFile(fileName);
  }

  private void checkFileStatus() throws IOException {
    if (applier != null) {
      // Check whether the current file is too full.
      if (applier.length() > maxFileSize || applier.recordCount() >= maxFileRecordCount) {
        closeCurrentFile();
      }
    }

    if (applier == null) {
      OutputFile outputFile = createNextFile();
      applier = RowAppenderFactory.create(outputFile, metaData.getArrowTable(), fileFormat);
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy