All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.common.LineageRecorder Maven / Gradle / Ivy

There is a newer version: 2.12.3
Show newest version
/*
 * Copyright © 2018-2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package io.cdap.plugin.common;

import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.DatasetManagementException;
import io.cdap.cdap.api.dataset.DatasetProperties;
import io.cdap.cdap.api.dataset.InstanceConflictException;
import io.cdap.cdap.api.lineage.field.EndPoint;
import io.cdap.cdap.api.lineage.field.ReadOperation;
import io.cdap.cdap.api.lineage.field.WriteOperation;
import io.cdap.cdap.etl.api.batch.BatchContext;
import io.cdap.cdap.etl.api.lineage.field.FieldReadOperation;
import io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation;

import java.util.Collections;
import java.util.List;
import javax.annotation.Nullable;

/**
 * A helper class for creating external dataset and recording it's lineage
 */
public class LineageRecorder {

  private final BatchContext context;
  private final String dataset;

  public LineageRecorder(BatchContext context, String dataset) {
    this.context = context;
    this.dataset = dataset;
  }

  /**
   * Creates an external dataset if a dataset with the given {@link LineageRecorder#dataset} name does not already
   * exists. If a non null schema is provided then the external dataset will have the given schema upon creation.
   *
   * @param schema the schema of the external dataset
   * @throws RuntimeException if the dataset creation fails
   */
  public void createExternalDataset(@Nullable Schema schema) {
    DatasetProperties datasetProperties;
    if (schema == null) {
      datasetProperties = DatasetProperties.EMPTY;
    } else {
      datasetProperties = DatasetProperties.of(Collections.singletonMap(DatasetProperties.SCHEMA, schema.toString()));
    }
    try {
      if (!context.datasetExists(dataset)) {
        // if the dataset does not already exists then create it with the given schema. If it does exists then there is
        // no need to create it.
        context.createDataset(dataset, Constants.EXTERNAL_DATASET_TYPE, datasetProperties);
      }
    } catch (InstanceConflictException e) {
      // This will happen when multiple pipelines run simultaneously and are trying to create the same
      // external dataset. Both might enter the if block after checking for existence and try to create the dataset.
      // One will succeed and another will receive a InstanceConflictException. This exception can be ignored.
      return;
    } catch (DatasetManagementException e) {
      throw new RuntimeException(String.format("Failed to create dataset %s with schema %s.", dataset, schema), e);
    }
  }

  /**
   * Records a {@link ReadOperation}
   *
   * @param operationName the name of the operation
   * @param operationDescription description for the operation
   * @param fields output fields of this read operation
   */
  public void recordRead(String operationName, String operationDescription, List fields) {
    context.record(Collections.singletonList(new FieldReadOperation(operationName,
                                                                    operationDescription,
                                                                    EndPoint.of(context.getNamespace(), dataset),
                                                                    fields)));
  }

  /**
   * Records a {@link WriteOperation}
   *
   * @param operationName the name of the operation
   * @param operationDescription description for the operation
   * @param fields input fields of this read operation
   */
  public void recordWrite(String operationName, String operationDescription, List fields) {
    context.record(Collections.singletonList(new FieldWriteOperation(operationName,
                                                                     operationDescription,
                                                                     EndPoint.of(context.getNamespace(), dataset),
                                                                     fields)));
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy