io.cdap.wrangler.lineage.LineageOperations Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of wrangler-core Show documentation
There is a newer version: 4.10.1
/*
 * Copyright © 2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.wrangler.lineage;

import com.google.common.collect.Sets;
import io.cdap.cdap.etl.api.lineage.field.FieldOperation;
import io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation;
import io.cdap.wrangler.api.Directive;
import io.cdap.wrangler.api.lineage.Lineage;
import io.cdap.wrangler.api.lineage.Mutation;
import io.cdap.wrangler.api.lineage.Relation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * This class {@link LineageOperations} generates transformation operations used to generate lineage.
 */
public final class LineageOperations {
  private static final Logger LOG = LoggerFactory.getLogger(LineageOperations.class);
  private static final String OPERATION_NAME_PREFIX = "operation_";
  private final Set input;
  private final Set output;
  private final List directives;

  /**
   * A constructor for generating the transformation operations for lineage.
   *
   * @param input A input {@link Set} of columns.
   * @param output A output {@link Set} of columns.
   * @param directives A {@link List} of directives.
   */
  public LineageOperations(Set input, Set output, List directives) {
    this.input = Collections.unmodifiableSet(new HashSet<>(input));
    this.output = Collections.unmodifiableSet(new HashSet<>(output));
    this.directives = Collections.unmodifiableList(new ArrayList<>(directives));;
  }

  /**
   * Generates the list of {@link FieldOperation} required for generating lineage.
   *
   * @return a {@link List} of {@link FieldOperation}
   */
  public List generate() {
    List operations = new ArrayList<>();
    // this set contains all the fields that are used as the input fields in the transformation
    Set definedSources = new HashSet<>();

    // The following three fields are needed to track the guessed transformation.
    // Due to the current limitation of some directive, the actual operated columns is unknown until the actual data
    // comes. This happens mostly in the parse scenario and it will happen in ALL type relations.
    // Therefore, we will guess these parse directive using the given input and output fields.
    // However, this has a big drawback, since the following directives can use columns
    // the parse directive actually generates, and they will generate something like [actual columns] -> [...]. In this
    // case, the traceback logic will not be able to the read operations since actual columns are never in the any
    // of the output fields in any transformation. To solve this issue, each time we see a input field that does not
    // appear before, we will add it to the last all operation, since we can be sure that it is an actual field and
    // it should be generated by one of the all operations.

    // this set contains all the available fields that can be used as inputs in a field transformation. If one of the
    // fields from a relation are not in this set, that means, it is generated from a all operation, and we will, by
    // guess, add it to the output fields of last all operations
    Set availableSources = new HashSet<>(input);
    // this int is the index of the last all operation generated field transformation in the operations, it will
    // need to be update the transformation operation to contain the actual columns.
    int lastAllOperationIndex = -1;
    // this is the output fields that should be in last all operations, basically it contains the output fields that
    // are not present in input schema + all actual fields used by the following directives.
    Set lastAllOutput = new HashSet<>();
    
    for (Directive directive : directives) {
      if (directive instanceof Lineage) {
        Mutation mutation = ((Lineage) directive).lineage();
        String readable = mutation.readable();
        for (Relation relation : mutation.relations()) {
          // we won't use uuid in relation since in cdap we compute the checksum of the operations and we only
          // write to the table if the operations are different
          String name = OPERATION_NAME_PREFIX + operations.size();
          List sources = relation.getSources();
          List targets = relation.getTargets();
          switch (relation.getType()) {
            case ALL:
              sources = sources.isEmpty() ? new ArrayList<>(input) : sources;
              checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
              // add the diff of all output and input fields as the output targets
              Set outputs = new HashSet<>(Sets.difference(output, availableSources));
              // if the sources are in the output fields itself, add that to the targets
              outputs.addAll(Sets.intersection(new HashSet<>(sources), output));
              // add all the targets that this relation provides to the outputs
              outputs.addAll(targets);
              targets = new ArrayList<>(outputs);
              operations.add(
                new FieldTransformOperation(
                  name,
                  readable,
                  sources,
                  targets)
              );
              lastAllOperationIndex = operations.size() - 1;
              lastAllOutput = new HashSet<>(targets);
              break;

            case GENERATE:
              sources = new ArrayList<>(input);
              operations.add(new FieldTransformOperation(
                name,
                readable,
                sources,
                targets
              ));
              break;

            case DROP:
              checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
              operations.add(
                new FieldTransformOperation(
                  name,
                  readable,
                  sources
                )
              );
              break;

            case CREATE:
              checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
              operations.add(
                new FieldTransformOperation(
                  name,
                  readable,
                  Collections.emptyList(),
                  targets
                )
              );
              break;

            case STANDARD:
              checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
              operations.add(
                new FieldTransformOperation(
                  name,
                  readable,
                  sources,
                  targets
                )
              );
          }
          availableSources.addAll(targets);
          definedSources.addAll(relation.getSources());
        }
      }
    }

    // We iterate through all the input fields in the schema, check if there is corresponding
    // field in the output schema. If both exists, then a identity mapping transform is added
    // to the {@code FieldTransformationOperation} is added.
    Set difference = Sets.difference(input, definedSources);
    for (String next : difference) {
      if (output.contains(next)) {
        FieldTransformOperation transformation =
          new FieldTransformOperation(
            OPERATION_NAME_PREFIX + operations.size(),
            String.format("Mapping column '%s' to column '%s'", next, next),
            Collections.singletonList(next),
            next
          );
        operations.add(transformation);
      }
    }

    return operations;
  }

  // checks if the current input field contain any actual fields, if there are, modify the previous ALL operation to
  // include that as output fields
  private void checkAndSetActualFields(List operations, Set availableSources,
                                       int lastAllIndex, Set lastAllOutputFields,
                                       List inputFields) {
    // if available sources has all the input fields, we are fine since the field lineage can be computed
    if (availableSources.containsAll(inputFields)) {
      return;
    }

    // this condition should never happen, since actual fields can only be generated in an all operations,
    if (lastAllIndex == -1) {
      LOG.warn("The input fields {} contains fields that are neither in input schema nor generated by " +
                 "other field operations, field operations might not be recorded.", inputFields);
      return;
    }

    // the output fields of the last ALL operation should contain all these actual fields
    Sets.SetView actualFields = Sets.difference(new HashSet<>(inputFields), availableSources);
    lastAllOutputFields.addAll(actualFields);
    availableSources.addAll(actualFields);
    FieldTransformOperation operation = (FieldTransformOperation) operations.get(lastAllIndex);
    operations.set(lastAllIndex, new FieldTransformOperation(operation.getName(), operation.getDescription(),
                                                             operation.getInputFields(),
                                                             new ArrayList<>(lastAllOutputFields)));
  }
}