Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.cdap.wrangler.lineage.LineageOperations Maven / Gradle / Ivy
/*
* Copyright © 2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.wrangler.lineage;
import com.google.common.collect.Sets;
import io.cdap.cdap.etl.api.lineage.field.FieldOperation;
import io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation;
import io.cdap.wrangler.api.Directive;
import io.cdap.wrangler.api.lineage.Lineage;
import io.cdap.wrangler.api.lineage.Mutation;
import io.cdap.wrangler.api.lineage.Relation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This class {@link LineageOperations} generates transformation operations used to generate lineage.
*/
public final class LineageOperations {
private static final Logger LOG = LoggerFactory.getLogger(LineageOperations.class);
private static final String OPERATION_NAME_PREFIX = "operation_";
private final Set input;
private final Set output;
private final List directives;
/**
* A constructor for generating the transformation operations for lineage.
*
* @param input A input {@link Set} of columns.
* @param output A output {@link Set} of columns.
* @param directives A {@link List} of directives.
*/
public LineageOperations(Set input, Set output, List directives) {
this.input = Collections.unmodifiableSet(new HashSet<>(input));
this.output = Collections.unmodifiableSet(new HashSet<>(output));
this.directives = Collections.unmodifiableList(new ArrayList<>(directives));;
}
/**
* Generates the list of {@link FieldOperation} required for generating lineage.
*
* @return a {@link List} of {@link FieldOperation}
*/
public List generate() {
List operations = new ArrayList<>();
// this set contains all the fields that are used as the input fields in the transformation
Set definedSources = new HashSet<>();
// The following three fields are needed to track the guessed transformation.
// Due to the current limitation of some directive, the actual operated columns is unknown until the actual data
// comes. This happens mostly in the parse scenario and it will happen in ALL type relations.
// Therefore, we will guess these parse directive using the given input and output fields.
// However, this has a big drawback, since the following directives can use columns
// the parse directive actually generates, and they will generate something like [actual columns] -> [...]. In this
// case, the traceback logic will not be able to the read operations since actual columns are never in the any
// of the output fields in any transformation. To solve this issue, each time we see a input field that does not
// appear before, we will add it to the last all operation, since we can be sure that it is an actual field and
// it should be generated by one of the all operations.
// this set contains all the available fields that can be used as inputs in a field transformation. If one of the
// fields from a relation are not in this set, that means, it is generated from a all operation, and we will, by
// guess, add it to the output fields of last all operations
Set availableSources = new HashSet<>(input);
// this int is the index of the last all operation generated field transformation in the operations, it will
// need to be update the transformation operation to contain the actual columns.
int lastAllOperationIndex = -1;
// this is the output fields that should be in last all operations, basically it contains the output fields that
// are not present in input schema + all actual fields used by the following directives.
Set lastAllOutput = new HashSet<>();
for (Directive directive : directives) {
if (directive instanceof Lineage) {
Mutation mutation = ((Lineage) directive).lineage();
String readable = mutation.readable();
for (Relation relation : mutation.relations()) {
// we won't use uuid in relation since in cdap we compute the checksum of the operations and we only
// write to the table if the operations are different
String name = OPERATION_NAME_PREFIX + operations.size();
List sources = relation.getSources();
List targets = relation.getTargets();
switch (relation.getType()) {
case ALL:
sources = sources.isEmpty() ? new ArrayList<>(input) : sources;
checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
// add the diff of all output and input fields as the output targets
Set outputs = new HashSet<>(Sets.difference(output, availableSources));
// if the sources are in the output fields itself, add that to the targets
outputs.addAll(Sets.intersection(new HashSet<>(sources), output));
// add all the targets that this relation provides to the outputs
outputs.addAll(targets);
targets = new ArrayList<>(outputs);
operations.add(
new FieldTransformOperation(
name,
readable,
sources,
targets)
);
lastAllOperationIndex = operations.size() - 1;
lastAllOutput = new HashSet<>(targets);
break;
case GENERATE:
sources = new ArrayList<>(input);
operations.add(new FieldTransformOperation(
name,
readable,
sources,
targets
));
break;
case DROP:
checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
operations.add(
new FieldTransformOperation(
name,
readable,
sources
)
);
break;
case CREATE:
checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
operations.add(
new FieldTransformOperation(
name,
readable,
Collections.emptyList(),
targets
)
);
break;
case STANDARD:
checkAndSetActualFields(operations, availableSources, lastAllOperationIndex, lastAllOutput, sources);
operations.add(
new FieldTransformOperation(
name,
readable,
sources,
targets
)
);
}
availableSources.addAll(targets);
definedSources.addAll(relation.getSources());
}
}
}
// We iterate through all the input fields in the schema, check if there is corresponding
// field in the output schema. If both exists, then a identity mapping transform is added
// to the {@code FieldTransformationOperation} is added.
Set difference = Sets.difference(input, definedSources);
for (String next : difference) {
if (output.contains(next)) {
FieldTransformOperation transformation =
new FieldTransformOperation(
OPERATION_NAME_PREFIX + operations.size(),
String.format("Mapping column '%s' to column '%s'", next, next),
Collections.singletonList(next),
next
);
operations.add(transformation);
}
}
return operations;
}
// checks if the current input field contain any actual fields, if there are, modify the previous ALL operation to
// include that as output fields
private void checkAndSetActualFields(List operations, Set availableSources,
int lastAllIndex, Set lastAllOutputFields,
List inputFields) {
// if available sources has all the input fields, we are fine since the field lineage can be computed
if (availableSources.containsAll(inputFields)) {
return;
}
// this condition should never happen, since actual fields can only be generated in an all operations,
if (lastAllIndex == -1) {
LOG.warn("The input fields {} contains fields that are neither in input schema nor generated by " +
"other field operations, field operations might not be recorded.", inputFields);
return;
}
// the output fields of the last ALL operation should contain all these actual fields
Sets.SetView actualFields = Sets.difference(new HashSet<>(inputFields), availableSources);
lastAllOutputFields.addAll(actualFields);
availableSources.addAll(actualFields);
FieldTransformOperation operation = (FieldTransformOperation) operations.get(lastAllIndex);
operations.set(lastAllIndex, new FieldTransformOperation(operation.getName(), operation.getDescription(),
operation.getInputFields(),
new ArrayList<>(lastAllOutputFields)));
}
}