
io.openlineage.spark.agent.util.PlanUtils Maven / Gradle / Ivy
/*
/* Copyright 2018-2024 contributors to the OpenLineage project
/* SPDX-License-Identifier: Apache-2.0
*/
package io.openlineage.spark.agent.util;
import static io.openlineage.spark.agent.lifecycle.ExecutionContext.CAMEL_TO_SNAKE_CASE;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import datahub.spark.conf.SparkLineageConf;
import io.datahubproject.openlineage.dataset.HdfsPathDataset;
import io.openlineage.client.OpenLineage;
import io.openlineage.spark.agent.Versions;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkEnv;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.PartialFunction;
import scala.PartialFunction$;
/**
* Utility functions for traversing a {@link
* org.apache.spark.sql.catalyst.plans.logical.LogicalPlan}.
*/
@Slf4j
public class PlanUtils {
/**
* Given a list of {@link PartialFunction}s merge to produce a single function that will test the
* input against each function one by one until a match is found or {@link
* PartialFunction$#empty()} is returned.
*
* @param fns
* @param
* @param
* @return
*/
public static OpenLineageAbstractPartialFunction> merge(
Collection extends PartialFunction>> fns) {
return new OpenLineageAbstractPartialFunction>() {
String appliedClassName;
@Override
public boolean isDefinedAt(T x) {
return fns.stream()
.filter(pfn -> PlanUtils.safeIsDefinedAt(pfn, x))
.findFirst()
.isPresent();
}
private boolean isDefinedAt(T x, PartialFunction> pfn) {
return PlanUtils.safeIsDefinedAt(pfn, x);
}
@Override
public Collection apply(T x) {
return fns.stream()
.filter(pfn -> PlanUtils.safeIsDefinedAt(pfn, x))
.map(
pfn -> {
try {
Collection collection = pfn.apply(x);
if (log.isDebugEnabled()) {
log.debug(
"Visitor {} visited {}, returned {}",
pfn.getClass().getCanonicalName(),
x.getClass().getCanonicalName(),
collection);
}
appliedClassName = x.getClass().getName();
return collection;
} catch (RuntimeException | NoClassDefFoundError | NoSuchMethodError e) {
log.error("Apply failed:", e);
return null;
}
})
.filter(Objects::nonNull)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
@Override
String appliedName() {
return appliedClassName;
}
};
}
/**
* Given a schema, construct a valid {@link OpenLineage.SchemaDatasetFacet}.
*
* @param structType
* @return
*/
public static OpenLineage.SchemaDatasetFacet schemaFacet(
OpenLineage openLineage, StructType structType) {
return openLineage
.newSchemaDatasetFacetBuilder()
.fields(transformFields(openLineage, structType.fields()))
.build();
}
private static List transformFields(
OpenLineage openLineage, StructField... fields) {
List list = new ArrayList<>();
for (StructField field : fields) {
list.add(
openLineage
.newSchemaDatasetFacetFieldsBuilder()
.name(field.name())
.type(field.dataType().typeName())
.build());
}
return list;
}
/**
* Given a list of attributes, constructs a valid {@link OpenLineage.SchemaDatasetFacet}.
*
* @param attributes
* @return
*/
public static StructType toStructType(List attributes) {
return new StructType(
attributes.stream()
.map(
attr ->
new StructField(attr.name(), attr.dataType(), attr.nullable(), attr.metadata()))
.collect(Collectors.toList())
.toArray(new StructField[0]));
}
public static String namespaceUri(URI outputPath) {
return Optional.ofNullable(outputPath.getAuthority())
.map(a -> String.format("%s://%s", outputPath.getScheme(), a))
.orElse(outputPath.getScheme());
}
/**
* Construct a {@link OpenLineage.DatasourceDatasetFacet} given a namespace for the datasource.
*
* @param namespaceUri
* @return
*/
public static OpenLineage.DatasourceDatasetFacet datasourceFacet(
OpenLineage openLineage, String namespaceUri) {
return openLineage
.newDatasourceDatasetFacetBuilder()
.uri(URI.create(namespaceUri))
.name(namespaceUri)
.build();
}
/**
* Construct a {@link OpenLineage.ParentRunFacet} given the parent job's parentRunId, job name,
* and namespace.
*
* @param parentRunId
* @param parentJob
* @param parentJobNamespace
* @return
*/
public static OpenLineage.ParentRunFacet parentRunFacet(
UUID parentRunId, String parentJob, String parentJobNamespace) {
return new OpenLineage(Versions.OPEN_LINEAGE_PRODUCER_URI)
.newParentRunFacetBuilder()
.run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build())
.job(
new OpenLineage.ParentRunFacetJobBuilder()
.name(parentJob.replaceAll(CAMEL_TO_SNAKE_CASE, "_$1").toLowerCase(Locale.ROOT))
.namespace(parentJobNamespace)
.build())
.build();
}
public static Path getDirectoryPathOl(Path p, Configuration hadoopConf) {
try {
if (p.getFileSystem(hadoopConf).getFileStatus(p).isFile()) {
return p.getParent();
} else {
return p;
}
} catch (IOException e) {
log.warn("Unable to get file system for path ", e);
return p;
}
}
// This method was replaced to support Datahub PathSpecs
public static Path getDirectoryPath(Path p, Configuration hadoopConf) {
SparkConf conf = SparkEnv.get().conf();
String propertiesString =
Arrays.stream(conf.getAllWithPrefix("spark.datahub."))
.map(tup -> tup._1 + "= \"" + tup._2 + "\"")
.collect(Collectors.joining("\n"));
Config datahubConfig = ConfigFactory.parseString(propertiesString);
SparkLineageConf sparkLineageConf =
SparkLineageConf.toSparkLineageConf(datahubConfig, null, null);
HdfsPathDataset hdfsPath = null;
try {
URI uri = new URI(p.toString());
hdfsPath = HdfsPathDataset.create(uri, sparkLineageConf.getOpenLineageConf());
log.debug("Path {} transformed to {}", p, hdfsPath.getDatasetPath());
return new Path(hdfsPath.getDatasetPath());
} catch (InstantiationException | URISyntaxException e) {
log.warn("Unable to convert path to hdfs path {} the exception was {}", p, e.getMessage());
return p;
}
}
/**
* Given a list of RDDs, it collects list of data location directories. For each RDD, a parent
* directory is taken and list of distinct locations is returned.
*
* @param fileRdds
* @return
*/
public static List findRDDPaths(List> fileRdds) {
return fileRdds.stream()
.flatMap(RddPathUtils::findRDDPaths)
.distinct()
.collect(Collectors.toList());
}
/**
* instanceOf alike implementation which does not fail in case of a missing class.
*
* @param instance
* @param classCanonicalName
* @return
*/
public static boolean safeIsInstanceOf(Object instance, String classCanonicalName) {
try {
Class c = Class.forName(classCanonicalName);
return instance.getClass().isAssignableFrom(c);
} catch (ClassNotFoundException e) {
return false;
}
}
/**
* isDefinedAt method implementation that should never throw an error or exception
*
* @param pfn
* @param x
* @return
*/
public static boolean safeIsDefinedAt(PartialFunction pfn, Object x) {
try {
return pfn.isDefinedAt(x);
} catch (ClassCastException e) {
// do nothing
return false;
} catch (Exception e) {
if (e != null) {
log.info("isDefinedAt method failed on {}", e);
}
return false;
} catch (NoClassDefFoundError e) {
log.info("isDefinedAt method failed on {}", e.getMessage());
return false;
}
}
/**
* apply method implementation that should never throw an error or exception
*
* @param pfn
* @param x
* @return
* @param
* @param
*/
public static List safeApply(PartialFunction> pfn, D x) {
try {
return pfn.apply(x);
} catch (Exception | NoClassDefFoundError | NoSuchMethodError e) {
log.info("apply method failed with", e);
return Collections.emptyList();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy