io.openlineage.spark.agent.util.RemovePathPatternUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of acryl-spark-lineage Show documentation
Library to push data lineage from spark to datahub
The newest version!
/*
/* Copyright 2018-2024 contributors to the OpenLineage project
/* SPDX-License-Identifier: Apache-2.0
*/

package io.openlineage.spark.agent.util;

import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import datahub.spark.conf.SparkAppContext;
import datahub.spark.conf.SparkConfigParser;
import io.datahubproject.openlineage.config.DatahubOpenlineageConfig;
import io.datahubproject.openlineage.dataset.HdfsPathDataset;
import io.openlineage.client.OpenLineage.InputDataset;
import io.openlineage.client.OpenLineage.OutputDataset;
import io.openlineage.spark.api.OpenLineageContext;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;

/**
 * Utility class to handle removing path patterns in dataset names. Given a configured regex pattern
 * with "remove" group defined, class methods run regex replacements on all the datasets available
 * within the event
 */
@Slf4j
public class RemovePathPatternUtils {
  public static final String REMOVE_PATTERN_GROUP = "remove";
  public static final String SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN =
      "spark.openlineage.dataset.removePath.pattern";

  private static Optional sparkConf = Optional.empty();

  public static List removeOutputsPathPattern_ol(
      OpenLineageContext context, List outputs) {
    return getPattern(context)
        .map(
            pattern ->
                outputs.stream()
                    .map(
                        dataset -> {
                          String newName = removePath(pattern, dataset.getName());
                          if (newName != dataset.getName()) {
                            return context
                                .getOpenLineage()
                                .newOutputDatasetBuilder()
                                .name(removePath(pattern, dataset.getName()))
                                .namespace(dataset.getNamespace())
                                .facets(dataset.getFacets())
                                .outputFacets(dataset.getOutputFacets())
                                .build();
                          } else {
                            return dataset;
                          }
                        })
                    .collect(Collectors.toList()))
        .orElse(outputs);
  }

  // This method was replaced to support Datahub PathSpecs
  public static List removeOutputsPathPattern(
      OpenLineageContext context, List outputs) {
    return outputs.stream()
        .map(
            dataset -> {
              String newName = removePathPattern(dataset.getName());
              if (!Objects.equals(newName, dataset.getName())) {
                return context
                    .getOpenLineage()
                    .newOutputDatasetBuilder()
                    .name(newName)
                    .namespace(dataset.getNamespace())
                    .facets(dataset.getFacets())
                    .outputFacets(dataset.getOutputFacets())
                    .build();
              } else {
                return dataset;
              }
            })
        .collect(Collectors.toList());
  }

  // This method was replaced to support Datahub PathSpecs
  public static List removeInputsPathPattern(
      OpenLineageContext context, List inputs) {
    return inputs.stream()
        .map(
            dataset -> {
              String newName = removePathPattern(dataset.getName());
              if (!Objects.equals(newName, dataset.getName())) {
                return context
                    .getOpenLineage()
                    .newInputDatasetBuilder()
                    .name(newName)
                    .namespace(dataset.getNamespace())
                    .facets(dataset.getFacets())
                    .inputFacets(dataset.getInputFacets())
                    .build();
              } else {
                return dataset;
              }
            })
        .collect(Collectors.toList());
  }

  private static Optional getPattern(OpenLineageContext context) {
    return Optional.of(context.getSparkContext())
        .map(sparkContext -> sparkContext.get().conf())
        .filter(conf -> conf.contains(SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN))
        .map(conf -> conf.get(SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN))
        .map(pattern -> Pattern.compile(pattern));
  }

  private static String removePath(Pattern pattern, String name) {
    return Optional.ofNullable(pattern.matcher(name))
        .filter(matcher -> matcher.find())
        .filter(
            matcher -> {
              try {
                matcher.group(REMOVE_PATTERN_GROUP);
                return true;
              } catch (IllegalStateException | IllegalArgumentException e) {
                return false;
              }
            })
        .filter(matcher -> StringUtils.isNotEmpty(matcher.group(REMOVE_PATTERN_GROUP)))
        .map(
            matcher ->
                name.substring(0, matcher.start(REMOVE_PATTERN_GROUP))
                    + name.substring(matcher.end(REMOVE_PATTERN_GROUP), name.length()))
        .orElse(name);
  }

  /**
   * SparkConf does not change through job lifetime but it can get lost once session is closed. It's
   * good to have it set in case of SPARK-29046
   */
  private static Optional loadSparkConf() {
    if (!sparkConf.isPresent() && SparkSession.getDefaultSession().isDefined()) {
      sparkConf = Optional.of(SparkSession.getDefaultSession().get().sparkContext().getConf());
    }
    return sparkConf;
  }

  private static String removePathPattern(String datasetName) {
    // TODO: The reliance on global-mutable state here should be changed
    //  this led to problems in the PathUtilsTest class, where some tests interfered with others
    log.info("Removing path pattern from dataset name {}", datasetName);
    Optional conf = loadSparkConf();
    if (!conf.isPresent()) {
      return datasetName;
    }
    try {
      String propertiesString =
          Arrays.stream(conf.get().getAllWithPrefix("spark.datahub."))
              .map(tup -> tup._1 + "= \"" + tup._2 + "\"")
              .collect(Collectors.joining("\n"));
      Config datahubConfig = ConfigFactory.parseString(propertiesString);
      DatahubOpenlineageConfig datahubOpenlineageConfig =
          SparkConfigParser.sparkConfigToDatahubOpenlineageConf(
              datahubConfig, new SparkAppContext());
      HdfsPathDataset hdfsPath =
          HdfsPathDataset.create(new URI(datasetName), datahubOpenlineageConfig);
      log.debug("Transformed path is {}", hdfsPath.getDatasetPath());
      return hdfsPath.getDatasetPath();
    } catch (InstantiationException e) {
      log.warn(
          "Unable to convert dataset {} to path the exception was {}", datasetName, e.getMessage());
      return datasetName;
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
  }
}