
datahub.spark.conf.SparkConfigParser Maven / Gradle / Ivy
package datahub.spark.conf;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.linkedin.common.FabricType;
import com.linkedin.common.urn.DataJobUrn;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import io.datahubproject.openlineage.config.DatahubOpenlineageConfig;
import io.datahubproject.openlineage.dataset.PathSpec;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkEnv;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SparkConfigParser {
public static final String PARENT_JOB_KEY = "parent.datajob_urn";
public static final String TRANSPORT_KEY = "transport";
public static final String GMS_URL_KEY = "rest.server";
public static final String GMS_AUTH_TOKEN = "rest.token";
public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification";
public static final String MAX_RETRIES = "rest.max_retries";
public static final String RETRY_INTERVAL_IN_SEC = "rest.retry_interval_in_sec";
public static final String COALESCE_KEY = "coalesce_jobs";
public static final String PATCH_ENABLED = "patch.enabled";
public static final String STAGE_METADATA_COALESCING = "stage_metadata_coalescing";
public static final String STREAMING_JOB = "streaming_job";
public static final String STREAMING_HEARTBEAT = "streaming_heartbeat";
public static final String DATAHUB_FLOW_NAME = "flow_name";
public static final String DATASET_ENV_KEY = "metadata.dataset.env";
public static final String DATASET_HIVE_PLATFORM_ALIAS = "metadata.dataset.hivePlatformAlias";
public static final String DATASET_MATERIALIZE_KEY = "metadata.dataset.materialize";
public static final String DATASET_PLATFORM_INSTANCE_KEY = "metadata.dataset.platformInstance";
public static final String DATASET_INCLUDE_SCHEMA_METADATA =
"metadata.dataset.experimental_include_schema_metadata";
public static final String SPARK_PLATFORM_INSTANCE_KEY = "platformInstance";
public static final String REMOVE_PARTITION_PATTERN = "metadata.remove_partition_pattern";
public static final String SPARK_APP_NAME = "spark.app.name";
public static final String SPARK_MASTER = "spark.master";
public static final String PLATFORM_KEY = "platform";
public static final String PATH_SPEC_LIST_KEY = "path_spec_list";
public static final String FILE_PARTITION_REGEXP_PATTERN = "file_partition_regexp";
public static final String FABRIC_TYPE_KEY = "env";
public static final String PLATFORM_INSTANCE_KEY = "platformInstance";
public static final String DATABRICKS_CLUSTER_KEY = "databricks.cluster";
public static final String PIPELINE_KEY = "metadata.pipeline";
public static final String PIPELINE_PLATFORM_INSTANCE_KEY = PIPELINE_KEY + ".platformInstance";
public static final String TAGS_KEY = "tags";
public static final String DOMAINS_KEY = "domains";
private static final Logger log = LoggerFactory.getLogger(SparkConfigParser.class);
public static final String SPARK_DATABRICKS_CLUSTER_USAGE_TAGS_CLUSTER_ALL_TAGS =
"spark.databricks.clusterUsageTags.clusterAllTags";
private static final ObjectMapper mapper = new ObjectMapper();
private SparkConfigParser() {}
public static Properties moveKeysToRoot(Properties properties, String prefix) {
Properties newProperties = new Properties();
Enumeration> propertyNames = properties.propertyNames();
while (propertyNames.hasMoreElements()) {
String key = (String) propertyNames.nextElement();
String value = properties.getProperty(key);
if (key.startsWith(prefix)) {
key = key.substring(prefix.length());
}
newProperties.setProperty(key, value);
log.info("Setting property {} to {}", key, value);
}
return newProperties;
}
public static Config parsePropertiesToConfig(Properties properties) {
properties
.keySet()
.removeIf(
o ->
(!o.toString().startsWith("spark.datahub.")
&& !o.toString()
.startsWith(SPARK_DATABRICKS_CLUSTER_USAGE_TAGS_CLUSTER_ALL_TAGS)));
properties = SparkConfigParser.moveKeysToRoot(properties, "spark.datahub.");
return ConfigFactory.parseProperties(properties);
}
public static Config parseSparkConfig() {
if (SparkEnv.get() == null) {
return ConfigFactory.empty();
}
SparkConf conf = SparkEnv.get().conf();
String propertiesString =
Arrays.stream(conf.getAllWithPrefix("spark.datahub."))
.map(tup -> tup._1 + "= \"" + tup._2 + "\"")
.collect(Collectors.joining("\n"));
return ConfigFactory.parseString(propertiesString);
}
public static Optional
© 2015 - 2025 Weber Informatics LLC | Privacy Policy