marquez.db.JobVersionDao Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of marquez-api Show documentation
Show all versions of marquez-api Show documentation
Collect, aggregate, and visualize a data ecosystem's metadata
/*
* Copyright 2018-2023 contributors to the Marquez project
* SPDX-License-Identifier: Apache-2.0
*/
package marquez.db;
import static marquez.db.Columns.stringOrThrow;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSortedSet;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.Collectors;
import lombok.NonNull;
import lombok.Value;
import marquez.api.models.JobVersion;
import marquez.common.Utils;
import marquez.common.models.DatasetId;
import marquez.common.models.DatasetName;
import marquez.common.models.JobName;
import marquez.common.models.NamespaceName;
import marquez.common.models.RunState;
import marquez.common.models.Version;
import marquez.db.mappers.ExtendedJobVersionRowMapper;
import marquez.db.mappers.JobDataMapper;
import marquez.db.mappers.JobVersionMapper;
import marquez.db.models.DatasetVersionRow;
import marquez.db.models.ExtendedDatasetVersionRow;
import marquez.db.models.ExtendedJobVersionRow;
import marquez.db.models.JobRow;
import marquez.db.models.JobVersionRow;
import marquez.db.models.NamespaceRow;
import marquez.db.models.UpdateLineageRow.DatasetRecord;
import marquez.service.models.Run;
import org.jdbi.v3.core.mapper.RowMapper;
import org.jdbi.v3.core.statement.StatementContext;
import org.jdbi.v3.sqlobject.config.RegisterRowMapper;
import org.jdbi.v3.sqlobject.statement.SqlQuery;
import org.jdbi.v3.sqlobject.statement.SqlUpdate;
/** The DAO for {@code JobVersion}. */
@RegisterRowMapper(ExtendedJobVersionRowMapper.class)
@RegisterRowMapper(JobVersionMapper.class)
@RegisterRowMapper(JobDataMapper.class)
public interface JobVersionDao extends BaseDao {
/** An {@code enum} used to determine the input / output dataset type for a given job version. */
enum IoType {
INPUT,
OUTPUT
}
/**
* Returns JobVersion fields, along with Run-related fields, prefixed with "run_". Input and
* Output datasets are constructed as JSON strings that can be deserialized into DatasetIds.
*/
String BASE_SELECT_ON_JOB_VERSIONS =
"""
WITH job_version_io AS (
SELECT io.job_version_uuid,
JSON_AGG(json_build_object('namespace', ds.namespace_name,
'name', ds.name))
FILTER (WHERE io.io_type = 'INPUT') AS input_datasets,
JSON_AGG(json_build_object('namespace', ds.namespace_name,
'name', ds.name))
FILTER (WHERE io.io_type = 'OUTPUT') AS output_datasets
FROM job_versions_io_mapping io
INNER JOIN job_versions jv ON jv.uuid = io.job_version_uuid
INNER JOIN datasets_view ds ON ds.uuid = io.dataset_uuid
INNER JOIN jobs_view j ON j.uuid=jv.job_uuid
WHERE j.namespace_name = :namespaceName
AND j.name = :jobName
GROUP BY io.job_version_uuid
), relevant_job_versions AS (
SELECT jv.uuid, jv.created_at, jv.updated_at, jv.job_uuid, jv.version,\s
jv.location, jv.latest_run_uuid, j.namespace_uuid,\s
j.namespace_name, j.name AS job_name
FROM job_versions jv
INNER JOIN jobs_view j ON j.uuid=jv.job_uuid
WHERE j.name = :jobName AND j.namespace_name=:namespaceName
ORDER BY jv.created_at DESC
)
SELECT jv.*,
dsio.input_datasets,
dsio.output_datasets,
r.uuid AS run_uuid,
r.created_at AS run_created_at,
r.updated_at AS run_updated_at,
r.nominal_start_time AS run_nominal_start_time,
r.nominal_end_time AS run_nominal_end_time,
r.current_run_state AS run_current_run_state,
r.started_at AS run_started_at,
r.ended_at AS run_ended_at,
r.namespace_name AS run_namespace_name,
r.job_name AS run_job_name,
jv.version AS run_job_version,
r.location AS run_location,
ra.args AS run_args,
f.facets AS run_facets,
ri.input_versions AS run_input_versions,
ro.output_versions AS run_output_versions
FROM relevant_job_versions AS jv
LEFT JOIN job_version_io dsio ON dsio.job_version_uuid = jv.uuid
LEFT OUTER JOIN runs r ON r.uuid = jv.latest_run_uuid
LEFT JOIN LATERAL (
SELECT jf.run_uuid, JSON_AGG(jf.facet ORDER BY jf.lineage_event_time ASC) AS facets
FROM job_facets_view AS jf
WHERE jf.run_uuid=jv.latest_run_uuid AND jf.job_uuid = jv.job_uuid
GROUP BY jf.run_uuid
) AS f ON r.uuid = f.run_uuid
LEFT OUTER JOIN run_args AS ra ON ra.uuid = r.run_args_uuid
LEFT JOIN LATERAL (
SELECT im.run_uuid,
JSON_AGG(json_build_object('namespace', dv.namespace_name,
'name', dv.dataset_name,
'version', dv.version)) AS input_versions
FROM runs_input_mapping im
INNER JOIN dataset_versions dv on im.dataset_version_uuid = dv.uuid
WHERE im.run_uuid=jv.latest_run_uuid
GROUP BY im.run_uuid
) ri ON ri.run_uuid = r.uuid
LEFT OUTER JOIN (
SELECT run_uuid,
JSON_AGG(json_build_object('namespace', namespace_name,
'name', dataset_name,
'version', version)) AS output_versions
FROM dataset_versions
GROUP BY run_uuid
) ro ON ro.run_uuid = r.uuid
""";
@SqlQuery(BASE_SELECT_ON_JOB_VERSIONS + "WHERE jv.version = :jobVersionUuid")
Optional findJobVersion(String namespaceName, String jobName, UUID jobVersionUuid);
@SqlQuery(BASE_SELECT_ON_JOB_VERSIONS + "LIMIT :limit OFFSET :offset")
List findAllJobVersions(String namespaceName, String jobName, int limit, int offset);
/**
* Used to upsert a {@link JobVersionRow} object; on version conflict, the job version object is
* returned with the {@code updated_at} column set to the last modified timestamp.
*
* @param jobVersionUuid The unique ID of the job version.
* @param now The last modified timestamp of the job version.
* @param jobUuid The unique ID of the job associated with the version.
* @param jobLocation The source code location for the job.
* @param version The version of the job; for internal use only.
* @param jobName The name of the job.
* @param namespaceUuid The unique ID of the namespace associated with the job version.
* @param namespaceName The namespace associated with the job version.
* @return The {@link ExtendedJobVersionRow} object inserted into the {@code job_versions} table.
*/
// TODO: A JobVersionRow object should be immutable; replace with JobVersionDao.insertJobVersion()
@SqlQuery(
"""
INSERT INTO job_versions (
uuid,
created_at,
updated_at,
job_uuid,
location,
version,
job_name,
namespace_uuid,
namespace_name
) VALUES (
:jobVersionUuid,
:now,
:now,
:jobUuid,
:jobLocation,
:version,
:jobName,
:namespaceUuid,
:namespaceName)
ON CONFLICT(version) DO
UPDATE SET updated_at = EXCLUDED.updated_at
RETURNING *
""")
ExtendedJobVersionRow upsertJobVersion(
UUID jobVersionUuid,
Instant now,
UUID jobUuid,
String jobLocation,
UUID version,
String jobName,
UUID namespaceUuid,
String namespaceName);
/**
* Used to upsert an input or output dataset to a given job version.
*
* @param jobVersionUuid The unique ID of the job version.
* @param datasetUuid The unique ID of the output dataset
* @param ioType The {@link IoType} of the dataset.
* @param jobUuid The unique ID of the job.
*/
@SqlUpdate(
"""
INSERT INTO job_versions_io_mapping (
job_version_uuid, dataset_uuid, io_type, job_uuid, job_symlink_target_uuid, is_current_job_version, made_current_at)
VALUES (:jobVersionUuid, :datasetUuid, :ioType, :jobUuid, :symlinkTargetJobUuid, TRUE, NOW())
ON CONFLICT (job_version_uuid, dataset_uuid, io_type, job_uuid) DO UPDATE SET is_current_job_version = TRUE
""")
void upsertCurrentInputOrOutputDatasetFor(
UUID jobVersionUuid,
UUID datasetUuid,
UUID jobUuid,
UUID symlinkTargetJobUuid,
IoType ioType);
@SqlUpdate(
"""
UPDATE job_versions_io_mapping
SET is_current_job_version = FALSE
WHERE (job_uuid = :jobUuid OR job_symlink_target_uuid = :jobUuid)
AND job_version_uuid != :jobVersionUuid
AND io_type = :ioType
AND is_current_job_version = TRUE;
""")
void markInputOrOutputDatasetAsPreviousFor(UUID jobVersionUuid, UUID jobUuid, IoType ioType);
@SqlUpdate(
"""
UPDATE job_versions_io_mapping
SET is_current_job_version = FALSE
WHERE (job_uuid = :jobUuid OR job_symlink_target_uuid = :jobUuid)
AND io_type = :ioType
AND is_current_job_version = TRUE;
""")
void markInputOrOutputDatasetAsPreviousFor(UUID jobUuid, IoType ioType);
/**
* Used to link an input dataset to a given job version.
*
* @param inputDatasetUuid The unique ID of the input dataset.
* @param jobUuid The unique ID of the job.
*/
default void upsertInputDatasetFor(
UUID jobVersionUuid, UUID inputDatasetUuid, UUID jobUuid, UUID symlinkTargetJobUuid) {
markInputOrOutputDatasetAsPreviousFor(jobVersionUuid, jobUuid, IoType.INPUT);
upsertCurrentInputOrOutputDatasetFor(
jobVersionUuid, inputDatasetUuid, jobUuid, symlinkTargetJobUuid, IoType.INPUT);
}
/**
* Used to link an output dataset to a given job version.
*
* @param outputDatasetUuid The unique ID of the output dataset.
* @param jobUuid The unique ID of the job.
*/
default void upsertOutputDatasetFor(
UUID jobVersionUuid, UUID outputDatasetUuid, UUID jobUuid, UUID symlinkTargetJobUuid) {
markInputOrOutputDatasetAsPreviousFor(jobVersionUuid, jobUuid, IoType.OUTPUT);
upsertCurrentInputOrOutputDatasetFor(
jobVersionUuid, outputDatasetUuid, jobUuid, symlinkTargetJobUuid, IoType.OUTPUT);
}
/**
* Returns the input datasets to a given job version.
*
* @param jobVersionUuid The unique ID of the job version.
* @return The input datasets for the job version.
*/
default List findInputDatasetsFor(UUID jobVersionUuid) {
return findInputOrOutputDatasetsFor(jobVersionUuid, IoType.INPUT);
}
/**
* Returns the output datasets to a given job version.
*
* @param jobVersionUuid The unique ID of the job version.
* @return The output datasets for the job version.
*/
default List findOutputDatasetsFor(UUID jobVersionUuid) {
return findInputOrOutputDatasetsFor(jobVersionUuid, IoType.OUTPUT);
}
/**
* Verifies if a job with a specified job version is present in table.
*
* @param version Version identifier
*/
@SqlQuery("SELECT EXISTS (SELECT 1 FROM job_versions WHERE version = :version)")
boolean versionExists(UUID version);
/**
* Returns the input or output datasets for a given job version.
*
* @param jobVersionUuid The unique ID of the job version.
* @param ioType The {@link IoType} of the dataset.
*/
@SqlQuery(
"""
SELECT dataset_uuid
FROM job_versions_io_mapping
WHERE job_version_uuid = :jobVersionUuid
AND io_type = :ioType
""")
List findInputOrOutputDatasetsFor(UUID jobVersionUuid, IoType ioType);
@SqlQuery(
"""
SELECT d.namespace_name, d.name, io.io_type
FROM job_versions_io_mapping io
INNER JOIN jobs_view j ON j.current_version_uuid = io.job_version_uuid
INNER JOIN datasets_view d on d.uuid = io.dataset_uuid
WHERE j.name = :jobName AND j.namespace_name=:jobNamespace
""")
List findCurrentInputOutputDatasetsFor(String jobNamespace, String jobName);
/**
* Used to associate a {@link Run} to a given job version. A run is an instance of a job version.
* When a run object is instantiated, the {@code latest_run_uuid} column in the {@code
* job_versions} table is updated and set to the unique ID of the latest run of the version. Note,
* multiple run instances may be linked to a job version as runs are based on a version.
*
* @param jobVersionUuid The unique ID of the job version.
* @param updatedAt The last modified timestamp of the job version.
* @param latestRunUuid The unique ID of the {@link Run} associated with the job version.
*/
@SqlUpdate(
"""
UPDATE job_versions
SET updated_at = :updatedAt,
latest_run_uuid = :latestRunUuid
WHERE uuid = :jobVersionUuid
""")
void updateLatestRunFor(UUID jobVersionUuid, Instant updatedAt, UUID latestRunUuid);
/** Returns the unique ID of the latest {@link Run} for a given job version. */
@SqlQuery("SELECT latest_run_uuid FROM job_versions WHERE uuid = :jobVersionUuid")
Optional findLatestRunFor(UUID jobVersionUuid);
/** Returns the {@link JobVersionRow} object for a given the unique run ID . */
@SqlQuery("SELECT * FROM job_versions WHERE latest_run_uuid = :runUuid")
Optional findJobVersionFor(UUID runUuid);
/** Returns the total row count for the {@code job_versions} table; used for testing only. */
@VisibleForTesting
@SqlQuery("SELECT COUNT(*) FROM job_versions")
int count();
/**
* Links facets of the given run to
*
* @param runUuid
* @param jobVersionUuid
*/
@SqlUpdate(
"""
UPDATE job_facets
SET job_version_uuid = :jobVersionUuid
WHERE run_uuid = :runUuid
""")
void linkJobFacetsToJobVersion(UUID runUuid, UUID jobVersionUuid);
/**
* Used to upsert an immutable {@link JobVersionRow}. A {@link Version} is generated using {@link
* Utils#newJobVersionFor(NamespaceName, JobName, ImmutableSet, ImmutableSet, String)} based on
* the jobs inputs and inputs, source code location, and context.
*
* @param jobRow The job.
* @return A {@link BagOfJobVersionInfo} object.
*/
default BagOfJobVersionInfo upsertRunlessJobVersion(
@NonNull JobRow jobRow, List inputs, List outputs) {
// Get the job.
final JobDao jobDao = createJobDao();
// Get the namespace for the job.
final NamespaceRow namespaceRow =
createNamespaceDao().findNamespaceByName(jobRow.getNamespaceName()).get();
// Generate the version for the job; the version may already exist.
final Version jobVersion =
Utils.newJobVersionFor(
NamespaceName.of(jobRow.getNamespaceName()),
JobName.of(
Optional.ofNullable(jobRow.getParentJobName())
.map(pn -> pn + "." + jobRow.getSimpleName())
.orElse(jobRow.getName())),
toDatasetIds(
inputs.stream().map(i -> i.getDatasetVersionRow()).collect(Collectors.toList())),
toDatasetIds(
outputs.stream().map(i -> i.getDatasetVersionRow()).collect(Collectors.toList())),
jobRow.getLocation());
// Add the job version.
final JobVersionDao jobVersionDao = createJobVersionDao();
final JobVersionRow jobVersionRow =
jobVersionDao.upsertJobVersion(
UUID.randomUUID(),
jobRow.getCreatedAt(),
jobRow.getUuid(),
jobRow.getLocation(),
jobVersion.getValue(),
jobRow.getName(),
namespaceRow.getUuid(),
jobRow.getNamespaceName());
// Link the input datasets to the job version.
inputs.forEach(
i -> {
jobVersionDao.upsertInputDatasetFor(
jobVersionRow.getUuid(),
i.getDatasetVersionRow().getDatasetUuid(),
jobVersionRow.getJobUuid(),
jobRow.getSymlinkTargetId());
});
// Link the output datasets to the job version.
outputs.forEach(
o -> {
jobVersionDao.upsertOutputDatasetFor(
jobVersionRow.getUuid(),
o.getDatasetVersionRow().getDatasetUuid(),
jobVersionRow.getJobUuid(),
jobRow.getSymlinkTargetId());
});
jobDao.updateVersionFor(jobRow.getUuid(), jobRow.getCreatedAt(), jobVersionRow.getUuid());
return new BagOfJobVersionInfo(
jobRow,
jobVersionRow,
inputs.stream()
.map(JobVersionDao::toExtendedDatasetVersionRow)
.collect(Collectors.toList()),
outputs.stream()
.map(JobVersionDao::toExtendedDatasetVersionRow)
.collect(Collectors.toList()));
}
private static ExtendedDatasetVersionRow toExtendedDatasetVersionRow(DatasetRecord d) {
return new ExtendedDatasetVersionRow(
d.getDatasetRow().getUuid(),
d.getDatasetRow().getCreatedAt(),
d.getDatasetVersionRow().getDatasetUuid(),
d.getDatasetVersionRow().getVersion(),
null,
null,
d.getDatasetRow().getNamespaceName(),
d.getDatasetRow().getName());
}
/**
* Used to upsert an immutable {@link JobVersionRow} object when a {@link Run} has transitioned. A
* {@link Version} is generated using {@link Utils#newJobVersionFor(NamespaceName, JobName,
* ImmutableSet, ImmutableSet, String)} based on the jobs inputs and inputs, source code location,
* and context. A version for a given job is created only when a {@link Run} transitions
* into a {@code COMPLETED}, {@code ABORTED}, or {@code FAILED} state.
*
* @param jobRowRunDetails The job row with run details.
* @param runState The current run state.
* @param transitionedAt The timestamp of the run state transition.
* @return A {@link BagOfJobVersionInfo} object.
*/
default BagOfJobVersionInfo upsertJobVersionOnRunTransition(
@NonNull JobRowRunDetails jobRowRunDetails,
@NonNull RunState runState,
@NonNull Instant transitionedAt,
boolean linkJobToJobVersion) {
// Get the job.
final JobDao jobDao = createJobDao();
// Add the job version.
final JobVersionDao jobVersionDao = createJobVersionDao();
final JobVersionRow jobVersionRow =
jobVersionDao.upsertJobVersion(
UUID.randomUUID(),
transitionedAt, // Use the timestamp of when the run state transitioned.
jobRowRunDetails.jobRow.getUuid(),
jobRowRunDetails.jobRow.getLocation(),
jobRowRunDetails.jobVersion.getValue(),
jobRowRunDetails.jobRow.getName(),
jobRowRunDetails.namespaceRow.getUuid(),
jobRowRunDetails.jobRow.getNamespaceName());
// Link the input datasets to the job version.
jobRowRunDetails.jobVersionInputs.forEach(
jobVersionInput -> {
jobVersionDao.upsertInputDatasetFor(
jobVersionRow.getUuid(),
jobVersionInput.getDatasetUuid(),
jobVersionRow.getJobUuid(),
jobRowRunDetails.jobRow.getSymlinkTargetId());
});
// Link the output datasets to the job version.
jobRowRunDetails.jobVersionOutputs.forEach(
jobVersionOutput -> {
jobVersionDao.upsertOutputDatasetFor(
jobVersionRow.getUuid(),
jobVersionOutput.getDatasetUuid(),
jobVersionRow.getJobUuid(),
jobRowRunDetails.jobRow.getSymlinkTargetId());
});
// Link the job version to the run.
createRunDao().updateJobVersion(jobRowRunDetails.runUuid, jobVersionRow.getUuid());
// Link the run to the job version; multiple run instances may be linked to a job version.
jobVersionDao.updateLatestRunFor(
jobVersionRow.getUuid(), transitionedAt, jobRowRunDetails.runUuid);
// Link the job facets to this job version
jobVersionDao.linkJobFacetsToJobVersion(jobRowRunDetails.runUuid, jobVersionRow.getUuid());
if (linkJobToJobVersion) {
jobDao.updateVersionFor(
jobRowRunDetails.jobRow.getUuid(), transitionedAt, jobVersionRow.getUuid());
}
return new BagOfJobVersionInfo(
jobRowRunDetails.jobRow,
jobVersionRow,
jobRowRunDetails.jobVersionInputs,
jobRowRunDetails.jobVersionOutputs);
}
/** Returns the specified {@link ExtendedDatasetVersionRow}s as {@link DatasetId}s. */
default ImmutableSortedSet toDatasetIds(
@NonNull final List datasetVersionRows) {
final ImmutableSortedSet.Builder datasetIds = ImmutableSortedSet.naturalOrder();
for (final DatasetVersionRow datasetVersionRow : datasetVersionRows) {
datasetIds.add(toDatasetId(datasetVersionRow));
}
return datasetIds.build();
}
private DatasetId toDatasetId(DatasetVersionRow dataset) {
return new DatasetId(
NamespaceName.of(dataset.getNamespaceName()), DatasetName.of(dataset.getDatasetName()));
}
default JobRowRunDetails loadJobRowRunDetails(JobRow jobRow, UUID runUuid) {
// Get the inputs and outputs dataset versions for the run associated with the job version.
final DatasetVersionDao datasetVersionDao = createDatasetVersionDao();
final List jobVersionInputs =
datasetVersionDao.findInputDatasetVersionsFor(runUuid);
final List jobVersionOutputs =
datasetVersionDao.findOutputDatasetVersionsFor(runUuid);
// Get the namespace for the job.
final NamespaceRow namespaceRow =
createNamespaceDao().findNamespaceByName(jobRow.getNamespaceName()).get();
// Generate the version for the job; the version may already exist.
final Version jobVersion =
Utils.newJobVersionFor(
NamespaceName.of(jobRow.getNamespaceName()),
JobName.of(
Optional.ofNullable(jobRow.getParentJobName())
.map(pn -> pn + "." + jobRow.getSimpleName())
.orElse(jobRow.getName())),
toDatasetIds(
jobVersionInputs.stream()
.map(i -> (DatasetVersionRow) i)
.collect(Collectors.toList())),
toDatasetIds(
jobVersionOutputs.stream()
.map(o -> (DatasetVersionRow) o)
.collect(Collectors.toList())),
jobRow.getLocation());
return new JobRowRunDetails(
jobRow, runUuid, namespaceRow, jobVersionInputs, jobVersionOutputs, jobVersion);
}
/** A container class for job version info. */
@Value
class BagOfJobVersionInfo {
JobRow jobRow;
JobVersionRow jobVersionRow;
List inputs;
List outputs;
}
record JobDataset(String namespace, String name, IoType ioType) {}
record JobRowRunDetails(
JobRow jobRow,
UUID runUuid,
NamespaceRow namespaceRow,
List jobVersionInputs,
List jobVersionOutputs,
Version jobVersion) {}
class JobDatasetMapper implements RowMapper {
@Override
public JobDataset map(ResultSet rs, StatementContext ctx) throws SQLException {
return new JobDataset(
stringOrThrow(rs, Columns.NAMESPACE_NAME),
stringOrThrow(rs, Columns.NAME),
IoType.valueOf(stringOrThrow(rs, Columns.IO_TYPE)));
}
}
}