All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ml-modules.root.data-hub.5.provenance.prov-lib.mjs Maven / Gradle / Ivy

There is a newer version: 6.1.1
Show newest version
/**
 Copyright (c) 2021 MarkLogic Corporation

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */

// This library is intended for Data Hub specific provenance functions.

'use strict';

import httpUtils from "/data-hub/5/impl/http-utils.mjs";
import config from "/com.marklogic.hub/config.mjs";
import hubUtils from "/data-hub/5/impl/hub-utils.mjs";

function validateDeleteRequest({retainDuration, batchSize = 100}) {
  if (xdmp.castableAs("http://www.w3.org/2001/XMLSchema", "yearMonthDuration", retainDuration)) {
    retainDuration = xs.yearMonthDuration(retainDuration);
  } else if (xdmp.castableAs("http://www.w3.org/2001/XMLSchema", "dayTimeDuration", retainDuration)) {
    retainDuration = xs.dayTimeDuration(retainDuration);
  } else {
    httpUtils.throwBadRequest(`The duration format for the retainDuration provided ("${retainDuration}") is unsupported. Format must be in either xs:yearMonthDuration or xs:dayTimeDuration format`);
  }
  if (retainDuration.lt(xs.dayTimeDuration('PT0S'))) {
    httpUtils.throwBadRequest(`The retainDuration provided ("${retainDuration}") is unsupported. The retain duration must be a positive duration.`);
  }
  if (xdmp.castableAs("http://www.w3.org/2001/XMLSchema", "unsignedInt", batchSize)) {
    batchSize = xs.unsignedInt(batchSize);
  } else {
    httpUtils.throwBadRequest(`The value for the batchSize provided ("${batchSize}") is unsupported. batchSize must be an unsigned int.`);
  }
  return {retainDuration, batchSize};
}

function deleteProvenance(deleteRequest, endpointState) {
  xdmp.securityAssert("http://marklogic.com/data-hub/privileges/delete-provenance", "execute");
  // update with validated request properties
  deleteRequest = validateDeleteRequest(deleteRequest);
  const {retainDuration, batchSize} = deleteRequest;
  const timePruningBegins = fn.currentDateTime().subtract(retainDuration);
  return fn.head(xdmp.invokeFunction(function() {
    const collectionQuery = cts.collectionQuery('http://marklogic.com/provenance-services/record');
    const timeQuery = cts.tripleRangeQuery(null, sem.iri('http://www.w3.org/ns/prov#generatedAtTime'), timePruningBegins, '<');
    const lastRemovedUriQuery = endpointState.lastUri ? cts.rangeQuery(cts.uriReference(), '>=', endpointState.lastUri) : null;
    const queries = [collectionQuery, timeQuery];
    if (lastRemovedUriQuery) {
      queries.push(lastRemovedUriQuery);
    }
    const finalQuery = cts.andQuery(queries);
    let estimateCount = cts.estimate(finalQuery);
    let lastUri = null;
    for (let uri of cts.uris(null, [`limit=${batchSize}`, "concurrent", "score-zero"], finalQuery)) {
      xdmp.documentDelete(uri);
      lastUri = uri;
    }
    return (lastUri !== null && estimateCount > batchSize) ? {lastUri} : null;
  }, {database: xdmp.database(config.JOBDATABASE), update: 'true', commit: 'auto', ignoreAmps: false}));
}

function installProvTemplates() {
  const recordProvenanceTemplate = xdmp.unquote('');
  const templateUri = '/hub-template/RecordProvenance.xml';
  const permissions = [
    xdmp.permission('ps-user', 'read'),
    xdmp.permission('ps-internal', 'update'),
    xdmp.permission('data-hub-developer', 'update')
  ];
  const collections = ["hub-template", "http://marklogic.com/xdmp/tde"];

  hubUtils.writeDocument(templateUri, recordProvenanceTemplate, permissions, collections, config.STAGINGSCHEMASDATABASE);
  hubUtils.writeDocument(templateUri, recordProvenanceTemplate, permissions, collections, config.FINALSCHEMASDATABASE);
}

// BEGIN document specific PROV queries
function provIRIsToCtsQuery(provIRIs) {
  return cts.elementAttributeValueQuery(Sequence.from([fn.QName("http://www.w3.org/ns/prov#", "activity"), fn.QName("http://www.w3.org/ns/prov#", "entity")]), fn.QName("http://www.w3.org/ns/prov#", "id"), provIRIs.map((id) => String(id)));
}

/* allAssociatedProvEntities returns all the associated provenance IDs and the generation time
 * given a document URI across databases.
 */
function allAssociatedProvEntities(documentURI, database = config.FINALDATABASE) {
  // Query is used to get PROV information from the document URI.
  const documentUriSparql = `PREFIX prov:
    PREFIX xs:

    SELECT DISTINCT * WHERE {
      # Using a union of two SELECT statements to get ancestor and self. The * transitive causes a binding error.
      {
        # SELECT statement to return self information about document URI
        SELECT ?ancestorOrSelfProvID ?generatedAtTime ?activityID WHERE {
          $provID  $documentURI;
                    prov:generatedAtTime ?generatedAtTime;
                prov:wasGeneratedBy ?activityID.
          BIND($provID AS ?ancestorOrSelfProvID)
        }
      } UNION {
        # SELECT statement to return ancestor/derivedFrom information about document URI
        SELECT ?ancestorOrSelfProvID ?generatedAtTime ?activityID WHERE {
          $provID  $documentURI;
                  # Using + transitive instead of * transitive to avoid binding error
                  prov:wasDerivedFrom+ ?ancestorOrSelfProvID.
          OPTIONAL {
            ?ancestorOrSelfProvID prov:generatedAtTime ?generatedAtTime.
          }
          OPTIONAL {
            ?ancestorOrSelfProvID prov:wasGeneratedBy ?activityID.
          }
        }
      }
    }
    ORDER BY DESC(?generatedAtTime)`;
  // This query is for the case where the initial query is done in the FINAL DB and we need to gather additional information from staging
  const provIdQuery = `PREFIX prov:
    PREFIX xs:

    SELECT DISTINCT * WHERE {
      # Using a union of two SELECT statements to get ancestor and self. The * transitive causes a binding error.
      {
        # SELECT statement to return self information about document URI
        SELECT ?ancestorOrSelfProvID ?generatedAtTime ?activityID WHERE {
          $provID prov:generatedAtTime ?generatedAtTime;
                prov:wasGeneratedBy ?activityID.
          BIND($provID AS ?ancestorOrSelfProvID)
        }
      } UNION {
        # SELECT statement to return ancestor/derivedFrom information about document URI
        SELECT ?ancestorOrSelfProvID ?generatedAtTime ?activityID WHERE {
          # Using + transitive instead of * transitive to avoid binding error
          $provID prov:wasDerivedFrom+ ?ancestorOrSelfProvID.
          OPTIONAL {
            ?ancestorOrSelfProvID prov:generatedAtTime ?generatedAtTime.
          }
          OPTIONAL {
            ?ancestorOrSelfProvID prov:wasGeneratedBy ?activityID.
          }
        }
      }
    }
    ORDER BY DESC(?generatedAtTime)`;
  const ancestorOrSelfProvIDs = [];
  const retrieveProvEntities = function() {
    const currentDatabase = xdmp.databaseName(xdmp.database());
    const isInitialQuery = database === config.STAGINGDATABASE || currentDatabase === config.FINALDATABASE;
    if (isInitialQuery) {
      const bindings = {documentURI};
      return sem.sparql(documentUriSparql, bindings);
    } else {
      return sem.sparql(provIdQuery, null, null, provIRIsToCtsQuery(ancestorOrSelfProvIDs));
    }
  };
  let finalProvEntities = null;
  const currentDatabase = xdmp.databaseName(xdmp.database());
  if (database === config.FINALDATABASE) {
    finalProvEntities = config.FINALDATABASE === currentDatabase ? retrieveProvEntities(): hubUtils.invokeFunction(retrieveProvEntities, config.FINALDATABASE);
    for (let finalProvEntity of finalProvEntities) {
      ancestorOrSelfProvIDs.push(finalProvEntity.ancestorOrSelfProvID);
      if (finalProvEntity.activityID && !ancestorOrSelfProvIDs.includes(finalProvEntity.activityID)) {
        ancestorOrSelfProvIDs.push(finalProvEntity.activityID);
      }
    }
  }
  let stagingProvEntities = null;
  stagingProvEntities = config.STAGINGDATABASE === currentDatabase ? retrieveProvEntities(): hubUtils.invokeFunction(retrieveProvEntities, config.STAGINGDATABASE);
  for (let stagingProvEntity of stagingProvEntities) {
    if (!ancestorOrSelfProvIDs.includes(stagingProvEntity.ancestorOrSelfProvID)) {
      ancestorOrSelfProvIDs.push(stagingProvEntity.ancestorOrSelfProvID);
    }
    if (stagingProvEntity.activityID && !ancestorOrSelfProvIDs.includes(stagingProvEntity.activityID)) {
      ancestorOrSelfProvIDs.push(stagingProvEntity.activityID);
    }
  }
  return ancestorOrSelfProvIDs;
}

function sourceInformationForDocument(documentURI, database = config.FINALDATABASE) {
  const allAssociatedProvIDs = allAssociatedProvEntities(documentURI, database);
  const sparql = `PREFIX prov:
  SELECT ?dataSourceName ?dataSourceType WHERE {
    $provID a prov:Entity;
               ?dataSourceName;
               ?dataSourceType.
  }`;
  const currentDatabase = xdmp.databaseName(xdmp.database());
  const sparqlFun = function() { return sem.sparql(sparql, {provID: allAssociatedProvIDs}); };
  let sourceInformation = config.STAGINGDATABASE === currentDatabase ? sparqlFun(): hubUtils.invokeFunction(sparqlFun, config.STAGINGDATABASE);
  return sourceInformation.toArray();
}

/* This function provides an abstraction for retrieving information about a document URI across databases and requires SPARQL that has
*   has a provID variable that is bound to.
* */
function runCrossDatabaseSparqlForDocumentURI(sparql, documentURI, database = config.FINALDATABASE) {
  const allAssociatedProvIDs = allAssociatedProvEntities(documentURI, database);
  const sparqlFun = function() { return sem.sparql(sparql, null, null, provIRIsToCtsQuery(allAssociatedProvIDs.map((id) => String(id)))); };
  const currentDatabase = xdmp.databaseName(xdmp.database());
  let stagingResults = config.STAGINGDATABASE === currentDatabase ? sparqlFun(): hubUtils.invokeFunction(sparqlFun, config.STAGINGDATABASE);
  let finalResults = null;
  if (database === config.FINALDATABASE) {
    finalResults = config.FINALDATABASE === currentDatabase ? sparqlFun(): hubUtils.invokeFunction(sparqlFun, config.FINALDATABASE);
  }
  return Sequence.from([
    finalResults,
    stagingResults
  ]).toArray();
}

function stepsRunAgainstDocument(documentURI, database = config.FINALDATABASE) {
  const sparql = `PREFIX prov:
  SELECT DISTINCT ?stepName WHERE {
    $provID  ?stepName;
            prov:wasGeneratedBy ?activityID.
    ?activityID prov:startedAtTime ?activityStartTime.
  }
  ORDER BY DESC(?activityStartTime)`;
  return runCrossDatabaseSparqlForDocumentURI(sparql, documentURI, database);
}

function derivedFromDocuments(documentURI, database = config.FINALDATABASE) {
  const sparql = `PREFIX prov:
  SELECT ?derivedFromDocument ?database ?generatedAtTime WHERE {
    $provID  ?derivedFromDocument.
    $provID  ?database.
    $provID prov:generatedAtTime ?generatedAtTime.
  }
  ORDER BY DESC(?generatedAtTime) ?derivedFromDocument`;
  return runCrossDatabaseSparqlForDocumentURI(sparql, documentURI, database);
}

function documentActivities(documentURI, database = config.FINALDATABASE) {
  const sparql = `PREFIX prov:
  SELECT DISTINCT ?activityID ?activityLabel ?activityStartTime ?activityEndTime WHERE {
    $provID a prov:Entity;
              prov:wasGeneratedBy ?activityID.
    ?activityID prov:label ?activityLabel;
                prov:startedAtTime ?activityStartTime;
                prov:endedAtTime ?activityEndTime.
  }
  ORDER BY DESC(?activityStartTime)`;
  return runCrossDatabaseSparqlForDocumentURI(sparql, documentURI, database);
}

function documentProvenanceGraph(documentURI, database = config.FINALDATABASE) {
  const sparql = `PREFIX dh:
PREFIX prov:
SELECT * WHERE {
  {
    SELECT DISTINCT ?activityID ?activityLabel ?stepName ?time ?documentURI ?database ?wasDerivedFrom ?provLabel ?provType ?agentID ?agentLabel WHERE {
      $provID a prov:Entity;
                prov:wasAttributedTo ?agentID;
                prov:wasGeneratedBy ?activityID;
                prov:generatedAtTime ?time;
                 ?database;
                 ?documentURI;
                 ?stepName;
                a ?provType.
      OPTIONAL {
          $provID  ?entityName.
      }
      OPTIONAL {
          $provID prov:wasDerivedFrom ?wasDerivedFrom.
      }
      OPTIONAL {
          $provID prov:label ?provLabel.
      }
      ?activityID prov:label ?activityLabel.
      ?agentID prov:label ?agentLabel.
      FILTER (?provType =  || ?provType = )
    }
  } UNION {
    SELECT DISTINCT ?provLabel ?dataSourceName ?dataSourceType ?time WHERE {
      $provID a dh:Source;
               ?dataSourceName;
              prov:generatedAtTime ?time.
      OPTIONAL {
          $provID  ?dataSourceType.
      }
      OPTIONAL {
          $provID prov:label ?provLabel.
      }
    }
  }
}
ORDER BY DESC(?time)`;
  let sparqlResults = runCrossDatabaseSparqlForDocumentURI(sparql, documentURI, database);
  let activities = [];
  let nodesByProvID = {};
  sparqlResults.forEach((result) => {
    let provID, provNode;
    if (fn.string(result.dataSourceName)) {
      provID = `external:${result.dataSourceName}#${result.time}`;
      provNode = {
        id: provID,
        label: fn.string(result.provLabel) || fn.string(result.dataSourceName),
        provenanceType: "collection",
        changeType: "Source",
        dataSourceType: result.dataSourceType
      };
    } else {
      provID = `${result.database}:${result.documentURI}#${result.time}`;
      provNode = {
        id: provID,
        label: fn.string(result.provLabel) || fn.string(result.documentURI),
        provenanceType: "collection",
        database: result.database,
        entityName: result.entityName,
        changeType: result.provType === sem.iri("http://marklogic.com/dhf#DocCreated") ? "Created" : "Updated"
      };
    }
    if (nodesByProvID[provID]) {
      Object.assign(nodesByProvID[provID], provNode);
    } else {
      nodesByProvID[provID] = provNode;
    }
    let existing = activities.find((activity) => activity.provID === provID);
    let activityID = fn.string(result.activityID);
    if (activityID) {
      let activity;
      if (existing) {
        activity = existing;
      } else {
        activity = {
          activityID,
          provID,
          time: result.time,
          nodes: [
            provNode,
            {
              id: activityID,
              label: fn.string(result.activityLabel),
              used: result.stepName,
              type: "activity"
            }
          ],
          links: [
            {
              from: provID,
              to: activityID,
              label: "generated by"
            }
          ]
        };
        activities.push(activity);
      }
      if (result.wasDerivedFrom) {
        let wasDerivedFrom = fn.string(result.wasDerivedFrom);
        activity.links.push({to: provID, from: wasDerivedFrom, label: "derived from"});
        if (!nodesByProvID[wasDerivedFrom]) {
          nodesByProvID[wasDerivedFrom] = {id: wasDerivedFrom};
        }
        activity.nodes.push(nodesByProvID[wasDerivedFrom]);
      }
    }
  });
  return {
    activities
  };
}

export default {
  deleteProvenance: import.meta.amp(deleteProvenance),
  installProvTemplates,
  allAssociatedProvEntities,
  sourceInformationForDocument,
  stepsRunAgainstDocument,
  derivedFromDocuments,
  documentActivities,
  documentProvenanceGraph
};




© 2015 - 2024 Weber Informatics LLC | Privacy Policy