All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ml-modules.root.data-hub.data-services.bulk.fixCreatedByStep.mjs Maven / Gradle / Ivy

There is a newer version: 6.1.1
Show newest version
/*
  Copyright (c) 2021 MarkLogic Corporation

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
*/
// No privilege required: Use of this is restricted to users who have update permissions on entity instances
// matching the query performed by this endpoint

import config from "/com.marklogic.hub/config.mjs";
import StepDefinition from "/data-hub/5/impl/stepDefinition.mjs";

let endpointState = external.endpointState;
if (!endpointState) {
  endpointState = {};
} else {
  endpointState = fn.head(xdmp.fromJSON(endpointState));
}

const workUnit = fn.head(xdmp.fromJSON(external.workUnit));

const batchSize = workUnit.batchSize ? workUnit.batchSize : 50;

const fixedCollection = "datahubCreatedByStep-fixed";

const stepDefinitionNames = fn.collection('http://marklogic.com/data-hub/step-definition')
  .toArray().map(stepDef => stepDef.toObject().name);

const documentQueries = [
  // Have to use a value query so that queries match on names with hyphens in them
  cts.fieldValueQuery("datahubCreatedByStep", stepDefinitionNames),
  cts.notQuery(cts.collectionQuery(fixedCollection))
];

if (endpointState.lastProcessedUri) {
  documentQueries.push(cts.rangeQuery(cts.uriReference(), ">", endpointState.lastProcessedUri));
}

const uris = cts.uris(
  null, ['limit=' + batchSize, "concurrent", "score-zero"],
  cts.andQuery(documentQueries), null, [workUnit.forestId]
).toArray();

if (uris.length == 0) {
  null;
} else {
  uris.forEach(uri => {
    const metadata = xdmp.documentGetMetadata(uri);

    const stepDef = new StepDefinition().getStepDefinition(metadata.datahubCreatedByStep);

    // Rare, but the stepDef may no longer exist, e.g. if it was a custom one
    if (stepDef) {
      const stepDefType = stepDef.toObject().type;

      // Every stepDef should have a type, but in case it doesn't, we can't do anything further
      if (stepDefType) {
        const datahubCreatedByJob = metadata.datahubCreatedByJob;
        if (datahubCreatedByJob) {
          // This can have multiple space-delimited values; we need the most recent one
          const jobIds = datahubCreatedByJob.split(' ');
          const latestJobId = jobIds[jobIds.length - 1];
          const flowName = metadata.datahubCreatedInFlow;
          const stepDefinitionName = stepDef.toObject().name;

          // This is based on the pattern used in prov.sjs
          const subject = latestJobId + flowName + stepDefType.toLowerCase() + uri;

          let stepName = null;
          // Try wasInfluencedBy first; it won't exist for ingestion steps, but there should be only one occurrence of
          // it if it does exist
          // Also - xdmp.eval works, but xdmp.invokeFunction returns no results; not sure why
          const influencedByTriple = fn.head(xdmp.eval("var subject, predicate; cts.triples(subject, predicate, null)",
            {subject: sem.iri(subject), predicate: sem.iri("http://www.w3.org/ns/prov#wasInfluencedBy")},
            {database: xdmp.database(config.JOBDATABASE)}
          ));
          if (influencedByTriple) {
            stepName = sem.tripleObject(influencedByTriple);
          }

          // If no luck with wasInfluencedBy, try wasAssociatedWith
          if (stepName === null) {
            console.log("trying associated");
            const associatedWithTriples = xdmp.eval("var subject, predicate; cts.triples(subject, predicate, null)",
              {subject: sem.iri(subject), predicate: sem.iri("http://www.w3.org/ns/prov#wasAssociatedWith")},
              {database: xdmp.database(config.JOBDATABASE)}
            );
            if (associatedWithTriples) {
              for (let triple of associatedWithTriples) {
                const object = sem.tripleObject(triple);
                if (object != flowName && object != stepDefinitionName) {
                  console.log("Using associatedWith!");
                  stepName = object;
                  break;
                }
              }
            }
          }

          // It is not unusual for triples to not exist, e.g provenance may have been disabled
          if (stepName) {
            xdmp.documentPutMetadata(uri, {datahubCreatedByStep: stepName});
            xdmp.documentAddCollections(uri, fixedCollection);
          }
        }
      }
    }
  });


  endpointState.lastProcessedUri = uris[uris.length - 1];
  Sequence.from([endpointState]);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy