ml-modules.root.data-hub.5.mastering.matching.matchable.mjs Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of marklogic-data-hub Show documentation
Show all versions of marklogic-data-hub Show documentation
Library for Creating an Operational Data Hub on MarkLogic
import consts from "/data-hub/5/impl/consts.mjs";
import core from "/data-hub/5/artifacts/core.mjs";
import httpUtils from "/data-hub/5/impl/http-utils.mjs";
import hubUtils from "/data-hub/5/impl/hub-utils.mjs";
import common from "/data-hub/5/mastering/common.mjs";
import {getEntityModel} from "/data-hub/core/models/entities.mjs";
import temporalLib from "/data-hub/5/temporal/hub-temporal.mjs";
// matching XQuery module for performance
const matchingXqy = require("/data-hub/5/mastering/matching/matching.xqy");
const groupQueries = common.groupQueries;
const optimizeCtsQueries = common.optimizeCtsQueries;
const matchingDebugTraceEnabled = xdmp.traceEnabled(consts.TRACE_MATCHING_DEBUG);
const matchingTraceEnabled = xdmp.traceEnabled(consts.TRACE_MATCHING) || matchingDebugTraceEnabled;
const matchingTraceEvent = xdmp.traceEnabled(consts.TRACE_MATCHING) ? consts.TRACE_MATCHING : consts.TRACE_MATCHING_DEBUG;
const queryHashPredicate = sem.iri("http://marklogic.com/data-hub/mastering#hasMatchingHash");
const hashScorePredicate = sem.iri("http://marklogic.com/data-hub/mastering#hasMatchingScore");
/*
* Returns a JSON Object with details to pass onto the merge step for use in taking action.
* @param {[]ContentObject} matchingDocumentSet
* @param {ThresholdDefinition} thresholdBucket
* @return {}
* @since 5.8.0
*/
export function buildActionDetails(matchingDocumentSet, thresholdBucket) {
const action = thresholdBucket.action();
const actionUri = thresholdBucket.generateActionURI(matchingDocumentSet);
const thresholdName = thresholdBucket.name();
const uris = matchingDocumentSet
.map((contentObj) => {
if (fn.startsWith(contentObj.uri, "/com.marklogic.smart-mastering/merged/")) {
return fn.distinctValues(contentObj.value.xpath("/*:envelope/*:headers/*:merges/*:document-uri")).toArray();
}
return fn.string(contentObj.uri);
})
.reduce((acc, cur) => acc.concat(cur), [])
.filter((uri, index, uris) => index === uris.indexOf(uri))
.sort();
const thresholdScore = thresholdBucket.score();
let actionBody;
if (action === "custom") {
actionBody = {
action: "customActions",
thresholdName,
thresholdScore,
uris,
actionModuleFunction: thresholdBucket.actionModuleFunction(),
actionModulePath: thresholdBucket.actionModulePath(),
matchResults: matchingDocumentSet.filter((match) => match.uri !== actionUri)
};
} else {
actionBody = {
action,
thresholdName,
threshold: thresholdName,
thresholdScore,
uris,
matchResults: matchingDocumentSet.map((match) => ({uri: match.uri, matchedRulesets: match.matchedRulesets, score: match.score || "referenceDocument"}))
};
}
return {
[actionUri]: actionBody
};
}
/*
* A class that encapsulates the configurable portions of the matching process.
*/
export class Matchable {
constructor(matchStep, stepContext) {
// update the match step if using the legacy format
this.isLegacyStep = false;
if (matchStep.scoring) {
this.legacyStep = matchStep;
this.legacyStepXML = matchingXqy.xmlOptionsFromJson(xdmp.toJSON(matchStep));
const updateMatchOptions = hubUtils.requireFunction("/data-hub/data-services/mastering/updateMatchOptionsLib.mjs", "updateMatchOptions");
this.matchStep = updateMatchOptions(matchStep);
this.isLegacyStep = true;
} else {
this.matchStep = matchStep;
}
this.matchStepNode = xdmp.toJSON(this.matchStep).root;
this.stepContext = stepContext;
const targetEntityType = this.matchStep.targetEntityType;
this._genericModel = new common.GenericMatchModel(this.matchStep, {collection: targetEntityType ? targetEntityType.substring(targetEntityType.lastIndexOf("/") + 1):null});
if (targetEntityType) {
this._model = getEntityModel(targetEntityType);
}
if (!this._model) {
this._model = this._genericModel;
}
this._propertyQueryFunctions = new Map();
this._cachedScores = new Map();
this._cachedRulesetMatches = new Map();
this._cachedRulesetScores = new Map();
}
/*
* Returns a Model class instance that defines how match queries should be built
* @return Model class instance
* @since 5.8.0
*/
model() {
return this._model;
}
/*
* Returns a cts.query to represent the entire set of documents that should be matched against
* @return cts.query
* @since 5.8.0
*/
baselineQuery() {
if (!this._baselineQuery) {
let firstBaseline = this._model.instanceQuery();
const temporalCollections = temporalLib.getTemporalCollections().toArray();
const includesTemporalDocuments = temporalCollections.some(collection => cts.exists(cts.andQuery([firstBaseline, cts.collectionQuery(collection)])));
if (includesTemporalDocuments) {
firstBaseline = cts.andQuery([firstBaseline, cts.collectionQuery("latest")]);
}
this._baselineQuery = common.applyInterceptors("Baseline Query Interceptor", firstBaseline, this.matchStep.baselineQueryInterceptors);
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Initializing the baseline match query: ${xdmp.describe(this._baselineQuery, Sequence.from([]), Sequence.from([]))}`);
}
}
return this._baselineQuery;
}
/*
* Returns an array of MatchRulesetDefinition class instances that describe the rule sets for matching
* @return []MatchRulesetDefinition
* @since 5.8.0
*/
matchRulesetDefinitions() {
if (!this._matchRulesetDefinitions) {
this._matchRulesetDefinitions = this.matchStepNode.xpath("matchRulesets").toArray()
.map((rulesetNode) => new MatchRulesetDefinition(rulesetNode, this))
.sort((a, b) => b.weight() - a.weight());
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Initializing the match ruleset definitions: ${xdmp.toJsonString(this._matchRulesetDefinitions.map((def) => def.raw()))}`);
}
}
return this._matchRulesetDefinitions;
}
/*
* Returns an array of ThresholdDefinition class instances that describe the thresholds matches can be grouped into
* @return []ThresholdDefinition
* @since 5.8.0
*/
thresholdDefinitions() {
if (!this._thresholdDefinitions) {
const thresholds = this.matchStep.thresholds && this.matchStep.thresholds.threshold ? this.matchStep.thresholds.threshold: this.matchStep.thresholds;
this._thresholdDefinitions = thresholds.map((thresholdObj) => {
return new ThresholdDefinition(thresholdObj, this);
}).sort((a, b) => a.score() - b.score());
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Initializing the threshold definitions: ${xdmp.toJsonString(this._thresholdDefinitions.map((def) => def.raw()))}`);
}
}
return this._thresholdDefinitions;
}
/*
* Returns a cts.query that filters out documents that shouldn't match with an individual document. The default is to
* return a cts.andNotQuery(cts.query(matchStep.filterQuery), cts.documentQuery([selfDocURI, documentsBlockedByUnmerge]))
* @param {Node} documentNode
* @return cts.query
* @since 5.8.0
*/
filterQuery(documentNode) {
let filterQuery, notDocumentQuery, stepFilterQuery;
const uri = xdmp.nodeUri(documentNode);
const getBlocks = hubUtils.requireFunction("/com.marklogic.smart-mastering/matcher-impl/blocks-impl.xqy", "getBlocks");
const excludeDocuments = fn.exists(uri) ? Sequence.from([uri, Sequence.from(getBlocks(uri))]): null;
if (fn.exists(excludeDocuments)) {
notDocumentQuery = cts.documentQuery(excludeDocuments.toArray());
}
if (this.matchStep.filterQuery) {
stepFilterQuery = this.matchStep.filterQuery instanceof cts.query ?
this.matchStep.filterQuery : cts.query(this.matchStep.filterQuery);
}
if (stepFilterQuery && notDocumentQuery) {
filterQuery = cts.andNotQuery(stepFilterQuery, notDocumentQuery);
} else if (notDocumentQuery) {
filterQuery = cts.notQuery(notDocumentQuery);
} else if (stepFilterQuery) {
filterQuery = stepFilterQuery;
}
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Base filter query set to ${xdmp.describe(filterQuery, Sequence.from([]), Sequence.from([]))} for ${xdmp.describe(documentNode, Sequence.from([]), Sequence.from([]))}`);
}
return common.applyInterceptors("Filter Query Interceptor", filterQuery, this.matchStep.filterQueryInterceptors, documentNode);
}
/*
* Returns a score in the form of a double of 2 documents
* @param {ContentObject} contentObjectA
* @param {ContentObject} contentObjectB
* @param {[]MatchRulesetDefinition} matchingRulesets
* @return double
* @since 5.8.0
*/
scoreDocument(contentObjectA, contentObjectB) {
const scoreKey = [contentObjectA.uri, contentObjectB.uri].sort().join(":");
if (this._cachedScores.has(scoreKey)) {
contentObjectB.matchedRulesets = this._cachedRulesetMatches.get(scoreKey);
return this._cachedScores.get(scoreKey);
}
const matchingRulesetDefinitions = this.matchRulesetDefinitions();
let defaultScore = 0;
contentObjectB.matchedRulesets = [];
for (const matchRuleset of matchingRulesetDefinitions) {
let rulesetScore = matchRuleset.score(contentObjectA, contentObjectB);
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Applying rule ${matchRuleset.name()} for ${xdmp.describe(contentObjectA.value, Sequence.from([]), Sequence.from([]))} and ${xdmp.describe(contentObjectB.value, Sequence.from([]), Sequence.from([]))} with score ${rulesetScore}`);
}
if (rulesetScore !== 0) {
if (matchRuleset.reduce()) {
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Decreasing score by ${rulesetScore} for ${xdmp.describe(contentObjectA.value, Sequence.from([]), Sequence.from([]))} and ${xdmp.describe(contentObjectB.value, Sequence.from([]), Sequence.from([]))}`);
}
rulesetScore = -Math.abs(rulesetScore);
} else {
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Increasing score by ${rulesetScore} for ${xdmp.describe(contentObjectA.value, Sequence.from([]), Sequence.from([]))} and ${xdmp.describe(contentObjectB.value, Sequence.from([]), Sequence.from([]))}`);
}
rulesetScore = Math.abs(rulesetScore);
}
defaultScore += rulesetScore;
contentObjectB.matchedRulesets.push({rulesetName: matchRuleset.name(), rulesetScore});
}
}
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Base score set to ${defaultScore} for ${xdmp.describe(contentObjectA.value, Sequence.from([]), Sequence.from([]))} and ${xdmp.describe(contentObjectB.value, Sequence.from([]), Sequence.from([]))}`);
}
const results = common.applyInterceptors("Score Document Interceptor", defaultScore, this.matchStep.scoreDocumentInterceptors, contentObjectA, contentObjectB, matchingRulesetDefinitions);
this._cachedRulesetMatches.set(scoreKey, contentObjectB.matchedRulesets);
this._cachedScores.set(scoreKey, results);
return results;
}
buildActionDetails(matchingDocumentSet, thresholdBucket) {
return buildActionDetails(matchingDocumentSet, thresholdBucket);
}
/*
* Returns a query given a property path a set of values based on the model associated with Matchable.
* This will likely be reused and so can be moved to a common at some point.
* @param {string} propertyPath
* @param {item*} values
* @return {cts.query}
* @since 5.8.0
*/
propertyQuery(propertyPath, values) {
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Property query for ${propertyPath} with values ${xdmp.describe(values, Sequence.from([]), Sequence.from([]))}`);
}
const valuesAreQueries = hubUtils.normalizeToArray(values).every(val => val instanceof cts.query);
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Property query for ${propertyPath} has query values? ${valuesAreQueries}`);
}
if (valuesAreQueries) {
return values;
}
if (!this._propertyQueryFunctions.has(propertyPath)) {
let indexes = this.model().propertyIndexes(propertyPath);
if (this._genericModel && !(indexes && indexes.length)) {
indexes = this._genericModel.propertyIndexes(propertyPath);
}
if (indexes && indexes.length) {
const scalarType = cts.referenceScalarType(indexes[0]);
const collation = (scalarType === "string") ? cts.referenceCollation(indexes[0]) : null;
this._propertyQueryFunctions.set(propertyPath, (values) => {
let typedValues = [];
for (const value of values) {
if (xdmp.castableAs("http://www.w3.org/2001/XMLSchema", scalarType, value)) {
typedValues.push(xs[scalarType](value));
}
}
if (typedValues.length === 0) {
return values;
}
if (collation) {
const extendedValues = [];
for (const value of typedValues) {
extendedValues.push(cts.valueMatch(indexes, value, ["case-insensitive", "score-zero", "concurrent", "eager"]));
}
typedValues = Sequence.from(extendedValues);
}
return cts.rangeQuery(indexes, "=", typedValues, ["score-function=zero"]);
});
} else {
const pathParts = propertyPath.split(".").filter((path) => path);
const propertyDefinitions = pathParts
.map((propertyPart, index) => this.model().propertyDefinition(pathParts.slice(0, index + 1).join(".")));
if (matchingDebugTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Property for ${propertyPath} has property definitions ${xdmp.describe(propertyDefinitions, Sequence.from([]), Sequence.from([]))}`);
}
this._propertyQueryFunctions.set(propertyPath, (values) => {
const propertyQuery = this._buildQueryFromPropertyDefinitionsAndValues(propertyDefinitions, values);
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Property query for ${propertyPath} is ${xdmp.describe(propertyQuery, Sequence.from([]), Sequence.from([]))}`);
}
return propertyQuery;
});
}
}
return this._propertyQueryFunctions.get(propertyPath)(values);
}
/*
* Returns a query given an XPath and a set of values based on the model associated with Matchable.
* This will likely be reused and so can be moved to a common at some point.
* @param {string} XPath
* @param {item*} values
* @return {cts.query}
* @since 5.8.0
*/
xpathQuery(xpath, values, localNamespaces = {}) {
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `XPath query for ${xpath} with values ${xdmp.describe(values, Sequence.from([]), Sequence.from([]))}`);
}
const propertyDefinitions = common.propertyDefinitionsFromXPath(xpath, localNamespaces);
const xpathQuery = this._buildQueryFromPropertyDefinitionsAndValues(propertyDefinitions, values);
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `XPath query for ${xpath} is ${xdmp.describe(xpathQuery, Sequence.from([]), Sequence.from([]))}`);
}
return xpathQuery;
}
/*
* Returns the max scan size.
* @return {integer}
* @since 5.8.0
*/
maxScan() {
if (!this._maxScan) {
this._maxScan = fn.number((this.matchStep.tuning && this.matchStep.tuning.maxScan) || 500);
}
return this._maxScan;
}
_buildQueryFromPropertyDefinitionsAndValues(propertyDefinitions, values) {
if (values instanceof Function) return values;
const lastPropertyDefinitionIndex = propertyDefinitions.length - 1;
const lastPropertyDefinition = propertyDefinitions[lastPropertyDefinitionIndex];
const parentPropertyDefinitions = propertyDefinitions.slice(0, lastPropertyDefinitionIndex).reverse();
const localname = lastPropertyDefinition.localname;
const valuesArray = hubUtils.normalizeToArray(values).filter(val => val instanceof cts.query || fn.normalizeSpace(fn.string(val)));
if (valuesArray.length === 0) {
return null;
}
const queryValues = valuesArray.filter(val => (val instanceof cts.query));
if (queryValues.length === valuesArray.length) {
return values;
}
let atomicValues = valuesArray.filter(val => !(val instanceof cts.query));
if (matchingDebugTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Parent property definitions for ${localname}: ${JSON.stringify(parentPropertyDefinitions, null, 2)}.`);
xdmp.trace(matchingTraceEvent, `Query values for ${localname}: ${xdmp.describe(queryValues, Sequence.from([]), Sequence.from([]))}.`);
xdmp.trace(matchingTraceEvent, `Atomic values for ${localname}: ${xdmp.describe(atomicValues, Sequence.from([]), Sequence.from([]))}.`);
}
if (this.matchStep.dataFormat === "json") {
const lastQuery = groupQueries(queryValues.concat(atomicValues.length ? [cts.jsonPropertyValueQuery(localname, atomicValues, ["case-insensitive"])]: []), cts.orQuery);
return parentPropertyDefinitions.reduce((acc, propertyDef) => propertyDef.localname ? cts.jsonPropertyScopeQuery(propertyDef.localname, acc) : acc, lastQuery);
} else {
atomicValues = atomicValues.map((val) => fn.string(val));
const lastQuery = groupQueries(queryValues.concat(atomicValues.length ? [cts.elementValueQuery(fn.QName(lastPropertyDefinition.namespace, localname), atomicValues, ["case-insensitive"])]: []), cts.orQuery);
return parentPropertyDefinitions.reduce((acc, propertyDef) => propertyDef.localname ? cts.elementQuery(fn.QName(propertyDef.namespace, propertyDef.localname), acc) : acc, lastQuery);
}
}
}
const cachedPropertyValues = new Map();
class MatchRulesetDefinition {
constructor(matchRulesetNode, matchable) {
this.matchRulesetNode = matchRulesetNode;
this.matchRulesNodes = this.matchRulesetNode.xpath("matchRules");
this.matchRuleset = matchRulesetNode.toObject();
this.matchable = matchable;
this._id = sem.uuidString();
this._cachedCtsQueries = new Map();
this._cachedQueryHashes = new Map();
this._cachedMatchFunctionResults = new Map();
this.exclusionListNames = [];
this.excludedValues = new Set();
if (this.matchRuleset.exclusionLists && this.matchRuleset.exclusionLists.length > 0) {
this.exclusionListNames = this.exclusionListNames.concat(this.matchRuleset.exclusionLists);
}
if (this.matchRuleset.matchRules) {
for (const matchRule of this.matchRuleset.matchRules) {
if (matchRule.exclusionLists) {
this.exclusionListNames = this.exclusionListNames.concat(matchRule.exclusionLists);
}
}
if (this.exclusionListNames && this.exclusionListNames.length > 0) {
for (const excListName of this.exclusionListNames) {
const excList = core.getArtifact("exclusionList", excListName);
for (const value of excList.values) {
this.excludedValues.add(String(value));
}
}
}
}
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Excluded values: ${xdmp.toJsonString([...this.excludedValues])}`);
}
}
name() {
return this.matchRuleset.name;
}
reduce() {
return !!this.matchRuleset.reduce;
}
matchRules() {
return this.matchRuleset.matchRules;
}
_valueFunction(matchRule, model) {
if (!matchRule._valueFunction) {
const pathKey = matchRule.documentXPath || matchRule.entityPropertyPath;
matchRule._valueFunction = (contentObject) => {
const key = `${contentObject.uri}:${this.exclusionListNames.sort().join(":")}:${pathKey}`;
if (!cachedPropertyValues.has(key)) {
let values;
if (matchRule.documentXPath) {
values = contentObject.value.xpath(matchRule.documentXPath, matchRule.namespaces);
} else {
values = model.propertyValues(matchRule.entityPropertyPath, contentObject.value);
}
// based on the exclusion lists, remove certain values from querying
if (this.excludedValues && this.excludedValues.size > 0) {
if (matchingDebugTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Filtering out the following values: ${xdmp.toJsonString([...this.excludedValues])}.`);
}
values = Sequence.from(values.toArray().filter(val => !this.excludedValues.has(String(val))));
if (matchingDebugTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Final values for query: ${xdmp.toJsonString(values)}.`);
}
}
cachedPropertyValues.set(key, values);
}
return cachedPropertyValues.get(key);
};
}
return matchRule._valueFunction;
}
synonymMatchFunction(value, passMatchRule) {
let matchRule = passMatchRule.toObject();
let thesaurus = matchRule.options.thesaurusURI;
let expandOptions = hubUtils.normalizeToArray(value).map((val) => fn.string(val).toLowerCase());
const queryLookup = hubUtils.requireFunction("/MarkLogic/thesaurus.xqy", "queryLookup");
let entries = queryLookup(thesaurus, cts.elementValueQuery(fn.QName("http://marklogic.com/xdmp/thesaurus", "term"), expandOptions, "case-insensitive"), "elements");
let options = matchRule.options;
let allEntries = [];
let filterNode;
//check if filter is present in options
if (fn.exists(options.filter)) {
filterNode = xdmp.unquote(options.filter);
}
for (const entry of entries) {
let meetsQualifier = false;
if (filterNode) {
for (let node of entry.xpath(".//*")) {
//comparing each node of entry with filerNode
if (fn.deepEqual(node, fn.head(filterNode).root)) {
meetsQualifier = true;
break;
}
}
} else {
meetsQualifier = true;
}
if (meetsQualifier) {
for (let syn of entry.xpath("(thsr:term|thsr:synonym/thsr:term)", {thsr: "http://marklogic.com/xdmp/thesaurus"})) {
allEntries.push(fn.string(syn));
}
}
}
//returning unique values of all matching entries
return Array.from(new Set(allEntries));
}
doubleMetaphoneMatchFunction(value, passMatchRule) {
let matchRule = passMatchRule.toObject();
let dictionary = matchRule.options.dictionaryURI;
let spellOption = {
distanceThreshold: matchRule.options.distanceThreshold || 100
};
let results;
try {
const suggest = hubUtils.requireFunction("/MarkLogic/spell.xqy", "suggest");
results = hubUtils.normalizeToArray(value).map((val) => suggest(dictionary, fn.string(val), spellOption));
} catch (e) {
httpUtils.throwNotFound(e.message + ": " + e.data.toString());
}
return Sequence.from(results);
}
zipMatchFunction(value, passMatchRule) {
let node = fn.head(hubUtils.normalizeToSequence(value));
let result = [node];
if (node.length === 5) {
let wildcardValue = node + "-*";
result.push(wildcardValue);
} else if (node) {
let val = fn.string(node).substring(0, 5);
result.push(val);
}
return result;
}
_matchFunction(matchRule, model) {
if (!matchRule._matchFunction) {
let passMatchRule = matchRule.node;
let passMatchStep = this.matchable.matchStepNode;
let matchFunction;
let propertyQueryFunction = (values) => this.matchable.propertyQuery(matchRule.entityPropertyPath, values);
if (matchRule.documentXPath) {
propertyQueryFunction = (values) => this.matchable.xpathQuery(matchRule.documentXPath, values, matchRule.namespaces);
}
switch (matchRule.matchType) {
case "exact":
matchFunction = propertyQueryFunction;
break;
case "doubleMetaphone":
matchFunction = this.doubleMetaphoneMatchFunction;
break;
case "synonym":
matchFunction = this.synonymMatchFunction;
break;
case "zip":
matchFunction = this.zipMatchFunction;
break;
case "custom":
matchFunction = hubUtils.requireFunction(matchRule.algorithmModulePath, matchRule.algorithmFunction);
if (this.matchable.isLegacyStep) {
const isXQuery = fn.endsWith(matchRule.algorithmModulePath, ".xqy");
passMatchStep = isXQuery ? this.matchable.legacyStepXML: this.matchable.legacyStep;
let algorithms = this.matchable.legacyStep.algorithms;
if (algorithms.algorithm) {
algorithms = algorithms.algorithm;
}
const legacyRuleJson = this.matchable.legacyStep.scoring.expand.find(rule => {
const algorithm = algorithms.find(a => a.name === rule.algorithmRef);
return (rule.propertyName === matchRule.entityPropertyPath || rule.propertyName === matchRule.documentXPath)
&& algorithm.at === matchRule.algorithmModulePath && algorithm.function === matchRule.algorithmFunction;
});
passMatchRule = isXQuery ? matchingXqy.convertQuickStartMatchRuleForXqueryModule(legacyRuleJson): legacyRuleJson;
}
break;
default:
httpUtils.throwBadRequest(`Undefined match type "${matchRule.matchType}" provided.`);
}
const pathKey = matchRule.documentXPath || matchRule.entityPropertyPath;
const matchTypeKey = matchRule.matchType === "custom" ? `${matchRule.algorithmModulePath}:${matchRule.algorithmFunction}`:matchRule.matchType;
matchRule._matchFunction = (uri, values) => {
const cacheKey = `${uri}:${pathKey}:${matchTypeKey}`;
if (this._cachedMatchFunctionResults.has(cacheKey)) {
return this._cachedMatchFunctionResults.get(cacheKey);
}
let results = matchFunction(values, passMatchRule, passMatchStep);
if (results) {
results = propertyQueryFunction(results);
}
this._cachedMatchFunctionResults.set(cacheKey, results);
return results;
};
}
return matchRule._matchFunction;
}
score(contentObjectA, contentObjectB) {
const cacheKey = `${this._id}:${[contentObjectA.uri, contentObjectB.uri].sort().join(":")}`;
const cachedMatchRuleScores = this.matchable._cachedRulesetScores;
if (cachedMatchRuleScores.has(cacheKey)) {
if (matchingDebugTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Using cached score. key:${cacheKey}, score:${cachedMatchRuleScores.get(cacheKey)}.`);
}
return cachedMatchRuleScores.get(cacheKey);
}
const query = this.buildCtsQuery(contentObjectA);
const isFuzzyMatch = this.fuzzyMatch();
const hashesA = Sequence.from(isFuzzyMatch ? this.queryHashes(contentObjectA, isFuzzyMatch): []);
const hashesB = Sequence.from(isFuzzyMatch ? this.queryHashes(contentObjectB, isFuzzyMatch): []);
const queryMatches = matchingXqy.queryMatchScore(
contentObjectB.value,
query,
hashesA,
hashesB);
let matchScore = 0;
if (queryMatches) {
let pos = 1;
const model = this.matchable.model();
const matchRuleFunctions = [];
for (const matchRule of this.matchRuleset.matchRules) {
if (!matchRule.node) {
matchRule.node = fn.head(fn.subsequence(this.matchRulesNodes, pos, 1));
}
pos++;
const valueFunction = this._valueFunction(matchRule, model);
const matchFunction = this._matchFunction(matchRule, model);
const values = valueFunction(contentObjectA);
const query = (values !== null && fn.exists(values)) ? matchFunction(contentObjectA.uri, values) : null;
if (query instanceof Function) {
matchRuleFunctions.push(query);
}
}
if (matchRuleFunctions.length === 0 || matchRuleFunctions.every((rule) => rule(contentObjectB.value))) {
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, "Query matched!");
}
matchScore = this.weight();
}
}
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Scoring ${xdmp.describe(contentObjectA.value)} with ${xdmp.describe(contentObjectB.value)} using cts.query: ${xdmp.describe(query, Sequence.from([]), Sequence.from([]))}.`);
}
if (matchingDebugTraceEnabled && matchScore === 0) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, "Query didn't match!");
}
cachedMatchRuleScores.set(cacheKey, matchScore);
return matchScore;
}
buildCtsQuery(contentObject) {
const uri = contentObject.uri;
if (!this._cachedCtsQueries.has(uri)) {
const documentNode = contentObject.value;
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `Building cts.query for ${xdmp.describe(documentNode)} with match ruleset ${this.name()}`);
}
const queries = [];
const model = this.matchable.model();
let pos = 1;
for (const matchRule of this.matchRuleset.matchRules) {
if (!matchRule.node) {
matchRule.node = fn.head(fn.subsequence(this.matchRulesNodes, pos, 1));
}
pos++;
const valueFunction = this._valueFunction(matchRule, model);
const matchFunction = this._matchFunction(matchRule, model);
const values = valueFunction(contentObject);
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Values for ${xdmp.describe(documentNode)} with match rule ${xdmp.toJsonString(matchRule)} are ${xdmp.describe(values, Sequence.from([]), Sequence.from([]))}`);
}
const query = fn.exists(values) ? matchFunction(uri, values) : null;
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `Query for ${xdmp.describe(documentNode)} with match rule ${xdmp.toJsonString(matchRule)} are ${xdmp.describe(query, Sequence.from([]), Sequence.from([]))}`);
}
if (query === null || fn.empty(query)) {
return null;
}
if (query && !(query instanceof Function)) {
queries.push(query);
}
}
if (matchingDebugTraceEnabled) {
xdmp.trace(consts.TRACE_MATCHING_DEBUG, `cts.query for ${xdmp.describe(documentNode)} with match ruleset ${this.name()} before optimization is ${xdmp.describe(queries)}`);
}
const optimizedCtsQuery = optimizeCtsQueries(groupQueries(queries, cts.andQuery));
if (matchingTraceEnabled) {
xdmp.trace(matchingTraceEvent, `cts.query for ${xdmp.describe(documentNode)} with match ruleset ${this.name()} returned ${xdmp.describe(optimizedCtsQuery, Sequence.from([]), Sequence.from([]))}`);
}
this._cachedCtsQueries.set(uri, optimizedCtsQuery);
}
return this._cachedCtsQueries.get(uri);
}
queryHashes(contentObject) {
const uri = contentObject.uri;
if (!this._cachedQueryHashes.has(uri)) {
const query = this.buildCtsQuery(contentObject);
let hashes = [];
if (query) {
hashes = [...matchingXqy.queryToHashes(query, this.fuzzyMatch())];
}
this._cachedQueryHashes.set(uri, hashes);
}
return this._cachedQueryHashes.get(uri);
}
weight() {
return fn.number(this.matchRuleset.weight);
}
fuzzyMatch() {
return !!this.matchRuleset.fuzzyMatch;
}
raw() {
return this.matchRuleset;
}
}
export class ThresholdDefinition {
constructor(threshold, matchable) {
this.threshold = threshold;
this.matchable = matchable;
}
/*
* Returns a string that is the threshold's name.
* @return {string}
* @since 5.8.0
*/
name() {
return this.threshold.thresholdName;
}
/*
* Returns a number that is the threshold's score.
* @return {number}
* @since 5.8.0
*/
score() {
return this.threshold.score;
}
// this is a helper function to find combinations of rulesets that match a threshold
_otherCombinations(remainingRulesets = []) {
let combinations = [];
for (let i = 0; i < remainingRulesets.length; i++) {
const ruleset = remainingRulesets[i];
let combinationsStartingWith = [ruleset];
let combinedWeight = ruleset.weight();
const followingRulesets = remainingRulesets.slice(i + 1);
for (const followingRuleset of followingRulesets) {
combinedWeight = combinedWeight + followingRuleset.weight();
combinationsStartingWith.push(followingRuleset);
if (combinedWeight >= this.score()) {
// add the combination to the list because the score is matched.
combinations.push(combinationsStartingWith);
// undo the last weight to so we can see if later rulesets will also push us to the threshold
combinationsStartingWith = [...combinationsStartingWith];
combinationsStartingWith.pop();
combinedWeight = combinedWeight - followingRuleset.weight();
}
}
}
return combinations;
}
/*
* Returns an array of arrays with each sub-array being a minimum combination of rulesets needed to meet this threshold.
* @return {[][]MatchRulesetDefinition}
* @since 5.8.0
*/
minimumMatchCombinations() {
if (!this._minimumMatchCombinations) {
const score = this.score();
const matchingRulesetDefinitions = this.matchable.matchRulesetDefinitions();
if (score === 0) {
this._minimumMatchCombinations = matchingRulesetDefinitions;
} else {
const rulesetsGreaterOrEqualToScore = [];
const rulesetsLessThanScore = [];
for (const rulesetDef of matchingRulesetDefinitions) {
// if weight is not specified, we assume the rule is important and custom scoring after retrieval will take place.
if (rulesetDef.weight() === undefined || rulesetDef.weight() >= score) {
rulesetsGreaterOrEqualToScore.push(rulesetDef);
} else {
rulesetsLessThanScore.push(rulesetDef);
}
}
this._minimumMatchCombinations = rulesetsGreaterOrEqualToScore.map((rulesetDef) => [rulesetDef]);
const lessCombinations = this._otherCombinations(rulesetsLessThanScore);
this._minimumMatchCombinations = this._minimumMatchCombinations.concat(lessCombinations);
}
}
return this._minimumMatchCombinations;
}
/*
* Returns a string that indicates the threshold's action.
* @return {string}
* @since 5.8.0
*/
action() {
return this.threshold.action;
}
generateMD5Key(documentSet, salt) {
const values = documentSet.map((contentObj) => contentObj.uri).sort();
if (salt) {
values.unshift(salt);
}
return xdmp.md5(values.join("##"));
}
generateActionURI(matchingDocumentSet) {
const action = this.action();
const firstUri = matchingDocumentSet[0].uri;
let key;
switch (action) {
case "merge": {
const currentMergeDoc = matchingDocumentSet.find((contentObj) => fn.startsWith(contentObj.uri, "/com.marklogic.smart-mastering/merged/"));
if (currentMergeDoc) {
return currentMergeDoc.uri;
}
key = this.generateMD5Key(matchingDocumentSet);
const format = firstUri.substr(firstUri.lastIndexOf(".") + 1);
return `/com.marklogic.smart-mastering/merged/${key}.${format}`;
}
case "notify":
key = this.generateMD5Key(matchingDocumentSet, this.name());
return `/com.marklogic.smart-mastering/matcher/notifications/${key}.xml`;
default:
return firstUri;
}
}
raw() {
return this.threshold;
}
actionModuleFunction() {
return this.threshold.actionModuleFunction;
}
actionModulePath() {
return this.threshold.actionModulePath;
}
actionModuleNamespace() {
return this.threshold.actionModuleNamespace;
}
}
export default {
queryHashPredicate,
hashScorePredicate,
Matchable,
MatchRulesetDefinition,
ThresholdDefinition,
buildActionDetails
};
© 2015 - 2024 Weber Informatics LLC | Privacy Policy