All Downloads are FREE. Search and download functionalities are using the official Maven repository.

be.ugent.rml.Executor Maven / Gradle / Ivy

Go to download

The RMLMapper executes RML rules to generate high quality Linked Data from multiple originally (semi-)structured data sources.

The newest version!
package be.ugent.rml;

import be.ugent.idlab.knows.dataio.access.LocalFileAccess;
import be.ugent.idlab.knows.dataio.access.RemoteFileAccess;
import be.ugent.idlab.knows.dataio.record.Record;
import be.ugent.idlab.knows.functions.agent.Agent;
import be.ugent.knows.idlabFunctions.IDLabFunctions;
import be.ugent.rml.conformer.MappingConformer;
import be.ugent.rml.functions.MultipleRecordsFunctionExecutor;
import be.ugent.rml.metadata.Metadata;
import be.ugent.rml.metadata.MetadataGenerator;
import be.ugent.rml.records.MarkerRecord;
import be.ugent.rml.records.RecordsFactory;
import be.ugent.rml.store.Quad;
import be.ugent.rml.store.QuadStore;
import be.ugent.rml.store.RDF4JStore;
import be.ugent.rml.term.*;
import be.ugent.rml.termgenerator.TermGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.nio.file.NoSuchFileException;
import java.util.*;

public class Executor {

    private static final Logger logger = LoggerFactory.getLogger(Executor.class);

    private final Initializer initializer;
    private final MappingOptimizer mappingOptimizer;
    private final Map> recordsHolders = new HashMap<>();

    /*
     * this map stores for every Triples Map, which is a Term,
     * a map with the record index and the record's corresponding subject, which is a ProvenancedTerm.
     */
    private final Map>> subjectCache;
    private final QuadStore resultingQuads;
    private final QuadStore rmlStore;
    private final Map targetStores;
    private final RecordsFactory recordsFactory;
    private static int blankNodeCounter;
    private final Map mappings;

    /**
     * Indicates whether the data to process contains an End-of-File (EOF) marker.
     * If it is not provided, it will be automatically inserted when necessary.
     * Don't change unless you're generating LDES and know what you're doing!
     */
    private boolean EOFProvidedInData = false;

    public Executor(QuadStore rmlStore, RecordsFactory recordsFactory, String baseIRI, StrictMode strictMode, final Agent functionAgent) throws Exception {
        this(rmlStore, recordsFactory, null, baseIRI, strictMode, functionAgent);
    }

    /**
     * Defaults to best effort operation. For strict mode,
     * use {@link Executor#Executor(QuadStore, RecordsFactory, QuadStore, String, StrictMode, Agent)}
     */
    public Executor(QuadStore rmlStore, RecordsFactory recordsFactory, QuadStore resultingQuads, String baseIRI, final Agent functionAgent) throws Exception {
        this(rmlStore, recordsFactory, resultingQuads, baseIRI, StrictMode.BEST_EFFORT, functionAgent);
    }

    /**
     * Call this if the data to process contains a specific End-of-File (EOF) marker.
     * If it is not provided, it will be automatically inserted when necessary.
     * Don't change unless you're generating LDES and know what you're doing!
     */
    public void setEOFProvidedInData() {
        this.EOFProvidedInData = true;
    }

    public Executor(QuadStore rmlStore, RecordsFactory recordsFactory, QuadStore resultingQuads, String baseIRI, StrictMode strictMode, final Agent functionAgent) throws Exception {
        this(rmlStore, recordsFactory, resultingQuads, baseIRI, strictMode, functionAgent, null);
    }

    public Executor(QuadStore rmlStore, RecordsFactory recordsFactory, QuadStore resultingQuads, String baseIRI, StrictMode strictMode, final Agent functionAgent, Map mappingOptions) throws Exception {
        // Convert mapping file to RML if needed.
        MappingConformer conformer = new MappingConformer(rmlStore, mappingOptions);

        try {
            boolean conversionNeeded = conformer.conform();

            if (conversionNeeded) {
                logger.info("Conversion to RML was needed.");
            }
        } catch (Exception e) {
            logger.error("Failed to make mapping file conformant to RML spec.", e);
        }

        this.mappingOptimizer = new MappingOptimizer(rmlStore);
        this.rmlStore = mappingOptimizer.optimizeMapping();
        this.initializer = new Initializer(this.rmlStore, functionAgent, baseIRI, strictMode);
        this.mappings = this.initializer.getMappings();
        this.recordsFactory = recordsFactory;
        this.subjectCache = new HashMap<>();
        this.targetStores = new HashMap<>();
        Executor.blankNodeCounter = 0;

        // Default store if no Targets are available for a triple
        this.resultingQuads = Objects.requireNonNullElseGet(resultingQuads, RDF4JStore::new);

        // Output stores for Targets in Term Maps
        for (Map.Entry tm: this.mappings.entrySet()) {
            Mapping mapping = tm.getValue();

            // Subject Map
            MappingInfo subjectMapInfo = mapping.getSubjectMappingInfo();
            Set targets = new HashSet<>(subjectMapInfo.getTargets());

            // Predicate, Object and Language Maps
            for(PredicateObjectGraphMapping pog: mapping.getPredicateObjectGraphMappings()) {
                if(pog.getPredicateMappingInfo() != null) {
                    targets.addAll(pog.getPredicateMappingInfo().getTargets());
                }
                if(pog.getObjectMappingInfo() != null) {
                    targets.addAll(pog.getObjectMappingInfo().getTargets());
                }
                if(pog.getGraphMappingInfo() != null) {
                    targets.addAll(pog.getGraphMappingInfo().getTargets());
                }
            }

            // Graph Map (only subjectGraphMap...)
            for(MappingInfo g: mapping.getGraphMappingInfos()) {
                targets.addAll(g.getTargets());
            }

            // Create stores
            for (Term t: targets) {
                logger.debug("Adding target for {}", t);
                this.targetStores.put(t, new RDF4JStore());
            }
        }
    }

    /*
     * New public API for the V5.X.X. releases
     */
    public Map execute(List triplesMaps, boolean removeDuplicates, MetadataGenerator metadataGenerator) throws Exception {

        POGFunction pogFunction;

        if (metadataGenerator != null && metadataGenerator.getDetailLevel().getLevel() >= MetadataGenerator.DETAIL_LEVEL.TRIPLE.getLevel()) {
            pogFunction = (subject, predicate, object, graph, checkEOFMarker) -> {
                if (generateQuad(subject, predicate, object, graph , checkEOFMarker)) {
                    metadataGenerator.insertQuad(new ProvenancedQuad(subject, predicate, object, graph));
                }
            };
        } else {
            pogFunction = this::generateQuad;
        }

        return executeWithFunction(triplesMaps, removeDuplicates, pogFunction);
    }

    public Map executeWithFunction(List triplesMaps, boolean removeDuplicates, POGFunction pogFunction) throws Exception {
        //check if TriplesMaps are provided
        if (triplesMaps == null || triplesMaps.isEmpty()) {
            triplesMaps = this.getTriplesMaps();
        }

        //we execute every mapping
        for (Term triplesMap : triplesMaps) {
            Mapping mapping = this.mappings.get(triplesMap);

            List records = this.getRecords(triplesMap);

            for (int j = 0; j < records.size(); j++) {
                Record record = records.get(j);
                List subjects = getSubject(triplesMap, mapping, record, j);

                if (subjects != null) {
                    generatePredicateObjectsForSubjects(subjects, mapping, record, pogFunction, EOFProvidedInData);
                }
            }

            if (!EOFProvidedInData) {
                // Generate an EOF marker to indicate the end of the data source and run mappings once more (if not provided).
                // This is a hack to call implicitDelete a final time, where it then returns the list of deleted records
                TermGenerator generator = mapping.getSubjectMappingInfo().getTermGenerator();
                boolean needsEOFMarker = generator.needsEOFMarker();
                if (needsEOFMarker) {
                    Record record = new MarkerRecord();
                    List subjects = new ArrayList<>();
                    List nodes = generator.generate(record);

                    if (!nodes.isEmpty()) {
                        List subjectTargets = getAllTargets(mapping.getSubjectMappingInfo(), record);
                        for (Term node : nodes) {
                            subjects.add(new ProvenancedTerm(node, null, subjectTargets));
                        }
                    }
                    // TODO this only works for the constants in the triples map!
                    // TODO `record` is not really used, we only need to generate a subject here
                    generatePredicateObjectsForSubjects(subjects, mapping, null, pogFunction, true);
                }
            }
        }

        if (removeDuplicates) {
            this.resultingQuads.removeDuplicates();
        }

        // Add the legacy store to the list of targets as well
        this.targetStores.put(new NamedNode("rmlmapper://default.store"), this.resultingQuads);
        return this.targetStores;
    }

    public Map execute(List triplesMaps) throws Exception {
        return this.execute(triplesMaps, false, null);
    }

    private boolean generateQuad(ProvenancedTerm subject, ProvenancedTerm predicate, ProvenancedTerm object, ProvenancedTerm graph, boolean checkEOFMarker) {
        Term g = null;
        Set targets = new HashSet<>();

        if (subject != null && predicate != null && object != null) {
            if (graph != null) {
                g = graph.getTerm();
                targets.addAll(graph.getTargets());
            }

            if (checkEOFMarker) {
                if (subject.getTerm().getValue().contains(IDLabFunctions.MAGIC_MARKER)
                        || subject.getTerm().getValue().contains(IDLabFunctions.MAGIC_MARKER_ENCODED)
                        || predicate.getTerm().getValue().contains(IDLabFunctions.MAGIC_MARKER)
                        || predicate.getTerm().getValue().contains(IDLabFunctions.MAGIC_MARKER_ENCODED)
                        || object.getTerm().getValue().contains(IDLabFunctions.MAGIC_MARKER)
                        || object.getTerm().getValue().contains(IDLabFunctions.MAGIC_MARKER_ENCODED))
                    return false;

                if (g != null && (g.getValue().contains(IDLabFunctions.MAGIC_MARKER) || g.getValue().contains(IDLabFunctions.MAGIC_MARKER_ENCODED)))
                    return false;
            }

            // Get all possible targets for triple, the Set guarantees that we don't have duplicates
            targets.addAll(subject.getTargets());
            targets.addAll(predicate.getTargets());
            targets.addAll(object.getTargets());

            // If we have targets, write to them
            if (!targets.isEmpty()) {
                for (Term t: targets) {
                    this.targetStores.get(t).addQuad(subject.getTerm(), predicate.getTerm(), object.getTerm(), g);
                }
            }
            // If not, use the default processor target
            else {
                this.resultingQuads.addQuad(subject.getTerm(), predicate.getTerm(), object.getTerm(), g);
            }

            return true;
        }

        return false;
    }

    private List getIRIsWithConditions(Record record, Term triplesMap, List conditions) throws Exception {
        List goodIRIs = new ArrayList<>();
        List> allIRIs = new ArrayList<>();

        for (MultipleRecordsFunctionExecutor condition : conditions) {
            allIRIs.add(this.getIRIsWithTrueCondition(record, triplesMap, condition));
        }

        if (!allIRIs.isEmpty()) {
            goodIRIs.addAll(allIRIs.get(0));

            for(int i = 1; i < allIRIs.size(); i ++) {
                List list = allIRIs.get(i);

                for (int j = 0; j < goodIRIs.size(); j ++) {
                    if (!list.contains(goodIRIs.get(j))) {
                        goodIRIs.remove(j);
                        j --;
                    }
                }
            }
        }

        return goodIRIs;
    }

    private List getIRIsWithTrueCondition(Record child, Term triplesMap, MultipleRecordsFunctionExecutor condition) throws Exception {
        Mapping mapping = this.mappings.get(triplesMap);

        //iterator over all the records corresponding with @triplesMap
        List records = this.getRecords(triplesMap);
        //this array contains all the IRIs that are valid regarding @path and @values
        List iris = new ArrayList<>();

        for (int i = 0; i < records.size(); i++) {
            Record parent = records.get(i);

            Map recordsMap = new HashMap<>();
            recordsMap.put("child", child);
            recordsMap.put("parent", parent);

            Object expectedBoolean = condition.execute(recordsMap);

            if (Boolean.TRUE.equals(expectedBoolean)) {
                List subjects = this.getSubject(triplesMap, mapping, parent, i);
                if (subjects != null)
                    iris.addAll(subjects);
            } else {
                logger.warn("The used condition with the Parent Triples Map does not return a boolean.");
            }
        }

        return iris;
    }

    private List getSubject(Term triplesMap, Mapping mapping, Record record, int i) throws Exception {
        if (!this.subjectCache.containsKey(triplesMap)) {
            this.subjectCache.put(triplesMap, new HashMap<>());
        }

        if (!this.subjectCache.get(triplesMap).containsKey(i)) {
            TermGenerator generator = mapping.getSubjectMappingInfo().getTermGenerator();
            List nodes = generator.generate(record);

            if (!nodes.isEmpty()) {
                List subjectTargets = getAllTargets(mapping.getSubjectMappingInfo(), record);
                List terms = new ArrayList<>();
                Metadata meta = new Metadata(triplesMap, mapping.getSubjectMappingInfo().getTerm());

                // TODO: only create metadata when it's required
                for (Term node : nodes) {
                    terms.add(new ProvenancedTerm(node, meta, subjectTargets));
                }
                this.subjectCache.get(triplesMap).put(i, terms);
                return terms;
            }
        }

        return this.subjectCache.get(triplesMap).get(i);
    }

    private List getAllIRIs(Term triplesMap) throws Exception {
        Mapping mapping = this.mappings.get(triplesMap);

        List records = getRecords(triplesMap);
        List iris = new ArrayList<>();

        for (int i = 0; i < records.size(); i++) {
            Record record = records.get(i);
            List subjects = getSubject(triplesMap, mapping, record, i);
            if (subjects != null)
                iris.addAll(subjects);
        }

        return iris;
    }

    private List getRecords(Term triplesMap) throws Exception {
        if (!this.recordsHolders.containsKey(triplesMap)) {
            this.recordsHolders.put(triplesMap, this.recordsFactory.createRecords(triplesMap, this.rmlStore));
        }

        return this.recordsHolders.get(triplesMap);
    }

    private List combineMultiplePOGs(List predicates, List objects, List graphs) {
        List results = new ArrayList<>();

        if (graphs.isEmpty()) {
            graphs.add(null);
        }

        predicates.forEach(
                p -> objects.forEach(
                        o -> graphs.forEach(
                                g -> results.add(new PredicateObjectGraph(p, o, g))
                        )
                )
        );

        return results;
    }

    public static String getNewBlankNodeID() {
        String temp = "" + Executor.blankNodeCounter;
        Executor.blankNodeCounter++;

        return temp;
    }

    public List getTriplesMaps() {
        List withSubjectMaps = rmlStore.getQuads(null, new NamedNode(NAMESPACES.RML2 + "subjectMap"), null);

        return withSubjectMaps.stream()
                .map(Quad::getSubject)
                .filter(subject -> rmlStore.contains(subject, new NamedNode(NAMESPACES.RML2 + "logicalSource"), null)).toList();
    }

    public QuadStore getRMLStore() {
        return this.rmlStore;
    }

    public Map getTargets(){
        if (this.targetStores.isEmpty()){
            return null;
        }
        return this.targetStores;
    }

    public void verifySources(String basepath, String mappingPath) throws Exception {
        for (Term triplesMap : this.getTriplesMaps()) {
            List logicalSources = Utils.getObjectsFromQuads(rmlStore.getQuads(triplesMap, new NamedNode(NAMESPACES.RML2 + "logicalSource"), null));
            Term logicalSource = logicalSources.get(0);
            List sources = Utils.getObjectsFromQuads(rmlStore.getQuads(logicalSource, new NamedNode(NAMESPACES.RML2 + "source"), null));
            for (Term source : sources) {
                String value = source.getValue();
                if (source instanceof Literal) {
                    InputStream is;
                    if (Utils.isRemoteFile(value)) {
                        is = new RemoteFileAccess(value).getInputStream();
                    } else {
                        try {
                            is = new LocalFileAccess(value, basepath, ((Literal) source).getDatatype().stringValue()).getInputStream();
                        } catch (NoSuchFileException e) {
                            is = new LocalFileAccess(value, mappingPath, ((Literal) source).getDatatype().stringValue()).getInputStream();
                        }
                    }
                    is.close(); // close resources.
                }
            }
        }
    }

    private void generatePredicateObjectsForSubjects(final List subjects,
                                                     final Mapping mapping,
                                                     final Record record,
                                                     final POGFunction pogFunction,
                                                     final boolean checkEOFMarker) throws Exception {
        for (ProvenancedTerm subject: subjects) {
            //TODO validate subject or check if blank node
            if (subject != null) {
                List subjectGraphs = new ArrayList<>();

                mapping.getGraphMappingInfos().forEach(mappingInfo -> {
                    List terms = null;

                    try {
                        terms = mappingInfo.getTermGenerator().generate(record);
                    } catch (Exception e) {
                        //todo be more nice and gentle
                        logger.error("Could not generate graph term for record {}", record, e);
                    }

                    if (terms != null) {
                        terms.forEach(term -> {
                            if (!term.equals(new NamedNode(NAMESPACES.RML2 + "defaultGraph"))) {
                                List subjectGraphTargets = getAllTargets(mappingInfo, record);
                                subjectGraphs.add(new ProvenancedTerm(term, null, subjectGraphTargets));
                            }
                        });
                    }
                });

                List pogs = new ArrayList<>();
                List predicateObjectGraphMappings = mapping.getPredicateObjectGraphMappings();

                for (PredicateObjectGraphMapping pogMapping : predicateObjectGraphMappings) {
                    ArrayList predicates = new ArrayList<>();
                    MappingInfo pogGraphMappingInfo = pogMapping.getGraphMappingInfo();
                    MappingInfo pogPredicateMappingInfo = pogMapping.getPredicateMappingInfo();
                    MappingInfo pogObjectMappingInfo = pogMapping.getObjectMappingInfo();

                    ArrayList poGraphs = new ArrayList<>(subjectGraphs);

                    if (pogGraphMappingInfo != null) {
                        TermGenerator pogGraphGenerator = pogGraphMappingInfo.getTermGenerator();
                        if (pogGraphGenerator != null) {
                            pogGraphGenerator.generate(record).forEach(term -> {
                                if (!term.equals(new NamedNode(NAMESPACES.RML2 + "defaultGraph"))) {
                                    List graphTargets = getAllTargets(pogGraphMappingInfo, record);
                                    poGraphs.add(new ProvenancedTerm(term, null, graphTargets));
                                }
                            });
                        }
                    }

                    /* Predicates */
                    if (pogPredicateMappingInfo != null) {
                        TermGenerator pogPredicateGenerator = pogPredicateMappingInfo.getTermGenerator();
                        List predicateTargets = getAllTargets(pogPredicateMappingInfo, record);
                        pogPredicateGenerator.generate(record).forEach(p -> {
                            Metadata meta = new ProvenancedTerm(p, pogPredicateMappingInfo).getMetadata();
                            predicates.add(new ProvenancedTerm(p, meta, predicateTargets));
                        });
                    }

                    /* Objects */
                    if (pogObjectMappingInfo != null) {
                        TermGenerator pogObjectGenerator = pogObjectMappingInfo.getTermGenerator();
                        if (pogObjectGenerator != null) {
                            List objects = pogObjectGenerator.generate(record);
                            List objectTargets = getAllTargets(pogObjectMappingInfo, record);
                            List provenancedObjects = new ArrayList<>();
                            objects.forEach(object -> {
                                Metadata meta = new ProvenancedTerm(object, pogObjectMappingInfo).getMetadata();
                                provenancedObjects.add(new ProvenancedTerm(object, meta, objectTargets));
                            });

                            if (!objects.isEmpty()) {
                                //add pogs
                                pogs.addAll(combineMultiplePOGs(predicates, provenancedObjects, poGraphs));
                            }
                        }

                    //check if we are dealing with a parentTriplesMap (RefObjMap)
                    } else if (pogMapping.getParentTriplesMap() != null) {
                        List objects;

                        //check if need to apply a join condition
                        if (!pogMapping.getJoinConditions().isEmpty()) {
                            logger.debug("mapping {}'s join conditions are not empty", pogMapping.toString());
                            objects = this.getIRIsWithConditions(record, pogMapping.getParentTriplesMap(), pogMapping.getJoinConditions());
                            //this.generateTriples(subject, po.getPredicateGenerator(), objects, record, combinedGraphs);
                        } else {
                            logger.debug("mapping {}'s join conditions are empty", pogMapping.toString());
                            objects = this.getAllIRIs(pogMapping.getParentTriplesMap());
                        }

                        pogs.addAll(combineMultiplePOGs(predicates, objects, poGraphs));
                    }
                }

                pogs.forEach(pog -> pogFunction.generateQuad(subject, pog.getPredicate(), pog.getObject(), pog.getGraph(), checkEOFMarker));
            }
        }
    }

    private List getAllTargets(MappingInfo mappingInfo, Record record) {
        List allTargets = new ArrayList<>();
        allTargets.addAll(generateTargetsAndAddToTargetStore(mappingInfo, record));
        allTargets.addAll(mappingInfo.getTargets());
        return allTargets;
    }

    private List generateTargetsAndAddToTargetStore(MappingInfo mappingInfo, Record record) {
        List targetGenerators = mappingInfo.getTargetGenerators();
        List generatedTargets = new ArrayList<>();
        for (TermGenerator targetGenerator : targetGenerators) {
            try {
                generatedTargets.addAll(targetGenerator.generate(record));
            } catch (Exception e) {
                logger.error("Error occurred when generating target", e);
            }
        }
        for (Term generatedTarget : generatedTargets){
            if(!targetStores.containsKey(generatedTarget)){
                targetStores.put(generatedTarget, new RDF4JStore());
            }
        }
        return generatedTargets;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy