it.unibz.inf.ontop.cli.OntopMaterialize Maven / Gradle / Ivy

Go to download
package it.unibz.inf.ontop.cli;


import com.github.rvesse.airline.annotations.Command;
import com.github.rvesse.airline.annotations.Option;
import com.github.rvesse.airline.annotations.OptionType;
import com.github.rvesse.airline.annotations.restrictions.AllowedEnumValues;
import com.github.rvesse.airline.parser.errors.ParseArgumentsIllegalValueException;
import com.google.common.collect.ImmutableSet;
import it.unibz.inf.ontop.exception.InvalidOntopConfigurationException;
import it.unibz.inf.ontop.exception.OBDASpecificationException;
import it.unibz.inf.ontop.injection.OntopSQLOWLAPIConfiguration;
import it.unibz.inf.ontop.injection.OntopSQLOWLAPIConfiguration.Builder;
import it.unibz.inf.ontop.injection.impl.OntopModelConfigurationImpl;
import it.unibz.inf.ontop.materialization.MaterializationParams;
import it.unibz.inf.ontop.rdf4j.materialization.RDF4JMaterializer;
import org.apache.commons.rdf.api.IRI;
import org.eclipse.rdf4j.query.GraphQueryResult;
import org.eclipse.rdf4j.rio.RDFHandler;

import javax.annotation.Nullable;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Optional;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import static it.unibz.inf.ontop.injection.OntopSQLCoreSettings.JDBC_URL;
import static it.unibz.inf.ontop.injection.OntopSQLCredentialSettings.JDBC_PASSWORD;
import static it.unibz.inf.ontop.injection.OntopSQLCredentialSettings.JDBC_USER;
import static it.unibz.inf.ontop.injection.OntopSystemSQLSettings.FETCH_SIZE;
import static org.apache.commons.io.FilenameUtils.removeExtension;

@Command(name = "materialize",
        description = "Materialize the RDF graph exposed by the mapping and the OWL ontology")
public class OntopMaterialize extends OntopMappingOntologyRelatedCommand {

    private enum PredicateType {
        CLASS("C"),
        PROPERTY("P");

        private final String code;

        PredicateType(String code) {
            this.code = code;
        }

        String getCode() {
            return code;
        }
    }


    private static final int TRIPLE_LIMIT_PER_FILE = 500000;
    private static final String DEFAULT_FETCH_SIZE = "50000";
    
    @Option(type = OptionType.COMMAND, override = true, name = {"-o", "--output"},
            title = "output", description = "output file (default) or prefix (only for --separate-files)")
    //@BashCompletion(behaviour = CompletionBehaviour.FILENAMES)
    private String outputFile;

    @Option(type = OptionType.COMMAND, name = {"-f", "--format"}, title = "output format",
            description = "The format of the materialized RDF graph. " +
                    //" Options: rdfxml, turtle. " +
                    "Default: rdfxml")
    @AllowedEnumValues(RDFFormatTypes.class)
    public RDFFormatTypes format = RDFFormatTypes.rdfxml;

    @Option(type = OptionType.COMMAND, name = {"--compression"}, title = "output compression",
            description = "The compression format of the materialized RDF graph. " +
                    "Default: no compression")
    @AllowedEnumValues(Compression.class)
    public Compression compression = Compression.no_compression;

    @Option(type = OptionType.COMMAND, name = {"--separate-files"}, title = "output to separate files",
            description = "generating separate files for different classes/properties. This is useful for" +
                    " materializing large OBDA setting. Default: false.")
    public boolean separate = false;

    @Option(type = OptionType.COMMAND, name = {"--no-streaming"}, title = "do not execute streaming of results",
            description = "All the SQL results of one big query will be stored in memory. Not recommended. Default: false.")
    private boolean noStream = false;

    public OntopMaterialize() {
    }

    @Override
    public void run() {

        RDF4JMaterializer materializer = createMaterializer();

        if (separate) {
            runWithSeparateFiles(materializer);
        } else {
            runWithSingleFile(materializer);
        }
    }

    private RDF4JMaterializer createMaterializer() {
        try {
            Builder configurationBuilder = createAndInitConfigurationBuilder();

            return RDF4JMaterializer.defaultMaterializer(
                    configurationBuilder.build(),
                    MaterializationParams.defaultBuilder()
                            .build()
            );
        } catch (OBDASpecificationException e) {
            throw new RuntimeException(e);
        }
    }

    private void runWithSingleFile(RDF4JMaterializer materializer) {
        long tripleCount = 0;

        final long startTime = System.currentTimeMillis();

        GraphQueryResult result = materializer.materialize().evaluate();

        try {
            BufferedWriter writer = createWriter(Optional.empty());
            tripleCount += serializeTripleBatch(
                    result,
                    Optional.empty(),
                    writer,
                    format.createRDFHandler(writer)
            );
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

        if (outputFile != null)
            System.out.println("NR of TRIPLES: " + tripleCount);

        final long endTime = System.currentTimeMillis();
        final long time = endTime - startTime;
        if (outputFile != null)
            System.out.println("Elapsed time to materialize: " + time + " {ms}");
    }

    private void runWithSeparateFiles(RDF4JMaterializer materializer) {
        try {
            validateBaseDirectory();
            materializeClassesByFile(materializer);
            materializePropertiesByFile(materializer);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * If --separate-files is set, this method checks the correctness of the outputFile path,
     * which has to be a directory. If the directory does not exist, but its parent does,
     * it is created. Otherwise, a ParseArgumentsIllegalValueException is thrown.
     * @throws IOException
     * @throws ParseArgumentsIllegalValueException
     */
    private void validateBaseDirectory() throws IOException, ParseArgumentsIllegalValueException {
        Path outputPath = Paths.get(removeExtension(outputFile));
        if(!Files.isDirectory(outputPath)) {
            if(Files.isDirectory(outputPath.getParent()))
            {
                Files.createDirectory(outputPath);
            }
            else {
                throw new ParseArgumentsIllegalValueException("output", outputFile, Set.of("output must be an existing directory if '--separate-files' is set."));
            }
        }
    }

    private void materializeClassesByFile(RDF4JMaterializer materializer) throws Exception {
        ImmutableSet classes = materializer.getClasses();
        int total = classes.size();
        AtomicInteger i = new AtomicInteger();
        for (IRI c : classes) {
            serializePredicate(materializer, c, PredicateType.CLASS, i.incrementAndGet(), total);
        }
    }

    private void materializePropertiesByFile(RDF4JMaterializer materializer) throws Exception {
        ImmutableSet properties = materializer.getProperties();

        int total = properties.size();
        AtomicInteger i = new AtomicInteger();
        for (IRI p : properties) {
            serializePredicate(materializer, p, PredicateType.PROPERTY, i.incrementAndGet(), total);
        }
    }

    /**
     * Serializes the A-box corresponding to a predicate into one or multiple file.
     */
    private void serializePredicate(RDF4JMaterializer materializer, IRI predicateIRI,
                                    PredicateType predicateType, int index, int total) throws Exception {
        final long startTime = System.currentTimeMillis();


        System.err.println(String.format("Materializing %s (%d/%d)", predicateIRI, index, total));
        System.err.println("Starts writing triples into files.");

        long tripleCount = 0;
        int fileCount = 0;

        String fileSubstring = predicateIRI.toString().replaceAll("[^a-zA-Z0-9]", "_")
                + predicateType.getCode() + "_";

        GraphQueryResult result = materializer.materialize(ImmutableSet.of(predicateIRI)).evaluate();

        while (result.hasNext()) {
            BufferedWriter writer = createWriter(Optional.of(fileSubstring + fileCount));
            tripleCount += serializeTripleBatch(
                    result,
                    Optional.of(TRIPLE_LIMIT_PER_FILE),
                    writer,
                    format.createRDFHandler(writer)
            );
            fileCount++;
        }

        if (outputFile != null)
            System.out.println("NR of TRIPLES: " + tripleCount);

        final long endTime = System.currentTimeMillis();
        final long time = endTime - startTime;

        if (outputFile != null)
            System.out.println("Elapsed time to materialize: " + time + " {ms}");
    }

    // We need direct access to the writer to close it (cannot be done via the RDFHandler)
    private BufferedWriter createWriter(Optional prefixExtension) throws IOException {
        OutputStream outputStream;
        if (outputFile != null) {
            String prefix = removeExtension(outputFile);
            String suffix = format.getExtension() + compression.getExtension();
            var path = prefixExtension
                    .map(s -> Paths.get(prefix, s + suffix))
                    .orElseGet(() -> Paths.get(prefix + suffix));
            var fileOutputStream = Files.newOutputStream(path);
            var fileName = path.getFileName().toString();
            outputStream = getCompressingOutputStream(fileOutputStream,
                    compression == Compression.no_compression
                            ? fileName
                            : removeExtension(fileName));
        }
        else
            outputStream = getCompressingOutputStream(System.out, "data" + compression.getExtension());
        return new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8));
    }

    private OutputStream getCompressingOutputStream(OutputStream outputStream, String fileName) throws IOException {
        switch(compression) {
            case gzip:
                return new GZIPOutputStream(outputStream);
            case zip:
                var zipOutputStream = new ZipOutputStream(outputStream);
                zipOutputStream.putNextEntry(new ZipEntry(fileName));
                return zipOutputStream;
            case no_compression:
            default:
                return outputStream;
        }
    }

    /**
     * Serializes a batch of triples into one file.
     * Upper bound: TRIPLE_LIMIT_PER_FILE.
     */
    private long serializeTripleBatch(GraphQueryResult result, Optional limitPerFile, BufferedWriter writer, RDFHandler handler) throws IOException {
        long tripleCount = 0;
        handler.startRDF();
        while (result.hasNext() && (!limitPerFile.isPresent() || tripleCount < limitPerFile.get())) {
            handler.handleStatement(result.next());
            tripleCount++;
        }
        handler.endRDF();
        writer.close();
        return tripleCount;
    }

    /**
     * Mapping file + connection info
     */
    private Builder createAndInitConfigurationBuilder() {

        final Builder configBuilder = OntopSQLOWLAPIConfiguration.defaultBuilder();

        if (owlFile != null)
            configBuilder.ontologyFile(owlFile);

        if (factFile != null)
            configBuilder.factsFile(factFile);

        if (factFormat != null)
            configBuilder.factFormat(factFormat.getExtension());

        if (factsBaseIRI != null)
            configBuilder.factsBaseIRI(factsBaseIRI);

        if (isR2rmlFile(mappingFile)) {
            configBuilder.r2rmlMappingFile(mappingFile);
        } else {
            configBuilder.nativeOntopMappingFile(mappingFile);
        }

        if (constraintFile != null)
            configBuilder.basicImplicitConstraintFile(constraintFile);

        if (dbMetadataFile != null)
            configBuilder.dbMetadataFile(dbMetadataFile);

        if (ontopLensesFile != null)
            configBuilder.lensesFile(ontopLensesFile);

        if (sparqlRulesFile != null)
            configBuilder.sparqlRulesFile(sparqlRulesFile);

        Properties properties = OntopModelConfigurationImpl.extractProperties(
                OntopModelConfigurationImpl.extractPropertyFile(propertiesFile));


        @Nullable
        String userFetchSizeStr = properties.getProperty(FETCH_SIZE);

        if (userFetchSizeStr != null) {
            try {
                int userFetchSize = Integer.parseInt(userFetchSizeStr);
                if (noStream && userFetchSize > 0)
                    throw new InvalidOntopConfigurationException("Do not provide a positive " + FETCH_SIZE
                            + " together with no streaming option");
                else if ((!noStream) && userFetchSize <= 0) {
                    throw new InvalidOntopConfigurationException("Do not provide a non-positive " + FETCH_SIZE
                            + " together with the streaming option");
                }
            } catch (NumberFormatException e ) {
                throw new InvalidOntopConfigurationException(FETCH_SIZE + " was expected an integer");
            }
        }
        /*
         * Set the default FETCH_SIZE for materializer
         */
        else if (!noStream)
            properties.setProperty(FETCH_SIZE, DEFAULT_FETCH_SIZE);
        else
            properties.setProperty(FETCH_SIZE, "-1");

        if (dbPassword != null)
            properties.setProperty(JDBC_PASSWORD, dbPassword);

        if (dbUrl != null)
            properties.setProperty(JDBC_URL, dbUrl);

        if (dbUser != null)
            properties.setProperty(JDBC_USER, dbUser);

        configBuilder
                .properties(properties)
                .enableOntologyAnnotationQuerying(true);

        return configBuilder;
    }

}