All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.clerezza.rdf.stable.serializer.StableSerializerProvider Maven / Gradle / Ivy

Go to download

A SerializingProvider that prodocues a stable output. This means if the graph to be serialized changes a little, then also the output changes a little.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.clerezza.rdf.stable.serializer;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringReader;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.clerezza.commons.rdf.BlankNode;
import org.apache.clerezza.commons.rdf.ImmutableGraph;
import org.apache.clerezza.commons.rdf.Literal;
import org.apache.clerezza.commons.rdf.RDFTerm;
import org.apache.clerezza.commons.rdf.Triple;
import org.apache.clerezza.commons.rdf.Graph;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph;
import org.apache.clerezza.rdf.core.serializedform.SerializingProvider;
import org.apache.clerezza.rdf.core.serializedform.SupportedFormat;
import org.apache.clerezza.rdf.utils.GraphNode;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * Credits:
 *
 * Blank node labeling algorithm by Jeremy J. Carroll (see "Signing RDF Graphs",
 * HP technical report 2003)
 *
 * Minimum Self-contained ImmutableGraph (MSG) decomposition algorithm by
 * Giovanni Tummarello, Christian Morbidoni, Paolo Puliti, Francesco Piazza,
 * Università Politecnica delle Marche, Italy
 * (see "Signing individual fragments of an RDF graph", 14th International
 * World Wide Web Conference WWW2005, Poster track, May 2005, Chiba, Japan)
 */

/**
 * A {@link org.apache.clerezza.rdf.core.serializedform.SerializingProvider} that tries
 * to provide similar results when serializing graphs. Specifically it tries to
 * label blank nodes deterministically with reasonable complexity.
 *
 * This serializer does not guarantee a deterministic result but it may minimize
 * the amount of modified lines in serialized output.
 *
 * @author Daniel Spicar ([email protected])
 */
@Component
@Service(SerializingProvider.class)
@SupportedFormat({SupportedFormat.N_TRIPLE})
public class StableSerializerProvider implements SerializingProvider {

    @Property(description="Specifies maximum amount of blank nodes " +
    "labeling recursions, may increase performance at the expense of stability " +
            "(0 = no limit).", intValue=0)
    public static final String MAX_LABELING_ITERATIONS = "max_labeling_iterations";

    private int maxLabelingIterations = -1;

    private final Logger logger = LoggerFactory.getLogger(getClass());

    protected void activate(ComponentContext cCtx) {
        maxLabelingIterations = (Integer) cCtx.getProperties().
            get(MAX_LABELING_ITERATIONS);
        logger.info("StableSerializerProvider activated");
    }

    @Override
    public void serialize(OutputStream os, Graph tc,
            String formatIdentifier) {

        try {
            List lines = new LinkedList();
            List msgs = decomposeGraphToMSGs(tc);
            NTriplesSerializer serializer = new NTriplesSerializer();

            computeMsgHashes(msgs, "MD5");

            for (MSG msg : msgs) {
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                serializer.serialize(baos, msg.tc);
                BufferedReader serializedGraph =
                        new BufferedReader(new StringReader(baos.toString()));
                lines.addAll(labelBlankNodes(serializedGraph, msg.hash));
            }

            Collections.sort(lines);
            for (String l : lines) {
                os.write((l + "\n").getBytes());
            }
        } catch (IOException ex) {
            logger.error("Exception while parsing serialized graph: {}", ex);
        } catch (NoSuchAlgorithmException ex) {
            logger.error("Exception while trying to generate graph hash: {}", ex);
        }
    }

    private List decomposeGraphToMSGs(Graph tc) {

        Graph tmp = new SimpleGraph();
        tmp.addAll(tc);

        List msgSet = new LinkedList();

        while (tmp.size() > 0) {
            Triple triple = tmp.iterator().next();
            Graph msgTc = new SimpleGraph();

            boolean containsBlankNode = fillMSG(triple, tmp, msgTc);
            MSG msg = new MSG(msgTc);
            msg.containsBlankNodes = containsBlankNode;
            msgSet.add(msg);
        }

        return msgSet;
    }

    private boolean fillMSG(Triple triple, Graph tc,
            Graph msg) {

        boolean containsBlankNode = false;

        RDFTerm resource = triple.getSubject();
        if (resource instanceof BlankNode) {
            containsBlankNode = true;
        } else {
            resource = triple.getObject();
            if (resource instanceof BlankNode) {
                containsBlankNode = true;
            }
        }
        if (containsBlankNode) {
            GraphNode gn = new GraphNode(resource, tc);
            ImmutableGraph context = gn.getNodeContext();
            msg.addAll(context);
            tc.removeAll(context);
        } else {
            msg.add(triple);
            tc.remove(triple);
        }
        return containsBlankNode;
    }

    private List labelBlankNodes(BufferedReader serializedGraph,
            String prefix) throws IOException {

        String line = null;
        List lines = new LinkedList();

        long commentedIdentifiers = 0;
        while ((line = serializedGraph.readLine()) != null) {
            try {
                commentedIdentifiers = commentBlankNodeLabels(line,
                        commentedIdentifiers, lines);
            } catch (IOException ex) {
                logger.error("Exception while trying to parse line: "
                        + line + "\n{}", ex);
            }
        }

        Collections.sort(lines);

        Map labels = new HashMap();
        long[] counters = {1, commentedIdentifiers}; //counter[0] = genSymCounter
        long commentedIdentifierBefore;
        int ctr = 0;
        do {
            commentedIdentifierBefore = counters[1];
            counters = generateBlankNodeLabels(lines, labels, counters[0],
                    counters[1], prefix);
            applyLabels(lines, labels, prefix);
            Collections.sort(lines);
            if(++ctr == maxLabelingIterations) {
                break;
            }
        } while (counters[1] > 0 && commentedIdentifierBefore != counters[1]);

        if (counters[1] > 0) {
            labelBlankNodesNonDeterministically(lines, counters, labels, prefix);
        }

        return lines;
    }


    private long[] generateBlankNodeLabels(List lines,
            Map labels, long genSymCounter, long comments,
            String prefix) {

        for (int i = 1; i <= lines.size(); ++i) {
            StringBuilder previousLine = i > 1 ?new StringBuilder(lines.get(i - 2)) : new StringBuilder();
            StringBuilder currentLine = new StringBuilder(lines.get(i - 1));
            StringBuilder nextLine = new StringBuilder();
            if (i < lines.size()) {
                nextLine.append(lines.get(i));
            }

            String currentLineWithoutComments = stripComments(currentLine);
            if (stripComments(previousLine).equals(currentLineWithoutComments) ||
                    stripComments(nextLine).equals(currentLineWithoutComments)) {
                continue;
            }

            int indexOfObject = checkObject(currentLineWithoutComments);
            if (indexOfObject != -1) {
                genSymCounter = applyGenSymIdentifier(labels, genSymCounter,
                        currentLine, indexOfObject, prefix);
                --comments;
            }

            int indexOfSubject = checkSubject(currentLineWithoutComments);
            if (indexOfSubject != -1) {
                genSymCounter = applyGenSymIdentifier(labels, genSymCounter,
                        currentLine, indexOfSubject, prefix);
                --comments;
            }

            lines.set(i - 1, currentLine.toString());
        }

        long[] result = {genSymCounter, comments};
        return result;
    }

    private void applyLabels(List lines, Map labels,
            String prefix) {

        for (int i = 0; i < lines.size(); ++i) {
            StringBuilder line = new StringBuilder(lines.get(i));

            int indexOfObject = checkObject(stripComments(line));
            if (indexOfObject != -1) {
                int indexOfComment = line.lastIndexOf("#_:");
                String identifier =
                        line.substring(indexOfComment + 1, line.length());

                if (labels.containsKey(identifier)) {
                    line.delete(indexOfComment, line.length());
                    line.delete(indexOfObject, indexOfObject + 1);
                    line.insert(indexOfObject, "_:" + prefix +
                            labels.get(identifier));
                }
            }

            int indexOfSubject = checkSubject(stripComments(line));
            if (indexOfSubject != -1) {
                int indexOfComment = line.lastIndexOf("#_:");
                String identifier =
                        line.substring(indexOfComment + 1, line.length());

                if (labels.containsKey(identifier)) {
                    line.delete(indexOfComment, line.length());
                    line.delete(indexOfSubject, indexOfSubject + 1);
                    line.insert(indexOfSubject, "_:" + prefix +
                            labels.get(identifier));
                }
            }

            lines.set(i, line.toString());
        }
    }

    private long commentBlankNodeLabels(String line, long commentedIdentifiers,
            List lines) throws IOException {

        StringReader lineReader = new StringReader(line);
        int data = lineReader.read();
        while (data != -1) {
            if (data == '<') {
                //skip until end tag
                while ((data = lineReader.read()) != '>') {
                    checkForEndOfStream(data);
                }
            } else if (data == '"') {
                break;
            } else if (data == '_') {
                if ((data = lineReader.read()) == ':') {
                    String identifier = "_:";
                    while ((data = lineReader.read()) != ' ') {
                        checkForEndOfStream(data);
                        identifier = identifier.concat(
                                Character.toString((char) data));
                    }
                    line = line.replaceFirst(identifier, "~");
                    line = line.concat(" #" + identifier);
                    ++commentedIdentifiers;
                    checkForEndOfStream(data);
                }
            }
            data = lineReader.read();
        }
        lines.add(line);
        return commentedIdentifiers;
    }

    private long applyGenSymIdentifier(Map labels,
            long genSymCounter, StringBuilder currentLine, int where,
            String prefix) {

        int index = currentLine.lastIndexOf("#_:");
        String identifier =
                currentLine.substring(index + 1, currentLine.length()).trim();
        currentLine.delete(index, currentLine.length());
        if (!labels.containsKey(identifier)) {
            labels.put(identifier, genSymCounter++);
        }
        currentLine.delete(where, where + 1);
        currentLine.insert(where, "_:" + prefix + labels.get(identifier));

        return genSymCounter;
    }

    private void labelBlankNodesNonDeterministically(List lines,
            long[] counters, Map labels, String prefix) {

        for (int i = 0; i < lines.size(); ++i) {
            StringBuilder currentLine = new StringBuilder(lines.get(i));
            String currentLineWithoutComments = stripComments(currentLine);
            int indexOfObject = checkObject(currentLineWithoutComments);
            if (indexOfObject != -1) {
                counters[0] = applyGenSymIdentifier(labels, counters[0],
                        currentLine, indexOfObject, prefix);
                --(counters[1]);
            }
            int indexOfSubject = checkSubject(currentLineWithoutComments);
            if (indexOfSubject != -1) {
                counters[0] = applyGenSymIdentifier(labels, counters[0],
                        currentLine, indexOfSubject, prefix);
                --(counters[1]);
            }
            lines.set(i, currentLine.toString());
        }
        Collections.sort(lines);
    }

    private void checkForEndOfStream(int data) throws IOException {
        if (data == -1) {
            throw new IOException("Parsing Error!");
        }
    }

    private int checkObject(String line) {
        int index = -1;
        if (line.charAt((index = line.length() - 3)) == '~') {
            return index;
        }
        return -1;
    }

    private int checkSubject(String line) {
        if (line.charAt(0) == '~') {
            return 0;
        }
        return -1;
    }


    private String stripComments(StringBuilder line) {
        if (line.length() < 3) {
            return "";
        }
        return line.substring(0, line.lastIndexOf(" .") + 2);
    }

    private void computeMsgHashes(List msgs, String algorithm)
            throws NoSuchAlgorithmException {

        MessageDigest md = MessageDigest.getInstance(algorithm);
        HashSet computedHashes = new HashSet(msgs.size());

        for(MSG msg : msgs) {
            if(!msg.containsBlankNodes) {
                //hash is needed only for b-node labelling
                continue;
            }
            List tripleHashes = new ArrayList(msg.tc.size());
            for (Triple t : msg.tc) {
                StringBuilder tripleHash = new StringBuilder();
                if (!(t.getSubject() instanceof BlankNode)) {
                    tripleHash.append(((IRI) t.getSubject()).hashCode());
                }
                tripleHash.append(t.getPredicate().hashCode());
                if (!(t.getObject() instanceof BlankNode)) {
                    if (t.getObject() instanceof Literal) {
                        tripleHash.append(((Literal) t.getObject()).
                                toString().hashCode());
                    } else {
                        tripleHash.append(((IRI) t.getObject()).hashCode());
                    }
                }
                tripleHashes.add(tripleHash.toString());
            }
            Collections.sort(tripleHashes);
            StringBuilder msgHash = new StringBuilder();
            for(String tripleHash : tripleHashes) {
                msgHash.append(tripleHash);
            }

            md.update(msgHash.toString().getBytes());

            String hexString;
            if(computedHashes.add((hexString = getHashHexString(md.digest())))){
                msg.hash = hexString;
            } else {
                md.update(String.valueOf(
                        System.currentTimeMillis()).getBytes());
                while(!computedHashes.add(
                        (hexString = getHashHexString(md.digest())))) {
                    md.update(String.valueOf(
                            System.currentTimeMillis()).getBytes());
                }
                msg.hash = hexString;
            }
        }
    }

    private String getHashHexString(byte[] hash) {
        StringBuilder hexString = new StringBuilder();
        for (int i = 0; i < hash.length; i++) {
            String hex = Integer.toHexString(0xFF & hash[i]);
            if (hex.length() == 1) {
                hexString.append('0');
            }
            hexString.append(hex);
        }
        return hexString.toString();
    }

    private static class MSG {

        final Graph tc;
        String hash = null;
        boolean containsBlankNodes = false;

        MSG(Graph tc) {
            this.tc = tc;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy