com.github.jsonldjava.impl.TurtleRDFParser Maven / Gradle / Ivy

Go to download
package com.github.jsonldjava.impl;

import static com.github.jsonldjava.core.JsonLdConsts.RDF_FIRST;
import static com.github.jsonldjava.core.JsonLdConsts.RDF_LANGSTRING;
import static com.github.jsonldjava.core.JsonLdConsts.RDF_NIL;
import static com.github.jsonldjava.core.JsonLdConsts.RDF_REST;
import static com.github.jsonldjava.core.JsonLdConsts.RDF_TYPE;
import static com.github.jsonldjava.core.JsonLdConsts.XSD_BOOLEAN;
import static com.github.jsonldjava.core.JsonLdConsts.XSD_DECIMAL;
import static com.github.jsonldjava.core.JsonLdConsts.XSD_DOUBLE;
import static com.github.jsonldjava.core.JsonLdConsts.XSD_INTEGER;
import static com.github.jsonldjava.core.RDFDatasetUtils.unescape;
import static com.github.jsonldjava.core.Regex.BLANK_NODE_LABEL;
import static com.github.jsonldjava.core.Regex.DECIMAL;
import static com.github.jsonldjava.core.Regex.DOUBLE;
import static com.github.jsonldjava.core.Regex.INTEGER;
import static com.github.jsonldjava.core.Regex.IRIREF;
import static com.github.jsonldjava.core.Regex.LANGTAG;
import static com.github.jsonldjava.core.Regex.PNAME_LN;
import static com.github.jsonldjava.core.Regex.PNAME_NS;
import static com.github.jsonldjava.core.Regex.STRING_LITERAL_LONG_QUOTE;
import static com.github.jsonldjava.core.Regex.STRING_LITERAL_LONG_SINGLE_QUOTE;
import static com.github.jsonldjava.core.Regex.STRING_LITERAL_QUOTE;
import static com.github.jsonldjava.core.Regex.STRING_LITERAL_SINGLE_QUOTE;
import static com.github.jsonldjava.core.Regex.UCHAR;
import static com.github.jsonldjava.core.Regex.WS;
import static com.github.jsonldjava.core.Regex.WS_0_N;
import static com.github.jsonldjava.core.Regex.WS_1_N;

import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.github.jsonldjava.core.JsonLdError;
import com.github.jsonldjava.core.RDFDataset;
import com.github.jsonldjava.core.RDFParser;
import com.github.jsonldjava.core.UniqueNamer;

/**
 * A (probably terribly slow) Parser for turtle. Turtle is the internal
 * RDFDataset used by JSOND-Java
 * 
 * TODO: this probably needs to be changed to use a proper parser/lexer
 * 
 * @author Tristan
 * 
 */
public class TurtleRDFParser implements RDFParser {

    static class Regex {
        final public static Pattern PREFIX_ID = Pattern.compile("@prefix" + WS_1_N + PNAME_NS
                + WS_1_N + IRIREF + WS_0_N + "\\." + WS_0_N);
        final public static Pattern BASE = Pattern.compile("@base" + WS_1_N + IRIREF + WS_0_N
                + "\\." + WS_0_N);
        final public static Pattern SPARQL_PREFIX = Pattern.compile("[Pp][Rr][Ee][Ff][Ii][Xx]" + WS
                + PNAME_NS + WS + IRIREF + WS_0_N);
        final public static Pattern SPARQL_BASE = Pattern.compile("[Bb][Aa][Ss][Ee]" + WS + IRIREF
                + WS_0_N);

        final public static Pattern PREFIXED_NAME = Pattern.compile("(?:" + PNAME_LN + "|"
                + PNAME_NS + ")");
        final public static Pattern IRI = Pattern.compile("(?:" + IRIREF + "|" + PREFIXED_NAME
                + ")");
        final public static Pattern ANON = Pattern.compile("(?:\\[" + WS + "*\\])");
        final public static Pattern BLANK_NODE = Pattern.compile(BLANK_NODE_LABEL + "|" + ANON);
        final public static Pattern STRING = Pattern.compile("(" + STRING_LITERAL_LONG_SINGLE_QUOTE
                + "|" + STRING_LITERAL_LONG_QUOTE + "|" + STRING_LITERAL_QUOTE + "|"
                + STRING_LITERAL_SINGLE_QUOTE + ")");
        final public static Pattern BOOLEAN_LITERAL = Pattern.compile("(true|false)");
        final public static Pattern RDF_LITERAL = Pattern.compile(STRING + "(?:" + LANGTAG
                + "|\\^\\^" + IRI + ")?");
        final public static Pattern NUMERIC_LITERAL = Pattern.compile("(" + DOUBLE + ")|("
                + DECIMAL + ")|(" + INTEGER + ")");
        final public static Pattern LITERAL = Pattern.compile(RDF_LITERAL + "|" + NUMERIC_LITERAL
                + "|" + BOOLEAN_LITERAL);

        final public static Pattern DIRECTIVE = Pattern.compile("^(?:" + PREFIX_ID + "|" + BASE
                + "|" + SPARQL_PREFIX + "|" + SPARQL_BASE + ")");
        final public static Pattern SUBJECT = Pattern.compile("^" + IRI + "|" + BLANK_NODE);
        final public static Pattern PREDICATE = Pattern.compile("^" + IRI + "|a" + WS_1_N);
        final public static Pattern OBJECT = Pattern.compile("^" + IRI + "|" + BLANK_NODE + "|"
                + LITERAL);

        // others
        // final public static Pattern WS_AT_LINE_START = Pattern.compile("^" +
        // WS_1_N);
        final public static Pattern EOLN = Pattern.compile("(?:\r\n)|(?:\n)|(?:\r)");
        final public static Pattern NEXT_EOLN = Pattern.compile("^.*(?:" + EOLN + ")" + WS_0_N);
        // final public static Pattern EMPTY_LINE = Pattern.compile("^" + WS +
        // "*$");

        final public static Pattern COMMENT_OR_WS = Pattern.compile("^(?:(?:[#].*(?:" + EOLN + ")"
                + WS_0_N + ")|(?:" + WS_1_N + "))");
    }

    private class State {
        String baseIri = "";
        Map namespaces = new LinkedHashMap();
        String curSubject = null;
        String curPredicate = null;

        String line = null;

        int lineNumber = 0;
        int linePosition = 0;

        // int bnodes = 0;
        UniqueNamer namer = new UniqueNamer("_:b");// {{ getName(); }}; // call
                                                   // getName() after
                                                   // construction to make
                                                   // first active bnode _:b1

        private final Stack> stack = new Stack>();
        public boolean expectingBnodeClose = false;

        public State(String input) throws JsonLdError {
            line = input;
            lineNumber = 1;
            advanceLinePosition(0);
        }

        public void push() {
            stack.push(new LinkedHashMap() {
                {
                    put(curSubject, curPredicate);
                }
            });
            expectingBnodeClose = true;
            curSubject = null;
            curPredicate = null;
        }

        public void pop() {
            if (stack.size() > 0) {
                for (final Entry x : stack.pop().entrySet()) {
                    curSubject = x.getKey();
                    curPredicate = x.getValue();
                }
            }
            if (stack.size() == 0) {
                expectingBnodeClose = false;
            }
        }

        private void advanceLineNumber() throws JsonLdError {
            final Matcher match = Regex.NEXT_EOLN.matcher(line);
            if (match.find()) {
                final String[] split = match.group(0).split("" + Regex.EOLN);
                lineNumber += (split.length - 1);
                linePosition += split[split.length - 1].length();
                line = line.substring(match.group(0).length());
            }
        }

        public void advanceLinePosition(int len) throws JsonLdError {
            if (len > 0) {
                linePosition += len;
                line = line.substring(len);
            }

            while (!"".equals(line)) {
                // clear any whitespace
                final Matcher match = Regex.COMMENT_OR_WS.matcher(line);
                if (match.find() && match.group(0).length() > 0) {
                    final Matcher eoln = Regex.EOLN.matcher(match.group(0));
                    int end = 0;
                    while (eoln.find()) {
                        lineNumber += 1;
                        end = eoln.end();
                    }
                    linePosition = match.group(0).length() - end;
                    line = line.substring(match.group(0).length());
                } else {
                    break;
                }
            }
            if ("".equals(line) && !endIsOK()) {
                throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                        "Error while parsing Turtle; unexpected end of input. {line: " + lineNumber
                                + ", position:" + linePosition + "}");
            }
        }

        private boolean endIsOK() {
            return curSubject == null && stack.size() == 0;
        }

        public String expandIRI(String ns, String name) throws JsonLdError {
            if (namespaces.containsKey(ns)) {
                return namespaces.get(ns) + name;
            } else {
                throw new JsonLdError(JsonLdError.Error.PARSE_ERROR, "No prefix found for: " + ns
                        + " {line: " + lineNumber + ", position:" + linePosition + "}");
            }
        }
    }

    @Override
    public RDFDataset parse(Object input) throws JsonLdError {
        if (!(input instanceof String)) {
            throw new JsonLdError(JsonLdError.Error.INVALID_INPUT,
                    "Invalid input; Triple RDF Parser requires a string input");
        }
        final RDFDataset result = new RDFDataset();
        final State state = new State((String) input);

        while (!"".equals(state.line)) {
            // check if line is a directive
            Matcher match = Regex.DIRECTIVE.matcher(state.line);
            if (match.find()) {
                if (match.group(1) != null || match.group(4) != null) {
                    final String ns = match.group(1) != null ? match.group(1) : match.group(4);
                    String iri = match.group(1) != null ? match.group(2) : match.group(5);
                    if (!iri.contains(":")) {
                        iri = state.baseIri + iri;
                    }
                    iri = unescape(iri);
                    validateIRI(state, iri);
                    state.namespaces.put(ns, iri);
                    result.setNamespace(ns, iri);
                } else {
                    String base = match.group(3) != null ? match.group(3) : match.group(6);
                    base = unescape(base);
                    validateIRI(state, base);
                    if (!base.contains(":")) {
                        state.baseIri = state.baseIri + base;
                    } else {
                        state.baseIri = base;
                    }
                }
                state.advanceLinePosition(match.group(0).length());
                continue;
            }

            if (state.curSubject == null) {
                // we need to match a subject
                match = Regex.SUBJECT.matcher(state.line);
                if (match.find()) {
                    String iri;
                    if (match.group(1) != null) {
                        // matched IRI
                        iri = unescape(match.group(1));
                        if (!iri.contains(":")) {
                            iri = state.baseIri + iri;
                        }
                    } else if (match.group(2) != null) {
                        // matched NS:NAME
                        final String ns = match.group(2);
                        final String name = unescapeReserved(match.group(3));
                        iri = state.expandIRI(ns, name);
                    } else if (match.group(4) != null) {
                        // match ns: only
                        iri = state.expandIRI(match.group(4), "");
                    } else if (match.group(5) != null) {
                        // matched BNODE
                        iri = state.namer.getName(match.group(0).trim());
                    } else {
                        // matched anon node
                        iri = state.namer.getName();
                    }
                    // make sure IRI still matches an IRI after escaping
                    validateIRI(state, iri);
                    state.curSubject = iri;
                    state.advanceLinePosition(match.group(0).length());
                }
                // handle blank nodes
                else if (state.line.startsWith("[")) {
                    final String bnode = state.namer.getName();
                    state.advanceLinePosition(1);
                    state.push();
                    state.curSubject = bnode;
                }
                // handle collections
                else if (state.line.startsWith("(")) {
                    final String bnode = state.namer.getName();
                    // so we know we want a predicate if the collection close
                    // isn't followed by a subject end
                    state.curSubject = bnode;
                    state.advanceLinePosition(1);
                    state.push();
                    state.curSubject = bnode;
                    state.curPredicate = RDF_FIRST;
                }
                // make sure we have a subject already
                else {
                    throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                            "Error while parsing Turtle; missing expected subject. {line: "
                                    + state.lineNumber + "position: " + state.linePosition + "}");
                }
            }

            if (state.curPredicate == null) {
                // match predicate
                match = Regex.PREDICATE.matcher(state.line);
                if (match.find()) {
                    String iri = "";
                    if (match.group(1) != null) {
                        // matched IRI
                        iri = unescape(match.group(1));
                        if (!iri.contains(":")) {
                            iri = state.baseIri + iri;
                        }
                    } else if (match.group(2) != null) {
                        // matched NS:NAME
                        final String ns = match.group(2);
                        final String name = unescapeReserved(match.group(3));
                        iri = state.expandIRI(ns, name);
                    } else if (match.group(4) != null) {
                        // matched ns:
                        iri = state.expandIRI(match.group(4), "");
                    } else {
                        // matched "a"
                        iri = RDF_TYPE;
                    }
                    validateIRI(state, iri);
                    state.curPredicate = iri;
                    state.advanceLinePosition(match.group(0).length());
                } else {
                    throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                            "Error while parsing Turtle; missing expected predicate. {line: "
                                    + state.lineNumber + "position: " + state.linePosition + "}");
                }
            }

            // expecting bnode or object

            // match BNODE values
            if (state.line.startsWith("[")) {
                final String bnode = state.namer.getName();
                result.addTriple(state.curSubject, state.curPredicate, bnode);
                state.advanceLinePosition(1);
                // check for anonymous objects
                if (state.line.startsWith("]")) {
                    state.advanceLinePosition(1);
                    // next we expect a statement or object separator
                }
                // otherwise we're inside the blank node
                else {
                    state.push();
                    state.curSubject = bnode;
                    // next we expect a predicate
                    continue;
                }
            }
            // match collections
            else if (state.line.startsWith("(")) {
                state.advanceLinePosition(1);
                // check for empty collection
                if (state.line.startsWith(")")) {
                    state.advanceLinePosition(1);
                    result.addTriple(state.curSubject, state.curPredicate, RDF_NIL);
                    // next we expect a statement or object separator
                }
                // otherwise we're inside the collection
                else {
                    final String bnode = state.namer.getName();
                    result.addTriple(state.curSubject, state.curPredicate, bnode);
                    state.push();
                    state.curSubject = bnode;
                    state.curPredicate = RDF_FIRST;
                    continue;
                }
            } else {
                // match object
                match = Regex.OBJECT.matcher(state.line);
                if (match.find()) {
                    String iri = null;
                    if (match.group(1) != null) {
                        // matched IRI
                        iri = unescape(match.group(1));
                        if (!iri.contains(":")) {
                            iri = state.baseIri + iri;
                        }
                    } else if (match.group(2) != null) {
                        // matched NS:NAME
                        final String ns = match.group(2);
                        final String name = unescapeReserved(match.group(3));
                        iri = state.expandIRI(ns, name);
                    } else if (match.group(4) != null) {
                        // matched ns:
                        iri = state.expandIRI(match.group(4), "");
                    } else if (match.group(5) != null) {
                        // matched BNODE
                        iri = state.namer.getName(match.group(0).trim());
                    }
                    if (iri != null) {
                        validateIRI(state, iri);
                        // we have a object
                        result.addTriple(state.curSubject, state.curPredicate, iri);
                    } else {
                        // we have a literal
                        String value = match.group(6);
                        String lang = null;
                        String datatype = null;
                        if (value != null) {
                            // we have a string literal
                            value = unquoteString(value);
                            value = unescape(value);
                            lang = match.group(7);
                            if (lang == null) {
                                if (match.group(8) != null) {
                                    datatype = unescape(match.group(8));
                                    if (!datatype.contains(":")) {
                                        datatype = state.baseIri + datatype;
                                    }
                                    validateIRI(state, datatype);
                                } else if (match.group(9) != null) {
                                    datatype = state.expandIRI(match.group(9),
                                            unescapeReserved(match.group(10)));
                                } else if (match.group(11) != null) {
                                    datatype = state.expandIRI(match.group(11), "");
                                }
                            } else {
                                datatype = RDF_LANGSTRING;
                            }
                        } else if (match.group(12) != null) {
                            // integer literal
                            value = match.group(12);
                            datatype = XSD_DOUBLE;
                        } else if (match.group(13) != null) {
                            // decimal literal
                            value = match.group(13);
                            datatype = XSD_DECIMAL;
                        } else if (match.group(14) != null) {
                            // double literal
                            value = match.group(14);
                            datatype = XSD_INTEGER;
                        } else if (match.group(15) != null) {
                            // boolean literal
                            value = match.group(15);
                            datatype = XSD_BOOLEAN;
                        }
                        result.addTriple(state.curSubject, state.curPredicate, value, datatype,
                                lang);
                    }
                    state.advanceLinePosition(match.group(0).length());
                } else {
                    throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                            "Error while parsing Turtle; missing expected object or blank node. {line: "
                                    + state.lineNumber + "position: " + state.linePosition + "}");
                }
            }

            // close collection
            boolean collectionClosed = false;
            while (state.line.startsWith(")")) {
                if (!RDF_FIRST.equals(state.curPredicate)) {
                    throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                            "Error while parsing Turtle; unexpected ). {line: " + state.lineNumber
                                    + "position: " + state.linePosition + "}");
                }
                result.addTriple(state.curSubject, RDF_REST, RDF_NIL);
                state.pop();
                state.advanceLinePosition(1);
                collectionClosed = true;
            }

            boolean expectDotOrPred = false;

            // match end of bnode
            if (state.line.startsWith("]")) {
                final String bnode = state.curSubject;
                state.pop();
                state.advanceLinePosition(1);
                if (state.curSubject == null) {
                    // this is a bnode as a subject and we
                    // expect either a . or a predicate
                    state.curSubject = bnode;
                    expectDotOrPred = true;
                }
            }

            // match list separator
            if (!expectDotOrPred && state.line.startsWith(",")) {
                state.advanceLinePosition(1);
                // now we expect another object/bnode
                continue;
            }

            // match predicate end
            if (!expectDotOrPred) {
                while (state.line.startsWith(";")) {
                    state.curPredicate = null;
                    state.advanceLinePosition(1);
                    // now we expect another predicate, or a dot
                    expectDotOrPred = true;
                }
            }

            if (state.line.startsWith(".")) {
                if (state.expectingBnodeClose) {
                    throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                            "Error while parsing Turtle; missing expected )\"]\". {line: "
                                    + state.lineNumber + "position: " + state.linePosition + "}");
                }
                state.curSubject = null;
                state.curPredicate = null;
                state.advanceLinePosition(1);
                // this can now be the end of the document.
                continue;
            } else if (expectDotOrPred) {
                // we're expecting another predicate since we didn't find a dot
                continue;
            }

            // if we're in a collection
            if (RDF_FIRST.equals(state.curPredicate)) {
                final String bnode = state.namer.getName();
                result.addTriple(state.curSubject, RDF_REST, bnode);
                state.curSubject = bnode;
                continue;
            }

            if (collectionClosed) {
                // we expect another object
                // TODO: it's not clear yet if this is valid
                continue;
            }

            // if we get here, we're missing a close statement
            throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                    "Error while parsing Turtle; missing expected \"]\" \",\" \";\" or \".\". {line: "
                            + state.lineNumber + "position: " + state.linePosition + "}");
        }

        return result;
    }

    final public static Pattern IRIREF_MINUS_CONTAINER = Pattern
            .compile("(?:(?:[^\\x00-\\x20<>\"{}|\\^`\\\\]|" + UCHAR + ")*)|" + Regex.PREFIXED_NAME);

    private void validateIRI(State state, String iri) throws JsonLdError {
        if (!IRIREF_MINUS_CONTAINER.matcher(iri).matches()) {
            throw new JsonLdError(JsonLdError.Error.PARSE_ERROR,
                    "Error while parsing Turtle; invalid IRI after escaping. {line: "
                            + state.lineNumber + "position: " + state.linePosition + "}");
        }
    }

    final private static Pattern PN_LOCAL_ESC_MATCHED = Pattern
            .compile("[\\\\]([_~\\.\\-!$&'\\(\\)*+,;=/?#@%])");

    static String unescapeReserved(String str) {
        if (str != null) {
            final Matcher m = PN_LOCAL_ESC_MATCHED.matcher(str);
            if (m.find()) {
                return m.replaceAll("$1");
            }
        }
        return str;
    }

    private String unquoteString(String value) {
        if (value.startsWith("\"\"\"") || value.startsWith("'''")) {
            return value.substring(3, value.length() - 3);
        } else if (value.startsWith("\"") || value.startsWith("'")) {
            return value.substring(1, value.length() - 1);
        }
        return value;
    }

}