All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eclipse.rdf4j.rio.turtle.TurtleParser Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/
package org.eclipse.rdf4j.rio.turtle;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.input.BOMInputStream;
import org.eclipse.rdf4j.common.text.ASCIIUtil;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Triple;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.base.CoreDatatype;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.util.Values;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.XSD;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;

/**
 * RDF parser for RDF-1.1 Turtle files. This parser is not thread-safe,
 * therefore its public methods are synchronized.
 * 

*

  • Normalization of integer, floating point and boolean values is dependent on the specified datatype handling. * According to the specification, integers and booleans should be normalized, but floats don't.
  • *
  • Comments can be used anywhere in the document, and extend to the end of the line. The Turtle grammar doesn't * allow comments to be used inside triple constructs that extend over multiple lines, but the author's own parser * deviates from this too.
  • * * * @author Arjohn Kampman * @author Peter Ansell */ public class TurtleParser extends AbstractRDFParser { /*-----------* * Variables * *-----------*/ private PushbackReader reader; protected Resource subject; protected IRI predicate; protected Value object; private int lineNumber = 1; private final StringBuilder parsingBuilder = new StringBuilder(); /** * The most recently read complete statement. */ private Statement previousStatement; /*--------------* * Constructors * *--------------*/ /** * Creates a new TurtleParser that will use a {@link SimpleValueFactory} to create RDF model objects. */ public TurtleParser() { super(); } /** * Creates a new TurtleParser that will use the supplied ValueFactory to create RDF model objects. * * @param valueFactory A ValueFactory. */ public TurtleParser(ValueFactory valueFactory) { super(valueFactory); } /*---------* * Methods * *---------*/ @Override public RDFFormat getRDFFormat() { return RDFFormat.TURTLE; } @Override public Collection> getSupportedSettings() { Set> result = new HashSet<>(super.getSupportedSettings()); result.add(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES); result.add(TurtleParserSettings.ACCEPT_TURTLESTAR); return result; } @Override public synchronized void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException { if (in == null) { throw new IllegalArgumentException("Input stream must not be 'null'"); } try { parse(new InputStreamReader(new BOMInputStream(in, false), StandardCharsets.UTF_8), baseURI); } catch (UnsupportedEncodingException e) { // Every platform should support the UTF-8 encoding... throw new RuntimeException(e); } } @Override public synchronized void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException { clear(); try { if (reader == null) { throw new IllegalArgumentException("Reader must not be 'null'"); } if (rdfHandler != null) { rdfHandler.startRDF(); } // Start counting lines at 1: lineNumber = 1; // Allow at most 8 characters to be pushed back: this.reader = new PushbackReader(reader, 10); if (baseURI != null) { // Store normalized base URI setBaseURI(baseURI); } reportLocation(); int c = skipWSC(); while (c != -1) { parseStatement(); c = skipWSC(); } } finally { clear(); } if (rdfHandler != null) { rdfHandler.endRDF(); } } protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException { StringBuilder sb = new StringBuilder(8); int codePoint; // longest valid directive @prefix do { codePoint = readCodePoint(); if (codePoint == -1 || TurtleUtil.isWhitespace(codePoint)) { unread(codePoint); break; } appendCodepoint(sb, codePoint); } while (sb.length() < 8); String directive = sb.toString(); if (directive.startsWith("@") || directive.equalsIgnoreCase("prefix") || directive.equalsIgnoreCase("base")) { parseDirective(directive); skipWSC(); // SPARQL BASE and PREFIX lines do not end in . if (directive.startsWith("@")) { verifyCharacterOrFail(readCodePoint(), "."); } } else { unread(directive); parseTriples(); skipWSC(); verifyCharacterOrFail(readCodePoint(), "."); } } protected void parseDirective(String directive) throws IOException, RDFParseException, RDFHandlerException { if (directive.length() >= 7 && directive.substring(0, 7).equals("@prefix")) { if (directive.length() > 7) { unread(directive.substring(7)); } parsePrefixID(); } else if (directive.length() >= 5 && directive.substring(0, 5).equals("@base")) { if (directive.length() > 5) { unread(directive.substring(5)); } parseBase(); } else if (directive.length() >= 6 && directive.substring(0, 6).equalsIgnoreCase("prefix")) { // SPARQL doesn't require whitespace after directive, so must unread // if // we found part of the prefixID if (directive.length() > 6) { unread(directive.substring(6)); } parsePrefixID(); } else if ((directive.length() >= 4 && directive.substring(0, 4).equalsIgnoreCase("base"))) { if (directive.length() > 4) { unread(directive.substring(4)); } parseBase(); } else if (directive.length() >= 7 && directive.substring(0, 7).equalsIgnoreCase("@prefix")) { if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) { reportFatalError("Cannot strictly support case-insensitive @prefix directive in compliance mode."); } if (directive.length() > 7) { unread(directive.substring(7)); } parsePrefixID(); } else if (directive.length() >= 5 && directive.substring(0, 5).equalsIgnoreCase("@base")) { if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) { reportFatalError("Cannot strictly support case-insensitive @base directive in compliance mode."); } if (directive.length() > 5) { unread(directive.substring(5)); } parseBase(); } else if (directive.isEmpty()) { reportFatalError("Directive name is missing, expected @prefix or @base"); } else { reportFatalError("Unknown directive \"" + directive + "\""); } } protected void parsePrefixID() throws IOException, RDFParseException, RDFHandlerException { skipWSC(); // Read prefix ID (e.g. "rdf:" or ":") StringBuilder prefixID = new StringBuilder(8); while (true) { int c = readCodePoint(); if (c == ':') { unread(c); break; } else if (TurtleUtil.isWhitespace(c)) { break; } else if (c == -1) { throwEOFException(); } appendCodepoint(prefixID, c); } skipWSC(); verifyCharacterOrFail(readCodePoint(), ":"); skipWSC(); // Read the namespace URI String namespaceStr = parseURI().toString(); String prefixStr = prefixID.toString(); // Store and report this namespace mapping setNamespace(prefixStr, namespaceStr); if (rdfHandler != null) { rdfHandler.handleNamespace(prefixStr, namespaceStr); } } protected void parseBase() throws IOException, RDFParseException, RDFHandlerException { skipWSC(); IRI baseURI = parseURI(); setBaseURI(baseURI.toString()); } protected void parseTriples() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); // If the first character is an open bracket we need to decide which of // the two parsing methods for blank nodes to use if (c == '[') { c = readCodePoint(); skipWSC(); c = peekCodePoint(); if (c == ']') { c = readCodePoint(); subject = createNode(); skipWSC(); parsePredicateObjectList(); } else { unread('['); subject = parseImplicitBlank(); } skipWSC(); c = peekCodePoint(); // if this is not the end of the statement, recurse into the list of // predicate and objects, using the subject parsed above as the // subject // of the statement. if (c != '.') { parsePredicateObjectList(); } } else { parseSubject(); skipWSC(); parsePredicateObjectList(); } subject = null; predicate = null; object = null; } protected void parsePredicateObjectList() throws IOException, RDFParseException, RDFHandlerException { predicate = parsePredicate(); skipWSC(); parseObjectList(); while (skipWSC() == ';') { readCodePoint(); int c = skipWSC(); if (c == '.' || // end of triple c == ']' || c == '}') // end of predicateObjectList inside // blank // node { break; } else if (c == ';') { // empty predicateObjectList, skip to next continue; } predicate = parsePredicate(); skipWSC(); parseObjectList(); } } protected void parseObjectList() throws IOException, RDFParseException, RDFHandlerException { parseObject(); if (skipWSC() == '{') { parseAnnotation(); } while (skipWSC() == ',') { readCodePoint(); skipWSC(); parseObject(); if (skipWSC() == '{') { parseAnnotation(); } } } protected void parseSubject() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); if (c == '(') { subject = parseCollection(); } else if (c == '[') { subject = parseImplicitBlank(); } else { Value value = parseValue(); if (value instanceof Resource) { subject = (Resource) value; } else if (value != null) { reportFatalError("Illegal subject value: " + value); } } } protected IRI parsePredicate() throws IOException, RDFParseException, RDFHandlerException { // Check if the short-cut 'a' is used int c1 = readCodePoint(); if (c1 == 'a') { int c2 = readCodePoint(); if (TurtleUtil.isWhitespace(c2)) { // Short-cut is used, return the rdf:type URI return RDF.TYPE; } // Short-cut is not used, unread all characters unread(c2); } unread(c1); // Predicate is a normal resource Value predicate = parseValue(); if (predicate instanceof IRI) { return (IRI) predicate; } else { reportFatalError("Illegal predicate value: " + predicate); return null; } } /** * Parse an object * * @throws IOException * @throws RDFParseException * @throws RDFHandlerException */ protected void parseObject() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); switch (c) { case '(': object = parseCollection(); break; case '[': object = parseImplicitBlank(); break; default: object = parseValue(); reportStatement(subject, predicate, object); break; } } /** * Parses a collection, e.g. ( item1 item2 item3 ). */ protected Resource parseCollection() throws IOException, RDFParseException, RDFHandlerException { verifyCharacterOrFail(readCodePoint(), "("); int c = skipWSC(); if (c == ')') { // Empty list readCodePoint(); if (subject != null) { reportStatement(subject, predicate, RDF.NIL); } return RDF.NIL; } else { Resource listRoot = createNode(); if (subject != null) { reportStatement(subject, predicate, listRoot); } // Remember current subject and predicate Resource oldSubject = subject; IRI oldPredicate = predicate; // generated bNode becomes subject, predicate becomes rdf:first subject = listRoot; predicate = RDF.FIRST; parseObject(); Resource bNode = listRoot; while (skipWSC() != ')') { // Create another list node and link it to the previous Resource newNode = createNode(); reportStatement(bNode, RDF.REST, newNode); // New node becomes the current subject = bNode = newNode; parseObject(); } // Skip ')' readCodePoint(); // Close the list reportStatement(bNode, RDF.REST, RDF.NIL); // Restore previous subject and predicate subject = oldSubject; predicate = oldPredicate; return listRoot; } } /** * Parses an implicit blank node. This method parses the token [] and predicateObjectLists that are * surrounded by square brackets. */ protected Resource parseImplicitBlank() throws IOException, RDFParseException, RDFHandlerException { verifyCharacterOrFail(readCodePoint(), "["); Resource bNode = createNode(); if (subject != null) { reportStatement(subject, predicate, bNode); } skipWSC(); int c = readCodePoint(); if (c != ']') { unread(c); // Remember current subject and predicate Resource oldSubject = subject; IRI oldPredicate = predicate; // generated bNode becomes subject subject = bNode; // Enter recursion with nested predicate-object list skipWSC(); parsePredicateObjectList(); skipWSC(); // Read closing bracket verifyCharacterOrFail(readCodePoint(), "]"); // Restore previous subject and predicate subject = oldSubject; predicate = oldPredicate; } return bNode; } /** * Parses an RDF value. This method parses uriref, qname, node ID, quoted literal, integer, double and boolean. */ protected Value parseValue() throws IOException, RDFParseException, RDFHandlerException { if (getParserConfig().get(TurtleParserSettings.ACCEPT_TURTLESTAR) && peekIsTripleValue()) { return parseTripleValue(); } int c = peekCodePoint(); if (c == '<') { // uriref, e.g. return parseURI(); } else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) { // qname or boolean return parseQNameOrBoolean(); } else if (c == '_') { // node ID, e.g. _:n1 return parseNodeID(); } else if (c == '"' || c == '\'') { // quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo''' return parseQuotedLiteral(); } else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') { // integer or double, e.g. 123 or 1.2e3 return parseNumber(); } else if (c == -1) { throwEOFException(); return null; } else { reportFatalError("Expected an RDF value here, found '" + new String(Character.toChars(c)) + "'"); return null; } } /** * Parses a quoted string, optionally followed by a language tag or datatype. */ protected Literal parseQuotedLiteral() throws IOException, RDFParseException, RDFHandlerException { String label = parseQuotedString(); // Check for presence of a language tag or datatype int c = peekCodePoint(); if (c == '@') { readCodePoint(); // Read language StringBuilder lang = getBuilder(); c = readCodePoint(); if (c == -1) { throwEOFException(); } boolean verifyLanguageTag = getParserConfig().get(BasicParserSettings.VERIFY_LANGUAGE_TAGS); if (verifyLanguageTag && !TurtleUtil.isLanguageStartChar(c)) { reportError("Expected a letter, found '" + new String(Character.toChars(c)) + "'", BasicParserSettings.VERIFY_LANGUAGE_TAGS); } appendCodepoint(lang, c); c = readCodePoint(); while (!TurtleUtil.isWhitespace(c)) { // SES-1887 : Flexibility introduced for SES-1985 and SES-1821 // needs // to be counterbalanced against legitimate situations where // Turtle // language tags do not need whitespace following the language // tag if (c == '.' || c == ';' || c == ',' || c == ')' || c == ']' || c == '>' || c == -1) { break; } if (verifyLanguageTag && !TurtleUtil.isLanguageChar(c)) { reportError("Illegal language tag char: '" + new String(Character.toChars(c)) + "'", BasicParserSettings.VERIFY_LANGUAGE_TAGS); } appendCodepoint(lang, c); c = readCodePoint(); } unread(c); return createLiteral(label, lang.toString(), ((IRI) null), getLineNumber(), -1); } else if (c == '^') { readCodePoint(); // next character should be another '^' verifyCharacterOrFail(readCodePoint(), "^"); skipWSC(); // Read datatype Value datatype = parseValue(); if (datatype == null) { // the datatype IRI could not be parsed. report as error only if VERIFY_URI_SYNTAX is enabled, silently // skip otherwise. reportError("Invalid datatype IRI for literal '" + label + "'", BasicParserSettings.VERIFY_URI_SYNTAX); return null; } else if (!(datatype instanceof IRI)) { reportFatalError("Illegal datatype value: " + datatype); } return createLiteral(label, null, (IRI) datatype, getLineNumber(), -1); } else { return createLiteral(label, null, ((IRI) null), getLineNumber(), -1); } } /** * Parses a quoted string, which is either a "normal string" or a """long string""". * * @return string * @throws IOException * @throws RDFParseException */ protected String parseQuotedString() throws IOException, RDFParseException { String result; int c1 = readCodePoint(); // First character should be '"' or "'" verifyCharacterOrFail(c1, "\"\'"); // Check for long-string, which starts and ends with three double quotes int c2 = readCodePoint(); int c3 = readCodePoint(); if ((c1 == '"' && c2 == '"' && c3 == '"') || (c1 == '\'' && c2 == '\'' && c3 == '\'')) { // Long string result = parseLongString(c2); } else { // Normal string unread(c3); unread(c2); result = parseString(c1); } // Unescape any escape sequences try { result = TurtleUtil.decodeString(result); } catch (IllegalArgumentException e) { reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES); } return result; } /** * Parses a "normal string". This method requires that the opening character has already been parsed. * * @return parsed string * @throws IOException * @throws RDFParseException */ protected String parseString(int closingCharacter) throws IOException, RDFParseException { StringBuilder sb = getBuilder(); while (true) { int c = readCodePoint(); if (c == closingCharacter) { break; } else if (c == -1) { throwEOFException(); } if (c == '\r' || c == '\n') { reportFatalError("Illegal carriage return or new line in literal"); } if (c == '\r' || c == '\n') { reportFatalError("Illegal carriage return or new line in literal"); } appendCodepoint(sb, c); if (c == '\\') { // This escapes the next character, which might be a '"' c = readCodePoint(); if (c == -1) { throwEOFException(); } appendCodepoint(sb, c); } } return sb.toString(); } /** * Parses a """long string""". This method requires that the first three characters have already been parsed. */ protected String parseLongString(int closingCharacter) throws IOException, RDFParseException { StringBuilder sb = getBuilder(); int doubleQuoteCount = 0; int c; while (doubleQuoteCount < 3) { c = readCodePoint(); if (c == -1) { throwEOFException(); } else if (c == closingCharacter) { doubleQuoteCount++; } else { doubleQuoteCount = 0; } appendCodepoint(sb, c); if (c == '\n') { lineNumber++; reportLocation(); } if (c == '\\') { // This escapes the next character, which might be a '"' c = readCodePoint(); if (c == -1) { throwEOFException(); } appendCodepoint(sb, c); } } return sb.substring(0, sb.length() - 3); } protected Literal parseNumber() throws IOException, RDFParseException { StringBuilder value = getBuilder(); IRI datatype = XSD.INTEGER; int c = readCodePoint(); // read optional sign character if (c == '+' || c == '-') { appendCodepoint(value, c); c = readCodePoint(); } while (ASCIIUtil.isNumber(c)) { appendCodepoint(value, c); c = readCodePoint(); } if (c == '.' || c == 'e' || c == 'E') { // read optional fractional digits if (c == '.') { if (TurtleUtil.isWhitespace(peekCodePoint())) { // We're parsing an integer that did not have a space before // the // period to end the statement } else { appendCodepoint(value, c); c = readCodePoint(); while (ASCIIUtil.isNumber(c)) { appendCodepoint(value, c); c = readCodePoint(); } if (value.length() == 1) { // We've only parsed a '.' reportFatalError("Object for statement missing"); } // We're parsing a decimal or a double datatype = XSD.DECIMAL; } } else { if (value.length() == 0) { // We've only parsed an 'e' or 'E' reportFatalError("Object for statement missing"); } } // read optional exponent if (c == 'e' || c == 'E') { datatype = XSD.DOUBLE; appendCodepoint(value, c); c = readCodePoint(); if (c == '+' || c == '-') { appendCodepoint(value, c); c = readCodePoint(); } if (!ASCIIUtil.isNumber(c)) { reportError("Exponent value missing", BasicParserSettings.VERIFY_DATATYPE_VALUES); } appendCodepoint(value, c); c = readCodePoint(); while (ASCIIUtil.isNumber(c)) { appendCodepoint(value, c); c = readCodePoint(); } } } // Unread last character, it isn't part of the number unread(c); // String label = value.toString(); // if (datatype.equals(XMLSchema.INTEGER)) { // try { // label = XMLDatatypeUtil.normalizeInteger(label); // } // catch (IllegalArgumentException e) { // // Note: this should never happen because of the parse constraints // reportError("Illegal integer value: " + label); // } // } // return createLiteral(label, null, datatype); // Return result as a typed literal return createLiteral(value.toString(), null, datatype, getLineNumber(), -1); } protected IRI parseURI() throws IOException, RDFParseException { StringBuilder uriBuf = getBuilder(); // First character should be '<' int c = readCodePoint(); verifyCharacterOrFail(c, "<"); boolean uriIsIllegal = false; // Read up to the next '>' character while (true) { c = readCodePoint(); if (c == '>') { break; } else if (c == -1) { throwEOFException(); } if (c == ' ') { reportError("IRI included an unencoded space: '" + c + "'", BasicParserSettings.VERIFY_URI_SYNTAX); uriIsIllegal = true; } appendCodepoint(uriBuf, c); if (c == '\\') { // This escapes the next character, which might be a '>' c = readCodePoint(); if (c == -1) { throwEOFException(); } if (c != 'u' && c != 'U') { reportError("IRI includes string escapes: '\\" + c + "'", BasicParserSettings.VERIFY_URI_SYNTAX); uriIsIllegal = true; } appendCodepoint(uriBuf, c); } } if (c == '.') { reportError("IRI must not end in a '.'", BasicParserSettings.VERIFY_URI_SYNTAX); uriIsIllegal = true; } // do not report back the actual URI if it's illegal and the parser is // configured to verify URI syntax. if (!(uriIsIllegal && getParserConfig().get(BasicParserSettings.VERIFY_URI_SYNTAX))) { String uri = uriBuf.toString(); // Unescape any escape sequences try { // FIXME: The following decodes \n and similar in URIs, which // should // be // invalid according to test uri = TurtleUtil.decodeString(uri); } catch (IllegalArgumentException e) { reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES); } return super.resolveURI(uri); } return null; } /** * Parses qnames and boolean values, which have equivalent starting characters. */ protected Value parseQNameOrBoolean() throws IOException, RDFParseException { // First character should be a ':' or a letter int c = readCodePoint(); if (c == -1) { throwEOFException(); } if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) { reportError("Expected a ':' or a letter, found '" + new String(Character.toChars(c)) + "'", BasicParserSettings.VERIFY_RELATIVE_URIS); } String namespace; if (c == ':') { // qname using default namespace namespace = getNamespace(""); } else { // c is the first letter of the prefix StringBuilder prefix = new StringBuilder(8); appendCodepoint(prefix, c); int previousChar = c; c = readCodePoint(); while (TurtleUtil.isPrefixChar(c)) { appendCodepoint(prefix, c); previousChar = c; c = readCodePoint(); } while (previousChar == '.' && prefix.length() > 0) { // '.' is a legal prefix name char, but can not appear at the end unread(c); c = previousChar; prefix.setLength(prefix.length() - 1); previousChar = prefix.codePointAt(prefix.codePointCount(0, prefix.length()) - 1); } if (c != ':') { // prefix may actually be a boolean value String value = prefix.toString(); if (value.equals("true")) { unread(c); return createLiteral("true", null, CoreDatatype.XSD.BOOLEAN, getLineNumber(), -1); } else if (value.equals("false")) { unread(c); return createLiteral("false", null, CoreDatatype.XSD.BOOLEAN, getLineNumber(), -1); } } verifyCharacterOrFail(c, ":"); namespace = getNamespace(prefix.toString()); } // c == ':', read optional local name StringBuilder localName = new StringBuilder(16); c = readCodePoint(); if (TurtleUtil.isNameStartChar(c)) { if (c == '\\') { localName.append(readLocalEscapedChar()); } else { appendCodepoint(localName, c); } int previousChar = c; c = readCodePoint(); while (TurtleUtil.isNameChar(c)) { if (c == '\\') { localName.append(readLocalEscapedChar()); } else { appendCodepoint(localName, c); } previousChar = c; c = readCodePoint(); } // Unread last character unread(c); if (previousChar == '.') { // '.' is a legal name char, but can not appear at the end, so // is // not actually part of the name unread(previousChar); localName.deleteCharAt(localName.length() - 1); } } else { // Unread last character unread(c); } String localNameString = localName.toString(); for (int i = 0; i < localNameString.length(); i++) { if (localNameString.charAt(i) == '%') { if (i > localNameString.length() - 3 || !ASCIIUtil.isHex(localNameString.charAt(i + 1)) || !ASCIIUtil.isHex(localNameString.charAt(i + 2))) { reportFatalError("Found incomplete percent-encoded sequence: " + localNameString); } } } // if (c == '.') { // reportFatalError("Blank node identifier must not end in a '.'"); // } // Note: namespace has already been resolved return createURI(namespace + localNameString); } private char readLocalEscapedChar() throws RDFParseException, IOException { int c = readCodePoint(); if (TurtleUtil.isLocalEscapedChar(c)) { return (char) c; } else { throw new RDFParseException("found '" + new String(Character.toChars(c)) + "', expected one of: " + Arrays.toString(TurtleUtil.LOCAL_ESCAPED_CHARS)); } } /** * Parses a blank node ID, e.g. _:node1. */ protected Resource parseNodeID() throws IOException, RDFParseException { // Node ID should start with "_:" verifyCharacterOrFail(readCodePoint(), "_"); verifyCharacterOrFail(readCodePoint(), ":"); // Read the node ID int c = readCodePoint(); if (c == -1) { throwEOFException(); } else if (!TurtleUtil.isBLANK_NODE_LABEL_StartChar(c)) { reportError("Expected a letter, found '" + (char) c + "'", BasicParserSettings.PRESERVE_BNODE_IDS); } StringBuilder name = getBuilder(); appendCodepoint(name, c); // Read all following letter and numbers, they are part of the name c = readCodePoint(); // If we would never go into the loop we must unread now if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) { unread(c); } while (TurtleUtil.isBLANK_NODE_LABEL_Char(c)) { int previous = c; c = readCodePoint(); if (previous == '.' && (c == -1 || TurtleUtil.isWhitespace(c) || c == '<' || c == '_')) { unread(c); unread(previous); break; } appendCodepoint(name, previous); if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) { unread(c); } } return createNode(name.toString()); } protected void reportStatement(Resource subj, IRI pred, Value obj) throws RDFParseException, RDFHandlerException { if (subj != null && pred != null && obj != null) { previousStatement = createStatement(subj, pred, obj); if (rdfHandler != null) { rdfHandler.handleStatement(previousStatement); } } } /** * Verifies that the supplied character code point codePoint is one of the expected characters specified * in expected. This method will throw a ParseException if this is not the case. */ protected void verifyCharacterOrFail(int codePoint, String expected) throws RDFParseException { if (codePoint == -1) { throwEOFException(); } final String supplied = new String(Character.toChars(codePoint)); if (expected.indexOf(supplied) == -1) { StringBuilder msg = new StringBuilder(32); msg.append("Expected "); for (int i = 0; i < expected.length(); i++) { if (i > 0) { msg.append(" or "); } msg.append('\''); msg.append(expected.charAt(i)); msg.append('\''); } msg.append(", found '"); msg.append(supplied); msg.append("'"); reportFatalError(msg.toString()); } } /** * Consumes any white space characters (space, tab, line feed, newline) and comments (#-style) from * reader. After this method has been called, the first character that is returned by reader * is either a non-ignorable character, or EOF. For convenience, this character is also returned by this method. * * @return The next character code point that will be returned by reader. */ protected int skipWSC() throws IOException, RDFHandlerException { int c = readCodePoint(); while (TurtleUtil.isWhitespace(c) || c == '#') { if (c == '#') { processComment(); } else if (c == '\n') { // we only count line feeds (LF), not carriage return (CR), as // normally a CR is immediately followed by a LF. lineNumber++; reportLocation(); } c = readCodePoint(); } unread(c); return c; } /** * Consumes characters from reader until the first EOL has been read. This line of text is then passed to the * {@link #rdfHandler} as a comment. */ protected void processComment() throws IOException, RDFHandlerException { StringBuilder comment = getBuilder(); int c = readCodePoint(); while (c != -1 && c != 0xD && c != 0xA) { appendCodepoint(comment, c); c = readCodePoint(); } if (c == 0xA) { lineNumber++; } // c is equal to -1, \r or \n. // In case c is equal to \r, we should also read a following \n. if (c == 0xD) { c = readCodePoint(); lineNumber++; if (c != 0xA) { unread(c); } } if (rdfHandler != null) { rdfHandler.handleComment(comment.toString()); } reportLocation(); } /** * Reads the next Unicode code point. * * @return the next Unicode code point, or -1 if the end of the stream has been reached. * @throws IOException */ protected int readCodePoint() throws IOException { int next = reader.read(); if (Character.isHighSurrogate((char) next)) { next = Character.toCodePoint((char) next, (char) reader.read()); } return next; } /** * Pushes back a single code point by copying it to the front of the buffer. After this method returns, a call to * {@link #readCodePoint()} will return the same code point c again. * * @param codePoint a single Unicode code point. * @throws IOException */ protected void unread(int codePoint) throws IOException { if (codePoint != -1) { if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); } else { reader.unread(codePoint); } } } /** * Pushes back the supplied string by copying it to the front of the buffer. After this method returns, successive * calls to {@link #readCodePoint()} will return the code points in the supplied string again, starting at the first * in the String.. * * @param string the string to un-read. * @throws IOException */ protected void unread(String string) throws IOException { int i = string.length(); while (i > 0) { final int codePoint = string.codePointBefore(i); if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); i -= surrogatePair.length; } else { reader.unread(codePoint); i--; } } } /** * Peeks at the next Unicode code point without advancing the reader, and returns its value. * * @return the next Unicode code point, or -1 if the end of the stream has been reached. * @throws IOException */ protected int peekCodePoint() throws IOException { int result = readCodePoint(); unread(result); return result; } protected void reportLocation() { reportLocation(getLineNumber(), -1); } /** * Overrides {@link AbstractRDFParser#reportWarning(String)}, adding line number information to the error. */ @Override protected void reportWarning(String msg) { reportWarning(msg, getLineNumber(), -1); } /** * Overrides {@link AbstractRDFParser#reportError(String, RioSetting)}, adding line number information to the error. */ @Override protected void reportError(String msg, RioSetting setting) throws RDFParseException { reportError(msg, getLineNumber(), -1, setting); } /** * Overrides {@link AbstractRDFParser#reportFatalError(String)}, adding line number information to the error. */ @Override protected void reportFatalError(String msg) throws RDFParseException { reportFatalError(msg, getLineNumber(), -1); } /** * Overrides {@link AbstractRDFParser#reportFatalError(Exception)}, adding line number information to the error. */ @Override protected void reportFatalError(Exception e) throws RDFParseException { reportFatalError(e, getLineNumber(), -1); } protected void throwEOFException() throws RDFParseException { throw new RDFParseException("Unexpected end of file"); } protected int getLineNumber() { return lineNumber; } private StringBuilder getBuilder() { parsingBuilder.setLength(0); return parsingBuilder; } /** * Appends the characters from codepoint into the string builder. This is the same as Character#toChars but prevents * the additional char array garbage for BMP codepoints. * * @param dst the destination in which to append the characters * @param codePoint the codepoint to be appended */ private static void appendCodepoint(StringBuilder dst, int codePoint) { if (Character.isBmpCodePoint(codePoint)) { dst.append((char) codePoint); } else if (Character.isValidCodePoint(codePoint)) { dst.append(Character.highSurrogate(codePoint)); dst.append(Character.lowSurrogate(codePoint)); } else { throw new IllegalArgumentException("Invalid codepoint " + codePoint); } } /** * Peeks at the next two Unicode code points without advancing the reader and returns true if they indicate the * start of an RDF-star triple value. Such values start with '<<'. * * @return true if the next code points indicate the beginning of an RDF-star triple value, false otherwise * @throws IOException */ protected boolean peekIsTripleValue() throws IOException { int c0 = readCodePoint(); int c1 = readCodePoint(); unread(c1); unread(c0); return c0 == '<' && c1 == '<'; } /** * Parser an RDF-star triple value and returns it. * * @return An RDF-star triple. * @throws IOException */ protected Triple parseTripleValue() throws IOException { verifyCharacterOrFail(readCodePoint(), "<"); verifyCharacterOrFail(readCodePoint(), "<"); skipWSC(); Value subject = parseValue(); if (subject instanceof Resource) { skipWSC(); Value predicate = parseValue(); if (predicate instanceof IRI) { skipWSC(); Value object = parseValue(); if (object != null) { skipWSC(); verifyCharacterOrFail(readCodePoint(), ">"); verifyCharacterOrFail(readCodePoint(), ">"); return valueFactory.createTriple((Resource) subject, (IRI) predicate, object); } else { reportFatalError("Missing object in RDF-star triple"); } } else { reportFatalError("Illegal predicate value in RDF-star triple: " + predicate); } } else { reportFatalError("Illegal subject val in RDF-star triple: " + subject); } return null; } protected void parseAnnotation() throws IOException { verifyCharacterOrFail(readCodePoint(), "{"); verifyCharacterOrFail(readCodePoint(), "|"); skipWSC(); // keep reference to original subject and predicate while processing the annotation content final Resource currentSubject = subject; final IRI currentPredicate = predicate; subject = Values.triple(previousStatement); parsePredicateObjectList(); verifyCharacterOrFail(readCodePoint(), "|"); verifyCharacterOrFail(readCodePoint(), "}"); subject = currentSubject; predicate = currentPredicate; } }




    © 2015 - 2025 Weber Informatics LLC | Privacy Policy