org.eclipse.rdf4j.rio.turtle.TurtleParser Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.turtle;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.input.BOMInputStream;
import org.eclipse.rdf4j.common.text.ASCIIUtil;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Triple;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.base.CoreDatatype;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.util.Values;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.XSD;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
/**
* RDF parser for RDF-1.1 Turtle files. This parser is not thread-safe,
* therefore its public methods are synchronized.
*
*
Normalization of integer, floating point and boolean values is dependent on the specified datatype handling.
* According to the specification, integers and booleans should be normalized, but floats don't.
* Comments can be used anywhere in the document, and extend to the end of the line. The Turtle grammar doesn't
* allow comments to be used inside triple constructs that extend over multiple lines, but the author's own parser
* deviates from this too.
*
*
* @author Arjohn Kampman
* @author Peter Ansell
*/
public class TurtleParser extends AbstractRDFParser {
/*-----------*
* Variables *
*-----------*/
private PushbackReader reader;
protected Resource subject;
protected IRI predicate;
protected Value object;
private int lineNumber = 1;
private final StringBuilder parsingBuilder = new StringBuilder();
/**
* The most recently read complete statement.
*/
private Statement previousStatement;
/*--------------*
* Constructors *
*--------------*/
/**
* Creates a new TurtleParser that will use a {@link SimpleValueFactory} to create RDF model objects.
*/
public TurtleParser() {
super();
}
/**
* Creates a new TurtleParser that will use the supplied ValueFactory to create RDF model objects.
*
* @param valueFactory A ValueFactory.
*/
public TurtleParser(ValueFactory valueFactory) {
super(valueFactory);
}
/*---------*
* Methods *
*---------*/
@Override
public RDFFormat getRDFFormat() {
return RDFFormat.TURTLE;
}
@Override
public Collection> getSupportedSettings() {
Set> result = new HashSet<>(super.getSupportedSettings());
result.add(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES);
result.add(TurtleParserSettings.ACCEPT_TURTLESTAR);
return result;
}
@Override
public synchronized void parse(InputStream in, String baseURI)
throws IOException, RDFParseException, RDFHandlerException {
if (in == null) {
throw new IllegalArgumentException("Input stream must not be 'null'");
}
try {
parse(new InputStreamReader(new BOMInputStream(in, false), StandardCharsets.UTF_8), baseURI);
} catch (UnsupportedEncodingException e) {
// Every platform should support the UTF-8 encoding...
throw new RuntimeException(e);
}
}
@Override
public synchronized void parse(Reader reader, String baseURI)
throws IOException, RDFParseException, RDFHandlerException {
clear();
try {
if (reader == null) {
throw new IllegalArgumentException("Reader must not be 'null'");
}
if (rdfHandler != null) {
rdfHandler.startRDF();
}
// Start counting lines at 1:
lineNumber = 1;
// Allow at most 8 characters to be pushed back:
this.reader = new PushbackReader(reader, 10);
if (baseURI != null) {
// Store normalized base URI
setBaseURI(baseURI);
}
reportLocation();
int c = skipWSC();
while (c != -1) {
parseStatement();
c = skipWSC();
}
} finally {
clear();
}
if (rdfHandler != null) {
rdfHandler.endRDF();
}
}
protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException {
StringBuilder sb = new StringBuilder(8);
int codePoint;
// longest valid directive @prefix
do {
codePoint = readCodePoint();
if (codePoint == -1 || TurtleUtil.isWhitespace(codePoint)) {
unread(codePoint);
break;
}
appendCodepoint(sb, codePoint);
} while (sb.length() < 8);
String directive = sb.toString();
if (directive.startsWith("@") || directive.equalsIgnoreCase("prefix") || directive.equalsIgnoreCase("base")) {
parseDirective(directive);
skipWSC();
// SPARQL BASE and PREFIX lines do not end in .
if (directive.startsWith("@")) {
verifyCharacterOrFail(readCodePoint(), ".");
}
} else {
unread(directive);
parseTriples();
skipWSC();
verifyCharacterOrFail(readCodePoint(), ".");
}
}
protected void parseDirective(String directive) throws IOException, RDFParseException, RDFHandlerException {
if (directive.length() >= 7 && directive.substring(0, 7).equals("@prefix")) {
if (directive.length() > 7) {
unread(directive.substring(7));
}
parsePrefixID();
} else if (directive.length() >= 5 && directive.substring(0, 5).equals("@base")) {
if (directive.length() > 5) {
unread(directive.substring(5));
}
parseBase();
} else if (directive.length() >= 6 && directive.substring(0, 6).equalsIgnoreCase("prefix")) {
// SPARQL doesn't require whitespace after directive, so must unread
// if
// we found part of the prefixID
if (directive.length() > 6) {
unread(directive.substring(6));
}
parsePrefixID();
} else if ((directive.length() >= 4 && directive.substring(0, 4).equalsIgnoreCase("base"))) {
if (directive.length() > 4) {
unread(directive.substring(4));
}
parseBase();
} else if (directive.length() >= 7 && directive.substring(0, 7).equalsIgnoreCase("@prefix")) {
if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
reportFatalError("Cannot strictly support case-insensitive @prefix directive in compliance mode.");
}
if (directive.length() > 7) {
unread(directive.substring(7));
}
parsePrefixID();
} else if (directive.length() >= 5 && directive.substring(0, 5).equalsIgnoreCase("@base")) {
if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
reportFatalError("Cannot strictly support case-insensitive @base directive in compliance mode.");
}
if (directive.length() > 5) {
unread(directive.substring(5));
}
parseBase();
} else if (directive.isEmpty()) {
reportFatalError("Directive name is missing, expected @prefix or @base");
} else {
reportFatalError("Unknown directive \"" + directive + "\"");
}
}
protected void parsePrefixID() throws IOException, RDFParseException, RDFHandlerException {
skipWSC();
// Read prefix ID (e.g. "rdf:" or ":")
StringBuilder prefixID = new StringBuilder(8);
while (true) {
int c = readCodePoint();
if (c == ':') {
unread(c);
break;
} else if (TurtleUtil.isWhitespace(c)) {
break;
} else if (c == -1) {
throwEOFException();
}
appendCodepoint(prefixID, c);
}
skipWSC();
verifyCharacterOrFail(readCodePoint(), ":");
skipWSC();
// Read the namespace URI
String namespaceStr = parseURI().toString();
String prefixStr = prefixID.toString();
// Store and report this namespace mapping
setNamespace(prefixStr, namespaceStr);
if (rdfHandler != null) {
rdfHandler.handleNamespace(prefixStr, namespaceStr);
}
}
protected void parseBase() throws IOException, RDFParseException, RDFHandlerException {
skipWSC();
IRI baseURI = parseURI();
setBaseURI(baseURI.toString());
}
protected void parseTriples() throws IOException, RDFParseException, RDFHandlerException {
int c = peekCodePoint();
// If the first character is an open bracket we need to decide which of
// the two parsing methods for blank nodes to use
if (c == '[') {
c = readCodePoint();
skipWSC();
c = peekCodePoint();
if (c == ']') {
c = readCodePoint();
subject = createNode();
skipWSC();
parsePredicateObjectList();
} else {
unread('[');
subject = parseImplicitBlank();
}
skipWSC();
c = peekCodePoint();
// if this is not the end of the statement, recurse into the list of
// predicate and objects, using the subject parsed above as the
// subject
// of the statement.
if (c != '.') {
parsePredicateObjectList();
}
} else {
parseSubject();
skipWSC();
parsePredicateObjectList();
}
subject = null;
predicate = null;
object = null;
}
protected void parsePredicateObjectList() throws IOException, RDFParseException, RDFHandlerException {
predicate = parsePredicate();
skipWSC();
parseObjectList();
while (skipWSC() == ';') {
readCodePoint();
int c = skipWSC();
if (c == '.' || // end of triple
c == ']' || c == '}') // end of predicateObjectList inside
// blank
// node
{
break;
} else if (c == ';') {
// empty predicateObjectList, skip to next
continue;
}
predicate = parsePredicate();
skipWSC();
parseObjectList();
}
}
protected void parseObjectList() throws IOException, RDFParseException, RDFHandlerException {
parseObject();
if (skipWSC() == '{') {
parseAnnotation();
}
while (skipWSC() == ',') {
readCodePoint();
skipWSC();
parseObject();
if (skipWSC() == '{') {
parseAnnotation();
}
}
}
protected void parseSubject() throws IOException, RDFParseException, RDFHandlerException {
int c = peekCodePoint();
if (c == '(') {
subject = parseCollection();
} else if (c == '[') {
subject = parseImplicitBlank();
} else {
Value value = parseValue();
if (value instanceof Resource) {
subject = (Resource) value;
} else if (value != null) {
reportFatalError("Illegal subject value: " + value);
}
}
}
protected IRI parsePredicate() throws IOException, RDFParseException, RDFHandlerException {
// Check if the short-cut 'a' is used
int c1 = readCodePoint();
if (c1 == 'a') {
int c2 = readCodePoint();
if (TurtleUtil.isWhitespace(c2)) {
// Short-cut is used, return the rdf:type URI
return RDF.TYPE;
}
// Short-cut is not used, unread all characters
unread(c2);
}
unread(c1);
// Predicate is a normal resource
Value predicate = parseValue();
if (predicate instanceof IRI) {
return (IRI) predicate;
} else {
reportFatalError("Illegal predicate value: " + predicate);
return null;
}
}
/**
* Parse an object
*
* @throws IOException
* @throws RDFParseException
* @throws RDFHandlerException
*/
protected void parseObject() throws IOException, RDFParseException, RDFHandlerException {
int c = peekCodePoint();
switch (c) {
case '(':
object = parseCollection();
break;
case '[':
object = parseImplicitBlank();
break;
default:
object = parseValue();
reportStatement(subject, predicate, object);
break;
}
}
/**
* Parses a collection, e.g. ( item1 item2 item3 ).
*/
protected Resource parseCollection() throws IOException, RDFParseException, RDFHandlerException {
verifyCharacterOrFail(readCodePoint(), "(");
int c = skipWSC();
if (c == ')') {
// Empty list
readCodePoint();
if (subject != null) {
reportStatement(subject, predicate, RDF.NIL);
}
return RDF.NIL;
} else {
Resource listRoot = createNode();
if (subject != null) {
reportStatement(subject, predicate, listRoot);
}
// Remember current subject and predicate
Resource oldSubject = subject;
IRI oldPredicate = predicate;
// generated bNode becomes subject, predicate becomes rdf:first
subject = listRoot;
predicate = RDF.FIRST;
parseObject();
Resource bNode = listRoot;
while (skipWSC() != ')') {
// Create another list node and link it to the previous
Resource newNode = createNode();
reportStatement(bNode, RDF.REST, newNode);
// New node becomes the current
subject = bNode = newNode;
parseObject();
}
// Skip ')'
readCodePoint();
// Close the list
reportStatement(bNode, RDF.REST, RDF.NIL);
// Restore previous subject and predicate
subject = oldSubject;
predicate = oldPredicate;
return listRoot;
}
}
/**
* Parses an implicit blank node. This method parses the token [] and predicateObjectLists that are
* surrounded by square brackets.
*/
protected Resource parseImplicitBlank() throws IOException, RDFParseException, RDFHandlerException {
verifyCharacterOrFail(readCodePoint(), "[");
Resource bNode = createNode();
if (subject != null) {
reportStatement(subject, predicate, bNode);
}
skipWSC();
int c = readCodePoint();
if (c != ']') {
unread(c);
// Remember current subject and predicate
Resource oldSubject = subject;
IRI oldPredicate = predicate;
// generated bNode becomes subject
subject = bNode;
// Enter recursion with nested predicate-object list
skipWSC();
parsePredicateObjectList();
skipWSC();
// Read closing bracket
verifyCharacterOrFail(readCodePoint(), "]");
// Restore previous subject and predicate
subject = oldSubject;
predicate = oldPredicate;
}
return bNode;
}
/**
* Parses an RDF value. This method parses uriref, qname, node ID, quoted literal, integer, double and boolean.
*/
protected Value parseValue() throws IOException, RDFParseException, RDFHandlerException {
if (getParserConfig().get(TurtleParserSettings.ACCEPT_TURTLESTAR) && peekIsTripleValue()) {
return parseTripleValue();
}
int c = peekCodePoint();
if (c == '<') {
// uriref, e.g.
return parseURI();
} else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) {
// qname or boolean
return parseQNameOrBoolean();
} else if (c == '_') {
// node ID, e.g. _:n1
return parseNodeID();
} else if (c == '"' || c == '\'') {
// quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo'''
return parseQuotedLiteral();
} else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') {
// integer or double, e.g. 123 or 1.2e3
return parseNumber();
} else if (c == -1) {
throwEOFException();
return null;
} else {
reportFatalError("Expected an RDF value here, found '" + new String(Character.toChars(c)) + "'");
return null;
}
}
/**
* Parses a quoted string, optionally followed by a language tag or datatype.
*/
protected Literal parseQuotedLiteral() throws IOException, RDFParseException, RDFHandlerException {
String label = parseQuotedString();
// Check for presence of a language tag or datatype
int c = peekCodePoint();
if (c == '@') {
readCodePoint();
// Read language
StringBuilder lang = getBuilder();
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
boolean verifyLanguageTag = getParserConfig().get(BasicParserSettings.VERIFY_LANGUAGE_TAGS);
if (verifyLanguageTag && !TurtleUtil.isLanguageStartChar(c)) {
reportError("Expected a letter, found '" + new String(Character.toChars(c)) + "'",
BasicParserSettings.VERIFY_LANGUAGE_TAGS);
}
appendCodepoint(lang, c);
c = readCodePoint();
while (!TurtleUtil.isWhitespace(c)) {
// SES-1887 : Flexibility introduced for SES-1985 and SES-1821
// needs
// to be counterbalanced against legitimate situations where
// Turtle
// language tags do not need whitespace following the language
// tag
if (c == '.' || c == ';' || c == ',' || c == ')' || c == ']' || c == '>' || c == -1) {
break;
}
if (verifyLanguageTag && !TurtleUtil.isLanguageChar(c)) {
reportError("Illegal language tag char: '" + new String(Character.toChars(c)) + "'",
BasicParserSettings.VERIFY_LANGUAGE_TAGS);
}
appendCodepoint(lang, c);
c = readCodePoint();
}
unread(c);
return createLiteral(label, lang.toString(), ((IRI) null), getLineNumber(), -1);
} else if (c == '^') {
readCodePoint();
// next character should be another '^'
verifyCharacterOrFail(readCodePoint(), "^");
skipWSC();
// Read datatype
Value datatype = parseValue();
if (datatype == null) {
// the datatype IRI could not be parsed. report as error only if VERIFY_URI_SYNTAX is enabled, silently
// skip otherwise.
reportError("Invalid datatype IRI for literal '" + label + "'", BasicParserSettings.VERIFY_URI_SYNTAX);
return null;
} else if (!(datatype instanceof IRI)) {
reportFatalError("Illegal datatype value: " + datatype);
}
return createLiteral(label, null, (IRI) datatype, getLineNumber(), -1);
} else {
return createLiteral(label, null, ((IRI) null), getLineNumber(), -1);
}
}
/**
* Parses a quoted string, which is either a "normal string" or a """long string""".
*
* @return string
* @throws IOException
* @throws RDFParseException
*/
protected String parseQuotedString() throws IOException, RDFParseException {
String result;
int c1 = readCodePoint();
// First character should be '"' or "'"
verifyCharacterOrFail(c1, "\"\'");
// Check for long-string, which starts and ends with three double quotes
int c2 = readCodePoint();
int c3 = readCodePoint();
if ((c1 == '"' && c2 == '"' && c3 == '"') || (c1 == '\'' && c2 == '\'' && c3 == '\'')) {
// Long string
result = parseLongString(c2);
} else {
// Normal string
unread(c3);
unread(c2);
result = parseString(c1);
}
// Unescape any escape sequences
try {
result = TurtleUtil.decodeString(result);
} catch (IllegalArgumentException e) {
reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
}
return result;
}
/**
* Parses a "normal string". This method requires that the opening character has already been parsed.
*
* @return parsed string
* @throws IOException
* @throws RDFParseException
*/
protected String parseString(int closingCharacter) throws IOException, RDFParseException {
StringBuilder sb = getBuilder();
while (true) {
int c = readCodePoint();
if (c == closingCharacter) {
break;
} else if (c == -1) {
throwEOFException();
}
if (c == '\r' || c == '\n') {
reportFatalError("Illegal carriage return or new line in literal");
}
if (c == '\r' || c == '\n') {
reportFatalError("Illegal carriage return or new line in literal");
}
appendCodepoint(sb, c);
if (c == '\\') {
// This escapes the next character, which might be a '"'
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
appendCodepoint(sb, c);
}
}
return sb.toString();
}
/**
* Parses a """long string""". This method requires that the first three characters have already been parsed.
*/
protected String parseLongString(int closingCharacter) throws IOException, RDFParseException {
StringBuilder sb = getBuilder();
int doubleQuoteCount = 0;
int c;
while (doubleQuoteCount < 3) {
c = readCodePoint();
if (c == -1) {
throwEOFException();
} else if (c == closingCharacter) {
doubleQuoteCount++;
} else {
doubleQuoteCount = 0;
}
appendCodepoint(sb, c);
if (c == '\n') {
lineNumber++;
reportLocation();
}
if (c == '\\') {
// This escapes the next character, which might be a '"'
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
appendCodepoint(sb, c);
}
}
return sb.substring(0, sb.length() - 3);
}
protected Literal parseNumber() throws IOException, RDFParseException {
StringBuilder value = getBuilder();
IRI datatype = XSD.INTEGER;
int c = readCodePoint();
// read optional sign character
if (c == '+' || c == '-') {
appendCodepoint(value, c);
c = readCodePoint();
}
while (ASCIIUtil.isNumber(c)) {
appendCodepoint(value, c);
c = readCodePoint();
}
if (c == '.' || c == 'e' || c == 'E') {
// read optional fractional digits
if (c == '.') {
if (TurtleUtil.isWhitespace(peekCodePoint())) {
// We're parsing an integer that did not have a space before
// the
// period to end the statement
} else {
appendCodepoint(value, c);
c = readCodePoint();
while (ASCIIUtil.isNumber(c)) {
appendCodepoint(value, c);
c = readCodePoint();
}
if (value.length() == 1) {
// We've only parsed a '.'
reportFatalError("Object for statement missing");
}
// We're parsing a decimal or a double
datatype = XSD.DECIMAL;
}
} else {
if (value.length() == 0) {
// We've only parsed an 'e' or 'E'
reportFatalError("Object for statement missing");
}
}
// read optional exponent
if (c == 'e' || c == 'E') {
datatype = XSD.DOUBLE;
appendCodepoint(value, c);
c = readCodePoint();
if (c == '+' || c == '-') {
appendCodepoint(value, c);
c = readCodePoint();
}
if (!ASCIIUtil.isNumber(c)) {
reportError("Exponent value missing", BasicParserSettings.VERIFY_DATATYPE_VALUES);
}
appendCodepoint(value, c);
c = readCodePoint();
while (ASCIIUtil.isNumber(c)) {
appendCodepoint(value, c);
c = readCodePoint();
}
}
}
// Unread last character, it isn't part of the number
unread(c);
// String label = value.toString();
// if (datatype.equals(XMLSchema.INTEGER)) {
// try {
// label = XMLDatatypeUtil.normalizeInteger(label);
// }
// catch (IllegalArgumentException e) {
// // Note: this should never happen because of the parse constraints
// reportError("Illegal integer value: " + label);
// }
// }
// return createLiteral(label, null, datatype);
// Return result as a typed literal
return createLiteral(value.toString(), null, datatype, getLineNumber(), -1);
}
protected IRI parseURI() throws IOException, RDFParseException {
StringBuilder uriBuf = getBuilder();
// First character should be '<'
int c = readCodePoint();
verifyCharacterOrFail(c, "<");
boolean uriIsIllegal = false;
// Read up to the next '>' character
while (true) {
c = readCodePoint();
if (c == '>') {
break;
} else if (c == -1) {
throwEOFException();
}
if (c == ' ') {
reportError("IRI included an unencoded space: '" + c + "'", BasicParserSettings.VERIFY_URI_SYNTAX);
uriIsIllegal = true;
}
appendCodepoint(uriBuf, c);
if (c == '\\') {
// This escapes the next character, which might be a '>'
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
if (c != 'u' && c != 'U') {
reportError("IRI includes string escapes: '\\" + c + "'", BasicParserSettings.VERIFY_URI_SYNTAX);
uriIsIllegal = true;
}
appendCodepoint(uriBuf, c);
}
}
if (c == '.') {
reportError("IRI must not end in a '.'", BasicParserSettings.VERIFY_URI_SYNTAX);
uriIsIllegal = true;
}
// do not report back the actual URI if it's illegal and the parser is
// configured to verify URI syntax.
if (!(uriIsIllegal && getParserConfig().get(BasicParserSettings.VERIFY_URI_SYNTAX))) {
String uri = uriBuf.toString();
// Unescape any escape sequences
try {
// FIXME: The following decodes \n and similar in URIs, which
// should
// be
// invalid according to test
uri = TurtleUtil.decodeString(uri);
} catch (IllegalArgumentException e) {
reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
}
return super.resolveURI(uri);
}
return null;
}
/**
* Parses qnames and boolean values, which have equivalent starting characters.
*/
protected Value parseQNameOrBoolean() throws IOException, RDFParseException {
// First character should be a ':' or a letter
int c = readCodePoint();
if (c == -1) {
throwEOFException();
}
if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) {
reportError("Expected a ':' or a letter, found '" + new String(Character.toChars(c)) + "'",
BasicParserSettings.VERIFY_RELATIVE_URIS);
}
String namespace;
if (c == ':') {
// qname using default namespace
namespace = getNamespace("");
} else {
// c is the first letter of the prefix
StringBuilder prefix = new StringBuilder(8);
appendCodepoint(prefix, c);
int previousChar = c;
c = readCodePoint();
while (TurtleUtil.isPrefixChar(c)) {
appendCodepoint(prefix, c);
previousChar = c;
c = readCodePoint();
}
while (previousChar == '.' && prefix.length() > 0) {
// '.' is a legal prefix name char, but can not appear at the end
unread(c);
c = previousChar;
prefix.setLength(prefix.length() - 1);
previousChar = prefix.codePointAt(prefix.codePointCount(0, prefix.length()) - 1);
}
if (c != ':') {
// prefix may actually be a boolean value
String value = prefix.toString();
if (value.equals("true")) {
unread(c);
return createLiteral("true", null, CoreDatatype.XSD.BOOLEAN, getLineNumber(), -1);
} else if (value.equals("false")) {
unread(c);
return createLiteral("false", null, CoreDatatype.XSD.BOOLEAN, getLineNumber(), -1);
}
}
verifyCharacterOrFail(c, ":");
namespace = getNamespace(prefix.toString());
}
// c == ':', read optional local name
StringBuilder localName = new StringBuilder(16);
c = readCodePoint();
if (TurtleUtil.isNameStartChar(c)) {
if (c == '\\') {
localName.append(readLocalEscapedChar());
} else {
appendCodepoint(localName, c);
}
int previousChar = c;
c = readCodePoint();
while (TurtleUtil.isNameChar(c)) {
if (c == '\\') {
localName.append(readLocalEscapedChar());
} else {
appendCodepoint(localName, c);
}
previousChar = c;
c = readCodePoint();
}
// Unread last character
unread(c);
if (previousChar == '.') {
// '.' is a legal name char, but can not appear at the end, so
// is
// not actually part of the name
unread(previousChar);
localName.deleteCharAt(localName.length() - 1);
}
} else {
// Unread last character
unread(c);
}
String localNameString = localName.toString();
for (int i = 0; i < localNameString.length(); i++) {
if (localNameString.charAt(i) == '%') {
if (i > localNameString.length() - 3 || !ASCIIUtil.isHex(localNameString.charAt(i + 1))
|| !ASCIIUtil.isHex(localNameString.charAt(i + 2))) {
reportFatalError("Found incomplete percent-encoded sequence: " + localNameString);
}
}
}
// if (c == '.') {
// reportFatalError("Blank node identifier must not end in a '.'");
// }
// Note: namespace has already been resolved
return createURI(namespace + localNameString);
}
private char readLocalEscapedChar() throws RDFParseException, IOException {
int c = readCodePoint();
if (TurtleUtil.isLocalEscapedChar(c)) {
return (char) c;
} else {
throw new RDFParseException("found '" + new String(Character.toChars(c)) + "', expected one of: "
+ Arrays.toString(TurtleUtil.LOCAL_ESCAPED_CHARS));
}
}
/**
* Parses a blank node ID, e.g. _:node1.
*/
protected Resource parseNodeID() throws IOException, RDFParseException {
// Node ID should start with "_:"
verifyCharacterOrFail(readCodePoint(), "_");
verifyCharacterOrFail(readCodePoint(), ":");
// Read the node ID
int c = readCodePoint();
if (c == -1) {
throwEOFException();
} else if (!TurtleUtil.isBLANK_NODE_LABEL_StartChar(c)) {
reportError("Expected a letter, found '" + (char) c + "'", BasicParserSettings.PRESERVE_BNODE_IDS);
}
StringBuilder name = getBuilder();
appendCodepoint(name, c);
// Read all following letter and numbers, they are part of the name
c = readCodePoint();
// If we would never go into the loop we must unread now
if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
unread(c);
}
while (TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
int previous = c;
c = readCodePoint();
if (previous == '.' && (c == -1 || TurtleUtil.isWhitespace(c) || c == '<' || c == '_')) {
unread(c);
unread(previous);
break;
}
appendCodepoint(name, previous);
if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
unread(c);
}
}
return createNode(name.toString());
}
protected void reportStatement(Resource subj, IRI pred, Value obj) throws RDFParseException, RDFHandlerException {
if (subj != null && pred != null && obj != null) {
previousStatement = createStatement(subj, pred, obj);
if (rdfHandler != null) {
rdfHandler.handleStatement(previousStatement);
}
}
}
/**
* Verifies that the supplied character code point codePoint is one of the expected characters specified
* in expected. This method will throw a ParseException if this is not the case.
*/
protected void verifyCharacterOrFail(int codePoint, String expected) throws RDFParseException {
if (codePoint == -1) {
throwEOFException();
}
final String supplied = new String(Character.toChars(codePoint));
if (expected.indexOf(supplied) == -1) {
StringBuilder msg = new StringBuilder(32);
msg.append("Expected ");
for (int i = 0; i < expected.length(); i++) {
if (i > 0) {
msg.append(" or ");
}
msg.append('\'');
msg.append(expected.charAt(i));
msg.append('\'');
}
msg.append(", found '");
msg.append(supplied);
msg.append("'");
reportFatalError(msg.toString());
}
}
/**
* Consumes any white space characters (space, tab, line feed, newline) and comments (#-style) from
* reader. After this method has been called, the first character that is returned by reader
* is either a non-ignorable character, or EOF. For convenience, this character is also returned by this method.
*
* @return The next character code point that will be returned by reader.
*/
protected int skipWSC() throws IOException, RDFHandlerException {
int c = readCodePoint();
while (TurtleUtil.isWhitespace(c) || c == '#') {
if (c == '#') {
processComment();
} else if (c == '\n') {
// we only count line feeds (LF), not carriage return (CR), as
// normally a CR is immediately followed by a LF.
lineNumber++;
reportLocation();
}
c = readCodePoint();
}
unread(c);
return c;
}
/**
* Consumes characters from reader until the first EOL has been read. This line of text is then passed to the
* {@link #rdfHandler} as a comment.
*/
protected void processComment() throws IOException, RDFHandlerException {
StringBuilder comment = getBuilder();
int c = readCodePoint();
while (c != -1 && c != 0xD && c != 0xA) {
appendCodepoint(comment, c);
c = readCodePoint();
}
if (c == 0xA) {
lineNumber++;
}
// c is equal to -1, \r or \n.
// In case c is equal to \r, we should also read a following \n.
if (c == 0xD) {
c = readCodePoint();
lineNumber++;
if (c != 0xA) {
unread(c);
}
}
if (rdfHandler != null) {
rdfHandler.handleComment(comment.toString());
}
reportLocation();
}
/**
* Reads the next Unicode code point.
*
* @return the next Unicode code point, or -1 if the end of the stream has been reached.
* @throws IOException
*/
protected int readCodePoint() throws IOException {
int next = reader.read();
if (Character.isHighSurrogate((char) next)) {
next = Character.toCodePoint((char) next, (char) reader.read());
}
return next;
}
/**
* Pushes back a single code point by copying it to the front of the buffer. After this method returns, a call to
* {@link #readCodePoint()} will return the same code point c again.
*
* @param codePoint a single Unicode code point.
* @throws IOException
*/
protected void unread(int codePoint) throws IOException {
if (codePoint != -1) {
if (Character.isSupplementaryCodePoint(codePoint)) {
final char[] surrogatePair = Character.toChars(codePoint);
reader.unread(surrogatePair);
} else {
reader.unread(codePoint);
}
}
}
/**
* Pushes back the supplied string by copying it to the front of the buffer. After this method returns, successive
* calls to {@link #readCodePoint()} will return the code points in the supplied string again, starting at the first
* in the String..
*
* @param string the string to un-read.
* @throws IOException
*/
protected void unread(String string) throws IOException {
int i = string.length();
while (i > 0) {
final int codePoint = string.codePointBefore(i);
if (Character.isSupplementaryCodePoint(codePoint)) {
final char[] surrogatePair = Character.toChars(codePoint);
reader.unread(surrogatePair);
i -= surrogatePair.length;
} else {
reader.unread(codePoint);
i--;
}
}
}
/**
* Peeks at the next Unicode code point without advancing the reader, and returns its value.
*
* @return the next Unicode code point, or -1 if the end of the stream has been reached.
* @throws IOException
*/
protected int peekCodePoint() throws IOException {
int result = readCodePoint();
unread(result);
return result;
}
protected void reportLocation() {
reportLocation(getLineNumber(), -1);
}
/**
* Overrides {@link AbstractRDFParser#reportWarning(String)}, adding line number information to the error.
*/
@Override
protected void reportWarning(String msg) {
reportWarning(msg, getLineNumber(), -1);
}
/**
* Overrides {@link AbstractRDFParser#reportError(String, RioSetting)}, adding line number information to the error.
*/
@Override
protected void reportError(String msg, RioSetting setting) throws RDFParseException {
reportError(msg, getLineNumber(), -1, setting);
}
/**
* Overrides {@link AbstractRDFParser#reportFatalError(String)}, adding line number information to the error.
*/
@Override
protected void reportFatalError(String msg) throws RDFParseException {
reportFatalError(msg, getLineNumber(), -1);
}
/**
* Overrides {@link AbstractRDFParser#reportFatalError(Exception)}, adding line number information to the error.
*/
@Override
protected void reportFatalError(Exception e) throws RDFParseException {
reportFatalError(e, getLineNumber(), -1);
}
protected void throwEOFException() throws RDFParseException {
throw new RDFParseException("Unexpected end of file");
}
protected int getLineNumber() {
return lineNumber;
}
private StringBuilder getBuilder() {
parsingBuilder.setLength(0);
return parsingBuilder;
}
/**
* Appends the characters from codepoint into the string builder. This is the same as Character#toChars but prevents
* the additional char array garbage for BMP codepoints.
*
* @param dst the destination in which to append the characters
* @param codePoint the codepoint to be appended
*/
private static void appendCodepoint(StringBuilder dst, int codePoint) {
if (Character.isBmpCodePoint(codePoint)) {
dst.append((char) codePoint);
} else if (Character.isValidCodePoint(codePoint)) {
dst.append(Character.highSurrogate(codePoint));
dst.append(Character.lowSurrogate(codePoint));
} else {
throw new IllegalArgumentException("Invalid codepoint " + codePoint);
}
}
/**
* Peeks at the next two Unicode code points without advancing the reader and returns true if they indicate the
* start of an RDF-star triple value. Such values start with '<<'.
*
* @return true if the next code points indicate the beginning of an RDF-star triple value, false otherwise
* @throws IOException
*/
protected boolean peekIsTripleValue() throws IOException {
int c0 = readCodePoint();
int c1 = readCodePoint();
unread(c1);
unread(c0);
return c0 == '<' && c1 == '<';
}
/**
* Parser an RDF-star triple value and returns it.
*
* @return An RDF-star triple.
* @throws IOException
*/
protected Triple parseTripleValue() throws IOException {
verifyCharacterOrFail(readCodePoint(), "<");
verifyCharacterOrFail(readCodePoint(), "<");
skipWSC();
Value subject = parseValue();
if (subject instanceof Resource) {
skipWSC();
Value predicate = parseValue();
if (predicate instanceof IRI) {
skipWSC();
Value object = parseValue();
if (object != null) {
skipWSC();
verifyCharacterOrFail(readCodePoint(), ">");
verifyCharacterOrFail(readCodePoint(), ">");
return valueFactory.createTriple((Resource) subject, (IRI) predicate, object);
} else {
reportFatalError("Missing object in RDF-star triple");
}
} else {
reportFatalError("Illegal predicate value in RDF-star triple: " + predicate);
}
} else {
reportFatalError("Illegal subject val in RDF-star triple: " + subject);
}
return null;
}
protected void parseAnnotation() throws IOException {
verifyCharacterOrFail(readCodePoint(), "{");
verifyCharacterOrFail(readCodePoint(), "|");
skipWSC();
// keep reference to original subject and predicate while processing the annotation content
final Resource currentSubject = subject;
final IRI currentPredicate = predicate;
subject = Values.triple(previousStatement);
parsePredicateObjectList();
verifyCharacterOrFail(readCodePoint(), "|");
verifyCharacterOrFail(readCodePoint(), "}");
subject = currentSubject;
predicate = currentPredicate;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy