org.openrdf.rio.turtle.TurtleParser Maven / Gradle / Ivy
/*
* Licensed to Aduna under one or more contributor license agreements.
* See the NOTICE.txt file distributed with this work for additional
* information regarding copyright ownership.
*
* Aduna licenses this file to you under the terms of the Aduna BSD
* License (the "License"); you may not use this file except in compliance
* with the License. See the LICENSE.txt file distributed with this work
* for the full License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing permissions
* and limitations under the License.
*/
package org.openrdf.rio.turtle;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.input.BOMInputStream;
import info.aduna.text.ASCIIUtil;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.XMLSchema;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RioSetting;
import org.openrdf.rio.helpers.BasicParserSettings;
import org.openrdf.rio.helpers.RDFParserBase;
import org.openrdf.rio.helpers.TurtleParserSettings;
/**
* RDF parser for Turtle
* files. This parser is not thread-safe, therefore its public methods are
* synchronized.
*
* This implementation is based on the 2006/01/02 version of the Turtle
* specification, with slight deviations:
*
* - Normalization of integer, floating point and boolean values is dependent
* on the specified datatype handling. According to the specification, integers
* and booleans should be normalized, but floats don't.
* - Comments can be used anywhere in the document, and extend to the end of
* the line. The Turtle grammar doesn't allow comments to be used inside triple
* constructs that extend over multiple lines, but the author's own parser
* deviates from this too.
* - The localname part of a prefixed named is allowed to start with a number
* (cf. the W3C Turtle Working
* Draft).
*
*
* @author Arjohn Kampman
*/
public class TurtleParser extends RDFParserBase {
/*-----------*
* Variables *
*-----------*/
private PushbackReader reader;
protected Resource subject;
protected URI predicate;
protected Value object;
private int lineNumber = 1;
/*--------------*
* Constructors *
*--------------*/
/**
* Creates a new TurtleParser that will use a {@link ValueFactoryImpl} to
* create RDF model objects.
*/
public TurtleParser() {
super();
}
/**
* Creates a new TurtleParser that will use the supplied ValueFactory to
* create RDF model objects.
*
* @param valueFactory
* A ValueFactory.
*/
public TurtleParser(ValueFactory valueFactory) {
super(valueFactory);
}
/*---------*
* Methods *
*---------*/
public RDFFormat getRDFFormat() {
return RDFFormat.TURTLE;
}
@Override
public Collection> getSupportedSettings() {
Set> result = new HashSet>(super.getSupportedSettings());
result.add(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES);
return result;
}
/**
* Implementation of the parse(InputStream, String) method defined
* in the RDFParser interface.
*
* @param in
* The InputStream from which to read the data, must not be
* null. The InputStream is supposed to contain UTF-8 encoded
* Unicode characters, as per the Turtle specification.
* @param baseURI
* The URI associated with the data in the InputStream, must not be
* null.
* @throws IOException
* If an I/O error occurred while data was read from the InputStream.
* @throws RDFParseException
* If the parser has found an unrecoverable parse error.
* @throws RDFHandlerException
* If the configured statement handler encountered an unrecoverable
* error.
* @throws IllegalArgumentException
* If the supplied input stream or base URI is null.
*/
public synchronized void parse(InputStream in, String baseURI)
throws IOException, RDFParseException, RDFHandlerException
{
if (in == null) {
throw new IllegalArgumentException("Input stream must not be 'null'");
}
// Note: baseURI will be checked in parse(Reader, String)
try {
parse(new InputStreamReader(new BOMInputStream(in, false), "UTF-8"), baseURI);
}
catch (UnsupportedEncodingException e) {
// Every platform should support the UTF-8 encoding...
throw new RuntimeException(e);
}
}
/**
* Implementation of the parse(Reader, String) method defined in the
* RDFParser interface.
*
* @param reader
* The Reader from which to read the data, must not be null.
* @param baseURI
* The URI associated with the data in the Reader, must not be
* null.
* @throws IOException
* If an I/O error occurred while data was read from the InputStream.
* @throws RDFParseException
* If the parser has found an unrecoverable parse error.
* @throws RDFHandlerException
* If the configured statement handler encountered an unrecoverable
* error.
* @throws IllegalArgumentException
* If the supplied reader or base URI is null.
*/
public synchronized void parse(Reader reader, String baseURI)
throws IOException, RDFParseException, RDFHandlerException
{
if (reader == null) {
throw new IllegalArgumentException("Reader must not be 'null'");
}
if (baseURI == null) {
throw new IllegalArgumentException("base URI must not be 'null'");
}
if (rdfHandler != null) {
rdfHandler.startRDF();
}
// Start counting lines at 1:
lineNumber = 1;
// Allow at most 8 characters to be pushed back:
this.reader = new PushbackReader(reader, 8);
// Store normalized base URI
setBaseURI(baseURI);
reportLocation();
try {
int c = skipWSC();
while (c != -1) {
parseStatement();
c = skipWSC();
}
}
finally {
clear();
}
if (rdfHandler != null) {
rdfHandler.endRDF();
}
}
protected void parseStatement()
throws IOException, RDFParseException, RDFHandlerException
{
StringBuilder sb = new StringBuilder(8);
int codePoint;
// longest valid directive @prefix
do {
codePoint = readCodePoint();
if (codePoint == -1 || TurtleUtil.isWhitespace(codePoint)) {
unread(codePoint);
break;
}
sb.append(Character.toChars(codePoint));
}
while (sb.length() < 8);
String directive = sb.toString();
if (directive.startsWith("@") || directive.equalsIgnoreCase("prefix")
|| directive.equalsIgnoreCase("base"))
{
parseDirective(directive);
skipWSC();
// SPARQL BASE and PREFIX lines do not end in .
if (directive.startsWith("@")) {
verifyCharacterOrFail(readCodePoint(), ".");
}
}
else {
unread(directive);
parseTriples();
skipWSC();
verifyCharacterOrFail(readCodePoint(), ".");
}
}
protected void parseDirective(String directive)
throws IOException, RDFParseException, RDFHandlerException
{
if (directive.length() >= 7 && directive.substring(0, 7).equals("@prefix")) {
if (directive.length() > 7) {
unread(directive.substring(7));
}
parsePrefixID();
}
else if (directive.length() >= 5 && directive.substring(0, 5).equals("@base")) {
if (directive.length() > 5) {
unread(directive.substring(5));
}
parseBase();
}
else if (directive.length() >= 6 && directive.substring(0, 6).equalsIgnoreCase("prefix")) {
// SPARQL doesn't require whitespace after directive, so must unread if
// we found part of the prefixID
if (directive.length() > 6) {
unread(directive.substring(6));
}
parsePrefixID();
}
else if ((directive.length() >= 4 && directive.substring(0, 4).equalsIgnoreCase("base"))) {
if (directive.length() > 4) {
unread(directive.substring(4));
}
parseBase();
}
else if (directive.length() >= 7 && directive.substring(0, 7).equalsIgnoreCase("@prefix")) {
if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
reportFatalError("Cannot strictly support case-insensitive @prefix directive in compliance mode.");
}
if (directive.length() > 7) {
unread(directive.substring(7));
}
parsePrefixID();
}
else if (directive.length() >= 5 && directive.substring(0, 5).equalsIgnoreCase("@base")) {
if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
reportFatalError("Cannot strictly support case-insensitive @base directive in compliance mode.");
}
if (directive.length() > 5) {
unread(directive.substring(5));
}
parseBase();
}
else if (directive.length() == 0) {
reportFatalError("Directive name is missing, expected @prefix or @base");
}
else {
reportFatalError("Unknown directive \"" + directive + "\"");
}
}
protected void parsePrefixID()
throws IOException, RDFParseException, RDFHandlerException
{
skipWSC();
// Read prefix ID (e.g. "rdf:" or ":")
StringBuilder prefixID = new StringBuilder(8);
while (true) {
int c = readCodePoint();
if (c == ':') {
unread(c);
break;
}
else if (TurtleUtil.isWhitespace(c)) {
break;
}
else if (c == -1) {
throwEOFException();
}
prefixID.append(Character.toChars(c));
}
skipWSC();
verifyCharacterOrFail(readCodePoint(), ":");
skipWSC();
// Read the namespace URI
URI namespace = parseURI();
// Store and report this namespace mapping
String prefixStr = prefixID.toString();
String namespaceStr = namespace.toString();
setNamespace(prefixStr, namespaceStr);
if (rdfHandler != null) {
rdfHandler.handleNamespace(prefixStr, namespaceStr);
}
}
protected void parseBase()
throws IOException, RDFParseException, RDFHandlerException
{
skipWSC();
URI baseURI = parseURI();
setBaseURI(baseURI.toString());
}
protected void parseTriples()
throws IOException, RDFParseException, RDFHandlerException
{
int c = peekCodePoint();
// If the first character is an open bracket we need to decide which of
// the two parsing methods for blank nodes to use
if (c == '[') {
c = readCodePoint();
skipWSC();
c = peekCodePoint();
if (c == ']') {
c = readCodePoint();
subject = createBNode();
skipWSC();
parsePredicateObjectList();
}
else {
unread('[');
subject = parseImplicitBlank();
}
skipWSC();
c = peekCodePoint();
// if this is not the end of the statement, recurse into the list of
// predicate and objects, using the subject parsed above as the subject
// of the statement.
if (c != '.') {
parsePredicateObjectList();
}
}
else {
parseSubject();
skipWSC();
parsePredicateObjectList();
}
subject = null;
predicate = null;
object = null;
}
protected void parsePredicateObjectList()
throws IOException, RDFParseException, RDFHandlerException
{
predicate = parsePredicate();
skipWSC();
parseObjectList();
while (skipWSC() == ';') {
readCodePoint();
int c = skipWSC();
if (c == '.' || // end of triple
c == ']' || c == '}') // end of predicateObjectList inside blank
// node
{
break;
}
else if (c == ';') {
// empty predicateObjectList, skip to next
continue;
}
predicate = parsePredicate();
skipWSC();
parseObjectList();
}
}
protected void parseObjectList()
throws IOException, RDFParseException, RDFHandlerException
{
parseObject();
while (skipWSC() == ',') {
readCodePoint();
skipWSC();
parseObject();
}
}
protected void parseSubject()
throws IOException, RDFParseException, RDFHandlerException
{
int c = peekCodePoint();
if (c == '(') {
subject = parseCollection();
}
else if (c == '[') {
subject = parseImplicitBlank();
}
else {
Value value = parseValue();
if (value instanceof Resource) {
subject = (Resource)value;
}
else {
reportFatalError("Illegal subject value: " + value);
}
}
}
protected URI parsePredicate()
throws IOException, RDFParseException, RDFHandlerException
{
// Check if the short-cut 'a' is used
int c1 = readCodePoint();
if (c1 == 'a') {
int c2 = readCodePoint();
if (TurtleUtil.isWhitespace(c2)) {
// Short-cut is used, return the rdf:type URI
return RDF.TYPE;
}
// Short-cut is not used, unread all characters
unread(c2);
}
unread(c1);
// Predicate is a normal resource
Value predicate = parseValue();
if (predicate instanceof URI) {
return (URI)predicate;
}
else {
reportFatalError("Illegal predicate value: " + predicate);
return null;
}
}
protected void parseObject()
throws IOException, RDFParseException, RDFHandlerException
{
int c = peekCodePoint();
if (c == '(') {
object = parseCollection();
}
else if (c == '[') {
object = parseImplicitBlank();
}
else {
object = parseValue();
}
reportStatement(subject, predicate, object);
}
/**
* Parses a collection, e.g. ( item1 item2 item3 ).
*/
protected Resource parseCollection()
throws IOException, RDFParseException, RDFHandlerException
{
verifyCharacterOrFail(readCodePoint(), "(");
int c = skipWSC();
if (c == ')') {
// Empty list
readCodePoint();
return RDF.NIL;
}
else {
BNode listRoot = createBNode();
// Remember current subject and predicate
Resource oldSubject = subject;
URI oldPredicate = predicate;
// generated bNode becomes subject, predicate becomes rdf:first
subject = listRoot;
predicate = RDF.FIRST;
parseObject();
BNode bNode = listRoot;
while (skipWSC() != ')') {
// Create another list node and link it to the previous
BNode newNode = createBNode();
reportStatement(bNode, RDF.REST, newNode);
// New node becomes the current
subject = bNode = newNode;
parseObject();
}
// Skip ')'
readCodePoint();
// Close the list
reportStatement(bNode, RDF.REST, RDF.NIL);
// Restore previous subject and predicate
subject = oldSubject;
predicate = oldPredicate;
return listRoot;
}
}
/**
* Parses an implicit blank node. This method parses the token []
* and predicateObjectLists that are surrounded by square brackets.
*/
protected Resource parseImplicitBlank()
throws IOException, RDFParseException, RDFHandlerException
{
verifyCharacterOrFail(readCodePoint(), "[");
BNode bNode = createBNode();
int c = readCodePoint();
if (c != ']') {
unread(c);
// Remember current subject and predicate
Resource oldSubject = subject;
URI oldPredicate = predicate;
// generated bNode becomes subject
subject = bNode;
// Enter recursion with nested predicate-object list
skipWSC();
parsePredicateObjectList();
skipWSC();
// Read closing bracket
verifyCharacterOrFail(readCodePoint(), "]");
// Restore previous subject and predicate
subject = oldSubject;
predicate = oldPredicate;
}
return bNode;
}
/**
* Parses an RDF value. This method parses uriref, qname, node ID, quoted
* literal, integer, double and boolean.
*/
protected Value parseValue()
throws IOException, RDFParseException, RDFHandlerException
{
int c = peekCodePoint();
if (c == '<') {
// uriref, e.g.
return parseURI();
}
else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) {
// qname or boolean
return parseQNameOrBoolean();
}
else if (c == '_') {
// node ID, e.g. _:n1
return parseNodeID();
}
else if (c == '"' || c == '\'') {
// quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo'''
return parseQuotedLiteral();
}
else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') {
// integer or double, e.g. 123 or 1.2e3
return parseNumber();
}
else if (c == -1) {
throwEOFException();
return null;
}
else {
reportFatalError("Expected an RDF value here, found '" + new String(Character.toChars(c)) + "'");
return null;
}
}
/**
* Parses a quoted string, optionally followed by a language tag or datatype.
*/
protected Literal parseQuotedLiteral()
throws IOException, RDFParseException, RDFHandlerException
{
String label = parseQuotedString();
// Check for presence of a language tag or datatype
int c = peekCodePoint();
if (c == '@') {
readCodePoint();
// Read language
StringBuilder lang = new StringBuilder(8);
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
boolean verifyLanguageTag = getParserConfig().get(BasicParserSettings.VERIFY_LANGUAGE_TAGS);
if (verifyLanguageTag && !TurtleUtil.isLanguageStartChar(c)) {
reportError("Expected a letter, found '" + new String(Character.toChars(c)) + "'",
BasicParserSettings.VERIFY_LANGUAGE_TAGS);
}
lang.append(Character.toChars(c));
c = readCodePoint();
while (!TurtleUtil.isWhitespace(c)) {
// SES-1887 : Flexibility introduced for SES-1985 and SES-1821 needs
// to be counterbalanced against legitimate situations where Turtle
// language tags do not need whitespace following the language tag
if (c == '.' || c == ';' || c == ',' || c == ')' || c == ']' || c == -1) {
break;
}
if (verifyLanguageTag && !TurtleUtil.isLanguageChar(c)) {
reportError("Illegal language tag char: '" + new String(Character.toChars(c)) + "'",
BasicParserSettings.VERIFY_LANGUAGE_TAGS);
}
lang.append(Character.toChars(c));
c = readCodePoint();
}
unread(c);
return createLiteral(label, lang.toString(), null, getLineNumber(), -1);
}
else if (c == '^') {
readCodePoint();
// next character should be another '^'
verifyCharacterOrFail(readCodePoint(), "^");
skipWSC();
// Read datatype
Value datatype = parseValue();
if (datatype instanceof URI) {
return createLiteral(label, null, (URI)datatype, getLineNumber(), -1);
}
else {
reportFatalError("Illegal datatype value: " + datatype);
return null;
}
}
else {
return createLiteral(label, null, null, getLineNumber(), -1);
}
}
/**
* Parses a quoted string, which is either a "normal string" or a """long
* string""".
*/
protected String parseQuotedString()
throws IOException, RDFParseException
{
String result = null;
int c1 = readCodePoint();
// First character should be '"' or "'"
verifyCharacterOrFail(c1, "\"\'");
// Check for long-string, which starts and ends with three double quotes
int c2 = readCodePoint();
int c3 = readCodePoint();
if ((c1 == '"' && c2 == '"' && c3 == '"') || (c1 == '\'' && c2 == '\'' && c3 == '\'')) {
// Long string
result = parseLongString(c2);
}
else {
// Normal string
unread(c3);
unread(c2);
result = parseString(c1);
}
// Unescape any escape sequences
try {
result = TurtleUtil.decodeString(result);
}
catch (IllegalArgumentException e) {
reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
}
return result;
}
/**
* Parses a "normal string". This method requires that the opening character
* has already been parsed.
*/
protected String parseString(int closingCharacter)
throws IOException, RDFParseException
{
StringBuilder sb = new StringBuilder(32);
while (true) {
int c = readCodePoint();
if (c == closingCharacter) {
break;
}
else if (c == -1) {
throwEOFException();
}
sb.append(Character.toChars(c));
if (c == '\\') {
// This escapes the next character, which might be a '"'
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
sb.append(Character.toChars(c));
}
}
return sb.toString();
}
/**
* Parses a """long string""". This method requires that the first three
* characters have already been parsed.
*/
protected String parseLongString(int closingCharacter)
throws IOException, RDFParseException
{
StringBuilder sb = new StringBuilder(1024);
int doubleQuoteCount = 0;
int c;
while (doubleQuoteCount < 3) {
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
else if (c == closingCharacter) {
doubleQuoteCount++;
}
else {
doubleQuoteCount = 0;
}
sb.append(Character.toChars(c));
if (c == '\\') {
// This escapes the next character, which might be a '"'
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
sb.append(Character.toChars(c));
}
}
return sb.substring(0, sb.length() - 3);
}
protected Literal parseNumber()
throws IOException, RDFParseException
{
StringBuilder value = new StringBuilder(8);
URI datatype = XMLSchema.INTEGER;
int c = readCodePoint();
// read optional sign character
if (c == '+' || c == '-') {
value.append(Character.toChars(c));
c = readCodePoint();
}
while (ASCIIUtil.isNumber(c)) {
value.append(Character.toChars(c));
c = readCodePoint();
}
if (c == '.' || c == 'e' || c == 'E') {
// read optional fractional digits
if (c == '.') {
if (TurtleUtil.isWhitespace(peekCodePoint())) {
// We're parsing an integer that did not have a space before the
// period to end the statement
}
else {
value.append(Character.toChars(c));
c = readCodePoint();
while (ASCIIUtil.isNumber(c)) {
value.append(Character.toChars(c));
c = readCodePoint();
}
if (value.length() == 1) {
// We've only parsed a '.'
reportFatalError("Object for statement missing");
}
// We're parsing a decimal or a double
datatype = XMLSchema.DECIMAL;
}
}
else {
if (value.length() == 0) {
// We've only parsed an 'e' or 'E'
reportFatalError("Object for statement missing");
}
}
// read optional exponent
if (c == 'e' || c == 'E') {
datatype = XMLSchema.DOUBLE;
value.append(Character.toChars(c));
c = readCodePoint();
if (c == '+' || c == '-') {
value.append(Character.toChars(c));
c = readCodePoint();
}
if (!ASCIIUtil.isNumber(c)) {
reportError("Exponent value missing", BasicParserSettings.VERIFY_DATATYPE_VALUES);
}
value.append(Character.toChars(c));
c = readCodePoint();
while (ASCIIUtil.isNumber(c)) {
value.append(Character.toChars(c));
c = readCodePoint();
}
}
}
// Unread last character, it isn't part of the number
unread(c);
// String label = value.toString();
// if (datatype.equals(XMLSchema.INTEGER)) {
// try {
// label = XMLDatatypeUtil.normalizeInteger(label);
// }
// catch (IllegalArgumentException e) {
// // Note: this should never happen because of the parse constraints
// reportError("Illegal integer value: " + label);
// }
// }
// return createLiteral(label, null, datatype);
// Return result as a typed literal
return createLiteral(value.toString(), null, datatype, getLineNumber(), -1);
}
protected URI parseURI()
throws IOException, RDFParseException
{
StringBuilder uriBuf = new StringBuilder(100);
// First character should be '<'
int c = readCodePoint();
verifyCharacterOrFail(c, "<");
// Read up to the next '>' character
while (true) {
c = readCodePoint();
if (c == '>') {
break;
}
else if (c == -1) {
throwEOFException();
}
if (c == ' ') {
reportFatalError("IRI included an unencoded space: '" + c + "'");
}
uriBuf.append(Character.toChars(c));
if (c == '\\') {
// This escapes the next character, which might be a '>'
c = readCodePoint();
if (c == -1) {
throwEOFException();
}
if (c != 'u' && c != 'U') {
reportFatalError("IRI includes string escapes: '\\" + c + "'");
}
uriBuf.append(Character.toChars(c));
}
}
if (c == '.') {
reportFatalError("IRI must not end in a '.'");
}
String uri = uriBuf.toString();
// Unescape any escape sequences
try {
// FIXME: The following decodes \n and similar in URIs, which should be
// invalid according to test
uri = TurtleUtil.decodeString(uri);
}
catch (IllegalArgumentException e) {
reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
}
return super.resolveURI(uri);
}
/**
* Parses qnames and boolean values, which have equivalent starting
* characters.
*/
protected Value parseQNameOrBoolean()
throws IOException, RDFParseException
{
// First character should be a ':' or a letter
int c = readCodePoint();
if (c == -1) {
throwEOFException();
}
if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) {
reportError("Expected a ':' or a letter, found '" + new String(Character.toChars(c)) + "'",
BasicParserSettings.VERIFY_RELATIVE_URIS);
}
String namespace = null;
if (c == ':') {
// qname using default namespace
namespace = getNamespace("");
}
else {
// c is the first letter of the prefix
StringBuilder prefix = new StringBuilder(8);
prefix.append(Character.toChars(c));
int previousChar = c;
c = readCodePoint();
while (TurtleUtil.isPrefixChar(c)) {
prefix.append(Character.toChars(c));
previousChar = c;
c = readCodePoint();
}
if (c != ':') {
// prefix may actually be a boolean value
String value = prefix.toString();
if (value.equals("true") || value.equals("false")) {
unread(c);
return createLiteral(value, null, XMLSchema.BOOLEAN, getLineNumber(), -1);
}
}
else {
if (previousChar == '.') {
// '.' is a legal prefix name char, but can not appear at the end
reportFatalError("prefix can not end with with '.'");
}
}
verifyCharacterOrFail(c, ":");
namespace = getNamespace(prefix.toString());
}
// c == ':', read optional local name
StringBuilder localName = new StringBuilder(16);
c = readCodePoint();
if (TurtleUtil.isNameStartChar(c)) {
if (c == '\\') {
localName.append(readLocalEscapedChar());
}
else {
localName.append(Character.toChars(c));
}
int previousChar = c;
c = readCodePoint();
while (TurtleUtil.isNameChar(c)) {
if (c == '\\') {
localName.append(readLocalEscapedChar());
}
else {
localName.append(Character.toChars(c));
}
previousChar = c;
c = readCodePoint();
}
// Unread last character
unread(c);
if (previousChar == '.') {
// '.' is a legal name char, but can not appear at the end, so is
// not actually part of the name
unread(previousChar);
localName.deleteCharAt(localName.length() - 1);
}
}
else {
// Unread last character
unread(c);
}
String localNameString = localName.toString();
for (int i = 0; i < localNameString.length(); i++) {
if (localNameString.charAt(i) == '%') {
if (i > localNameString.length() - 3 || !ASCIIUtil.isHex(localNameString.charAt(i + 1))
|| !ASCIIUtil.isHex(localNameString.charAt(i + 2)))
{
reportFatalError("Found incomplete percent-encoded sequence: " + localNameString);
}
}
}
// if (c == '.') {
// reportFatalError("Blank node identifier must not end in a '.'");
// }
// Note: namespace has already been resolved
return createURI(namespace + localNameString);
}
private char readLocalEscapedChar()
throws RDFParseException, IOException
{
int c = readCodePoint();
if (TurtleUtil.isLocalEscapedChar(c)) {
return (char)c;
}
else {
throw new RDFParseException("found '" + new String(Character.toChars(c)) + "', expected one of: "
+ Arrays.toString(TurtleUtil.LOCAL_ESCAPED_CHARS));
}
}
/**
* Parses a blank node ID, e.g. _:node1.
*/
protected BNode parseNodeID()
throws IOException, RDFParseException
{
// Node ID should start with "_:"
verifyCharacterOrFail(readCodePoint(), "_");
verifyCharacterOrFail(readCodePoint(), ":");
// Read the node ID
int c = readCodePoint();
if (c == -1) {
throwEOFException();
}
else if (!TurtleUtil.isBLANK_NODE_LABEL_StartChar(c)) {
reportError("Expected a letter, found '" + (char)c + "'", BasicParserSettings.PRESERVE_BNODE_IDS);
}
StringBuilder name = new StringBuilder(32);
name.append(Character.toChars(c));
// Read all following letter and numbers, they are part of the name
c = readCodePoint();
// If we would never go into the loop we must unread now
if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
unread(c);
}
while (TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
int previous = c;
c = readCodePoint();
if (previous == '.' && (c == -1 || TurtleUtil.isWhitespace(c) || c == '<' || c == '_')) {
unread(c);
unread(previous);
break;
}
name.append((char)previous);
if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
unread(c);
}
}
return createBNode(name.toString());
}
protected void reportStatement(Resource subj, URI pred, Value obj)
throws RDFParseException, RDFHandlerException
{
Statement st = createStatement(subj, pred, obj);
if (rdfHandler != null) {
rdfHandler.handleStatement(st);
}
}
/**
* Verifies that the supplied character code point codePoint is one
* of the expected characters specified in expected. This method
* will throw a ParseException if this is not the case.
*/
protected void verifyCharacterOrFail(int codePoint, String expected)
throws RDFParseException
{
if (codePoint == -1) {
throwEOFException();
}
final String supplied = new String(Character.toChars(codePoint));
if (expected.indexOf(supplied) == -1) {
StringBuilder msg = new StringBuilder(32);
msg.append("Expected ");
for (int i = 0; i < expected.length(); i++) {
if (i > 0) {
msg.append(" or ");
}
msg.append('\'');
msg.append(expected.charAt(i));
msg.append('\'');
}
msg.append(", found '");
msg.append(supplied);
msg.append("'");
reportFatalError(msg.toString());
}
}
/**
* Consumes any white space characters (space, tab, line feed, newline) and
* comments (#-style) from reader. After this method has been
* called, the first character that is returned by reader is either
* a non-ignorable character, or EOF. For convenience, this character is also
* returned by this method.
*
* @return The next character code point that will be returned by
* reader.
*/
protected int skipWSC()
throws IOException, RDFHandlerException
{
int c = readCodePoint();
while (TurtleUtil.isWhitespace(c) || c == '#') {
if (c == '#') {
processComment();
}
else if (c == '\n') {
// we only count line feeds (LF), not carriage return (CR), as
// normally a CR is immediately followed by a LF.
lineNumber++;
}
c = readCodePoint();
}
unread(c);
return c;
}
/**
* Consumes characters from reader until the first EOL has been read. This
* line of text is then passed to the {@link #rdfHandler} as a comment.
*/
protected void processComment()
throws IOException, RDFHandlerException
{
StringBuilder comment = new StringBuilder(64);
int c = readCodePoint();
while (c != -1 && c != 0xD && c != 0xA) {
comment.append(Character.toChars(c));
c = readCodePoint();
}
// c is equal to -1, \r or \n.
// In case c is equal to \r, we should also read a following \n.
if (c == 0xD) {
c = readCodePoint();
if (c != 0xA) {
unread(c);
}
}
if (rdfHandler != null) {
rdfHandler.handleComment(comment.toString());
}
reportLocation();
}
/**
* Reads the next Unicode code point.
*
* @return the next Unicode code point, or -1 if the end of the stream has
* been reached.
* @throws IOException
*/
protected int readCodePoint()
throws IOException
{
int next = reader.read();
if (Character.isHighSurrogate((char)next)) {
next = Character.toCodePoint((char)next, (char)reader.read());
}
return next;
}
/**
* Pushes back a single code point by copying it to the front of the buffer.
* After this method returns, a call to {@link #readCodePoint()} will return
* the same code point c again.
*
* @param codePoint
* a single Unicode code point.
* @throws IOException
*/
protected void unread(int codePoint)
throws IOException
{
if (codePoint != -1) {
if (Character.isSupplementaryCodePoint(codePoint)) {
final char[] surrogatePair = Character.toChars(codePoint);
reader.unread(surrogatePair);
}
else {
reader.unread(codePoint);
}
}
}
/**
* Pushes back the supplied string by copying it to the front of the buffer.
* After this method returns, successive calls to {@link #readCodePoint()}
* will return the code points in the supplied string again, starting at the
* first in the String..
*
* @param string
* the string to un-read.
* @throws IOException
*/
protected void unread(String string)
throws IOException
{
for (int i = string.codePointCount(0, string.length()); i >= 1; i--) {
final int codePoint = string.codePointBefore(i);
if (Character.isSupplementaryCodePoint(codePoint)) {
final char[] surrogatePair = Character.toChars(codePoint);
reader.unread(surrogatePair);
}
else {
reader.unread(codePoint);
}
}
}
/**
* Peeks at the next Unicode code point without advancing the reader, and
* returns its value.
*
* @return the next Unicode code point, or -1 if the end of the stream has
* been reached.
* @throws IOException
*/
protected int peekCodePoint()
throws IOException
{
int result = readCodePoint();
unread(result);
return result;
}
protected void reportLocation() {
reportLocation(getLineNumber(), -1);
}
/**
* Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
* information to the error.
*/
@Override
protected void reportWarning(String msg) {
reportWarning(msg, getLineNumber(), -1);
}
/**
* Overrides {@link RDFParserBase#reportError(String, RioSetting)}, adding
* line number information to the error.
*/
@Override
protected void reportError(String msg, RioSetting setting)
throws RDFParseException
{
reportError(msg, getLineNumber(), -1, setting);
}
/**
* Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
* number information to the error.
*/
@Override
protected void reportFatalError(String msg)
throws RDFParseException
{
reportFatalError(msg, getLineNumber(), -1);
}
/**
* Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
* number information to the error.
*/
@Override
protected void reportFatalError(Exception e)
throws RDFParseException
{
reportFatalError(e, getLineNumber(), -1);
}
protected void throwEOFException()
throws RDFParseException
{
throw new RDFParseException("Unexpected end of file");
}
private int getLineNumber() {
return lineNumber;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy